From a8e2e75cd5afcafa7bb3f8136062e38c21fd6c41 Mon Sep 17 00:00:00 2001 From: Balachandar Namasivayam Date: Thu, 10 Jan 2019 10:28:32 -0800 Subject: [PATCH] Re-enable CheckDesiredClasses after making necessary changes for multi-region setup. Fixed a couple of bugs 1) A rare race condition where a worker is being roles even after it died. 2) Fix how RoleFitness is calculated for TLog and LogRouter. Only worst fitness is compared to see if a better fit is available. --- fdbrpc/Locality.h | 2 +- fdbserver/ClusterController.actor.cpp | 54 +++---- .../workloads/ConsistencyCheck.actor.cpp | 141 +++++++++++++----- 3 files changed, 135 insertions(+), 62 deletions(-) diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index bf5aabb9c4..e98b894dd4 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -28,7 +28,7 @@ struct ProcessClass { // This enum is stored in restartInfo.ini for upgrade tests, so be very careful about changing the existing items! enum ClassType { UnsetClass, StorageClass, TransactionClass, ResolutionClass, TesterClass, ProxyClass, MasterClass, StatelessClass, LogClass, ClusterControllerClass, LogRouterClass, InvalidClass = -1 }; enum Fitness { BestFit, GoodFit, UnsetFit, OkayFit, WorstFit, ExcludeFit, NeverAssign }; //cannot be larger than 7 because of leader election mask - enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController }; + enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; int16_t _class; int16_t _source; diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index e7a8b36bcc..17900c2fc5 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -438,17 +438,18 @@ public: struct RoleFitness { ProcessClass::Fitness bestFit; ProcessClass::Fitness worstFit; + ProcessClass::ClusterRole role; int count; - RoleFitness(int bestFit, int worstFit, int count) : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count) {} + RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role) : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count), role(role) {} - RoleFitness(int fitness, int count) : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count) {} + RoleFitness(int fitness, int count, ProcessClass::ClusterRole role) : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role) {} - RoleFitness() : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), count(0) {} + RoleFitness() : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole), count(0) {} - RoleFitness(RoleFitness first, RoleFitness second) : bestFit(std::min(first.worstFit, second.worstFit)), worstFit(std::max(first.worstFit, second.worstFit)), count(first.count + second.count) {} + RoleFitness(RoleFitness first, RoleFitness second, ProcessClass::ClusterRole role) : bestFit(std::min(first.worstFit, second.worstFit)), worstFit(std::max(first.worstFit, second.worstFit)), count(first.count + second.count), role(role) { } - RoleFitness( vector> workers, ProcessClass::ClusterRole role ) { + RoleFitness( vector> workers, ProcessClass::ClusterRole role ) : role(role) { worstFit = ProcessClass::BestFit; bestFit = ProcessClass::NeverAssign; for(auto it : workers) { @@ -459,7 +460,7 @@ public: count = workers.size(); } - RoleFitness( std::vector classes, ProcessClass::ClusterRole role ) { + RoleFitness( std::vector classes, ProcessClass::ClusterRole role ) : role(role) { worstFit = ProcessClass::BestFit; bestFit = ProcessClass::NeverAssign; for(auto it : classes) { @@ -472,7 +473,8 @@ public: bool operator < (RoleFitness const& r) const { if (worstFit != r.worstFit) return worstFit < r.worstFit; - if (bestFit != r.bestFit) return bestFit < r.bestFit; + // FIXME: TLog recruitment process does not guarantee the best fit is not worsened. + if ((role != ProcessClass::TLog || role != ProcessClass::LogRouter) && bestFit != r.bestFit) return bestFit < r.bestFit; return count > r.count; } @@ -489,7 +491,7 @@ public: bool operator == (RoleFitness const& r) const { return worstFit == r.worstFit && bestFit == r.bestFit && count == r.count; } - std::string toString() const { return format("%d %d &d", bestFit, worstFit, count); } + std::string toString() const { return format("%d %d %d", bestFit, worstFit, count); } }; std::set>> getDatacenters( DatabaseConfiguration const& conf, bool checkStable = false ) { @@ -532,8 +534,8 @@ public: } if( now() - remoteStartTime.get() < SERVER_KNOBS->WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY && - ( ( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredRemoteLogs()).betterCount(RoleFitness(remoteLogs, ProcessClass::TLog)) ) || - ( RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount).betterCount(RoleFitness(logRouters, ProcessClass::LogRouter)) ) ) ) { + ( ( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredRemoteLogs(), ProcessClass::TLog).betterCount(RoleFitness(remoteLogs, ProcessClass::TLog)) ) || + ( RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount, ProcessClass::LogRouter).betterCount(RoleFitness(logRouters, ProcessClass::LogRouter)) ) ) ) { throw operation_failed(); } @@ -600,10 +602,10 @@ public: } if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY && - ( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs()).betterCount(RoleFitness(tlogs, ProcessClass::TLog)) || - ( region.satelliteTLogReplicationFactor > 0 && RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredSatelliteLogs(dcId)).betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog)) ) || - RoleFitness(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, req.configuration.getDesiredProxies()).betterCount(RoleFitness(proxies, ProcessClass::Proxy)) || - RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS, req.configuration.getDesiredResolvers()).betterCount(RoleFitness(resolvers, ProcessClass::Resolver)) ) ) { + ( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog).betterCount(RoleFitness(tlogs, ProcessClass::TLog)) || + ( region.satelliteTLogReplicationFactor > 0 && RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredSatelliteLogs(dcId), ProcessClass::TLog).betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog)) ) || + RoleFitness(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, req.configuration.getDesiredProxies(), ProcessClass::Proxy).betterCount(RoleFitness(proxies, ProcessClass::Proxy)) || + RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS, req.configuration.getDesiredResolvers(), ProcessClass::Resolver).betterCount(RoleFitness(resolvers, ProcessClass::Resolver)) ) ) { return operation_failed(); } @@ -705,7 +707,7 @@ public: proxies.push_back(first_proxy.worker); resolvers.push_back(first_resolver.worker); - auto fitness = RoleFitness( RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver) ); + auto fitness = RoleFitness( RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver), ProcessClass::NoRole ); if(dcId == clusterControllerDcId) { bestFitness = fitness; @@ -750,8 +752,8 @@ public: .detail("DesiredResolvers", req.configuration.getDesiredResolvers()).detail("ActualResolvers", result.resolvers.size()); if( now() - startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY && - ( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs()).betterCount(RoleFitness(tlogs, ProcessClass::TLog)) || - RoleFitness(std::min(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS), std::max(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS), req.configuration.getDesiredProxies()+req.configuration.getDesiredResolvers()).betterCount(bestFitness) ) ) { + ( RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog).betterCount(RoleFitness(tlogs, ProcessClass::TLog)) || + RoleFitness(std::min(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS), std::max(SERVER_KNOBS->EXPECTED_PROXY_FITNESS, SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS), req.configuration.getDesiredProxies()+req.configuration.getDesiredResolvers(), ProcessClass::NoRole).betterCount(bestFitness) ) ) { throw operation_failed(); } @@ -945,10 +947,11 @@ public: return false; RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog); - RoleFitness newRemoteTLogFit((db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) ? getWorkersForTlogs(db.config, db.config.getRemoteTLogReplicationFactor(), db.config.getDesiredRemoteLogs(), db.config.getRemoteTLogPolicy(), id_used, true, remoteDC) : remote_tlogs, ProcessClass::TLog); - + RoleFitness newRemoteTLogFit( + (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) ? + getWorkersForTlogs(db.config, db.config.getRemoteTLogReplicationFactor(), db.config.getDesiredRemoteLogs(), db.config.getRemoteTLogPolicy(), id_used, true, remoteDC) + : remote_tlogs, ProcessClass::TLog); if(oldRemoteTLogFit < newRemoteTLogFit) return false; - int oldRouterCount = oldTLogFit.count * std::max(1, db.config.desiredLogRouterCount / std::max(1,oldTLogFit.count)); int newRouterCount = newTLogFit.count * std::max(1, db.config.desiredLogRouterCount / std::max(1,newTLogFit.count)); RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter); @@ -960,11 +963,9 @@ public: if(newLogRoutersFit.count < newRouterCount) { newLogRoutersFit.worstFit = ProcessClass::NeverAssign; } - if(oldLogRoutersFit < newLogRoutersFit) return false; - // Check proxy/resolver fitness - RoleFitness oldInFit(RoleFitness(proxyClasses, ProcessClass::Proxy), RoleFitness(resolverClasses, ProcessClass::Resolver)); + RoleFitness oldInFit(RoleFitness(proxyClasses, ProcessClass::Proxy), RoleFitness(resolverClasses, ProcessClass::Resolver), ProcessClass::NoRole); auto first_resolver = getWorkerForRoleInDatacenter( clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true ); auto first_proxy = getWorkerForRoleInDatacenter( clusterControllerDcId, ProcessClass::Proxy, ProcessClass::ExcludeFit, db.config, id_used, true ); @@ -974,10 +975,8 @@ public: proxies.push_back(first_proxy.worker); resolvers.push_back(first_resolver.worker); - RoleFitness newInFit(RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver)); - + RoleFitness newInFit(RoleFitness(proxies, ProcessClass::Proxy), RoleFitness(resolvers, ProcessClass::Resolver), ProcessClass::NoRole); if(oldInFit.betterFitness(newInFit)) return false; - if(oldTLogFit > newTLogFit || oldInFit > newInFit || (oldSatelliteFallback && !newSatelliteFallback) || oldSatelliteTLogFit > newSatelliteTLogFit || oldRemoteTLogFit > newRemoteTLogFit || oldLogRoutersFit > newLogRoutersFit) { TraceEvent("BetterMasterExists", id).detail("OldMasterFit", oldMasterFit).detail("NewMasterFit", mworker.fitness) .detail("OldTLogFit", oldTLogFit.toString()).detail("NewTLogFit", newTLogFit.toString()) @@ -1321,6 +1320,9 @@ ACTOR Future rebootAndCheck( ClusterControllerData* cluster, Optional workerAvailabilityWatch( WorkerInterface worker, ProcessClass startingClass, ClusterControllerData* cluster ) { state Future failed = worker.address() == g_network->getLocalAddress() ? Never() : waitFailureClient( worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME ); cluster->updateWorkerList.set( worker.locality.processId(), ProcessData(worker.locality, startingClass, worker.address()) ); + // This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch fails for the worker. + wait(delay(0)); + loop { choose { when( wait( IFailureMonitor::failureMonitor().onStateEqual( worker.storage.getEndpoint(), FailureStatus(IFailureMonitor::failureMonitor().getState( worker.storage.getEndpoint() ).isAvailable()) ) ) ) { diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 047aee1cac..ca06faab63 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -242,10 +242,9 @@ struct ConsistencyCheckWorkload : TestWorkload bool hasExtraStores = wait( self->checkForExtraDataStores(cx, self) ); //Check that each machine is operating as its desired class - //FIXME: re-enable - //bool usingDesiredClasses = wait(self->checkUsingDesiredClasses(cx, self)); - //if(!usingDesiredClasses) - // self->testFailure("Cluster has machine(s) not using requested classes"); + bool usingDesiredClasses = wait(self->checkUsingDesiredClasses(cx, self)); + if(!usingDesiredClasses) + self->testFailure("Cluster has machine(s) not using requested classes"); bool workerListCorrect = wait( self->checkWorkerList(cx, self) ); if(!workerListCorrect) @@ -1202,7 +1201,7 @@ struct ConsistencyCheckWorkload : TestWorkload return true; } - static ProcessClass::Fitness getBestAvailableFitness(std::set& availableClassTypes, ProcessClass::ClusterRole role) { + static ProcessClass::Fitness getBestAvailableFitness(std::vector& availableClassTypes, ProcessClass::ClusterRole role) { ProcessClass::Fitness bestAvailableFitness = ProcessClass::NeverAssign; for (auto classType : availableClassTypes) { bestAvailableFitness = std::min(bestAvailableFitness, ProcessClass(classType, ProcessClass::InvalidSource).machineClassFitness(role)); @@ -1211,67 +1210,139 @@ struct ConsistencyCheckWorkload : TestWorkload return bestAvailableFitness; } + template + static std::string getOptionalString(Optional opt) { + if (opt.present()) + return opt.get().toString(); + return "NotSet"; + } + + typedef std::pair WorkerClassPair; //Returns true if all machines in the cluster that specified a desired class are operating in that class - ACTOR Future checkUsingDesiredClasses(Database cx, ConsistencyCheckWorkload *self) - { - state vector> allWorkers = wait( getWorkers( self->dbInfo ) ); - state vector> nonExcludedWorkers = wait( getWorkers( self->dbInfo, GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY ) ); - state vector storageServers = wait( getStorageServers( cx ) ); + ACTOR Future checkUsingDesiredClasses(Database cx, ConsistencyCheckWorkload *self) { + state Optional expectedPrimaryDcId; + state Optional expectedRemoteDcId; + state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx)); + state vector allWorkers = wait(getWorkers(self->dbInfo)); + state vector nonExcludedWorkers = wait(getWorkers(self->dbInfo, GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY)); auto& db = self->dbInfo->get(); - - std::set allClassTypes; - std::map allWorkerProcessMap; + + std::map allWorkerProcessMap; + std::map, std::vector> dcToAllClassTypes; for (auto worker : allWorkers) { - allClassTypes.insert(worker.second.classType()); - allWorkerProcessMap[worker.first.address()] = worker.second; + allWorkerProcessMap[worker.first.address()] = worker; + Optional dc = worker.first.locality._data[LocalityData::keyDcId]; + if (!dcToAllClassTypes.count(dc)) + dcToAllClassTypes.insert({}); + dcToAllClassTypes[dc].push_back(worker.second.classType()); } - std::set nonExcludedClassTypes; - std::map nonExcludedWorkerProcessMap; + std::map nonExcludedWorkerProcessMap; + std::map, std::vector> dcToNonExcludedClassTypes; for (auto worker : nonExcludedWorkers) { - nonExcludedClassTypes.insert(worker.second.classType()); - nonExcludedWorkerProcessMap[worker.first.address()] = worker.second; + nonExcludedWorkerProcessMap[worker.first.address()] = worker; + Optional dc = worker.first.locality._data[LocalityData::keyDcId]; + if (!dcToNonExcludedClassTypes.count(dc)) + dcToNonExcludedClassTypes.insert({}); + dcToNonExcludedClassTypes[dc].push_back(worker.second.classType()); } - // Check cluster controller - ProcessClass::Fitness bestClusterControllerFitness = getBestAvailableFitness(nonExcludedClassTypes, ProcessClass::ClusterController); - if (!nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) || nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].machineClassFitness(ProcessClass::ClusterController) != bestClusterControllerFitness) { - TraceEvent("ConsistencyCheck_ClusterControllerNotBest").detail("BestClusterControllerFitness", bestClusterControllerFitness).detail("ExistingClusterControllerFit", nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) ? nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].machineClassFitness(ProcessClass::ClusterController) : -1); + if (!allWorkerProcessMap.count(db.clusterInterface.clientInterface.address())) { + TraceEvent("ConsistencyCheck_CCNotInWorkerList").detail("CCAddress", db.clusterInterface.clientInterface.address().toString()); + return false; + } + if (!allWorkerProcessMap.count(db.master.address())) { + TraceEvent("ConsistencyCheck_MasterNotInWorkerList").detail("MasterAddress", db.master.address().toString()); return false; } - // Check master - ProcessClass::Fitness bestMasterFitness = getBestAvailableFitness(nonExcludedClassTypes, ProcessClass::Master); + Optional ccDcId = allWorkerProcessMap[db.clusterInterface.clientInterface.address()].first.locality._data[LocalityData::keyDcId]; + Optional masterDcId = allWorkerProcessMap[db.master.address()].first.locality._data[LocalityData::keyDcId]; + + if (ccDcId != masterDcId) { + TraceEvent("ConsistencyCheck_CCAndMasterNotInSameDC").detail("ClusterControllerDcId", getOptionalString(ccDcId)).detail("MasterDcId", getOptionalString(masterDcId)); + return false; + } + // Check if master and cluster controller are in the desired DC for fearless cluster when running under simulation + // FIXME: g_simulator.datacenterDead could return false positives. Relaxing checks until it is fixed. + if (g_network->isSimulated() && config.usableRegions> 1 && g_simulator.primaryDcId.present() && + !g_simulator.datacenterDead(g_simulator.primaryDcId) && !g_simulator.datacenterDead(g_simulator.remoteDcId)) { + expectedPrimaryDcId = config.regions[0].dcId; + expectedRemoteDcId = config.regions[1].dcId; + // If the priorities are equal, either could be the primary + if (config.regions[0].priority == config.regions[1].priority) { + expectedPrimaryDcId = masterDcId; + expectedRemoteDcId = config.regions[0].dcId == expectedPrimaryDcId.get() ? config.regions[1].dcId : config.regions[0].dcId; + } + + if (ccDcId != expectedPrimaryDcId) { + TraceEvent("ConsistencyCheck_ClusterControllerDcNotBest").detail("PreferredDcId", getOptionalString(expectedPrimaryDcId)).detail("ExistingDcId", getOptionalString(ccDcId)); + return false; + } + if (masterDcId != expectedPrimaryDcId) { + TraceEvent("ConsistencyCheck_MasterDcNotBest").detail("PreferredDcId", getOptionalString(expectedPrimaryDcId)).detail("ExistingDcId", getOptionalString(masterDcId)); + return false; + } + } + + // Check CC + ProcessClass::Fitness bestClusterControllerFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[ccDcId], ProcessClass::ClusterController); + if (!nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) || nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].second.machineClassFitness(ProcessClass::ClusterController) != bestClusterControllerFitness) { + TraceEvent("ConsistencyCheck_ClusterControllerNotBest").detail("BestClusterControllerFitness", bestClusterControllerFitness).detail("ExistingClusterControllerFit", nonExcludedWorkerProcessMap.count(db.clusterInterface.clientInterface.address()) ? nonExcludedWorkerProcessMap[db.clusterInterface.clientInterface.address()].second.machineClassFitness(ProcessClass::ClusterController) : -1); + return false; + } + + // Check Master + ProcessClass::Fitness bestMasterFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[masterDcId], ProcessClass::Master); if (bestMasterFitness == ProcessClass::NeverAssign) { - bestMasterFitness = getBestAvailableFitness(allClassTypes, ProcessClass::Master); + bestMasterFitness = getBestAvailableFitness(dcToAllClassTypes[masterDcId], ProcessClass::Master); if (bestMasterFitness != ProcessClass::NeverAssign) { bestMasterFitness = ProcessClass::ExcludeFit; } } - if (!allWorkerProcessMap.count(db.master.address()) || (!nonExcludedWorkerProcessMap.count(db.master.address()) && bestMasterFitness != ProcessClass::ExcludeFit) || nonExcludedWorkerProcessMap[db.master.address()].machineClassFitness(ProcessClass::Master) != bestMasterFitness) { - TraceEvent("ConsistencyCheck_MasterNotBest").detail("BestMasterFitness", bestMasterFitness).detail("ExistingMasterFit", nonExcludedWorkerProcessMap.count(db.master.address()) ? nonExcludedWorkerProcessMap[db.master.address()].machineClassFitness(ProcessClass::Master) : -1); + if ((!nonExcludedWorkerProcessMap.count(db.master.address()) && bestMasterFitness != ProcessClass::ExcludeFit) || nonExcludedWorkerProcessMap[db.master.address()].second.machineClassFitness(ProcessClass::Master) != bestMasterFitness) { + TraceEvent("ConsistencyCheck_MasterNotBest").detail("BestMasterFitness", bestMasterFitness).detail("ExistingMasterFit", nonExcludedWorkerProcessMap.count(db.master.address()) ? nonExcludedWorkerProcessMap[db.master.address()].second.machineClassFitness(ProcessClass::Master) : -1); return false; } - // Check master proxy - ProcessClass::Fitness bestMasterProxyFitness = getBestAvailableFitness(nonExcludedClassTypes, ProcessClass::Proxy); + // Check proxy + ProcessClass::Fitness bestMasterProxyFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[masterDcId], ProcessClass::Proxy); for (auto masterProxy : db.client.proxies) { - if (!nonExcludedWorkerProcessMap.count(masterProxy.address()) || nonExcludedWorkerProcessMap[masterProxy.address()].machineClassFitness(ProcessClass::Proxy) != bestMasterProxyFitness) { - TraceEvent("ConsistencyCheck_ProxyNotBest").detail("BestMasterProxyFitness", bestMasterProxyFitness).detail("ExistingMasterProxyFitness", nonExcludedWorkerProcessMap.count(masterProxy.address()) ? nonExcludedWorkerProcessMap[masterProxy.address()].machineClassFitness(ProcessClass::Proxy) : -1); + if (!nonExcludedWorkerProcessMap.count(masterProxy.address()) || nonExcludedWorkerProcessMap[masterProxy.address()].second.machineClassFitness(ProcessClass::Proxy) != bestMasterProxyFitness) { + TraceEvent("ConsistencyCheck_ProxyNotBest").detail("BestMasterProxyFitness", bestMasterProxyFitness).detail("ExistingMasterProxyFitness", nonExcludedWorkerProcessMap.count(masterProxy.address()) ? nonExcludedWorkerProcessMap[masterProxy.address()].second.machineClassFitness(ProcessClass::Proxy) : -1); return false; } } // Check resolver - ProcessClass::Fitness bestResolverFitness = getBestAvailableFitness(nonExcludedClassTypes, ProcessClass::Resolver); + ProcessClass::Fitness bestResolverFitness = getBestAvailableFitness(dcToNonExcludedClassTypes[masterDcId], ProcessClass::Resolver); for (auto resolver : db.resolvers) { - if (!nonExcludedWorkerProcessMap.count(resolver.address()) || nonExcludedWorkerProcessMap[resolver.address()].machineClassFitness(ProcessClass::Resolver) != bestResolverFitness) { - TraceEvent("ConsistencyCheck_ResolverNotBest").detail("BestResolverFitness", bestResolverFitness).detail("ExistingResolverFitness", nonExcludedWorkerProcessMap.count(resolver.address()) ? nonExcludedWorkerProcessMap[resolver.address()].machineClassFitness(ProcessClass::Resolver) : -1); + if (!nonExcludedWorkerProcessMap.count(resolver.address()) || nonExcludedWorkerProcessMap[resolver.address()].second.machineClassFitness(ProcessClass::Resolver) != bestResolverFitness) { + TraceEvent("ConsistencyCheck_ResolverNotBest").detail("BestResolverFitness", bestResolverFitness).detail("ExistingResolverFitness", nonExcludedWorkerProcessMap.count(resolver.address()) ? nonExcludedWorkerProcessMap[resolver.address()].second.machineClassFitness(ProcessClass::Resolver) : -1); return false; } } + // Check LogRouter + if (g_network->isSimulated() && config.usableRegions> 1 && g_simulator.primaryDcId.present() && + !g_simulator.datacenterDead(g_simulator.primaryDcId) && !g_simulator.datacenterDead(g_simulator.remoteDcId)) { + for (auto &tlogSet : db.logSystemConfig.tLogs) { + if (!tlogSet.isLocal && tlogSet.logRouters.size()) { + for (auto &logRouter : tlogSet.logRouters) { + if (!nonExcludedWorkerProcessMap.count(logRouter.interf().address())) { + TraceEvent("ConsistencyCheck_LogRouterNotInNonExcludedWorkers").detail("Id", logRouter.id()); + return false; + } + if (logRouter.interf().locality.dcId() != expectedRemoteDcId) { + TraceEvent("ConsistencyCheck_LogRouterNotBestDC").detail("expectedDC", getOptionalString(expectedRemoteDcId)).detail("ActualDC", getOptionalString(logRouter.interf().locality.dcId())); + return false; + } + } + } + } + } + // TODO: Check Tlog return true;