diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 26ae8e99b4..667891a14b 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -559,7 +559,6 @@ Future storageServerTracker( Future teamTracker( struct DDTeamCollection* const& self, Reference const& team, bool const& badTeam ); - struct DDTeamCollection : ReferenceCounted { enum { REQUESTING_WORKER = 0, GETTING_WORKER = 1, GETTING_STORAGE = 2 }; @@ -647,24 +646,27 @@ struct DDTeamCollection : ReferenceCounted { return result && resultEntries.size() == 0; } - DDTeamCollection( - Database const& cx, - UID masterId, - MoveKeysLock const& lock, - PromiseStream const& output, - Reference const& shardsAffectedByTeamFailure, - DatabaseConfiguration configuration, - std::vector> includedDCs, - Optional>> otherTrackedDCs, - Optional> >> const& serverChanges, - Future readyToStart, Reference> zeroHealthyTeams, bool primary, - Reference> processingUnhealthy) - :cx(cx), masterId(masterId), lock(lock), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder( Void() ), badTeamRemover( Void() ), redundantTeamRemover( Void() ), - configuration(configuration), serverChanges(serverChanges), readyToStart(readyToStart), checkTeamDelay( delay( SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution) ), - initialFailureReactionDelay( delayed( readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution ) ), healthyTeamCount( 0 ), storageServerSet(new LocalityMap()), - initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), optimalTeamCount( 0 ), recruitingStream(0), restartRecruiting( SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY ), - unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs), zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), processingUnhealthy(processingUnhealthy) - { + DDTeamCollection(Database const& cx, UID masterId, MoveKeysLock const& lock, + PromiseStream const& output, + Reference const& shardsAffectedByTeamFailure, + DatabaseConfiguration configuration, std::vector> includedDCs, + Optional>> otherTrackedDCs, + Optional>>> const& serverChanges, + Future readyToStart, Reference> zeroHealthyTeams, bool primary, + Reference> processingUnhealthy) + : cx(cx), masterId(masterId), lock(lock), output(output), + shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()), + badTeamRemover(Void()), redundantTeamRemover(Void()), configuration(configuration), + serverChanges(serverChanges), readyToStart(readyToStart), + checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)), + initialFailureReactionDelay( + delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)), + healthyTeamCount(0), storageServerSet(new LocalityMap()), + initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), + optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), + unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs), + zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), + processingUnhealthy(processingUnhealthy) { if(!primary || configuration.usableRegions == 1) { TraceEvent("DDTrackerStarting", masterId) .detail( "State", "Inactive" ) @@ -967,14 +969,6 @@ struct DDTeamCollection : ReferenceCounted { serverIds.push_back(*tempMap->getObject(it)); } self->addTeam(serverIds.begin(), serverIds.end(), true); -// if ( !self->redundantTeamRemover.isReady() ) { -// wait( self->redundantTeamRemover ); -// } -// if ( self->redundantTeamRemover.isReady() ) { -// self->redundantTeamRemover = teamRemover(self); -// self->addActor.send(self->redundantTeamRemover); -// } - self->traceTeamCollectionInfo(); } } else { serverIds.clear(); @@ -990,10 +984,12 @@ struct DDTeamCollection : ReferenceCounted { // NOTE: We may add extra teams in the above logic. // Kick off the teamRemover to run in the future to clean up redundant teams if we end up with too many teams - if ( self->redundantTeamRemover.isReady() ) { - self->redundantTeamRemover = teamRemover(self); - self->addActor.send(self->redundantTeamRemover); + if (self->redundantTeamRemover.isReady()) { + self->redundantTeamRemover = teamRemover(self); + self->addActor.send(self->redundantTeamRemover); } + // Trace and record the current number of teams for correctness test + self->traceTeamCollectionInfo(); return Void(); } @@ -1014,7 +1010,7 @@ struct DDTeamCollection : ReferenceCounted { self->traceTeamCollectionInfo(); wait( yield() ); } - if ( self->redundantTeamRemover.isReady() ) { + if (self->redundantTeamRemover.isReady()) { self->redundantTeamRemover = teamRemover(self); self->addActor.send(self->redundantTeamRemover); } @@ -1118,12 +1114,13 @@ struct DDTeamCollection : ReferenceCounted { addTeam(newTeamServers, isInitialTeam); } - void addTeam(const vector>& newTeamServers, bool isInitialTeam, bool redundantTeam = false) { + void addTeam(const vector>& newTeamServers, bool isInitialTeam, + bool redundantTeam = false) { Reference teamInfo(new TCTeamInfo(newTeamServers)); - bool badTeam = !satisfiesPolicy(teamInfo->servers) || teamInfo->servers.size() != configuration.storageTeamSize || redundantTeam; + bool badTeam = !satisfiesPolicy(teamInfo->servers) || + teamInfo->servers.size() != configuration.storageTeamSize || redundantTeam; - //TODO: MT upgrade: add a bool to force it to be a badTeam teamInfo->tracker = teamTracker(this, teamInfo, badTeam); // ASSERT( teamInfo->serverIDs.size() > 0 ); //team can be empty at DB initialization if (badTeam) { @@ -1213,9 +1210,10 @@ struct DDTeamCollection : ReferenceCounted { } void traceConfigInfo() { - TraceEvent("DDConfig").detail("StorageTeamSize", configuration.storageTeamSize) + TraceEvent("DDConfig") + .detail("StorageTeamSize", configuration.storageTeamSize) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) - .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); + .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); } void traceServerInfo() { @@ -1273,9 +1271,10 @@ struct DDTeamCollection : ReferenceCounted { TraceEvent("MachineTeamInfo").detail("Size", machineTeams.size()); for (auto& team : machineTeams) { - TraceEvent("MachineTeamInfo").detail("TeamIndex", i++) - .detail("MachineIDs", team->getMachineIDsStr()) - .detail("ServerTeamNumber", team->serverTeams.size()); + TraceEvent("MachineTeamInfo") + .detail("TeamIndex", i++) + .detail("MachineIDs", team->getMachineIDsStr()) + .detail("ServerTeamNumber", team->serverTeams.size()); } } @@ -1605,7 +1604,7 @@ struct DDTeamCollection : ReferenceCounted { int calculateHealthyServerCount() { int serverCount = 0; - for (auto i =server_info.begin(); i != server_info.end(); ++i) { + for (auto i = server_info.begin(); i != server_info.end(); ++i) { if (!server_status.get(i->first).isUnhealthy()) { ++serverCount; } @@ -1615,7 +1614,7 @@ struct DDTeamCollection : ReferenceCounted { int calculateHealthyMachineCount() { int totalHealthyMachineCount = 0; - for (auto &m : machine_info) { + for (auto& m : machine_info) { if (isMachineHealthy(m.second)) { ++totalHealthyMachineCount; } @@ -1624,28 +1623,29 @@ struct DDTeamCollection : ReferenceCounted { return totalHealthyMachineCount; } - //Sanity check - bool isServerTeamNumberCorrect(Reference &mt) { + // Sanity check + bool isServerTeamNumberCorrect(Reference& mt) { int num = 0; bool ret = true; - for (auto &team : teams) { + for (auto& team : teams) { if (team->machineTeam->machineIDs == mt->machineIDs) { ++num; } } - if ( ret == false ) { - TraceEvent(SevError, "ServerTeamNumberOnMachineIncorrect").detail("MachineTeam", mt->getMachineIDsStr()) - .detail("ServerTeamsSize", mt->serverTeams.size()).detail("CountedServerTeamNumber", num); + if (ret == false) { + TraceEvent(SevError, "ServerTeamNumberOnMachineIncorrect") + .detail("MachineTeam", mt->getMachineIDsStr()) + .detail("ServerTeamsSize", mt->serverTeams.size()) + .detail("CountedServerTeamNumber", num); } return ret; } - // Find the machine team with the least number of server teams - int getMachineTeamWithLeastProcessTeams(Reference &ret) { + int getMachineTeamWithLeastProcessTeams(Reference& ret) { int minNumProcessTeams = std::numeric_limits::max(); - for (auto &mt : machineTeams) { + for (auto& mt : machineTeams) { ASSERT(isServerTeamNumberCorrect(mt)); if (mt->serverTeams.size() < minNumProcessTeams) { minNumProcessTeams = mt->serverTeams.size(); @@ -1657,8 +1657,8 @@ struct DDTeamCollection : ReferenceCounted { } int getHealthyMachineTeamCount() { - int healthyTeamCount = 0; - for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) { + int healthyTeamCount = 0; + for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) { ASSERT((*mt)->machines.size() == configuration.storageTeamSize); if (isMachineTeamHealthy(*mt)) { @@ -1667,7 +1667,7 @@ struct DDTeamCollection : ReferenceCounted { } return healthyTeamCount; - } + } // Create server teams based on machine teams // Before the number of machine teams reaches the threshold, build a machine team for each server team @@ -1806,8 +1806,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) - .trackLatest( "TeamCollectionInfo" ); - + .trackLatest("TeamCollectionInfo"); return addedTeams; } @@ -1815,8 +1814,8 @@ struct DDTeamCollection : ReferenceCounted { // Check if the number of server (and machine teams) is larger than the maximum allowed number void traceTeamCollectionInfo() { int totalHealthyServerCount = calculateHealthyServerCount(); - int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount; - int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount; + int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount; + int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount; int totalHealthyMachineCount = calculateHealthyMachineCount(); int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; @@ -1836,10 +1835,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) - .trackLatest( "TeamCollectionInfo" ); + .trackLatest("TeamCollectionInfo"); // Debug purpose - if ( healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams ) { + if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) { // When the number of machine teams is over the limit, print out the current team info. traceAllInfo(true); } @@ -1929,19 +1928,19 @@ struct DDTeamCollection : ReferenceCounted { int healthyMachineTeamCount = self->getHealthyMachineTeamCount(); TraceEvent("TeamCollectionInfo", self->masterId) - .detail("Primary", self->primary) - .detail("AddedTeamNumber", 0) - .detail("AimToBuildTeamNumber", teamsToBuild) - .detail("CurrentTeamNumber", self->teams.size()) - .detail("DesiredTeamNumber", desiredTeams) - .detail("MaxTeamNumber", maxTeams) - .detail("StorageTeamSize", self->configuration.storageTeamSize) - .detail("CurrentMachineTeamNumber", self->machineTeams.size()) - .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) - .detail("DesiredMachineTeams", desiredMachineTeams) - .detail("MaxMachineTeams", maxMachineTeams) - .detail("TotalHealthyMachine", totalHealthyMachineCount) - .trackLatest( "TeamCollectionInfo" ); + .detail("Primary", self->primary) + .detail("AddedTeamNumber", 0) + .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("CurrentTeamNumber", self->teams.size()) + .detail("DesiredTeamNumber", desiredTeams) + .detail("MaxTeamNumber", maxTeams) + .detail("StorageTeamSize", self->configuration.storageTeamSize) + .detail("CurrentMachineTeamNumber", self->machineTeams.size()) + .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) + .detail("DesiredMachineTeams", desiredMachineTeams) + .detail("MaxMachineTeams", maxMachineTeams) + .detail("TotalHealthyMachine", totalHealthyMachineCount) + .trackLatest("TeamCollectionInfo"); } } @@ -2128,8 +2127,8 @@ struct DDTeamCollection : ReferenceCounted { } } // remove machine team on each machine - for (auto &machine: machine_info) { - for(int i = 0; i < machine.second->machineTeams.size(); ++i) { + for (auto& machine : machine_info) { + for (int i = 0; i < machine.second->machineTeams.size(); ++i) { if (machine.second->machineTeams[i]->machineIDs == targetMT->machineIDs) { machine.second->machineTeams[i--] = machine.second->machineTeams.back(); machine.second->machineTeams.pop_back(); @@ -2238,17 +2237,19 @@ struct DDTeamCollection : ReferenceCounted { ACTOR Future removeBadTeams(DDTeamCollection* self) { wait(self->initialFailureReactionDelay); loop { - while(self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) { + while (self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) { wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange()); } - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue. - if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) { + // After the team trackers wait on the initial failure reaction delay, they yield. + // We want to make sure every tracker has had the opportunity to send their relocations to the queue. + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskLowPriority)); + if (!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) { break; } } wait(self->addSubsetComplete.getFuture()); TraceEvent("DDRemovingBadTeams", self->masterId).detail("Primary", self->primary); - for(auto it : self->badTeams) { + for (auto it : self->badTeams) { it->tracker.cancel(); } self->badTeams.clear(); @@ -2260,17 +2261,19 @@ ACTOR Future teamRemover(DDTeamCollection* self) { loop { // Wait on processingUnhealthy as removeBadTeams() does loop { - while(self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) { + while (self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) { wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange()); } - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue. - if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) { + // After the team trackers wait on the initial failure reaction delay, they yield. + // We want to make sure every tracker has had the opportunity to send their relocations to the queue. + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY, TaskLowPriority)); + if (!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) { break; } } // Wait for the badTeamRemover() to avoid the potential race between adding the bad team (add the team tracker) // and remove bad team (cancel the team tracker). - wait( self->badTeamRemover ); + wait(self->badTeamRemover); // From this point, all machine teams and server teams should be healthy, because we wait above // until processingUnhealthy is done. @@ -2278,22 +2281,23 @@ ACTOR Future teamRemover(DDTeamCollection* self) { // Check if all machines are healthy, if not, we wait for 1 second and loop back. // Eventually, all machines will become healthy. if (totalHealthyMachineCount != self->machine_info.size()) { - wait( delay(1.0) ); + wait(delay(1.0)); continue; } // Sanity check all machine teams are healthy int currentHealthyMTCount = self->getHealthyMachineTeamCount(); if (currentHealthyMTCount != self->machineTeams.size()) { - TraceEvent(SevError, "InvalidAssumption").detail("TotalHealthyMachineCount", totalHealthyMachineCount) - .detail("MachineNumber", self->machine_info.size() ) - .detail("CurrentHealthyMTCount", currentHealthyMTCount) - .detail("MachineTeamNumber", self->machineTeams.size()); + TraceEvent(SevError, "InvalidAssumption") + .detail("TotalHealthyMachineCount", totalHealthyMachineCount) + .detail("MachineNumber", self->machine_info.size()) + .detail("CurrentHealthyMTCount", currentHealthyMTCount) + .detail("MachineTeamNumber", self->machineTeams.size()); self->traceAllInfo(true); } int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * self->machine_info.size(); - int totalMTCount= self->machineTeams.size(); // all machine teams should be healthy teams at this point + int totalMTCount = self->machineTeams.size(); // all machine teams should be healthy teams at this point if (totalMTCount > desiredMachineTeams) { // Pick the machine team with the least number of server teams and mark it undesired @@ -2312,21 +2316,23 @@ ACTOR Future teamRemover(DDTeamCollection* self) { // The team will be marked as a bad team bool foundTeam = self->removeTeam(team); ASSERT(foundTeam == true); - //removeTeam() has side effect of swapping the last element to the current pos in the serverTeams vector in the machine team. + // removeTeam() has side effect of swapping the last element to the current pos + // in the serverTeams vector in the machine team. --teamIndex; self->addTeam(team->servers, true, true); TEST(true); } - if ( self->badTeamRemover.isReady() ) { + if (self->badTeamRemover.isReady()) { self->badTeamRemover = removeBadTeams(self); self->addActor.send(self->badTeamRemover); } - TraceEvent("TeamRemover").detail("MachineTeamToRemove", mt->getMachineIDsStr()) - .detail("NumProcessTeamsOnTheMachineTeam", minNumProcessTeams) - .detail("CurrentMachineTeamNumber", self->machineTeams.size()) - .detail("DesiredMachineTeam", desiredMachineTeams); + TraceEvent("TeamRemover") + .detail("MachineTeamToRemove", mt->getMachineIDsStr()) + .detail("NumProcessTeamsOnTheMachineTeam", minNumProcessTeams) + .detail("CurrentMachineTeamNumber", self->machineTeams.size()) + .detail("DesiredMachineTeam", desiredMachineTeams); // Remove the machine team bool foundRemovedMachineTeam = self->removeMachineTeam(mt); @@ -2336,11 +2342,11 @@ ACTOR Future teamRemover(DDTeamCollection* self) { if (numMachineTeamRemoved > 0) { // Only trace the information when we remove a machine team TraceEvent("TeamRemoverDone") - .detail("HealthyMachineNumber", totalHealthyMachineCount) - .detail("CurrentHealthyMachineTeamNumber", currentHealthyMTCount) - .detail("CurrentMachineTeamNumber", self->machineTeams.size()) - .detail("DesiredMachineTeam", desiredMachineTeams) - .detail("NumMachineTeamRemoved", numMachineTeamRemoved); + .detail("HealthyMachineNumber", totalHealthyMachineCount) + .detail("CurrentHealthyMachineTeamNumber", currentHealthyMTCount) + .detail("CurrentMachineTeamNumber", self->machineTeams.size()) + .detail("DesiredMachineTeam", desiredMachineTeams) + .detail("NumMachineTeamRemoved", numMachineTeamRemoved); self->traceTeamCollectionInfo(); } diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 9795bc7062..d410b67c27 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -120,7 +120,7 @@ public: double FREE_SPACE_RATIO_CUTOFF; double FREE_SPACE_RATIO_DD_CUTOFF; int DESIRED_TEAMS_PER_SERVER; - int MAX_TEAMS_PER_SERVER;; + int MAX_TEAMS_PER_SERVER; int64_t DD_SHARD_SIZE_GRANULARITY; int64_t DD_SHARD_SIZE_GRANULARITY_SIM; int DD_MOVE_KEYS_PARALLELISM; diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 06367214b3..755b7ddf32 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -252,15 +252,15 @@ ACTOR Future getDataDistributionQueueSize( Database cx, Reference getTeamCollectionValid( Database cx, WorkerInterface masterWorker) { +// Gets if the number of process and machine teams does not exceed the maximum allowed number of teams +ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface masterWorker) { state int attempts = 0; loop { try { TraceEvent("GetTeamCollectionValid").detail("Stage", "ContactingMaster"); - TraceEventFields teamCollectionInfoMessage = wait( timeoutError(masterWorker.eventLogRequest.getReply( - EventLogRequest( LiteralStringRef("TeamCollectionInfo") ) ), 1.0 ) ); + TraceEventFields teamCollectionInfoMessage = wait(timeoutError( + masterWorker.eventLogRequest.getReply(EventLogRequest(LiteralStringRef("TeamCollectionInfo"))), 1.0)); TraceEvent("GetTeamCollectionValid").detail("Stage", "GotString"); @@ -274,47 +274,48 @@ ACTOR Future getTeamCollectionValid( Database cx, WorkerInterface masterWo sscanf(teamCollectionInfoMessage.getValue("CurrentTeamNumber").c_str(), "%lld", ¤tTeamNumber); sscanf(teamCollectionInfoMessage.getValue("DesiredTeamNumber").c_str(), "%lld", &desiredTeamNumber); sscanf(teamCollectionInfoMessage.getValue("MaxTeamNumber").c_str(), "%lld", &maxTeamNumber); - sscanf(teamCollectionInfoMessage.getValue("CurrentMachineTeamNumber").c_str(), "%lld", ¤tMachineTeamNumber); - sscanf(teamCollectionInfoMessage.getValue("CurrentHealthyMachineTeamNumber").c_str(), "%lld", &healthyMachineTeamCount); - sscanf(teamCollectionInfoMessage.getValue("DesiredMachineTeams").c_str(), "%lld", &desiredMachineTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("CurrentMachineTeamNumber").c_str(), "%lld", + ¤tMachineTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("CurrentHealthyMachineTeamNumber").c_str(), "%lld", + &healthyMachineTeamCount); + sscanf(teamCollectionInfoMessage.getValue("DesiredMachineTeams").c_str(), "%lld", + &desiredMachineTeamNumber); sscanf(teamCollectionInfoMessage.getValue("MaxMachineTeams").c_str(), "%lld", &maxMachineTeamNumber); - //if (currentTeamNumber > desiredTeamNumber || currentMachineTeamNumber > desiredMachineTeamNumber) { if (currentMachineTeamNumber > maxMachineTeamNumber || healthyMachineTeamCount > desiredMachineTeamNumber) { -// printf("getTeamCollectionValid: currentTeamNumber:%ld, desiredTeamNumber:%ld, maxTeamNumber:%ld currentMachineTeamNumber:%ld, desiredMachineTeamNumber:%ld, maxMachineTeamNumber:%ld\n", -// currentTeamNumber, desiredTeamNumber, maxTeamNumber, currentMachineTeamNumber, desiredMachineTeamNumber, maxMachineTeamNumber); TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) - .detail("DesiredTeamNumber", desiredTeamNumber) - .detail("MaxTeamNumber", maxTeamNumber) - .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) - .detail("DesiredMachineTeams", desiredMachineTeamNumber) - .detail("CurrentMachineTeamNumber", currentMachineTeamNumber) - .detail("MaxMachineTeams", maxMachineTeamNumber); + .detail("DesiredTeamNumber", desiredTeamNumber) + .detail("MaxTeamNumber", maxTeamNumber) + .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) + .detail("DesiredMachineTeams", desiredMachineTeamNumber) + .detail("CurrentMachineTeamNumber", currentMachineTeamNumber) + .detail("MaxMachineTeams", maxMachineTeamNumber); return false; } else { return true; } - } catch( Error &e ) { - TraceEvent("QuietDatabaseFailure", masterWorker.id()).detail("Reason", "Failed to extract GetTeamCollectionValid information"); + } catch (Error& e) { + TraceEvent("QuietDatabaseFailure", masterWorker.id()) + .detail("Reason", "Failed to extract GetTeamCollectionValid information"); attempts++; - if ( attempts > 10 ) { - TraceEvent("QuietDatabaseNoTeamCollectionInfo", masterWorker.id()).detail("Reason", "Had never called build team to build any team"); + if (attempts > 10) { + TraceEvent("QuietDatabaseNoTeamCollectionInfo", masterWorker.id()) + .detail("Reason", "Had never called build team to build any team"); return true; } - //throw; - wait( delay(10.0) ); + // throw; + wait(delay(10.0)); } }; - } -//Gets if the number of process and machine teams does not exceed the maximum allowed number of teams -//Convenience method that first finds the master worker from a zookeeper interface -ACTOR Future getTeamCollectionValid( Database cx, Reference> dbInfo) { +// Gets if the number of process and machine teams does not exceed the maximum allowed number of teams +// Convenience method that first finds the master worker from a zookeeper interface +ACTOR Future getTeamCollectionValid(Database cx, Reference> dbInfo) { WorkerInterface masterWorker = wait(getMasterWorker(cx, dbInfo)); - bool valid = wait(getTeamCollectionValid( cx, masterWorker)); + bool valid = wait(getTeamCollectionValid(cx, masterWorker)); return valid; } @@ -401,22 +402,22 @@ ACTOR Future waitForQuietDatabase( Database cx, Reference dataDistributionActive = getDataDistributionActive( cx, masterWorker ); state Future storageServersRecruiting = getStorageServersRecruiting ( cx, dbInfo, masterWorker ); - wait( success( dataInFlight ) && success( tLogQueueSize ) && success( dataDistributionQueueSize ) - && success( teamCollectionValid ) && success( storageQueueSize ) - && success( dataDistributionActive ) && success( storageServersRecruiting ) ); + wait(success(dataInFlight) && success(tLogQueueSize) && success(dataDistributionQueueSize) && + success(teamCollectionValid) && success(storageQueueSize) && success(dataDistributionActive) && + success(storageServersRecruiting)); TraceEvent(("QuietDatabase" + phase).c_str()) - .detail("DataInFlight", dataInFlight.get()) - .detail("MaxTLogQueueSize", tLogQueueSize.get()) - .detail("DataDistributionQueueSize", dataDistributionQueueSize.get()) - .detail("TeamCollectionValid", teamCollectionValid.get()) - .detail("MaxStorageQueueSize", storageQueueSize.get()) - .detail("DataDistributionActive", dataDistributionActive.get()) - .detail("StorageServersRecruiting", storageServersRecruiting.get()); + .detail("DataInFlight", dataInFlight.get()) + .detail("MaxTLogQueueSize", tLogQueueSize.get()) + .detail("DataDistributionQueueSize", dataDistributionQueueSize.get()) + .detail("TeamCollectionValid", teamCollectionValid.get()) + .detail("MaxStorageQueueSize", storageQueueSize.get()) + .detail("DataDistributionActive", dataDistributionActive.get()) + .detail("StorageServersRecruiting", storageServersRecruiting.get()); - if ( dataInFlight.get() > dataInFlightGate || tLogQueueSize.get() > maxTLogQueueGate - || dataDistributionQueueSize.get() > maxDataDistributionQueueSize || storageQueueSize.get() > maxStorageServerQueueGate - || dataDistributionActive.get() == false || storageServersRecruiting.get() == true - || teamCollectionValid.get() == false) { + if (dataInFlight.get() > dataInFlightGate || tLogQueueSize.get() > maxTLogQueueGate || + dataDistributionQueueSize.get() > maxDataDistributionQueueSize || + storageQueueSize.get() > maxStorageServerQueueGate || dataDistributionActive.get() == false || + storageServersRecruiting.get() == true || teamCollectionValid.get() == false) { wait( delay( 1.0 ) ); numSuccesses = 0; diff --git a/fdbserver/QuietDatabase.h b/fdbserver/QuietDatabase.h index f21388b38b..faefbf13c8 100644 --- a/fdbserver/QuietDatabase.h +++ b/fdbserver/QuietDatabase.h @@ -32,8 +32,8 @@ Future getDataInFlight( Database const& cx, Reference getMaxTLogQueueSize( Database const& cx, Reference> const& ); Future getMaxStorageServerQueueSize( Database const& cx, Reference> const& ); Future getDataDistributionQueueSize( Database const &cx, Reference> const&, bool const& reportInFlight ); -Future getTeamCollectionValid( Database const &cx, WorkerInterface const& ); -Future getTeamCollectionValid( Database const &cx, Reference> const&); +Future getTeamCollectionValid(Database const& cx, WorkerInterface const&); +Future getTeamCollectionValid(Database const& cx, Reference> const&); Future> getStorageServers( Database const& cx, bool const &use_system_priority = false); Future>> getWorkers( Reference> const& dbInfo, int const& flags = 0 ); Future getMasterWorker( Database const& cx, Reference> const& dbInfo ); diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 8c8d811495..1810a6a7ef 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -206,10 +206,10 @@ struct ConsistencyCheckWorkload : TestWorkload self->testFailure("Non-zero data distribution queue/in-flight size"); } - //Check that the number of process (and machine) teams is no larger than the allowed maximum number of teams + // Check that the number of process (and machine) teams is no larger than + // the allowed maximum number of teams bool teamCollectionValid = wait(getTeamCollectionValid(cx, self->dbInfo)); - if (!teamCollectionValid) - { + if (!teamCollectionValid) { TraceEvent(SevError, "ConsistencyCheck_TooManyTeams"); self->testFailure("The number of process or machine teams is larger than the allowed maximum number of teams"); }