From 2b73c89e98f4129943cb4d428f5895df74475a43 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 5 Feb 2019 14:18:25 -0800 Subject: [PATCH] TeamCollection: Test the number of teams Call the traceTeamCollectionInfo function to record the team numbers when we add a team directly from the shard information, instead of using addTeamsBestOf logic. --- fdbserver/DataDistribution.actor.cpp | 100 ++++++++++++++++-- fdbserver/QuietDatabase.actor.cpp | 68 +++++++----- fdbserver/tester.actor.cpp | 2 + .../workloads/ConsistencyCheck.actor.cpp | 2 +- 4 files changed, 136 insertions(+), 36 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 50287b78d5..21ca1e7204 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -39,6 +39,16 @@ class TCTeamInfo; struct TCMachineInfo; class TCMachineTeamInfo; +struct TeamBuildInfo { + int currentServerTeamNum; + int desiredServerTeamNum; + int maxServerTeamNum; + + int currentMachineTeamNum; + int desiredMachineTeamNum; + int maxMachineTeamNum; +}; + struct TCServerInfo : public ReferenceCounted { UID id; StorageServerInterface lastKnownInterface; @@ -957,6 +967,7 @@ struct DDTeamCollection : ReferenceCounted { serverIds.push_back(*tempMap->getObject(it)); } self->addTeam(serverIds.begin(), serverIds.end(), true); + self->traceTeamCollectionInfo(); } } else { serverIds.clear(); @@ -985,6 +996,7 @@ struct DDTeamCollection : ReferenceCounted { state std::set>::iterator teamIterEnd = self->primary ? initTeams->primaryTeams.end() : initTeams->remoteTeams.end(); for(; teamIter != teamIterEnd; ++teamIter) { self->addTeam(teamIter->begin(), teamIter->end(), true); + self->traceTeamCollectionInfo(); wait( yield() ); } @@ -1560,6 +1572,27 @@ struct DDTeamCollection : ReferenceCounted { return true; } + int calculateHealthyServerCount() { + int serverCount = 0; + for (auto i =server_info.begin(); i != server_info.end(); ++i) { + if (!server_status.get(i->first).isUnhealthy()) { + ++serverCount; + } + } + return serverCount; + } + + int calculateHealthyMachineCount() { + int totalHealthyMachineCount = 0; + for (auto m : machine_info) { + if (isMachineHealthy(m.second)) { + ++totalHealthyMachineCount; + } + } + + return totalHealthyMachineCount; + } + // Create server teams based on machine teams // Before the number of machine teams reaches the threshold, build a machine team for each server team // When it reaches the threshold, first try to build a server team with existing machine teams; if failed, @@ -1585,12 +1618,7 @@ struct DDTeamCollection : ReferenceCounted { ++totalMachineTeamCount; } - int totalHealthyMachineCount = 0; - for (auto m : machine_info) { - if (isMachineHealthy(m.second)) { - ++totalHealthyMachineCount; - } - } + int totalHealthyMachineCount = calculateHealthyMachineCount(); int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; @@ -1687,7 +1715,13 @@ struct DDTeamCollection : ReferenceCounted { } } - TraceEvent("AddTeamsBestOf", masterId) +// teamBuildInfo.currentServerTeamNum = teams.size(); +// teamBuildInfo.currentMachineTeamNum = machineTeams.size(); +// teamBuildInfo.desiredMachineTeamNum = desiredMachineTeams; +// teamBuildInfo.maxMachineTeamNum = maxMachineTeams; + + + TraceEvent("TeamCollectionInfo", masterId) .detail("Primary", primary) .detail("AddedTeamNumber", addedTeams) .detail("AimToBuildTeamNumber", teamsToBuild) @@ -1699,12 +1733,38 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) - .trackLatest( "AddTeamsBestOf" ); + .trackLatest( "TeamCollectionInfo" ); return addedTeams; } + // Check if the number of server (and machine teams) is larger than the maximum allowed number + void traceTeamCollectionInfo() { + int totalHealthyServerCount = calculateHealthyServerCount(); + int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount; + int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount; + + int totalHealthyMachineCount = calculateHealthyMachineCount(); + int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; + int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; + + TraceEvent("TeamCollectionInfo", masterId) + .detail("Primary", primary) + .detail("AddedTeamNumber", 0) + .detail("AimToBuildTeamNumber", 0) + .detail("CurrentTeamNumber", teams.size()) + .detail("DesiredTeamNumber", desiredServerTeams) + .detail("MaxTeamNumber", maxServerTeams) + .detail("StorageTeamSize", configuration.storageTeamSize) + .detail("CurrentMachineTeamNumber", machineTeams.size()) + .detail("DesiredMachineTeams", desiredMachineTeams) + .detail("MaxMachineTeams", maxMachineTeams) + .detail("TotalHealthyMachine", totalHealthyMachineCount) + .trackLatest( "TeamCollectionInfo" ); + + } + // Use the current set of known processes (from server_info) to compute an optimized set of storage server teams. // The following are guarantees of the process: // - Each newly-built team will meet the replication policy @@ -1770,6 +1830,10 @@ struct DDTeamCollection : ReferenceCounted { state vector> builtTeams; +// state struct TeamBuildInfo teamBuildInfo; +// teamBuildInfo.desiredServerTeamNum = desiredTeams; +// teamBuildInfo.maxServerTeamNum = maxTeams; + int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams); if (addedTeams <= 0 && self->teams.size() == 0) { TraceEvent(SevWarn, "NoTeamAfterBuildTeam") @@ -1778,6 +1842,25 @@ struct DDTeamCollection : ReferenceCounted { // Debug: set true for traceAllInfo() to print out more information self->traceAllInfo(); } + } else { + int totalHealthyMachineCount = self->calculateHealthyMachineCount(); + + int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; + int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; + + TraceEvent("TeamCollectionInfo", self->masterId) + .detail("Primary", self->primary) + .detail("AddedTeamNumber", 0) + .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("CurrentTeamNumber", self->teams.size()) + .detail("DesiredTeamNumber", desiredTeams) + .detail("MaxTeamNumber", maxTeams) + .detail("StorageTeamSize", self->configuration.storageTeamSize) + .detail("CurrentMachineTeamNumber", self->machineTeams.size()) + .detail("DesiredMachineTeams", desiredMachineTeams) + .detail("MaxMachineTeams", maxMachineTeams) + .detail("TotalHealthyMachine", totalHealthyMachineCount) + .trackLatest( "TeamCollectionInfo" ); } } @@ -2687,6 +2770,7 @@ ACTOR Future storageServerTracker( for(auto it : newBadTeams) { if( self->removeTeam(it) ) { self->addTeam(it->servers, true); + self->traceTeamCollectionInfo(); addedNewBadTeam = true; } } diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 3dd9eab0bb..3e87041c8d 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -254,39 +254,53 @@ ACTOR Future getDataDistributionQueueSize( Database cx, Reference getTeamCollectionValid( Database cx, WorkerInterface masterWorker) { - try { - TraceEvent("GetTeamCollectionValid").detail("Stage", "ContactingMaster"); + state int attempts = 0; + loop { + try { + TraceEvent("GetTeamCollectionValid").detail("Stage", "ContactingMaster"); - TraceEventFields addTeamsBestOfMessage = wait( timeoutError(masterWorker.eventLogRequest.getReply( - EventLogRequest( LiteralStringRef("AddTeamsBestOf") ) ), 1.0 ) ); + TraceEventFields teamCollectionInfoMessage = wait( timeoutError(masterWorker.eventLogRequest.getReply( + EventLogRequest( LiteralStringRef("TeamCollectionInfo") ) ), 1.0 ) ); - TraceEvent("GetTeamCollectionValid").detail("Stage", "GotString"); + TraceEvent("GetTeamCollectionValid").detail("Stage", "GotString"); - int64_t currentTeamNumber; - int64_t desiredTeamNumber; - int64_t maxTeamNumber; - int64_t currentMachineTeamNumber; - int64_t desiredMachineTeamNumber; - int64_t maxMachineTeamNumber; - sscanf(addTeamsBestOfMessage.getValue("CurrentTeamNumber").c_str(), "%lld", ¤tTeamNumber); - sscanf(addTeamsBestOfMessage.getValue("DesiredTeamNumber").c_str(), "%lld", &desiredTeamNumber); - sscanf(addTeamsBestOfMessage.getValue("MaxTeamNumber").c_str(), "%lld", &maxTeamNumber); - sscanf(addTeamsBestOfMessage.getValue("CurrentMachineTeamNumber").c_str(), "%lld", ¤tMachineTeamNumber); - sscanf(addTeamsBestOfMessage.getValue("DesiredMachineTeams").c_str(), "%lld", &desiredMachineTeamNumber); - sscanf(addTeamsBestOfMessage.getValue("MaxMachineTeams").c_str(), "%lld", &maxMachineTeamNumber); + int64_t currentTeamNumber; + int64_t desiredTeamNumber; + int64_t maxTeamNumber; + int64_t currentMachineTeamNumber; + int64_t desiredMachineTeamNumber; + int64_t maxMachineTeamNumber; + sscanf(teamCollectionInfoMessage.getValue("CurrentTeamNumber").c_str(), "%lld", ¤tTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("DesiredTeamNumber").c_str(), "%lld", &desiredTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("MaxTeamNumber").c_str(), "%lld", &maxTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("CurrentMachineTeamNumber").c_str(), "%lld", ¤tMachineTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("DesiredMachineTeams").c_str(), "%lld", &desiredMachineTeamNumber); + sscanf(teamCollectionInfoMessage.getValue("MaxMachineTeams").c_str(), "%lld", &maxMachineTeamNumber); - if (currentTeamNumber > maxTeamNumber || currentMachineTeamNumber > maxMachineTeamNumber) { - printf("getTeamCollectionValid: currentTeamNumber:%ld, desiredTeamNumber:%ld, maxTeamNumber:%ld currentMachineTeamNumber:%ld, desiredMachineTeamNumber:%ld, maxMachineTeamNumber:%ld.", - currentTeamNumber, desiredTeamNumber, maxTeamNumber, currentMachineTeamNumber, desiredMachineTeamNumber, maxMachineTeamNumber); - return false; - } else { - return true; + if (currentTeamNumber > maxTeamNumber || currentMachineTeamNumber > maxMachineTeamNumber) { + printf("getTeamCollectionValid: currentTeamNumber:%ld, desiredTeamNumber:%ld, maxTeamNumber:%ld currentMachineTeamNumber:%ld, desiredMachineTeamNumber:%ld, maxMachineTeamNumber:%ld.", + currentTeamNumber, desiredTeamNumber, maxTeamNumber, currentMachineTeamNumber, desiredMachineTeamNumber, maxMachineTeamNumber); + TraceEvent("GetTeamCollectionValid").detail("CurrentTeamNumber", currentTeamNumber) + .detail("DesiredTeamNumber", desiredTeamNumber).detail("MaxTeamNumber", maxTeamNumber) + .detail("CurrentMachineTeamNumber", currentMachineTeamNumber).detail("DesiredMachineTeams", desiredMachineTeamNumber) + .detail("MaxMachineTeams", maxMachineTeamNumber); + return false; + } else { + return true; + } + + } catch( Error &e ) { + TraceEvent("QuietDatabaseFailure", masterWorker.id()).detail("Reason", "Failed to extract GetTeamCollectionValid information"); + attempts++; + if ( attempts > 10 ) { + TraceEvent("QuietDatabaseNoTeamCollectionInfo", masterWorker.id()).detail("Reason", "Had never called build team to build any team"); + return true; + } + //throw; + wait( delay(5.0) ); } + }; - } catch( Error &e ) { - TraceEvent("QuietDatabaseFailure", masterWorker.id()).detail("Reason", "Failed to extract GetTeamCollectionValid information"); - throw; - } } //Gets if the number of process and machine teams does not exceed the maximum allowed number of teams diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 49eae1b034..111b1809ad 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -690,6 +690,8 @@ ACTOR Future runWorkload( Database cx, std::vector< Test } } + printf("MX: success:%d failure:%d\n", success, failure); + if( spec.phases & TestWorkload::METRICS ) { state std::vector< Future> > metricTasks; printf("fetching metrics (%s)...\n", printable(spec.title).c_str()); diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 1c29be4338..8c8d811495 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -210,7 +210,7 @@ struct ConsistencyCheckWorkload : TestWorkload bool teamCollectionValid = wait(getTeamCollectionValid(cx, self->dbInfo)); if (!teamCollectionValid) { - TraceEvent("ConsistencyCheck_TooManyTeams"); + TraceEvent(SevError, "ConsistencyCheck_TooManyTeams"); self->testFailure("The number of process or machine teams is larger than the allowed maximum number of teams"); }