TeamTracker:Fix bug in counting optimalTeamCount

When a teamTracker is cancelled, e.g, by redundant teamRemover or badTeamRemover,
we should decrease the optimalTeamCount if the team is considered as an
optimal team, i.e., all members' machine fitness is no worse than unset, and
the team is healthy.
This commit is contained in:
Meng Xu 2019-07-11 14:41:17 -07:00
parent 4c32593f59
commit 221e6945db
2 changed files with 22 additions and 13 deletions

View File

@ -1287,7 +1287,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
} }
// To enable verbose debug info, set shouldPrint to true // To enable verbose debug info, set shouldPrint to true
void traceAllInfo(bool shouldPrint = true) { void traceAllInfo(bool shouldPrint = false) {
if (!shouldPrint) return; if (!shouldPrint) return;
TraceEvent("TraceAllInfo").detail("Primary", primary); TraceEvent("TraceAllInfo").detail("Primary", primary);
@ -1425,7 +1426,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
score += server->machine->machineTeams.size(); score += server->machine->machineTeams.size();
Standalone<StringRef> machine_id = server->lastKnownInterface.locality.zoneId().get(); Standalone<StringRef> machine_id = server->lastKnownInterface.locality.zoneId().get();
machineIDs.push_back(machine_id); machineIDs.push_back(machine_id);
TraceEvent("MachineTeamDebug\n").detail("MachineTeamMember", machine_id); TraceEvent("MachineTeamDebugDetail").detail("MachineTeamMember", machine_id);
} }
@ -2637,6 +2638,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
state bool lastZeroHealthy = self->zeroHealthyTeams->get(); state bool lastZeroHealthy = self->zeroHealthyTeams->get();
state bool firstCheck = true; state bool firstCheck = true;
state bool optimal;
if(logTeamEvents) { if(logTeamEvents) {
TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc()); TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
} }
@ -2678,7 +2681,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
bool healthy = !badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize; bool healthy = !badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize;
team->setHealthy( healthy ); // Unhealthy teams won't be chosen by bestTeam team->setHealthy( healthy ); // Unhealthy teams won't be chosen by bestTeam
bool optimal = team->isOptimal() && healthy; optimal = team->isOptimal() && healthy;
bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get())); bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get()));
lastReady = self->initialFailureReactionDelay.isReady(); lastReady = self->initialFailureReactionDelay.isReady();
@ -2855,7 +2858,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc()); TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
} }
self->priority_teams[team->getPriority()]--; self->priority_teams[team->getPriority()]--;
if( team->isHealthy() ) { if(team->isHealthy()) {
self->healthyTeamCount--; self->healthyTeamCount--;
ASSERT( self->healthyTeamCount >= 0 ); ASSERT( self->healthyTeamCount >= 0 );
@ -2864,6 +2867,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
self->zeroHealthyTeams->set(true); self->zeroHealthyTeams->set(true);
} }
} }
if (optimal) {
self->optimalTeamCount--;
ASSERT( self->optimalTeamCount >= 0 );
self->zeroOptimalTeams.set(self->optimalTeamCount == 0);
}
throw; throw;
} }
} }
@ -3200,7 +3208,8 @@ ACTOR Future<Void> storageServerTracker(
} }
if( server->lastKnownClass.machineClassFitness( ProcessClass::Storage ) > ProcessClass::UnsetFit ) { if( server->lastKnownClass.machineClassFitness( ProcessClass::Storage ) > ProcessClass::UnsetFit ) {
if( self->optimalTeamCount > 0 ) { // We see a case optimalTeamCount = 1, while healthyTeamCount = 0 in 3 data_hall configuration
if( self->optimalTeamCount > 0 ) { //&& self->healthyTeamCount > 0
TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId) TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
.detail("Server", server->id) .detail("Server", server->id)
.detail("OptimalTeamCount", self->optimalTeamCount) .detail("OptimalTeamCount", self->optimalTeamCount)

View File

@ -303,15 +303,15 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
// Get storage policy // Get storage policy
//state std::string replicationPolicyName = g_simulator.storagePolicy->name(); // Across //state std::string replicationPolicyName = g_simulator.storagePolicy->name(); // Across
state std::string replicationPolicyInfo = g_simulator.storagePolicy->info(); // state std::string replicationPolicyInfo = g_simulator.storagePolicy->info();
// machineID is zoneid by default. If machine concept is not zoneid (say it is data_hall), // // machineID is zoneid by default. If machine concept is not zoneid (say it is data_hall),
// the machine-team logic needs to carefully change to assigne machineID as a different type (say data_hall) // // the machine-team logic needs to carefully change to assigne machineID as a different type (say data_hall)
state bool isMachineIDZoneID = replicationPolicyInfo.find("zoneid") != string::npos; // state bool isMachineIDZoneID = replicationPolicyInfo.find("zoneid") != std::string::npos;
if (!isMachineIDZoneID) { // if (!isMachineIDZoneID) {
TraceEvent(SevWarnAlways, "MachineIDIsNotZoneID"); // TraceEvent(SevWarnAlways, "MachineIDIsNotZoneID");
return true; // return true;
} // }
// The if condition should be consistent with the condition in serverTeamRemover() and // The if condition should be consistent with the condition in serverTeamRemover() and
// machineTeamRemover() that decides if redundant teams exist. // machineTeamRemover() that decides if redundant teams exist.