TeamTracker:Fix bug in counting optimalTeamCount
When a teamTracker is cancelled, e.g, by redundant teamRemover or badTeamRemover, we should decrease the optimalTeamCount if the team is considered as an optimal team, i.e., all members' machine fitness is no worse than unset, and the team is healthy.
This commit is contained in:
parent
4c32593f59
commit
221e6945db
|
@ -1287,7 +1287,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// To enable verbose debug info, set shouldPrint to true
|
// To enable verbose debug info, set shouldPrint to true
|
||||||
void traceAllInfo(bool shouldPrint = true) {
|
void traceAllInfo(bool shouldPrint = false) {
|
||||||
|
|
||||||
if (!shouldPrint) return;
|
if (!shouldPrint) return;
|
||||||
|
|
||||||
TraceEvent("TraceAllInfo").detail("Primary", primary);
|
TraceEvent("TraceAllInfo").detail("Primary", primary);
|
||||||
|
@ -1425,7 +1426,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
||||||
score += server->machine->machineTeams.size();
|
score += server->machine->machineTeams.size();
|
||||||
Standalone<StringRef> machine_id = server->lastKnownInterface.locality.zoneId().get();
|
Standalone<StringRef> machine_id = server->lastKnownInterface.locality.zoneId().get();
|
||||||
machineIDs.push_back(machine_id);
|
machineIDs.push_back(machine_id);
|
||||||
TraceEvent("MachineTeamDebug\n").detail("MachineTeamMember", machine_id);
|
TraceEvent("MachineTeamDebugDetail").detail("MachineTeamMember", machine_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -2637,6 +2638,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
state bool lastZeroHealthy = self->zeroHealthyTeams->get();
|
state bool lastZeroHealthy = self->zeroHealthyTeams->get();
|
||||||
state bool firstCheck = true;
|
state bool firstCheck = true;
|
||||||
|
|
||||||
|
state bool optimal;
|
||||||
|
|
||||||
if(logTeamEvents) {
|
if(logTeamEvents) {
|
||||||
TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
|
TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
|
||||||
}
|
}
|
||||||
|
@ -2678,7 +2681,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
|
|
||||||
bool healthy = !badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize;
|
bool healthy = !badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize;
|
||||||
team->setHealthy( healthy ); // Unhealthy teams won't be chosen by bestTeam
|
team->setHealthy( healthy ); // Unhealthy teams won't be chosen by bestTeam
|
||||||
bool optimal = team->isOptimal() && healthy;
|
optimal = team->isOptimal() && healthy;
|
||||||
bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get()));
|
bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get()));
|
||||||
|
|
||||||
lastReady = self->initialFailureReactionDelay.isReady();
|
lastReady = self->initialFailureReactionDelay.isReady();
|
||||||
|
@ -2855,7 +2858,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
|
TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
|
||||||
}
|
}
|
||||||
self->priority_teams[team->getPriority()]--;
|
self->priority_teams[team->getPriority()]--;
|
||||||
if( team->isHealthy() ) {
|
if(team->isHealthy()) {
|
||||||
self->healthyTeamCount--;
|
self->healthyTeamCount--;
|
||||||
ASSERT( self->healthyTeamCount >= 0 );
|
ASSERT( self->healthyTeamCount >= 0 );
|
||||||
|
|
||||||
|
@ -2864,6 +2867,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
self->zeroHealthyTeams->set(true);
|
self->zeroHealthyTeams->set(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (optimal) {
|
||||||
|
self->optimalTeamCount--;
|
||||||
|
ASSERT( self->optimalTeamCount >= 0 );
|
||||||
|
self->zeroOptimalTeams.set(self->optimalTeamCount == 0);
|
||||||
|
}
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3200,7 +3208,8 @@ ACTOR Future<Void> storageServerTracker(
|
||||||
}
|
}
|
||||||
|
|
||||||
if( server->lastKnownClass.machineClassFitness( ProcessClass::Storage ) > ProcessClass::UnsetFit ) {
|
if( server->lastKnownClass.machineClassFitness( ProcessClass::Storage ) > ProcessClass::UnsetFit ) {
|
||||||
if( self->optimalTeamCount > 0 ) {
|
// We see a case optimalTeamCount = 1, while healthyTeamCount = 0 in 3 data_hall configuration
|
||||||
|
if( self->optimalTeamCount > 0 ) { //&& self->healthyTeamCount > 0
|
||||||
TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
|
TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
|
||||||
.detail("Server", server->id)
|
.detail("Server", server->id)
|
||||||
.detail("OptimalTeamCount", self->optimalTeamCount)
|
.detail("OptimalTeamCount", self->optimalTeamCount)
|
||||||
|
|
|
@ -303,15 +303,15 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
|
||||||
|
|
||||||
// Get storage policy
|
// Get storage policy
|
||||||
//state std::string replicationPolicyName = g_simulator.storagePolicy->name(); // Across
|
//state std::string replicationPolicyName = g_simulator.storagePolicy->name(); // Across
|
||||||
state std::string replicationPolicyInfo = g_simulator.storagePolicy->info();
|
// state std::string replicationPolicyInfo = g_simulator.storagePolicy->info();
|
||||||
// machineID is zoneid by default. If machine concept is not zoneid (say it is data_hall),
|
// // machineID is zoneid by default. If machine concept is not zoneid (say it is data_hall),
|
||||||
// the machine-team logic needs to carefully change to assigne machineID as a different type (say data_hall)
|
// // the machine-team logic needs to carefully change to assigne machineID as a different type (say data_hall)
|
||||||
state bool isMachineIDZoneID = replicationPolicyInfo.find("zoneid") != string::npos;
|
// state bool isMachineIDZoneID = replicationPolicyInfo.find("zoneid") != std::string::npos;
|
||||||
|
|
||||||
if (!isMachineIDZoneID) {
|
// if (!isMachineIDZoneID) {
|
||||||
TraceEvent(SevWarnAlways, "MachineIDIsNotZoneID");
|
// TraceEvent(SevWarnAlways, "MachineIDIsNotZoneID");
|
||||||
return true;
|
// return true;
|
||||||
}
|
// }
|
||||||
|
|
||||||
// The if condition should be consistent with the condition in serverTeamRemover() and
|
// The if condition should be consistent with the condition in serverTeamRemover() and
|
||||||
// machineTeamRemover() that decides if redundant teams exist.
|
// machineTeamRemover() that decides if redundant teams exist.
|
||||||
|
|
Loading…
Reference in New Issue