TeamTracker:Fix bug in counting optimalTeamCount

When a teamTracker is cancelled, e.g, by redundant teamRemover or badTeamRemover, we should decrease the optimalTeamCount if the team is considered as an optimal team, i.e., all members' machine fitness is no worse than unset, and the team is healthy.
2019-07-11 14:41:17 -07:00 · 2019-07-11 14:41:17 -07:00 · 221e6945db
parent 4c32593f59
commit 221e6945db
2 changed files with 22 additions and 13 deletions
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -1287,7 +1287,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	}

 	// To enable verbose debug info, set shouldPrint to true
-	void traceAllInfo(bool shouldPrint = true) {
+	void traceAllInfo(bool shouldPrint = false) {
+
 		if (!shouldPrint) return;

 		TraceEvent("TraceAllInfo").detail("Primary", primary);
@ -1425,7 +1426,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 					score += server->machine->machineTeams.size();
 					Standalone<StringRef> machine_id = server->lastKnownInterface.locality.zoneId().get();
 					machineIDs.push_back(machine_id);
-					TraceEvent("MachineTeamDebug\n").detail("MachineTeamMember", machine_id);
+					TraceEvent("MachineTeamDebugDetail").detail("MachineTeamMember", machine_id);
 				}
 				

@ -2637,6 +2638,8 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 	state bool lastZeroHealthy = self->zeroHealthyTeams->get();
 	state bool firstCheck = true;

+	state bool optimal;
+
 	if(logTeamEvents) {
 		TraceEvent("TeamTrackerStarting", self->distributorId).detail("Reason", "Initial wait complete (sc)").detail("Team", team->getDesc());
 	}
@ -2678,7 +2681,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea

 			bool healthy = !badTeam && !anyUndesired && serversLeft == self->configuration.storageTeamSize;
 			team->setHealthy( healthy );	// Unhealthy teams won't be chosen by bestTeam
-			bool optimal = team->isOptimal() && healthy;
+			optimal = team->isOptimal() && healthy;
 			bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get()));

 			lastReady = self->initialFailureReactionDelay.isReady();
@ -2855,7 +2858,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 			TraceEvent("TeamTrackerStopping", self->distributorId).detail("Team", team->getDesc());
 		}
 		self->priority_teams[team->getPriority()]--;
-		if( team->isHealthy() ) {
+		if(team->isHealthy()) {
 			self->healthyTeamCount--;
 			ASSERT( self->healthyTeamCount >= 0 );

@ -2864,6 +2867,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 				self->zeroHealthyTeams->set(true);
 			}
 		}
+		if (optimal) {
+			self->optimalTeamCount--;
+			ASSERT( self->optimalTeamCount >= 0 );
+			self->zeroOptimalTeams.set(self->optimalTeamCount == 0);
+		}
 		throw;
 	}
 }
@ -3200,7 +3208,8 @@ ACTOR Future<Void> storageServerTracker(
 			}

 			if( server->lastKnownClass.machineClassFitness( ProcessClass::Storage ) > ProcessClass::UnsetFit ) {
-				if( self->optimalTeamCount > 0 ) {
+				// We see a case optimalTeamCount = 1, while healthyTeamCount = 0 in 3 data_hall configuration
+				if( self->optimalTeamCount > 0 ) { //&& self->healthyTeamCount > 0 
 					TraceEvent(SevWarn, "UndesiredStorageServer", self->distributorId)
 					    .detail("Server", server->id)
 					    .detail("OptimalTeamCount", self->optimalTeamCount)
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -303,15 +303,15 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr

 			// Get storage policy
 			//state std::string replicationPolicyName = g_simulator.storagePolicy->name(); // Across
-			state std::string replicationPolicyInfo = g_simulator.storagePolicy->info();
-			// machineID is zoneid by default. If machine concept is not zoneid (say it is data_hall), 
-			// the machine-team logic needs to carefully change to assigne machineID as a different type (say data_hall)
-			state bool isMachineIDZoneID = replicationPolicyInfo.find("zoneid") != string::npos;
+			// state std::string replicationPolicyInfo = g_simulator.storagePolicy->info();
+			// // machineID is zoneid by default. If machine concept is not zoneid (say it is data_hall), 
+			// // the machine-team logic needs to carefully change to assigne machineID as a different type (say data_hall)
+			// state bool isMachineIDZoneID = replicationPolicyInfo.find("zoneid") != std::string::npos;

-			if (!isMachineIDZoneID) {
-				TraceEvent(SevWarnAlways, "MachineIDIsNotZoneID");
-				return true;
-			}
+			// if (!isMachineIDZoneID) {
+			// 	TraceEvent(SevWarnAlways, "MachineIDIsNotZoneID");
+			// 	return true;
+			// }

 			// The if condition should be consistent with the condition in serverTeamRemover() and 
 			// machineTeamRemover() that decides if redundant teams exist.