Merge pull request #1764 from xumengpanda/mengxu/release-61/DD-ensure-new-machines-have-teams-PR

[Release 6.1 Patch] Ensure new added machines are used to build teams
This commit is contained in:
Evan Tschannen 2019-07-02 14:03:35 -07:00 committed by GitHub
commit b2e6b25496
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 178 additions and 33 deletions

View File

@ -2,6 +2,14 @@
Release Notes
#############
6.1.11
======
Fixes
-----
* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
6.1.10
======
@ -14,6 +22,7 @@ Fixes
-----
* The ``fdbrestore`` commands ``abort``, ``wait``, and ``status`` would use a default cluster file instead of the destination cluster file argument. `(PR #1701) <https://github.com/apple/foundationdb/pull/1701>`_
* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) <https://github.com/apple/foundationdb/pull/1764>`_
6.1.9
=====

View File

@ -1310,7 +1310,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// Five steps to create each machine team, which are document in the function
// Reuse ReplicationPolicy selectReplicas func to select machine team
// return number of added machine teams
int addBestMachineTeams(int targetMachineTeamsToBuild) {
int addBestMachineTeams(int targetMachineTeamsToBuild, int remainingMachineTeamBudget) {
int addedMachineTeams = 0;
int totalServerIndex = 0;
int machineTeamsToBuild = 0;
@ -1329,7 +1329,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int loopCount = 0;
// Add a team in each iteration
while (addedMachineTeams < machineTeamsToBuild) {
while (addedMachineTeams < machineTeamsToBuild || addedMachineTeams < remainingMachineTeamBudget) {
// Step 2: Get least used machines from which we choose machines as a machine team
std::vector<Reference<TCMachineInfo>> leastUsedMachines; // A less used machine has less number of teams
int minTeamCount = std::numeric_limits<int>::max();
@ -1379,6 +1379,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// that have the least-utilized server
team.clear();
auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team);
// NOTE: selectReplicas() should always return success when storageTeamSize = 1
ASSERT_WE_THINK(configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success));
if (!success) {
break;
}
@ -1432,6 +1434,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
addMachineTeam(machines);
addedMachineTeams++;
// Update the remaining machine team budget because the budget may decrease by
// any value between 1 and storageTeamSize
remainingMachineTeamBudget = getRemainingMachineTeamBudget();
} else {
TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
.detail("Primary", primary)
@ -1591,6 +1596,32 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
return totalHealthyMachineCount;
}
std::pair<int64_t, int64_t> calculateMinMaxServerTeamNumOnServer() {
int64_t minTeamNumber = std::numeric_limits<int64_t>::max();
int64_t maxTeamNumber = 0;
for (auto& server : server_info) {
if (server_status.get(server.first).isUnhealthy()) {
continue;
}
minTeamNumber = std::min((int64_t) server.second->teams.size(), minTeamNumber);
maxTeamNumber = std::max((int64_t) server.second->teams.size(), maxTeamNumber);
}
return std::make_pair(minTeamNumber, maxTeamNumber);
}
std::pair<int64_t, int64_t> calculateMinMaxMachineTeamNumOnMachine() {
int64_t minTeamNumber = std::numeric_limits<int64_t>::max();
int64_t maxTeamNumber = 0;
for (auto& machine : machine_info) {
if (!isMachineHealthy(machine.second)) {
continue;
}
minTeamNumber = std::min<int64_t>((int64_t) machine.second->machineTeams.size(), minTeamNumber);
maxTeamNumber = std::max<int64_t>((int64_t) machine.second->machineTeams.size(), maxTeamNumber);
}
return std::make_pair(minTeamNumber, maxTeamNumber);
}
// Sanity check
bool isServerTeamNumberCorrect(Reference<TCMachineTeamInfo>& mt) {
int num = 0;
@ -1641,12 +1672,41 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
return healthyTeamCount;
}
// Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
// remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has
// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
int getRemainingMachineTeamBudget() {
int remainingMachineTeamBudget = 0;
for (auto& m : machine_info) {
int machineTeamCount = m.second->machineTeams.size();
remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - machineTeamCount));
}
// We over-provision the remainingMachineTeamBudget because we do not know, when a new machine team is built,
// how many times it can be counted into the budget. For example, when a new machine is added,
// a new machine team only consume 1 such budget
return remainingMachineTeamBudget;
}
// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
int getRemainingServerTeamBudget() {
// remainingTeamBudget is the number of teams needed to ensure every server has
// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
int remainingTeamBudget = 0;
for (auto& s : server_info) {
int numValidTeams = s.second->teams.size();
remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams));
}
return remainingTeamBudget;
}
// Create server teams based on machine teams
// Before the number of machine teams reaches the threshold, build a machine team for each server team
// When it reaches the threshold, first try to build a server team with existing machine teams; if failed,
// build an extra machine team and record the event in trace
int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber) {
ASSERT(teamsToBuild > 0);
int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) {
ASSERT(teamsToBuild >= 0);
ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0);
int addedMachineTeams = 0;
@ -1657,27 +1717,28 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// When we change configuration, we may have machine teams with storageTeamSize in the old configuration.
int healthyMachineTeamCount = getHealthyMachineTeamCount();
int totalMachineTeamCount = machineTeams.size();
int totalHealthyMachineCount = calculateHealthyMachineCount();
int remainingMachineTeamBudget = getRemainingMachineTeamBudget();
int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount;
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
// machineTeamsToBuild mimics how the teamsToBuild is calculated in buildTeams()
int machineTeamsToBuild =
std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount);
int machineTeamsToBuild = std::max(
0, std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount));
TraceEvent("BuildMachineTeams")
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("HealthyMachineTeamCount", healthyMachineTeamCount)
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("MachineTeamsToBuild", machineTeamsToBuild);
.detail("MachineTeamsToBuild", machineTeamsToBuild)
.detail("RemainingMachineTeamBudget", remainingMachineTeamBudget);
// Pre-build all machine teams until we have the desired number of machine teams
if (machineTeamsToBuild > 0) {
addedMachineTeams = addBestMachineTeams(machineTeamsToBuild);
if (machineTeamsToBuild > 0 || remainingMachineTeamBudget > 0) {
addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget);
}
while (addedTeams < teamsToBuild) {
while (addedTeams < teamsToBuild || addedTeams < remainingTeamBudget) {
// Step 1: Create 1 best machine team
std::vector<UID> bestServerTeam;
int bestScore = std::numeric_limits<int>::max();
@ -1754,6 +1815,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
// Step 4: Add the server team
addTeam(bestServerTeam.begin(), bestServerTeam.end(), false);
addedTeams++;
remainingTeamBudget = getRemainingServerTeamBudget();
if (++loopCount > 2 * teamsToBuild * (configuration.storageTeamSize + 1)) {
break;
@ -1762,10 +1824,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
healthyMachineTeamCount = getHealthyMachineTeamCount();
std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
TraceEvent("TeamCollectionInfo", distributorId)
.detail("Primary", primary)
.detail("AddedTeamNumber", addedTeams)
.detail("AimToBuildTeamNumber", teamsToBuild)
.detail("RemainingTeamBudget", remainingTeamBudget)
.detail("CurrentTeamNumber", teams.size())
.detail("DesiredTeamNumber", desiredTeamNumber)
.detail("MaxTeamNumber", maxTeamNumber)
@ -1775,6 +1841,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
.detail("DoBuildTeams", doBuildTeams)
.trackLatest("TeamCollectionInfo");
return addedTeams;
@ -1791,10 +1862,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
int healthyMachineTeamCount = getHealthyMachineTeamCount();
std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer();
std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine();
TraceEvent("TeamCollectionInfo", distributorId)
.detail("Primary", primary)
.detail("AddedTeamNumber", 0)
.detail("AimToBuildTeamNumber", 0)
.detail("RemainingTeamBudget", 0)
.detail("CurrentTeamNumber", teams.size())
.detail("DesiredTeamNumber", desiredServerTeams)
.detail("MaxTeamNumber", maxServerTeams)
@ -1804,14 +1879,22 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
.detail("DoBuildTeams", doBuildTeams)
.trackLatest("TeamCollectionInfo");
// Debug purpose
// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
// // When the number of machine teams is over the limit, print out the current team info.
// traceAllInfo(true);
// }
// Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise
// simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams
// wait(delay(0.01));
// Debug purpose
// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) {
// // When the number of machine teams is over the limit, print out the current team info.
// traceAllInfo(true);
// }
}
// Use the current set of known processes (from server_info) to compute an optimized set of storage server teams.
@ -1859,10 +1942,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
totalTeamCount++;
}
}
// Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER,
// remainingTeamBudget is the number of teams needed to ensure every server has
// SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams
int remainingTeamBudget = self->getRemainingServerTeamBudget();
// teamsToBuild is calculated such that we will not build too many teams in the situation
// when all (or most of) teams become unhealthy temporarily and then healthy again
state int teamsToBuild = std::min(desiredTeams - teamCount, maxTeams - totalTeamCount);
state int teamsToBuild = std::max(0, std::min(desiredTeams - teamCount, maxTeams - totalTeamCount));
TraceEvent("BuildTeamsBegin", self->distributorId)
.detail("TeamsToBuild", teamsToBuild)
@ -1879,13 +1966,13 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("MachineCount", self->machine_info.size())
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER);
if (teamsToBuild > 0) {
if (teamsToBuild > 0 || remainingTeamBudget > 0) {
state vector<std::vector<UID>> builtTeams;
// addTeamsBestOf() will not add more teams than needed.
// If the team number is more than the desired, the extra teams are added in the code path when
// a team is added as an initial team
int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams);
int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams, remainingTeamBudget);
if (addedTeams <= 0 && self->teams.size() == 0) {
TraceEvent(SevWarn, "NoTeamAfterBuildTeam")
@ -1901,10 +1988,14 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount;
int healthyMachineTeamCount = self->getHealthyMachineTeamCount();
std::pair<uint64_t, uint64_t> minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer();
std::pair<uint64_t, uint64_t> minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine();
TraceEvent("TeamCollectionInfo", self->distributorId)
.detail("Primary", self->primary)
.detail("AddedTeamNumber", 0)
.detail("AimToBuildTeamNumber", teamsToBuild)
.detail("RemainingTeamBudget", remainingTeamBudget)
.detail("CurrentTeamNumber", self->teams.size())
.detail("DesiredTeamNumber", desiredTeams)
.detail("MaxTeamNumber", maxTeams)
@ -1914,6 +2005,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
.detail("DesiredMachineTeams", desiredMachineTeams)
.detail("MaxMachineTeams", maxMachineTeams)
.detail("TotalHealthyMachine", totalHealthyMachineCount)
.detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first)
.detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second)
.detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first)
.detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second)
.detail("DoBuildTeams", self->doBuildTeams)
.trackLatest("TeamCollectionInfo");
}
}
@ -2311,6 +2407,16 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
team = mt->serverTeams[teamIndex];
ASSERT(team->machineTeam->machineIDs == mt->machineIDs); // Sanity check
// Check if a server will have 0 team after the team is removed
for (auto& s : team->getServers()) {
if (s->teams.size() == 0) {
TraceEvent(SevError, "TeamRemoverTooAggressive")
.detail("Server", s->id)
.detail("Team", team->getServerIDsStr());
self->traceAllInfo(true);
}
}
// The team will be marked as a bad team
bool foundTeam = self->removeTeam(team);
ASSERT(foundTeam == true);
@ -2956,11 +3062,14 @@ ACTOR Future<Void> storageServerTracker(
if(hasWrongStoreTypeOrDC)
self->restartRecruiting.trigger();
if ( lastIsUnhealthy && !status.isUnhealthy() && !server->teams.size() ) {
if (lastIsUnhealthy && !status.isUnhealthy() &&
server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) {
self->doBuildTeams = true;
self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
}
lastIsUnhealthy = status.isUnhealthy();
state bool recordTeamCollectionInfo = false;
choose {
when( wait( failureTracker ) ) {
// The server is failed AND all data has been removed from it, so permanently remove it.
@ -3064,7 +3173,8 @@ ACTOR Future<Void> storageServerTracker(
self->badTeamRemover = removeBadTeams(self);
self->addActor.send(self->badTeamRemover);
// The team number changes, so we need to update the team number info
self->traceTeamCollectionInfo();
// self->traceTeamCollectionInfo();
recordTeamCollectionInfo = true;
}
}
@ -3072,10 +3182,14 @@ ACTOR Future<Void> storageServerTracker(
// We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location
status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality );
// self->traceTeamCollectionInfo();
recordTeamCollectionInfo = true;
//Restart the storeTracker for the new interface
storeTracker = keyValueStoreTypeTracker(self, server);
hasWrongStoreTypeOrDC = false;
self->doBuildTeams = true;
self->restartTeamBuilder.trigger();
if(restartRecruiting)
self->restartRecruiting.trigger();
}
@ -3096,6 +3210,10 @@ ACTOR Future<Void> storageServerTracker(
server->wakeUpTracker = Promise<Void>();
}
}
if (recordTeamCollectionInfo) {
self->traceTeamCollectionInfo();
}
}
} catch( Error &e ) {
if (e.code() != error_code_actor_cancelled && errorOut.canBeSet())
@ -3845,7 +3963,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") {
Reference<IReplicationPolicy> policy = Reference<IReplicationPolicy>(new PolicyAcross(teamSize, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())));
state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize);
int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams);
collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30);
ASSERT(collection->sanityCheckTeams() == true);
@ -3870,8 +3988,8 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") {
return Void();
}
collection->addBestMachineTeams(30); // Create machine teams to help debug
int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams);
collection->addBestMachineTeams(30, 30); // Create machine teams to help debug
collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30);
collection->sanityCheckTeams(); // Server team may happen to be on the same machine team, although unlikely
if (collection) delete (collection);
@ -3886,7 +4004,7 @@ TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") {
state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize;
state DDTeamCollection* collection = testTeamCollection(3, policy, processSize);
int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams);
int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams, 200);
delete(collection);
@ -3906,11 +4024,11 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") {
state DDTeamCollection* collection = testTeamCollection(3, policy, processSize);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10);
delete(collection);
ASSERT(result == 10);
ASSERT(result >= 10);
return Void();
}
@ -3926,9 +4044,9 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") {
collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
collection->addTeam(std::set<UID>({ UID(1, 0), UID(3, 0), UID(4, 0) }), true);
int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams);
int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8);
ASSERT(result == 8);
ASSERT(result >= 8);
for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) {
auto teamCount = process->second->teams.size();
@ -3956,8 +4074,8 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
collection->addTeam(std::set<UID>({ UID(1, 0), UID(3, 0), UID(4, 0) }), true);
int resultMachineTeams = collection->addBestMachineTeams(10);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams);
collection->addBestMachineTeams(10, 10);
int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10);
if (collection->machineTeams.size() != 10 || result != 8) {
collection->traceAllInfo(true); // Debug message

View File

@ -290,6 +290,15 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
int64_t desiredMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("DesiredMachineTeams"));
int64_t maxMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeams"));
int64_t minServerTeamOnServer =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer"));
int64_t maxServerTeamOnServer =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer"));
int64_t minMachineTeamOnMachine =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine"));
int64_t maxMachineTeamOnMachine =
boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine"));
// Team number is always valid when we disable teamRemover. This avoids false positive in simulation test
if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
TraceEvent("GetTeamCollectionValid")
@ -299,7 +308,10 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
// The if condition should be consistent with the condition in teamRemover() that decides
// if redundant teams exist.
if (healthyMachineTeamCount > desiredMachineTeamNumber) {
if (healthyMachineTeamCount > desiredMachineTeamNumber ||
(minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) {
// When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test
// cases. Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration.
TraceEvent("GetTeamCollectionValid")
.detail("CurrentTeamNumber", currentTeamNumber)
.detail("DesiredTeamNumber", desiredTeamNumber)
@ -307,7 +319,13 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
.detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount)
.detail("DesiredMachineTeams", desiredMachineTeamNumber)
.detail("CurrentMachineTeamNumber", currentMachineTeamNumber)
.detail("MaxMachineTeams", maxMachineTeamNumber);
.detail("MaxMachineTeams", maxMachineTeamNumber)
.detail("MinTeamNumberOnServer", minServerTeamOnServer)
.detail("MaxTeamNumberOnServer", maxServerTeamOnServer)
.detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine)
.detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine)
.detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER)
.detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER);
return false;
} else {
return true;