From 0baae134f6228d7f835c4f85cd3b3716b109656d Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Fri, 28 Jun 2019 15:59:47 -0700 Subject: [PATCH] TeamCollectionInfo: Resolve review comments --- documentation/sphinx/source/release-notes.rst | 8 ++++ fdbserver/DataDistribution.actor.cpp | 38 ++++++------------- fdbserver/QuietDatabase.actor.cpp | 4 +- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 6ca2490060..11dd64fbd0 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,6 +2,14 @@ Release Notes ############# +6.1.11 +====== + +Fixes +----- + +* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) `_ + 6.1.10 ====== diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 11ce0063ff..d474850cc3 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1343,12 +1343,7 @@ struct DDTeamCollection : ReferenceCounted { // Invariant: We only create correct size machine teams. // When configuration (e.g., team size) is changed, the DDTeamCollection will be destroyed and rebuilt // so that the invariant will not be violated. - int teamCount = 0; - for (auto& mt : machine.second->machineTeams) { - if (isMachineTeamHealthy(mt)) { - ++teamCount; - } - } + int teamCount = machine.second->machineTeams.size(); if (teamCount < minTeamCount) { leastUsedMachines.clear(); @@ -1385,7 +1380,8 @@ struct DDTeamCollection : ReferenceCounted { // that have the least-utilized server team.clear(); auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team); - // NOTE: selectReplicas() returns false always when storageTeamSize == 1 + // NOTE: selectReplicas() should always return success when storageTeamSize = 1 + ASSERT_WE_THINK ( configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success) ); if (!success && configuration.storageTeamSize > 1) { break; } @@ -1605,9 +1601,9 @@ struct DDTeamCollection : ReferenceCounted { uint32_t minTeamNumber = std::numeric_limits::max(); uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& server : server_info) { - // if ( server_status.get(server.first).isUnhealthy() ) { - // continue; - // } + if ( server_status.get(server.first).isUnhealthy() ) { + continue; + } if (server.second->teams.size() < minTeamNumber) { minTeamNumber = server.second->teams.size(); } @@ -1622,9 +1618,9 @@ struct DDTeamCollection : ReferenceCounted { int minTeamNumber = std::numeric_limits::max(); int maxTeamNumber = 0; for (auto& machine : machine_info) { - // if ( !isMachineHealthy(machine.second) ) { - // continue; - // } + if ( !isMachineHealthy(machine.second) ) { + continue; + } if (machine.second->machineTeams.size() < minTeamNumber) { minTeamNumber = machine.second->machineTeams.size(); } @@ -1691,12 +1687,7 @@ struct DDTeamCollection : ReferenceCounted { int getRemainingMachineTeamBudget() { int remainingMachineTeamBudget = 0; for (auto& m : machine_info) { - int healthyMTCount = 0; - for (auto& mt : m.second->machineTeams) { - if (isMachineTeamHealthy(mt)) { - ++healthyMTCount; - } - } + int healthyMTCount = m.second->machineTeams.size(); remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount)); } @@ -1712,12 +1703,7 @@ struct DDTeamCollection : ReferenceCounted { // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams int remainingTeamBudget = 0; for (auto& s : server_info) { - int numValidTeams = 0; - for (auto& team : s.second->teams) { - if (!team->isWrongConfiguration() && team->isHealthy()) { - ++numValidTeams; - } - } + int numValidTeams = s.second->teams.size(); remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams)); } @@ -1757,7 +1743,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MachineTeamsToBuild", machineTeamsToBuild) .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget); // Pre-build all machine teams until we have the desired number of machine teams - if (machineTeamsToBuild > 0) { + if (machineTeamsToBuild > 0 || remainingMachineTeamBudget > 0) { addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget); } diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 886f7ad099..0f33faa40c 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -308,7 +308,9 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. - if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 ) { + if (healthyMachineTeamCount > desiredMachineTeamNumber || (minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) { + // When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test cases. + // Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration. TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber)