Add serverTeamRemover to remove redundant server teams
This commit is contained in:
parent
716494ed9f
commit
599fcb2e6d
|
@ -592,6 +592,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
Promise<Void> addSubsetComplete;
|
||||
Future<Void> badTeamRemover;
|
||||
Future<Void> redundantMachineTeamRemover;
|
||||
Future<Void> redundantServerTeamRemover;
|
||||
|
||||
Reference<LocalitySet> storageServerSet;
|
||||
std::vector<LocalityEntry> forcedEntries, resultEntries;
|
||||
|
@ -633,7 +634,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
Reference<AsyncVar<bool>> processingUnhealthy)
|
||||
: cx(cx), distributorId(distributorId), lock(lock), output(output),
|
||||
shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()),
|
||||
badTeamRemover(Void()), redundantMachineTeamRemover(Void()), configuration(configuration),
|
||||
badTeamRemover(Void()), redundantMachineTeamRemover(Void()), redundantServerTeamRemover(Void()), configuration(configuration),
|
||||
readyToStart(readyToStart), clearHealthyZoneFuture(Void()),
|
||||
checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)),
|
||||
initialFailureReactionDelay(
|
||||
|
@ -1626,6 +1627,25 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
return std::pair<Reference<TCMachineTeamInfo>, int>(retMT, minNumProcessTeams);
|
||||
}
|
||||
|
||||
// Find the server team whose members are on the most number of server teams
|
||||
std::pair<Reference<TCTeamInfo>, int> getServerTeamWithMostProcessTeams() {
|
||||
Reference<TCTeamInfo> retST;
|
||||
int maxNumProcessTeams = 0;
|
||||
|
||||
for (auto& t : teams) {
|
||||
int numProcessTeams = 0;
|
||||
for (auto& server : t->getServers()) {
|
||||
numProcessTeams += server->teams.size();
|
||||
}
|
||||
if (numProcessTeams > maxNumProcessTeams) {
|
||||
maxNumProcessTeams = numProcessTeams;
|
||||
retST = t;
|
||||
}
|
||||
}
|
||||
|
||||
return std::pair<Reference<TCTeamInfo>, int>(retST, maxNumProcessTeams);
|
||||
}
|
||||
|
||||
int getHealthyMachineTeamCount() {
|
||||
int healthyTeamCount = 0;
|
||||
for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) {
|
||||
|
@ -2264,7 +2284,7 @@ ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
|
|||
state int numMachineTeamRemoved = 0;
|
||||
loop {
|
||||
// In case the machineTeamRemover cause problems in production, we can disable it
|
||||
if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
|
||||
if (SERVER_KNOBS->TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER) {
|
||||
return Void(); // Directly return Void()
|
||||
}
|
||||
|
||||
|
@ -2362,6 +2382,79 @@ ACTOR Future<Void> machineTeamRemover(DDTeamCollection* self) {
|
|||
}
|
||||
}
|
||||
|
||||
// Remove the server team whose members have the most number of process teams
|
||||
// until the total number of server teams is no larger than the desired number
|
||||
ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {
|
||||
state int numServerTeamRemoved = 0;
|
||||
loop {
|
||||
// In case the serverTeamRemover cause problems in production, we can disable it
|
||||
if (SERVER_KNOBS->TR_FLAG_DISABLE_SERVER_TEAM_REMOVER) {
|
||||
return Void(); // Directly return Void()
|
||||
}
|
||||
|
||||
wait(waitUntilHealthy(self));
|
||||
|
||||
// To avoid removing machine teams too fast, which is unlikely happen though
|
||||
wait( delay(SERVER_KNOBS->TR_REMOVE_SERVER_TEAM_DELAY) );
|
||||
|
||||
// Wait for the badTeamRemover() to avoid the potential race between adding the bad team (add the team tracker)
|
||||
// and remove bad team (cancel the team tracker).
|
||||
wait(self->badTeamRemover);
|
||||
|
||||
state int healthyServerCount = self->calculateHealthyServerCount();
|
||||
// Check if all servers are healthy, if not, we wait for 1 second and loop back.
|
||||
// Eventually, all servers will become healthy.
|
||||
if (healthyServerCount != self->server_info.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// From this point, all server teams should be healthy, because we wait above
|
||||
// until processingUnhealthy is done, and all machines are healthy
|
||||
|
||||
// In most cases, all machine teams should be healthy teams at this point.
|
||||
int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * healthyServerCount;
|
||||
int totalSTCount = self->teams.size();
|
||||
|
||||
if (totalSTCount > desiredServerTeams) {
|
||||
// Pick the server team whose members are on the most number of server teams, and mark it undesired
|
||||
state std::pair<Reference<TCTeamInfo>, int> foundSTInfo = self->getServerTeamWithMostProcessTeams();
|
||||
state Reference<TCTeamInfo> st = foundSTInfo.first;
|
||||
state int maxNumProcessTeams = foundSTInfo.second;
|
||||
ASSERT(st.isValid());
|
||||
// The team will be marked as a bad team
|
||||
bool foundTeam = self->removeTeam(st);
|
||||
ASSERT(foundTeam == true);
|
||||
self->addTeam(st->getServers(), true, true);
|
||||
TEST(true);
|
||||
|
||||
self->doBuildTeams = true;
|
||||
|
||||
if (self->badTeamRemover.isReady()) {
|
||||
self->badTeamRemover = removeBadTeams(self);
|
||||
self->addActor.send(self->badTeamRemover);
|
||||
}
|
||||
|
||||
TraceEvent("ServerTeamRemover")
|
||||
.detail("ServerTeamToRemove", st->getServerIDsStr())
|
||||
.detail("NumProcessTeamsOnTheServerTeam", maxNumProcessTeams)
|
||||
.detail("CurrentServerTeamNumber", self->teams.size())
|
||||
.detail("DesiredTeam", desiredServerTeams);
|
||||
|
||||
numServerTeamRemoved++;
|
||||
} else {
|
||||
if (numServerTeamRemoved > 0) {
|
||||
// Only trace the information when we remove a machine team
|
||||
TraceEvent("ServerTeamRemoverDone")
|
||||
.detail("HealthyServerNumber", healthyServerCount)
|
||||
.detail("CurrentServerTeamNumber", self->teams.size())
|
||||
.detail("DesiredServerTeam", desiredServerTeams)
|
||||
.detail("NumServerTeamRemoved", numServerTeamRemoved);
|
||||
self->traceTeamCollectionInfo();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Track a team and issue RelocateShards when the level of degradation changes
|
||||
// A badTeam can be unhealthy or just a redundantTeam removed by machineTeamRemover() or serverTeamRemover()
|
||||
ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> team, bool badTeam, bool redundantTeam) {
|
||||
|
@ -3336,6 +3429,10 @@ ACTOR Future<Void> dataDistributionTeamCollection(
|
|||
self->redundantMachineTeamRemover = machineTeamRemover(self);
|
||||
self->addActor.send(self->redundantMachineTeamRemover);
|
||||
}
|
||||
if (self->redundantServerTeamRemover.isReady()) {
|
||||
self->redundantServerTeamRemover = serverTeamRemover(self);
|
||||
self->addActor.send(self->redundantServerTeamRemover);
|
||||
}
|
||||
self->traceTeamCollectionInfo();
|
||||
|
||||
if(self->includedDCs.size()) {
|
||||
|
|
|
@ -179,8 +179,10 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( DD_ZERO_HEALTHY_TEAM_DELAY, 1.0 );
|
||||
|
||||
// TeamRemover
|
||||
TR_FLAG_DISABLE_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
init( TR_REMOVE_MACHINE_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_MACHINE_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
|
||||
TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
|
||||
|
||||
// Redwood Storage Engine
|
||||
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 );
|
||||
|
|
|
@ -141,9 +141,12 @@ public:
|
|||
double DEBOUNCE_RECRUITING_DELAY;
|
||||
|
||||
// TeamRemover to remove redundant teams
|
||||
bool TR_FLAG_DISABLE_TEAM_REMOVER; // disable the teamRemover actor
|
||||
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
|
||||
double TR_REMOVE_MACHINE_TEAM_DELAY; // wait for the specified time before try to remove next machine team
|
||||
|
||||
bool TR_FLAG_DISABLE_SERVER_TEAM_REMOVER; // disable the serverTeamRemover actor
|
||||
double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team
|
||||
|
||||
double DD_FAILURE_TIME;
|
||||
double DD_ZERO_HEALTHY_TEAM_DELAY;
|
||||
|
||||
|
|
|
@ -291,10 +291,12 @@ ACTOR Future<bool> getTeamCollectionValid(Database cx, WorkerInterface dataDistr
|
|||
int64_t desiredMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("DesiredMachineTeams"));
|
||||
int64_t maxMachineTeamNumber = boost::lexical_cast<int64_t>(teamCollectionInfoMessage.getValue("MaxMachineTeams"));
|
||||
|
||||
// TODO: Get finer granularity check
|
||||
// Team number is always valid when we disable teamRemover. This avoids false positive in simulation test
|
||||
if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
|
||||
if (SERVER_KNOBS->TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER || SERVER_KNOBS->TR_FLAG_DISABLE_SERVER_TEAM_REMOVER) {
|
||||
TraceEvent("GetTeamCollectionValid")
|
||||
.detail("KnobsTeamRemoverDisabled", SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER);
|
||||
.detail("KnobsMachineTeamRemoverDisabled", SERVER_KNOBS->TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER)
|
||||
.detail("KnobsServerTeamRemoverDisabled", SERVER_KNOBS->TR_FLAG_DISABLE_SERVER_TEAM_REMOVER);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue