ServerTeamRemover runs after machineTeamRemover finishes

If serverTeamRemover removes a team before machineTeamRemover brings
the machine team number down to the desired number, DD may create a new
team (due to teams removed by serverTeamRemover), which may be removed
later by machineTeamRemover. This causes unnnecessary extra data movement.
This commit is contained in:
Meng Xu 2019-07-19 16:48:50 -07:00
parent 64bee63dbc
commit b001a9ebe8
3 changed files with 13 additions and 3 deletions

View File

@ -2377,16 +2377,24 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
}
};
ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self) {
ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self, double extraDelay = 0) {
state int waitCount = 0;
loop {
while(self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) {
// processingUnhealthy: true when there exists data movement
TraceEvent("WaitUntilHealthyStalled", self->distributorId).detail("Primary", self->primary).detail("ZeroHealthy", self->zeroHealthyTeams->get()).detail("ProcessingUnhealthy", self->processingUnhealthy->get());
wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange());
waitCount = 0;
}
wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskPriority::Low)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) {
return Void();
if (extraDelay <= 0.01 || waitCount >= 1) {
// Return healthy if we do not need extraDelay or when DD are healthy in at least two consecutive check
return Void();
} else {
wait(delay(extraDelay, TaskPriority::Low));
waitCount++;
}
}
}
}
@ -2537,7 +2545,7 @@ ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {
// To avoid removing server teams too fast, which is unlikely happen though
wait(delay(removeServerTeamDelay));
wait(waitUntilHealthy(self));
wait(waitUntilHealthy(self, SERVER_KNOBS->TR_REMOVE_SERVER_TEAM_EXTRA_DELAY));
// Wait for the badTeamRemover() to avoid the potential race between
// adding the bad team (add the team tracker) and remove bad team (cancel the team tracker).
wait(self->badTeamRemover);

View File

@ -185,6 +185,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS = true; if( randomize && BUGGIFY ) TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS = deterministicRandom()->random01() < 0.1 ? true : false;
TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
init( TR_REMOVE_SERVER_TEAM_EXTRA_DELAY, 5.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_EXTRA_DELAY = deterministicRandom()->random01() * 10.0;
// Redwood Storage Engine
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 );

View File

@ -148,6 +148,7 @@ public:
bool TR_FLAG_DISABLE_SERVER_TEAM_REMOVER; // disable the serverTeamRemover actor
double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team
double TR_REMOVE_SERVER_TEAM_EXTRA_DELAY; // serverTeamRemover waits for the delay and check DD healthyness again to ensure it runs after machineTeamRemover
double DD_FAILURE_TIME;
double DD_ZERO_HEALTHY_TEAM_DELAY;