ServerTeamRemover runs after machineTeamRemover finishes
If serverTeamRemover removes a team before machineTeamRemover brings the machine team number down to the desired number, DD may create a new team (due to teams removed by serverTeamRemover), which may be removed later by machineTeamRemover. This causes unnnecessary extra data movement.
This commit is contained in:
parent
64bee63dbc
commit
b001a9ebe8
|
@ -2377,16 +2377,24 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
|
|||
}
|
||||
};
|
||||
|
||||
ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self) {
|
||||
ACTOR Future<Void> waitUntilHealthy(DDTeamCollection* self, double extraDelay = 0) {
|
||||
state int waitCount = 0;
|
||||
loop {
|
||||
while(self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) {
|
||||
// processingUnhealthy: true when there exists data movement
|
||||
TraceEvent("WaitUntilHealthyStalled", self->distributorId).detail("Primary", self->primary).detail("ZeroHealthy", self->zeroHealthyTeams->get()).detail("ProcessingUnhealthy", self->processingUnhealthy->get());
|
||||
wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange());
|
||||
waitCount = 0;
|
||||
}
|
||||
wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskPriority::Low)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue.
|
||||
if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) {
|
||||
return Void();
|
||||
if (extraDelay <= 0.01 || waitCount >= 1) {
|
||||
// Return healthy if we do not need extraDelay or when DD are healthy in at least two consecutive check
|
||||
return Void();
|
||||
} else {
|
||||
wait(delay(extraDelay, TaskPriority::Low));
|
||||
waitCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2537,7 +2545,7 @@ ACTOR Future<Void> serverTeamRemover(DDTeamCollection* self) {
|
|||
// To avoid removing server teams too fast, which is unlikely happen though
|
||||
wait(delay(removeServerTeamDelay));
|
||||
|
||||
wait(waitUntilHealthy(self));
|
||||
wait(waitUntilHealthy(self, SERVER_KNOBS->TR_REMOVE_SERVER_TEAM_EXTRA_DELAY));
|
||||
// Wait for the badTeamRemover() to avoid the potential race between
|
||||
// adding the bad team (add the team tracker) and remove bad team (cancel the team tracker).
|
||||
wait(self->badTeamRemover);
|
||||
|
|
|
@ -185,6 +185,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS = true; if( randomize && BUGGIFY ) TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS = deterministicRandom()->random01() < 0.1 ? true : false;
|
||||
TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
|
||||
init( TR_REMOVE_SERVER_TEAM_EXTRA_DELAY, 5.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_EXTRA_DELAY = deterministicRandom()->random01() * 10.0;
|
||||
|
||||
// Redwood Storage Engine
|
||||
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 );
|
||||
|
|
|
@ -148,6 +148,7 @@ public:
|
|||
|
||||
bool TR_FLAG_DISABLE_SERVER_TEAM_REMOVER; // disable the serverTeamRemover actor
|
||||
double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team
|
||||
double TR_REMOVE_SERVER_TEAM_EXTRA_DELAY; // serverTeamRemover waits for the delay and check DD healthyness again to ensure it runs after machineTeamRemover
|
||||
|
||||
double DD_FAILURE_TIME;
|
||||
double DD_ZERO_HEALTHY_TEAM_DELAY;
|
||||
|
|
Loading…
Reference in New Issue