TeamCollection: Add knobs for team remover
Added three knobs to control team remover bool TR_FLAG_DISABLE_TEAM_REMOVER: Disable the teamRemover actor double TR_REMOVE_MACHINE_TEAM_DELAY: Wait for the specified time before try to remove next machine team double TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY: Wait before checking if all machines are healthy
This commit is contained in:
parent
01e55e43bd
commit
5481851e82
|
@ -2268,6 +2268,11 @@ ACTOR Future<Void> removeBadTeams(DDTeamCollection* self) {
|
|||
ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
|
||||
state int numMachineTeamRemoved = 0;
|
||||
loop {
|
||||
// In case the teamRemover cause problems in production, we can disable it
|
||||
if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
|
||||
break; // directly return Void()
|
||||
}
|
||||
|
||||
// Wait on processingUnhealthy as removeBadTeams() does
|
||||
loop {
|
||||
while (self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) {
|
||||
|
@ -2280,6 +2285,10 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
// To avoid removing machine teams too fast, which is unlikely happen though
|
||||
wait( delay(SERVER_KNOBS->TR_REMOVE_MACHINE_TEAM_DELAY) );
|
||||
|
||||
|
||||
// Wait for the badTeamRemover() to avoid the potential race between adding the bad team (add the team tracker)
|
||||
// and remove bad team (cancel the team tracker).
|
||||
wait(self->badTeamRemover);
|
||||
|
@ -2289,7 +2298,7 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
|
|||
// Check if all machines are healthy, if not, we wait for 1 second and loop back.
|
||||
// Eventually, all machines will become healthy.
|
||||
if (totalHealthyMachineCount != self->machine_info.size()) {
|
||||
wait(delay(1.0));
|
||||
wait( delay(SERVER_KNOBS->TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY) );
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,7 +141,6 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( ALL_DATA_REMOVED_DELAY, 1.0 );
|
||||
init( INITIAL_FAILURE_REACTION_DELAY, 30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
|
||||
init( CHECK_TEAM_DELAY, 30.0 );
|
||||
init( CHECK_REDUNDANT_TEAM_DELAY, 30.0 ); // seconds
|
||||
init( LOG_ON_COMPLETION_DELAY, DD_QUEUE_LOGGING_INTERVAL );
|
||||
init( BEST_TEAM_MAX_TEAM_TRIES, 10 );
|
||||
init( BEST_TEAM_OPTION_COUNT, 4 );
|
||||
|
@ -167,6 +166,11 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( MOVEKEYS_LOCK_POLLING_DELAY, 5.0 );
|
||||
init( DEBOUNCE_RECRUITING_DELAY, 5.0 );
|
||||
|
||||
// TeamRemover
|
||||
TR_FLAG_DISABLE_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_TEAM_REMOVER = g_random->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
|
||||
init( TR_REMOVE_MACHINE_TEAM_DELAY, 1.0 ); if( randomize && BUGGIFY ) TR_REMOVE_MACHINE_TEAM_DELAY = g_random->random01() * 60.0;
|
||||
init( TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY, 1.0 ); if( randomize && BUGGIFY ) TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY = g_random->random01() * 60.0;
|
||||
|
||||
// Redwood Storage Engine
|
||||
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 );
|
||||
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN, 0 );
|
||||
|
|
|
@ -129,6 +129,10 @@ public:
|
|||
int64_t DD_LOCATION_CACHE_SIZE;
|
||||
double MOVEKEYS_LOCK_POLLING_DELAY;
|
||||
double DEBOUNCE_RECRUITING_DELAY;
|
||||
// TeamRemover to remove redundant teams
|
||||
bool TR_FLAG_DISABLE_TEAM_REMOVER; // disable the teamRemover actor
|
||||
double TR_REMOVE_MACHINE_TEAM_DELAY; // wait for the specified time before try to remove next machine team
|
||||
double TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY; // wait before checking if all machines are healthy
|
||||
|
||||
// Redwood Storage Engine
|
||||
int PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT;
|
||||
|
|
|
@ -209,7 +209,7 @@ struct ConsistencyCheckWorkload : TestWorkload
|
|||
// Check that the number of process (and machine) teams is no larger than
|
||||
// the allowed maximum number of teams
|
||||
bool teamCollectionValid = wait(getTeamCollectionValid(cx, self->dbInfo));
|
||||
if (!teamCollectionValid) {
|
||||
if (!teamCollectionValid && SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER == false) {
|
||||
TraceEvent(SevError, "ConsistencyCheck_TooManyTeams");
|
||||
self->testFailure("The number of process or machine teams is larger than the allowed maximum number of teams");
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ namespace std {
|
|||
|
||||
class IRandom {
|
||||
public:
|
||||
virtual double random01() = 0;
|
||||
virtual double random01() = 0; // return random value in [0, 1]
|
||||
virtual int randomInt(int min, int maxPlusOne) = 0;
|
||||
virtual int64_t randomInt64(int64_t min, int64_t maxPlusOne) = 0;
|
||||
virtual uint32_t randomUInt32() = 0;
|
||||
|
|
Loading…
Reference in New Issue