TeamCollection: Add knobs for team remover

Added three knobs to control team remover

bool TR_FLAG_DISABLE_TEAM_REMOVER:
	Disable the teamRemover actor
double TR_REMOVE_MACHINE_TEAM_DELAY:
	Wait for the specified time before try to remove next machine team
double TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY:
	Wait before checking if all machines are healthy
This commit is contained in:
Meng Xu 2019-02-13 15:11:53 -08:00
parent 01e55e43bd
commit 5481851e82
5 changed files with 21 additions and 4 deletions

View File

@ -2268,6 +2268,11 @@ ACTOR Future<Void> removeBadTeams(DDTeamCollection* self) {
ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
state int numMachineTeamRemoved = 0;
loop {
// In case the teamRemover cause problems in production, we can disable it
if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) {
break; // directly return Void()
}
// Wait on processingUnhealthy as removeBadTeams() does
loop {
while (self->zeroHealthyTeams->get() || self->processingUnhealthy->get()) {
@ -2280,6 +2285,10 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
break;
}
}
// To avoid removing machine teams too fast, which is unlikely happen though
wait( delay(SERVER_KNOBS->TR_REMOVE_MACHINE_TEAM_DELAY) );
// Wait for the badTeamRemover() to avoid the potential race between adding the bad team (add the team tracker)
// and remove bad team (cancel the team tracker).
wait(self->badTeamRemover);
@ -2289,7 +2298,7 @@ ACTOR Future<Void> teamRemover(DDTeamCollection* self) {
// Check if all machines are healthy, if not, we wait for 1 second and loop back.
// Eventually, all machines will become healthy.
if (totalHealthyMachineCount != self->machine_info.size()) {
wait(delay(1.0));
wait( delay(SERVER_KNOBS->TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY) );
continue;
}

View File

@ -141,7 +141,6 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( ALL_DATA_REMOVED_DELAY, 1.0 );
init( INITIAL_FAILURE_REACTION_DELAY, 30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
init( CHECK_TEAM_DELAY, 30.0 );
init( CHECK_REDUNDANT_TEAM_DELAY, 30.0 ); // seconds
init( LOG_ON_COMPLETION_DELAY, DD_QUEUE_LOGGING_INTERVAL );
init( BEST_TEAM_MAX_TEAM_TRIES, 10 );
init( BEST_TEAM_OPTION_COUNT, 4 );
@ -167,6 +166,11 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
init( MOVEKEYS_LOCK_POLLING_DELAY, 5.0 );
init( DEBOUNCE_RECRUITING_DELAY, 5.0 );
// TeamRemover
TR_FLAG_DISABLE_TEAM_REMOVER = false; if( randomize && BUGGIFY ) TR_FLAG_DISABLE_TEAM_REMOVER = g_random->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
init( TR_REMOVE_MACHINE_TEAM_DELAY, 1.0 ); if( randomize && BUGGIFY ) TR_REMOVE_MACHINE_TEAM_DELAY = g_random->random01() * 60.0;
init( TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY, 1.0 ); if( randomize && BUGGIFY ) TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY = g_random->random01() * 60.0;
// Redwood Storage Engine
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT, 30 );
init( PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN, 0 );

View File

@ -129,6 +129,10 @@ public:
int64_t DD_LOCATION_CACHE_SIZE;
double MOVEKEYS_LOCK_POLLING_DELAY;
double DEBOUNCE_RECRUITING_DELAY;
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_TEAM_REMOVER; // disable the teamRemover actor
double TR_REMOVE_MACHINE_TEAM_DELAY; // wait for the specified time before try to remove next machine team
double TR_WAIT_FOR_ALL_MACHINES_HEALTHY_DELAY; // wait before checking if all machines are healthy
// Redwood Storage Engine
int PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT;

View File

@ -209,7 +209,7 @@ struct ConsistencyCheckWorkload : TestWorkload
// Check that the number of process (and machine) teams is no larger than
// the allowed maximum number of teams
bool teamCollectionValid = wait(getTeamCollectionValid(cx, self->dbInfo));
if (!teamCollectionValid) {
if (!teamCollectionValid && SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER == false) {
TraceEvent(SevError, "ConsistencyCheck_TooManyTeams");
self->testFailure("The number of process or machine teams is larger than the allowed maximum number of teams");
}

View File

@ -68,7 +68,7 @@ namespace std {
class IRandom {
public:
virtual double random01() = 0;
virtual double random01() = 0; // return random value in [0, 1]
virtual int randomInt(int min, int maxPlusOne) = 0;
virtual int64_t randomInt64(int64_t min, int64_t maxPlusOne) = 0;
virtual uint32_t randomUInt32() = 0;