fix: the cluster controller did not consider the master sharing the same process as the cluster controller as bad in all needed locations
waited too long for good recruitment locations, which would add too much time to recoveries of clusters that do not use machine classes
This commit is contained in:
parent
080a454051
commit
b7dde88029
|
@ -734,7 +734,7 @@ public:
|
|||
|
||||
if ( oldMasterFit < newMasterFit )
|
||||
return false;
|
||||
if ( oldMasterFit > newMasterFit )
|
||||
if ( oldMasterFit > newMasterFit || ( dbi.master.locality.processId() == clusterControllerProcessId && mworker.first.locality.processId() != clusterControllerProcessId ) )
|
||||
return true;
|
||||
|
||||
// Check tLog fitness
|
||||
|
@ -840,7 +840,8 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
|
|||
state double recoveryStart = now();
|
||||
TraceEvent("CCWDB", cluster->id).detail("Recruiting", "Master");
|
||||
state std::pair<WorkerInterface, ProcessClass> masterWorker = cluster->getMasterWorker(db->config);
|
||||
if( masterWorker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) {
|
||||
if( ( masterWorker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.first.locality.processId() == cluster->clusterControllerProcessId )
|
||||
&& now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) {
|
||||
TraceEvent("CCWDB", cluster->id).detail("Fitness", masterWorker.second.machineClassFitness( ProcessClass::Master ));
|
||||
Void _ = wait( delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY) );
|
||||
continue;
|
||||
|
|
|
@ -259,8 +259,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
|
|||
init( SIM_SHUTDOWN_TIMEOUT, 10 );
|
||||
init( SHUTDOWN_TIMEOUT, 600 ); if( randomize && BUGGIFY ) SHUTDOWN_TIMEOUT = 60.0;
|
||||
init( MASTER_SPIN_DELAY, 1.0 ); if( randomize && BUGGIFY ) MASTER_SPIN_DELAY = 10.0;
|
||||
init( WAIT_FOR_GOOD_RECRUITMENT_DELAY, 1.0 );
|
||||
init( ATTEMPT_RECRUITMENT_DELAY, 0.05 );
|
||||
init( WAIT_FOR_GOOD_RECRUITMENT_DELAY, 0.1 );
|
||||
init( ATTEMPT_RECRUITMENT_DELAY, 0.35 );
|
||||
init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0;
|
||||
init( CHECK_BETTER_MASTER_INTERVAL, 1.0 ); if( randomize && BUGGIFY ) CHECK_BETTER_MASTER_INTERVAL = 0.001;
|
||||
init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL, 600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;
|
||||
|
|
Loading…
Reference in New Issue