From b7dde8802940a27388687f33e5e9e8eff4001672 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 6 Feb 2018 11:30:05 -0800 Subject: [PATCH] fix: the cluster controller did not consider the master sharing the same process as the cluster controller as bad in all needed locations waited too long for good recruitment locations, which would add too much time to recoveries of clusters that do not use machine classes --- fdbserver/ClusterController.actor.cpp | 5 +++-- fdbserver/Knobs.cpp | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index eb9cd17b34..ecc635ce33 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -734,7 +734,7 @@ public: if ( oldMasterFit < newMasterFit ) return false; - if ( oldMasterFit > newMasterFit ) + if ( oldMasterFit > newMasterFit || ( dbi.master.locality.processId() == clusterControllerProcessId && mworker.first.locality.processId() != clusterControllerProcessId ) ) return true; // Check tLog fitness @@ -840,7 +840,8 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster state double recoveryStart = now(); TraceEvent("CCWDB", cluster->id).detail("Recruiting", "Master"); state std::pair masterWorker = cluster->getMasterWorker(db->config); - if( masterWorker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) { + if( ( masterWorker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS || masterWorker.first.locality.processId() == cluster->clusterControllerProcessId ) + && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) { TraceEvent("CCWDB", cluster->id).detail("Fitness", masterWorker.second.machineClassFitness( ProcessClass::Master )); Void _ = wait( delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY) ); continue; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index ebf4db028e..80e0e0351a 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -259,8 +259,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SIM_SHUTDOWN_TIMEOUT, 10 ); init( SHUTDOWN_TIMEOUT, 600 ); if( randomize && BUGGIFY ) SHUTDOWN_TIMEOUT = 60.0; init( MASTER_SPIN_DELAY, 1.0 ); if( randomize && BUGGIFY ) MASTER_SPIN_DELAY = 10.0; - init( WAIT_FOR_GOOD_RECRUITMENT_DELAY, 1.0 ); - init( ATTEMPT_RECRUITMENT_DELAY, 0.05 ); + init( WAIT_FOR_GOOD_RECRUITMENT_DELAY, 0.1 ); + init( ATTEMPT_RECRUITMENT_DELAY, 0.35 ); init( WORKER_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0; init( CHECK_BETTER_MASTER_INTERVAL, 1.0 ); if( randomize && BUGGIFY ) CHECK_BETTER_MASTER_INTERVAL = 0.001; init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL, 600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;