The cluster controller will increase its failure monitoring delay after there have been many unfinishedRecoveries

2018-08-31 10:51:55 -07:00 · 2018-08-31 10:51:55 -07:00 · e60c668853
parent 04d998978c
commit e60c668853
3 changed files with 24 additions and 6 deletions
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@ -39,6 +39,8 @@ ClientKnobs::ClientKnobs(bool randomize) {
 	init( FAILURE_MIN_DELAY,                       4.0 ); if( randomize && BUGGIFY ) FAILURE_MIN_DELAY = 1.0;
 	init( FAILURE_TIMEOUT_DELAY,     FAILURE_MIN_DELAY );
 	init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY );
+	init( FAILURE_EMERGENCY_DELAY,                60.0 );
+	init( FAILURE_MAX_GENERATIONS,                   4 );

 	// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin

--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@ -38,6 +38,8 @@ public:
 	double FAILURE_MIN_DELAY;
 	double FAILURE_TIMEOUT_DELAY;
 	double CLIENT_FAILURE_TIMEOUT_DELAY;
+	double FAILURE_EMERGENCY_DELAY;
+	double FAILURE_MAX_GENERATIONS;

 	// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin
 	double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -94,8 +94,10 @@ public:
 		DatabaseConfiguration config;   // Asynchronously updated via master registration
 		DatabaseConfiguration fullyRecoveredConfig;
 		Database db;
+		int unfinishedRecoveries;
+		int logGenerations;

-		DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false),
+		DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0),
 			clientInfo( new AsyncVar<ClientDBInfo>( ClientDBInfo() ) ),
 			serverInfo( new AsyncVar<ServerDBInfo>( ServerDBInfo( LiteralStringRef("DB") ) ) ),
 			db( DatabaseContext::create( clientInfo, Future<Void>(), LocalityData(), true, TaskDefaultEndpoint, true ) )  // SOMEDAY: Locality!
@ -1059,6 +1061,7 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 			rmq.forceRecovery = db->forceRecovery;

 			cluster->masterProcessId = masterWorker.worker.first.locality.processId();
+			cluster->db.unfinishedRecoveries++;
 			ErrorOr<MasterInterface> newMaster = wait( masterWorker.worker.first.master.tryGetReply( rmq ) );
 			if (newMaster.present()) {
 				TraceEvent("CCWDB", cluster->id).detail("Recruited", newMaster.get().id());
@ -1345,7 +1348,7 @@ struct FailureStatusInfo {
 };

 //The failure monitor client relies on the fact that the failure detection server will not declare itself failed
-ACTOR Future<Void> failureDetectionServer( UID uniqueID, FutureStream< FailureMonitoringRequest > requests ) {
+ACTOR Future<Void> failureDetectionServer( UID uniqueID, ClusterControllerData::DBInfo* db, FutureStream< FailureMonitoringRequest > requests ) {
 	state Version currentVersion = 0;
 	state std::map<NetworkAddress, FailureStatusInfo> currentStatus;	// The status at currentVersion
 	state std::deque<SystemFailureStatus> statusHistory;	// The last change in statusHistory is from currentVersion-1 to currentVersion
@ -1443,13 +1446,16 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, FutureStream< FailureMo
 			//TraceEvent("FailureDetectionPoll", uniqueID).detail("PivotDelay", pivotDelay).detail("Clients", currentStatus.size());
 			//TraceEvent("FailureDetectionAcceptableDelay").detail("Delay", acceptableDelay1000);

+			bool tooManyLogGenerations = std::max(db->unfinishedRecoveries, db->logGenerations) > CLIENT_KNOBS->FAILURE_MAX_GENERATIONS;
+
 			for(auto it = currentStatus.begin(); it != currentStatus.end(); ) {
 				double delay = t - it->second.lastRequestTime;
-
-				if ( it->first != g_network->getLocalAddress() && ( delay > pivotDelay * 2 + FLOW_KNOBS->SERVER_REQUEST_INTERVAL + CLIENT_KNOBS->FAILURE_MIN_DELAY || delay > CLIENT_KNOBS->FAILURE_MAX_DELAY ) ) {
+				if ( it->first != g_network->getLocalAddress() && ( tooManyLogGenerations ?
+					( delay > CLIENT_KNOBS->FAILURE_EMERGENCY_DELAY ) :
+					( delay > pivotDelay * 2 + FLOW_KNOBS->SERVER_REQUEST_INTERVAL + CLIENT_KNOBS->FAILURE_MIN_DELAY || delay > CLIENT_KNOBS->FAILURE_MAX_DELAY ) ) ) {
 					//printf("Failure Detection Server: Status of '%s' is now '%s' after %f sec\n", it->first.toString().c_str(), "Failed", now() - it->second.lastRequestTime);
 					TraceEvent("FailureDetectionStatus", uniqueID).detail("System", it->first).detail("Status","Failed").detail("Why", "Timeout").detail("LastRequestAge", delay)
-						.detail("PivotDelay", pivotDelay);
+						.detail("PivotDelay", pivotDelay).detail("UnfinishedRecoveries", db->unfinishedRecoveries).detail("LogGenerations", db->logGenerations);
 					statusHistory.push_back( SystemFailureStatus( it->first, FailureStatus(true) ) );
 					++currentVersion;
 					it = currentStatus.erase(it);
@ -1555,6 +1561,14 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
 		return;
 	}

+	if ( req.recoveryState == RecoveryState::FULLY_RECOVERED ) {
+		self->db.unfinishedRecoveries = 0;
+		self->db.logGenerations = 0;
+		ASSERT( !req.logSystemConfig.oldTLogs.size() );
+	} else {
+		self->db.logGenerations = std::max<int>(self->db.logGenerations, req.logSystemConfig.oldTLogs.size());
+	}
+
 	db->masterRegistrationCount = req.registrationCount;
 	db->recoveryStalled = req.recoveryStalled;
 	if ( req.configuration.present() ) {
@ -2135,7 +2149,7 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 	state Future<ErrorOr<Void>> error = errorOr( actorCollection( addActor.getFuture() ) );

 	auto pSelf = &self;
-	addActor.send( failureDetectionServer( self.id, interf.clientInterface.failureMonitoring.getFuture() ) );
+	addActor.send( failureDetectionServer( self.id, &self.db, interf.clientInterface.failureMonitoring.getFuture() ) );
 	addActor.send( clusterWatchDatabase( &self, &self.db ) );  // Start the master database
 	addActor.send( self.updateWorkerList.init( self.db.db ) );
 	addActor.send( statusServer( interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));