A version that pass 100K simulation

2023-07-26 11:12:46 -07:00 · 2023-07-26 11:12:46 -07:00 · 3056702f2b
parent dcfbb55ad5
commit 3056702f2b
6 changed files with 38 additions and 12 deletions
--- a/fdbclient/ServerKnobs.cpp
+++ b/fdbclient/ServerKnobs.cpp
@ -686,14 +686,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( REPLACE_INTERFACE_CHECK_DELAY,                         5.0 );
 	init( COORDINATOR_REGISTER_INTERVAL,                         5.0 );
 	init( CLIENT_REGISTER_INTERVAL,                            600.0 );
-	init( CC_ENABLE_WORKER_HEALTH_MONITOR,                      true );
-	init( CC_WORKER_HEALTH_CHECKING_INTERVAL,                   30.0 );
+	init( CC_ENABLE_WORKER_HEALTH_MONITOR,                     false );
+	init( CC_WORKER_HEALTH_CHECKING_INTERVAL,                   60.0 );
 	init( CC_DEGRADED_LINK_EXPIRATION_INTERVAL,                300.0 );
-	init( CC_MIN_DEGRADATION_INTERVAL,                          60.0 );
+	init( CC_MIN_DEGRADATION_INTERVAL,                         120.0 );
 	init( ENCRYPT_KEY_PROXY_FAILURE_TIME,                        0.1 ); if ( isSimulated ) ENCRYPT_KEY_PROXY_FAILURE_TIME = 1.0 + deterministicRandom()->random01();
 	init( CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE,                      3 );
 	init( CC_MAX_EXCLUSION_DUE_TO_HEALTH,                          2 );
-	init( CC_HEALTH_TRIGGER_RECOVERY,                           true );
+	init( CC_HEALTH_TRIGGER_RECOVERY,                          false );
 	init( CC_TRACKING_HEALTH_RECOVERY_INTERVAL,               3600.0 );
 	init( CC_MAX_HEALTH_RECOVERY_COUNT,                            5 );
 	init( CC_HEALTH_TRIGGER_FAILOVER,                          false );
@ -974,7 +974,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
 	init( MIN_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS,             10.0 );
 	init( MAX_DELAY_CC_WORST_FIT_CANDIDACY_SECONDS,             30.0 );
 	init( DBINFO_FAILED_DELAY,                                   1.0 );
-	init( ENABLE_WORKER_HEALTH_MONITOR,                         true ); if ( randomize && BUGGIFY ) ENABLE_WORKER_HEALTH_MONITOR = true;
+	init( ENABLE_WORKER_HEALTH_MONITOR,                        false ); if ( randomize && BUGGIFY ) ENABLE_WORKER_HEALTH_MONITOR = true;
 	init( WORKER_HEALTH_MONITOR_INTERVAL,                       60.0 );
 	init( PEER_LATENCY_CHECK_MIN_POPULATION,                      30 );
 	init( PEER_LATENCY_DEGRADATION_PERCENTILE,                  0.50 );
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -129,7 +129,6 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() {
 				}
 			}

-			/*
 			if (recoveryData->recoveryState < RecoveryState::ACCEPTING_COMMITS) {
 				for (const auto& tlog : recoveryData->recruitment.tLogs) {
 					if (tlog.addresses().contains(server)) {
@ -141,7 +140,7 @@ bool ClusterControllerData::transactionSystemContainsDegradedServers() {
 						return true;
 					}
 				}
-			}*/
+			}

 			for (const auto& proxy : dbi.client.grvProxies) {
 				if (proxy.addresses().contains(server)) {
@ -758,6 +757,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
 }

 ACTOR Future<Void> doCheckOutstandingRequests(ClusterControllerData* self) {
+	TraceEvent("ZZZZZdoCheckOutstandingRequestsCall").log();
 	try {
 		wait(delay(SERVER_KNOBS->CHECK_OUTSTANDING_INTERVAL));
 		while (now() - self->lastRecruitTime < SERVER_KNOBS->SINGLETON_RECRUIT_BME_DELAY ||
@ -788,6 +788,7 @@ ACTOR Future<Void> doCheckOutstandingRequests(ClusterControllerData* self) {
 			TraceEvent(SevError, "CheckOutstandingError").error(e);
 		}
 	}
+	TraceEvent("ZZZZZdoCheckOutstandingRequestsReturn").log();
 	return Void();
 }

@ -808,6 +809,7 @@ ACTOR Future<Void> doCheckOutstandingRemoteRequests(ClusterControllerData* self)
 }

 void checkOutstandingRequests(ClusterControllerData* self) {
+	TraceEvent("ZZZZZCallCheckOutstandingRequests").log();
 	if (self->outstandingRemoteRequestChecker.isReady()) {
 		self->outstandingRemoteRequestChecker = doCheckOutstandingRemoteRequests(self);
 	}
@ -2848,11 +2850,13 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {

 			// Compare `self->degradationInfo` with `self->excludedDegradedServers` and remove those that have
 			// recovered.
+			bool hasRecoveredServer = false;
 			for (auto it = self->excludedDegradedServers.begin(); it != self->excludedDegradedServers.end();) {
 				if (self->degradationInfo.degradedServers.find(*it) == self->degradationInfo.degradedServers.end() &&
 				    self->degradationInfo.disconnectedServers.find(*it) ==
 				        self->degradationInfo.disconnectedServers.end()) {
 					self->excludedDegradedServers.erase(it++);
+					hasRecoveredServer = true;
 				} else {
 					++it;
 				}
@ -2911,6 +2915,10 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
 				}
 			}

+			if (hasRecoveredServer) {
+				checkOutstandingRequests(self);
+			}	
+
 			wait(delay(SERVER_KNOBS->CC_WORKER_HEALTH_CHECKING_INTERVAL));
 		} catch (Error& e) {
 			TraceEvent(SevWarnAlways, "ClusterControllerHealthMonitorError").error(e);
@ -3097,7 +3105,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,

 			for (auto const& [id, worker] : self.id_worker) {
 				if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) &&
-				    self.db.config.isExcludedServer(worker.details.interf.addresses())) {
+				    (self.db.config.isExcludedServer(worker.details.interf.addresses()) || self.isExcludedDegradedServer(worker.details.interf.addresses()))) {
 					continue;
 				}

--- a/fdbserver/include/fdbserver/ClusterController.actor.h
+++ b/fdbserver/include/fdbserver/ClusterController.actor.h
@ -1496,6 +1496,9 @@ public:

 		for (auto& it : id_worker) {
 			auto fitness = it.second.details.processClass.machineClassFitness(role);
+			if (role == ProcessClass::ClusterRole::Master) {
+				TraceEvent("ZZZZZZGetMasterRoleFitness").detail("Worker", it.second.details.interf.address()).detail("Fitness", fitness);
+			}
 			if (conf.isExcludedServer(it.second.details.interf.addresses()) ||
 			    isExcludedDegradedServer(it.second.details.interf.addresses())) {
 				fitness = std::max(fitness, ProcessClass::ExcludeFit);
@ -2388,6 +2391,7 @@ public:
 	// This function returns true when the cluster controller determines it is worth forcing
 	// a cluster recovery in order to change the recruited processes in the transaction subsystem.
 	bool betterMasterExists() {
+		TraceEvent("ZZZZZCallingBetterMasterExist").log();
 		const ServerDBInfo dbi = db.serverInfo->get();

 		if (dbi.recoveryState < RecoveryState::ACCEPTING_COMMITS) {
@ -2577,6 +2581,7 @@ public:
 			;
 			return false;
 		}
+		TraceEvent("ZZZZZZMasterFit").detail("Old", oldMasterFit).detail("New", newMasterFit);
 		if (oldMasterFit > newMasterFit || (dbi.master.locality.processId() == clusterControllerProcessId &&
 		                                    mworker.worker.interf.locality.processId() != clusterControllerProcessId)) {
 			TraceEvent("BetterMasterExists", id)
--- a/fdbserver/workloads/ClogTlog.actor.cpp
+++ b/fdbserver/workloads/ClogTlog.actor.cpp
@ -129,14 +129,14 @@ struct ClogTlogWorkload : TestWorkload {
 	}

 	ACTOR static Future<Void> excludeFailedLog(ClogTlogWorkload* self, Database cx) {
-		state Future<Void> timeout = delay(300);
+		state Future<Void> timeout = delay(30);

 		loop choose {
 			when(wait(self->dbInfo->onChange())) {
 				if (self->dbInfo->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS) {
 					return Void();
 				}
-				timeout = delay(300);
+				timeout = delay(30);
 			}
 			when(wait(timeout)) {
 				// recovery state hasn't changed in 30s, exclude the failed tlog
@ -157,7 +157,6 @@ struct ClogTlogWorkload : TestWorkload {
 			self->useDisconnection = true;
 		}

-		state double workloadEnd = now() + self->testDuration - 10;
 		// Let cycle workload issue some transactions.
 		wait(delay(20.0));

@ -166,6 +165,7 @@ struct ClogTlogWorkload : TestWorkload {
 		}

 		double startTime = now();
+		state double workloadEnd = now() + self->testDuration - 10;
 		TraceEvent("ClogTlog").detail("StartTime", startTime).detail("EndTime", workloadEnd);

 		// Clog and wait for recovery to happen
@ -174,8 +174,14 @@ struct ClogTlogWorkload : TestWorkload {
 			wait(self->dbInfo->onChange());
 		}

+		state bool useGrayFailureToRecover = false;
+		if (deterministicRandom()->coinflip() && self->useDisconnection) {
+			TraceEvent("ClogTlogUseGrayFailreToRecover").log();
+			useGrayFailureToRecover = true;
+		}	
+
 		// start exclusion and wait for fully recovery
-		// state Future<Void> excludeLog = excludeFailedLog(self, cx);
+		state Future<Void> excludeLog = useGrayFailureToRecover ? Never() : excludeFailedLog(self, cx);
 		state Future<Void> onChange = self->dbInfo->onChange();
 		loop choose {
 			when(wait(onChange)) {
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -1394,6 +1394,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 		std::map<NetworkAddress, WorkerDetails> nonExcludedWorkerProcessMap;
 		std::map<Optional<Key>, std::vector<ProcessClass::ClassType>> dcToNonExcludedClassTypes;
 		for (const auto& worker : nonExcludedWorkers) {
+			TraceEvent("ZZZZZZNonExcludedWorkerClass").detail("Worker", worker.interf.address()).detail("Class", worker.processClass.classType());
 			nonExcludedWorkerProcessMap[worker.interf.address()] = worker;
 			Optional<Key> dc = worker.interf.locality.dcId();
 			if (!dcToNonExcludedClassTypes.count(dc))
--- a/tests/rare/ClogTlog.toml
+++ b/tests/rare/ClogTlog.toml
@ -5,10 +5,16 @@ machineCount = 20
 commitProxyCount = 4
 config = 'triple'
 desiredTLogCount = 6
+# FIXME: re-enable rocks at some point
+storageEngineExcludeTypes = [4, 5]

 [[knobs]]
 enable_worker_health_monitor = true
 cc_enable_worker_health_monitor = true
+cc_worker_health_checking_interval = 15
+cc_min_degradation_interval = 30
+cc_health_trigger_recovery = true
+peer_latency_degradation_threshold = 1

 [[test]]
 testTitle = 'ClogTlog'