Add waitForUnreliableExtraStoreReboot to wait for extra store to reboot

2020-03-12 10:18:31 -07:00 · 2020-03-12 10:18:31 -07:00 · a9136f3f72
parent d87ed92f78
commit a9136f3f72
3 changed files with 99 additions and 13 deletions
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@ -45,9 +45,9 @@ public:

 	void choosePrimaryAddress() {
 		if(addresses.secondaryAddress.present() && !g_network->getLocalAddresses().secondaryAddress.present() && (addresses.address.isTLS() != g_network->getLocalAddresses().address.isTLS())) {
-			if (addresses.address.isTLS()) {
-				TraceEvent(SevWarn, "MXDEBUGChoosePrimaryAddressSwap").detail("PrimaryAddressWillBeTLS", addresses.secondaryAddress.get().isTLS()).backtrace();
-			}
+			// if (addresses.address.isTLS()) {
+			// 	TraceEvent(SevWarn, "MXDEBUGChoosePrimaryAddressSwap").detail("PrimaryAddressWillBeTLS", addresses.secondaryAddress.get().isTLS()).backtrace();
+			// }
 			std::swap(addresses.address, addresses.secondaryAddress.get());
 		}
 	}
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -400,6 +400,8 @@ std::vector< DiskStore > getDiskStores( std::string folder ) {
 	return result;
 }

+// Register the worker interf to cluster controller (cc) and
+// re-register the worker when key roles interface, e.g., cc, dd, ratekeeper, change.
 ACTOR Future<Void> registrationClient(
 		Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface,
 		WorkerInterface interf,
@ -424,7 +426,7 @@ ACTOR Future<Void> registrationClient(
 		Future<RegisterWorkerReply> registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never();
 		choose {
 			when ( RegisterWorkerReply reply = wait( registrationReply )) {
-				processClass = reply.processClass;	
+				processClass = reply.processClass;
 				asyncPriorityInfo->set( reply.priorityInfo );

 				if(!reply.storageCache.present()) {
@ -434,7 +436,7 @@ ACTOR Future<Void> registrationClient(
 					StorageServerInterface recruited;
 					recruited.locality = locality;
 					recruited.initEndpoints();
-					
+
 					std::map<std::string, std::string> details;
 					startRole( Role::STORAGE_CACHE, recruited.id(), interf.id(), details );

@ -1127,7 +1129,7 @@ ACTOR Future<Void> workerServer(

 				Future<Void> backupProcess = backupWorker(recruited, req, dbInfo);
 				errorForwarders.add(forwardError(errors, Role::BACKUP, recruited.id(), backupProcess));
-				TraceEvent("Backup_InitRequest", req.reqId).detail("BackupId", recruited.id());
+				TraceEvent("BackupInitRequest", req.reqId).detail("BackupId", recruited.id());
 				InitializeBackupReply reply(recruited, req.backupEpoch);
 				req.reply.send(reply);
 			}
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -34,6 +34,9 @@
 #include "fdbclient/ManagementAPI.actor.h"
 #include "flow/actorcompiler.h"  // This must be the last #include.

+//#define SevCCheckInfo SevVerbose
+#define SevCCheckInfo SevInfo
+
 struct ConsistencyCheckWorkload : TestWorkload
 {
 	//Whether or not we should perform checks that will only pass if the database is in a quiescent state
@ -292,6 +295,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 					}

 					wait(::success(self->checkForStorage(cx, configuration, self)));
+					wait(::success(self->waitForUnreliableExtraStoreReboot(cx, self)));
 					wait(::success(self->checkForExtraDataStores(cx, self)));

 					//Check that each machine is operating as its desired class
@ -1167,9 +1171,89 @@ struct ConsistencyCheckWorkload : TestWorkload
 		return true;
 	}

+	ACTOR Future<bool> waitForUnreliableExtraStoreReboot(Database cx, ConsistencyCheckWorkload *self) {
+		state int waitCount = 0;
+		loop {
+			state std::vector<WorkerDetails> workers = wait( getWorkers( self->dbInfo ) );
+			state std::vector<StorageServerInterface> storageServers = wait( getStorageServers( cx ) );
+			state std::vector<WorkerInterface> coordWorkers = wait(getCoordWorkers(cx, self->dbInfo));
+			auto& db = self->dbInfo->get();
+			state std::vector<TLogInterface> logs = db.logSystemConfig.allPresentLogs();
+
+			state std::vector<WorkerDetails>::iterator itr;
+			state bool foundExtraDataStore = false;
+			state std::vector<struct ProcessInfo*> protectedProcessesToKill;
+
+			state std::map<NetworkAddress, std::set<UID>> statefulProcesses;
+			for (const auto& ss : storageServers) {
+				statefulProcesses[ss.address()].insert(ss.id());
+				// Add both addresses so that we will not mistakenly trigger ConsistencyCheck_ExtraDataStore
+				if (ss.secondaryAddress().present()) {
+					statefulProcesses[ss.secondaryAddress().get()].insert(ss.id());
+				}
+				TraceEvent(SevCCheckInfo, "StatefulProcess").detail("StorageServer", ss.id()).detail("PrimaryAddress", ss.address().toString()).detail("SecondaryAddress", ss.secondaryAddress().present() ?  ss.secondaryAddress().get().toString() : "Unset");
+			}
+			for (const auto& log : logs) {
+				statefulProcesses[log.address()].insert(log.id());
+				if (log.secondaryAddress().present()) {
+					statefulProcesses[log.secondaryAddress().get()].insert(log.id());
+				}
+				TraceEvent(SevCCheckInfo, "StatefulProcess").detail("Log", log.id()).detail("PrimaryAddress", log.address().toString()).detail("SecondaryAddress", log.secondaryAddress().present() ?  log.secondaryAddress().get().toString() : "Unset");
+			}
+			// Coordinators are also stateful processes
+			for (const auto& cWorker : coordWorkers) {
+				statefulProcesses[cWorker.address()].insert(cWorker.id());
+				if (cWorker.secondaryAddress().present()) {
+					statefulProcesses[cWorker.secondaryAddress().get()].insert(cWorker.id());
+				}
+				TraceEvent(SevCCheckInfo, "StatefulProcess").detail("Coordinator", cWorker.id()).detail("PrimaryAddress", cWorker.address().toString()).detail("SecondaryAddress", cWorker.secondaryAddress().present() ?  cWorker.secondaryAddress().get().toString() : "Unset");
+			}
+
+			// Wait for extra store process that is unreliable (i.e., in the process of rebooting) to finish; Otherwise,
+			// the test will try to kill the extra store process which may be protected. This causes failure.
+			state bool protectedExtraStoreUnreliable = false;
+
+			for(itr = workers.begin(); itr != workers.end(); ++itr) {
+				ErrorOr<Standalone<VectorRef<UID>>> stores = wait(itr->interf.diskStoreRequest.getReplyUnlessFailedFor(DiskStoreRequest(false), 2, 0));
+				if(stores.isError()) {
+					TraceEvent("ConsistencyCheck_GetDataStoreFailure").error(stores.getError()).detail("Address", itr->interf.address());
+					self->testFailure("Failed to get data stores");
+					return false;
+				}
+
+				TraceEvent(SevCCheckInfo, "CheckProtectedExtraStoreRebootProgress").detail("Worker", itr->interf.id().toString()).detail("PrimaryAddress", itr->interf.address().toString()).detail("SecondaryAddress", itr->interf.secondaryAddress().present() ? itr->interf.secondaryAddress().get().toString() : "Unset");
+				for (const auto& id : stores.get()) {
+					if (statefulProcesses[itr->interf.address()].count(id)) {
+						continue;
+					} else  {
+						if(g_network->isSimulated()) {
+							auto p = g_simulator.getProcessByAddress(itr->interf.address());
+							if (g_simulator.protectedAddresses.count(p->address) && !p->isReliable()) {
+								protectedExtraStoreUnreliable = true;
+								break;
+							}
+						}
+					}
+				}
+				if (protectedExtraStoreUnreliable) {
+					break;
+				}
+			}
+			if (protectedExtraStoreUnreliable) {
+				wait(delay(10.0));
+				waitCount++;
+			}
+			if (waitCount > 20) {
+				TraceEvent(SevError, "ProtectedExtraStoreUnreliableStuck").detail("ExpectedBehavior", "Extra store should be cleaned up after process reboot");
+				break;
+			}
+		}
+		return waitCount <= 20;
+	}
+
 	ACTOR Future<bool> checkForExtraDataStores(Database cx, ConsistencyCheckWorkload *self) {
-		state vector<WorkerDetails> workers = wait( getWorkers( self->dbInfo ) );
-		state vector<StorageServerInterface> storageServers = wait( getStorageServers( cx ) );
+		state std::vector<WorkerDetails> workers = wait( getWorkers( self->dbInfo ) );
+		state std::vector<StorageServerInterface> storageServers = wait( getStorageServers( cx ) );
 		state std::vector<WorkerInterface> coordWorkers = wait(getCoordWorkers(cx, self->dbInfo));
 		auto& db = self->dbInfo->get();
 		state std::vector<TLogInterface> logs = db.logSystemConfig.allPresentLogs();
@ -1185,19 +1269,22 @@ struct ConsistencyCheckWorkload : TestWorkload
 			if (ss.secondaryAddress().present()) {
 				statefulProcesses[ss.secondaryAddress().get()].insert(ss.id());
 			}
+			TraceEvent(SevCCheckInfo, "StatefulProcess").detail("StorageServer", ss.id()).detail("PrimaryAddress", ss.address().toString()).detail("SecondaryAddress", ss.secondaryAddress().present() ?  ss.secondaryAddress().get().toString() : "Unset");
 		}
 		for (const auto& log : logs) {
 			statefulProcesses[log.address()].insert(log.id());
 			if (log.secondaryAddress().present()) {
 				statefulProcesses[log.secondaryAddress().get()].insert(log.id());
 			}
+			TraceEvent(SevCCheckInfo, "StatefulProcess").detail("Log", log.id()).detail("PrimaryAddress", log.address().toString()).detail("SecondaryAddress", log.secondaryAddress().present() ?  log.secondaryAddress().get().toString() : "Unset");
 		}
 		// Coordinators are also stateful processes
-		for (const auto& cWorker: coordWorkers) {
+		for (const auto& cWorker : coordWorkers) {
 			statefulProcesses[cWorker.address()].insert(cWorker.id());
 			if (cWorker.secondaryAddress().present()) {
 				statefulProcesses[cWorker.secondaryAddress().get()].insert(cWorker.id());
 			}
+			TraceEvent(SevCCheckInfo, "StatefulProcess").detail("Coordinator", cWorker.id()).detail("PrimaryAddress", cWorker.address().toString()).detail("SecondaryAddress", cWorker.secondaryAddress().present() ?  cWorker.secondaryAddress().get().toString() : "Unset");
 		}

 		for(itr = workers.begin(); itr != workers.end(); ++itr) {
@ -1208,6 +1295,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 				return false;
 			}

+			TraceEvent(SevCCheckInfo, "ConsistencyCheck_ExtraDataStore").detail("Worker", itr->interf.id().toString()).detail("PrimaryAddress", itr->interf.address().toString()).detail("SecondaryAddress", itr->interf.secondaryAddress().present() ? itr->interf.secondaryAddress().get().toString() : "Unset");
 			for (const auto& id : stores.get()) {
 				// if (statefulProcesses[itr->interf.address()].count(id)) {
 				// 	continue;
@ -1227,10 +1315,6 @@ struct ConsistencyCheckWorkload : TestWorkload
 						    .detail("Reliable", p->isReliable())
 						    .detail("ReliableInfo", p->getReliableInfo())
 						    .detail("KillOrRebootProcess", p->address);
-						// if (g_simulator.protectedAddresses.count(machine->address)) {
-						// 	protectedProcessesToKill.push_back(p);
-						// 	continue;
-						// }
 						if(p->isReliable()) {
 							g_simulator.rebootProcess(p, ISimulator::RebootProcess);
 						} else {