updated help message and changed existing workload to use mark as failed feature

2019-08-07 16:44:52 -07:00 · 2019-08-07 16:44:52 -07:00 · 807b02551e
parent e65800c0df
commit 807b02551e
7 changed files with 34 additions and 26 deletions
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -477,11 +477,14 @@ void initHelp() {
 		"change cluster coordinators or description",
 		"If 'auto' is specified, coordinator addresses will be choosen automatically to support the configured redundancy level. (If the current set of coordinators are healthy and already support the redundancy level, nothing will be changed.)\n\nOtherwise, sets the coordinators to the list of IP:port pairs specified by <ADDRESS>+. An fdbserver process must be running on each of the specified addresses.\n\ne.g. coordinators 10.0.0.1:4000 10.0.0.2:4000 10.0.0.3:4000\n\nIf 'description=desc' is specified then the description field in the cluster\nfile is changed to desc, which must match [A-Za-z0-9_]+.");
 	helpMap["exclude"] =
-	    CommandHelp("exclude [no_wait] <ADDRESS>*", "exclude servers from the database",
+	    CommandHelp("exclude [FORCE] [permanent] [no_wait] <ADDRESS>*", "exclude servers from the database",
 	                "If no addresses are specified, lists the set of excluded servers.\n\nFor each IP address or "
 	                "IP:port pair in <ADDRESS>*, adds the address to the set of excluded servers then waits until all "
 	                "database state has been safely moved away from the specified servers. If 'no_wait' is set, the "
-	                "command returns \nimmediately without checking if the exclusions have completed successfully.");
+	                "command returns \nimmediately without checking if the exclusions have completed successfully.\n"
+	                "If 'FORCE' is set, the command does not perform safety checks before excluding.\n"
+	                "If 'permanent' is set, the tLog queue is dropped pre-emptively before waiting\n"
+	                "for data movement to finish and the server cannot be included again.");
 	helpMap["include"] = CommandHelp(
 		"include all|<ADDRESS>*",
 		"permit previously-excluded servers to rejoin the database",
@ -2132,7 +2135,7 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc

 		wait( makeInterruptable(excludeServers(db,addresses,permanentlyFailed)) );

-		if (waitForAllExcluded) {
+		if (waitForAllExcluded && !permanentlyFailed) {
 			printf("Waiting for state to be removed from all excluded servers. This may take a while.\n");
 			printf("(Interrupting this wait with CTRL+C will not cancel the data movement.)\n");
 		}
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -1335,6 +1335,7 @@ ACTOR static Future<vector<AddressExclusion>> getExcludedServers( Transaction* t
 		if (a.isValid())
 			exclusions.push_back( a );
 	}
+	uniquify(exclusions);
 	return exclusions;
 }

--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -931,8 +931,8 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
 			wait( checkMoveKeysLock(&tr, lock) );
 			TraceEvent("RemoveKeysFromFailedServerLocked").detail("ServerID", serverID).detail("Version", tr.getReadVersion().get());
 			// Get all values of keyServers and remove serverID from every occurrence
-			// Very inefficient going over every entry in keyServers
-			// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries (change this?)
+			// FIXME: Very inefficient going over every entry in keyServers, concern in violating 5s transaction limit
+			// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries
 			state Standalone<RangeResultRef> keyServers = wait( krmGetRanges(&tr, keyServersPrefix, allKeys) );
 			state KeyValueRef* it = keyServers.begin();
 			for ( ; it != keyServers.end() ; ++it) {
@ -940,34 +940,27 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
 				state vector<UID> dest;
 				decodeKeyServersValue(it->value, src, dest);
 				TraceEvent("FailedServerCheckpoint1.0")
-					.detail("Key", keyServersKey(it->key));
-				for (UID i : src) {
-					TraceEvent("FailedServerCheckpoint1.0Src")
-						.detail("UID", i);
-				}
-				for (UID i : dest) {
-					TraceEvent("FailedServerCheckpoint1.0Dest")
-						.detail("UID", i);
-				}
+					.detail("Key", keyServersKey(it->key))
+					.detail("SrcSize", src.size())
+					.detail("Src", describe(src))
+					.detail("DestSize", dest.size())
+					.detail("Dest", describe(dest));
+
 				// // The failed server is not present
 				// if (std::find(src.begin(), src.end(), serverID) == src.end() && std::find(dest.begin(), dest.end(), serverID) == dest.end() ) {
 				// 	continue;
 				// }

 				// Update the vectors to remove failed server then set the value again
-				// Dest is usually empty, but keep this in case there is parallel data movement (?)
+				// Dest is usually empty, but keep this in case there is parallel data movement
 				src.erase(std::remove(src.begin(), src.end(), serverID), src.end());
 				dest.erase(std::remove(dest.begin(), dest.end(), serverID), dest.end());
 				TraceEvent("FailedServerCheckpoint1.1")
-					.detail("Key", keyServersKey(it->key));
-				for (UID i : src) {
-					TraceEvent("FailedServerCheckpoint1.1Src")
-						.detail("UID", i);
-				}
-				for (UID i : dest) {
-					TraceEvent("FailedServerCheckpoint1.1Dest")
-						.detail("UID", i);
-				}
+					.detail("Key", keyServersKey(it->key))
+					.detail("SrcSize", src.size())
+					.detail("Src", describe(src))
+					.detail("DestSize", dest.size())
+					.detail("Dest", describe(dest));;
 				tr.set(keyServersKey(it->key), keyServersValue(src, dest));
 			}

--- a/fdbserver/workloads/RemoveServersSafely.actor.cpp
+++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp
@ -401,11 +401,16 @@ struct RemoveServersSafelyWorkload : TestWorkload {

 		state std::vector<ISimulator::ProcessInfo*>	killProcArray;
 		state std::vector<AddressExclusion>	toKillArray;
+		state std::vector<AddressExclusion>	toKillMarkFailedArray;

 		std::copy(toKill.begin(), toKill.end(), std::back_inserter(toKillArray));
 		killProcArray = self->getProcesses(toKill);
+		if (toKillArray.size()) {
+			toKillMarkFailedArray.push_back(deterministicRandom()->randomChoice(toKillArray));
+		}

 		TraceEvent("RemoveAndKill", functionId).detail("Step", "Activate Server Exclusion").detail("KillAddrs", toKill.size()).detail("KillProcs", killProcArray.size()).detail("MissingProcs", toKill.size()!=killProcArray.size()).detail("ToKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable());
+		wait( excludeServers( cx, toKillMarkFailedArray, true ) );
 		wait( excludeServers( cx, toKillArray ) );

 		// We need to skip at least the quorum change if there's nothing to kill, because there might not be enough servers left
--- a/tests/fast/SwizzledRollbackSideband.txt
+++ b/tests/fast/SwizzledRollbackSideband.txt
@ -29,4 +29,6 @@ testTitle=SwizzledCausalConsistencyTest
    minDelay=0
    maxDelay=100
    kill1Timeout=30
-    kill2Timeout=6000
+    kill2Timeout=6000
+
+minimumReplication=2
--- a/tests/slow/DDBalanceAndRemove.txt
+++ b/tests/slow/DDBalanceAndRemove.txt
@ -39,4 +39,6 @@ testTitle=DDBalance_test
    minDelay=0
    maxDelay=100
    kill1Timeout=30
-    kill2Timeout=6000
+    kill2Timeout=6000
+
+minimumReplication=2
--- a/tests/slow/DDBalanceAndRemoveStatus.txt
+++ b/tests/slow/DDBalanceAndRemoveStatus.txt
@ -43,3 +43,5 @@ testTitle=DDBalance_test

    testName=Status
    testDuration=30.0
+
+minimumReplication=2