updated help message and changed existing workload to use mark as failed feature

This commit is contained in:
Jon Fu 2019-08-07 16:44:52 -07:00
parent e65800c0df
commit 807b02551e
7 changed files with 34 additions and 26 deletions

View File

@ -477,11 +477,14 @@ void initHelp() {
"change cluster coordinators or description",
"If 'auto' is specified, coordinator addresses will be choosen automatically to support the configured redundancy level. (If the current set of coordinators are healthy and already support the redundancy level, nothing will be changed.)\n\nOtherwise, sets the coordinators to the list of IP:port pairs specified by <ADDRESS>+. An fdbserver process must be running on each of the specified addresses.\n\ne.g. coordinators 10.0.0.1:4000 10.0.0.2:4000 10.0.0.3:4000\n\nIf 'description=desc' is specified then the description field in the cluster\nfile is changed to desc, which must match [A-Za-z0-9_]+.");
helpMap["exclude"] =
CommandHelp("exclude [no_wait] <ADDRESS>*", "exclude servers from the database",
CommandHelp("exclude [FORCE] [permanent] [no_wait] <ADDRESS>*", "exclude servers from the database",
"If no addresses are specified, lists the set of excluded servers.\n\nFor each IP address or "
"IP:port pair in <ADDRESS>*, adds the address to the set of excluded servers then waits until all "
"database state has been safely moved away from the specified servers. If 'no_wait' is set, the "
"command returns \nimmediately without checking if the exclusions have completed successfully.");
"command returns \nimmediately without checking if the exclusions have completed successfully.\n"
"If 'FORCE' is set, the command does not perform safety checks before excluding.\n"
"If 'permanent' is set, the tLog queue is dropped pre-emptively before waiting\n"
"for data movement to finish and the server cannot be included again.");
helpMap["include"] = CommandHelp(
"include all|<ADDRESS>*",
"permit previously-excluded servers to rejoin the database",
@ -2132,7 +2135,7 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc
wait( makeInterruptable(excludeServers(db,addresses,permanentlyFailed)) );
if (waitForAllExcluded) {
if (waitForAllExcluded && !permanentlyFailed) {
printf("Waiting for state to be removed from all excluded servers. This may take a while.\n");
printf("(Interrupting this wait with CTRL+C will not cancel the data movement.)\n");
}

View File

@ -1335,6 +1335,7 @@ ACTOR static Future<vector<AddressExclusion>> getExcludedServers( Transaction* t
if (a.isValid())
exclusions.push_back( a );
}
uniquify(exclusions);
return exclusions;
}

View File

@ -931,8 +931,8 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
wait( checkMoveKeysLock(&tr, lock) );
TraceEvent("RemoveKeysFromFailedServerLocked").detail("ServerID", serverID).detail("Version", tr.getReadVersion().get());
// Get all values of keyServers and remove serverID from every occurrence
// Very inefficient going over every entry in keyServers
// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries (change this?)
// FIXME: Very inefficient going over every entry in keyServers, concern in violating 5s transaction limit
// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries
state Standalone<RangeResultRef> keyServers = wait( krmGetRanges(&tr, keyServersPrefix, allKeys) );
state KeyValueRef* it = keyServers.begin();
for ( ; it != keyServers.end() ; ++it) {
@ -940,34 +940,27 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx, UID serverID, MoveKey
state vector<UID> dest;
decodeKeyServersValue(it->value, src, dest);
TraceEvent("FailedServerCheckpoint1.0")
.detail("Key", keyServersKey(it->key));
for (UID i : src) {
TraceEvent("FailedServerCheckpoint1.0Src")
.detail("UID", i);
}
for (UID i : dest) {
TraceEvent("FailedServerCheckpoint1.0Dest")
.detail("UID", i);
}
.detail("Key", keyServersKey(it->key))
.detail("SrcSize", src.size())
.detail("Src", describe(src))
.detail("DestSize", dest.size())
.detail("Dest", describe(dest));
// // The failed server is not present
// if (std::find(src.begin(), src.end(), serverID) == src.end() && std::find(dest.begin(), dest.end(), serverID) == dest.end() ) {
// continue;
// }
// Update the vectors to remove failed server then set the value again
// Dest is usually empty, but keep this in case there is parallel data movement (?)
// Dest is usually empty, but keep this in case there is parallel data movement
src.erase(std::remove(src.begin(), src.end(), serverID), src.end());
dest.erase(std::remove(dest.begin(), dest.end(), serverID), dest.end());
TraceEvent("FailedServerCheckpoint1.1")
.detail("Key", keyServersKey(it->key));
for (UID i : src) {
TraceEvent("FailedServerCheckpoint1.1Src")
.detail("UID", i);
}
for (UID i : dest) {
TraceEvent("FailedServerCheckpoint1.1Dest")
.detail("UID", i);
}
.detail("Key", keyServersKey(it->key))
.detail("SrcSize", src.size())
.detail("Src", describe(src))
.detail("DestSize", dest.size())
.detail("Dest", describe(dest));;
tr.set(keyServersKey(it->key), keyServersValue(src, dest));
}

View File

@ -401,11 +401,16 @@ struct RemoveServersSafelyWorkload : TestWorkload {
state std::vector<ISimulator::ProcessInfo*> killProcArray;
state std::vector<AddressExclusion> toKillArray;
state std::vector<AddressExclusion> toKillMarkFailedArray;
std::copy(toKill.begin(), toKill.end(), std::back_inserter(toKillArray));
killProcArray = self->getProcesses(toKill);
if (toKillArray.size()) {
toKillMarkFailedArray.push_back(deterministicRandom()->randomChoice(toKillArray));
}
TraceEvent("RemoveAndKill", functionId).detail("Step", "Activate Server Exclusion").detail("KillAddrs", toKill.size()).detail("KillProcs", killProcArray.size()).detail("MissingProcs", toKill.size()!=killProcArray.size()).detail("ToKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable());
wait( excludeServers( cx, toKillMarkFailedArray, true ) );
wait( excludeServers( cx, toKillArray ) );
// We need to skip at least the quorum change if there's nothing to kill, because there might not be enough servers left

View File

@ -29,4 +29,6 @@ testTitle=SwizzledCausalConsistencyTest
minDelay=0
maxDelay=100
kill1Timeout=30
kill2Timeout=6000
kill2Timeout=6000
minimumReplication=2

View File

@ -39,4 +39,6 @@ testTitle=DDBalance_test
minDelay=0
maxDelay=100
kill1Timeout=30
kill2Timeout=6000
kill2Timeout=6000
minimumReplication=2

View File

@ -43,3 +43,5 @@ testTitle=DDBalance_test
testName=Status
testDuration=30.0
minimumReplication=2