added more trace lines and added timeout to safety check in test workload

This commit is contained in:
Jon Fu 2019-08-21 11:52:44 -07:00
parent a757e66327
commit 3666c0c776
4 changed files with 18 additions and 2 deletions

View File

@ -3449,6 +3449,7 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exc
}
}
}
TraceEvent("ExclusionSafetyCheckCoordinators");
state ClientCoordinators coordinatorList(cx->getConnectionFile());
state vector<Future<Optional<LeaderInfo>>> leaderServers;
for (int i = 0; i < coordinatorList.clientLeaderServers.size(); i++) {
@ -3474,7 +3475,7 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exc
}
int faultTolerance = (leaderServers.size() - 1) / 2 - coordinatorsUnavailable;
bool coordinatorCheck = (attemptCoordinatorExclude <= faultTolerance);
TraceEvent("ExclusionSafetyCheck")
TraceEvent("ExclusionSafetyCheckFinish")
.detail("CoordinatorListSize", leaderServers.size())
.detail("NumExclusions", exclusions.size())
.detail("FaultTolerance", faultTolerance)

View File

@ -4315,6 +4315,7 @@ ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest
break;
}
}
TraceEvent("DDExclusionSafetyCheckFinish");
req.reply.send(safe);
return Void();
}

View File

@ -1552,6 +1552,7 @@ ACTOR Future<Void> proxyCheckSafeExclusion(Reference<AsyncVar<ServerDBInfo>> db,
throw e;
}
}
TraceEvent("SafetyCheckMasterProxyFinish");
req.reply.send(safe);
return Void();
}

View File

@ -408,13 +408,26 @@ struct RemoveServersSafelyWorkload : TestWorkload {
killProcArray = self->getProcesses(toKill);
if (safeKillSet) {
loop {
state bool safe = false;
auto failSet = random_subset(toKillArray, deterministicRandom()->randomInt(1, toKillArray.size() / 2 + 2));
toKillMarkFailedArray.resize(failSet.size());
std::copy(failSet.begin(), failSet.end(), toKillMarkFailedArray.begin());
TraceEvent("RemoveAndKill", functionId)
.detail("Step", "SafetyCheck")
.detail("Exclusions", describe(toKillMarkFailedArray));
bool safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray));
loop {
choose {
when(bool _safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray))) {
safe = _safe;
break;
}
when(wait(delay(5.0))) {
TraceEvent("RemoveAndKill", functionId)
.detail("Step", "SafetyCheckTimedOut")
.detail("Exclusions", describe(toKillMarkFailedArray));
}
}
}
if (safe) break;
}
}