added a wait to check for master proxies changed and put in a few more trace events

This commit is contained in:
Jon Fu 2019-08-20 14:43:48 -07:00
parent b9c73632e7
commit 04d514c483
4 changed files with 24 additions and 7 deletions

View File

@ -3434,9 +3434,21 @@ ACTOR Future<Void> snapCreate(Database cx, StringRef snapCmd, UID snapUID) {
}
ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions) {
ExclusionSafetyCheckRequest req(exclusions);
state bool ddCheck =
wait(loadBalance(cx->getMasterProxies(false), &MasterProxyInterface::exclusionSafetyCheckReq, req, cx->taskID));
TraceEvent("ExclusionSafetyCheckBegin")
.detail("NumExclusion", exclusions.size())
.detail("Exclusions", describe(exclusions));
state ExclusionSafetyCheckRequest req(exclusions);
state bool ddCheck;
loop {
choose {
when(wait(cx->onMasterProxiesChanged())) {}
when(bool _ddCheck = wait(loadBalance(cx->getMasterProxies(false),
&MasterProxyInterface::exclusionSafetyCheckReq, req, cx->taskID))) {
ddCheck = _ddCheck;
break;
}
}
}
state ClientCoordinators coordinatorList(cx->getConnectionFile());
state vector<Future<Optional<LeaderInfo>>> leaderServers;
for (int i = 0; i < coordinatorList.clientLeaderServers.size(); i++) {
@ -3461,12 +3473,14 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exc
}
}
int faultTolerance = (leaderServers.size() - 1) / 2 - coordinatorsUnavailable;
bool coordinatorCheck = (attemptCoordinatorExclude <= faultTolerance);
TraceEvent("ExclusionSafetyCheck")
.detail("CoordinatorListSize", leaderServers.size())
.detail("NumExclusions", exclusions.size())
.detail("FaultTolerance", faultTolerance)
.detail("AttemptCoordinatorExclude", attemptCoordinatorExclude);
.detail("AttemptCoordinatorExclude", attemptCoordinatorExclude)
.detail("CoordinatorCheck", coordinatorCheck)
.detail("DataDistributorCheck", ddCheck);
bool coordinatorCheck = (attemptCoordinatorExclude <= faultTolerance);
return (ddCheck && coordinatorCheck);
}

View File

@ -4284,7 +4284,9 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq, Reference<AsyncV
ACTOR Future<Void> ddExclusionSafetyCheck(DistributorExclusionSafetyCheckRequest req, Reference<DDTeamCollection> tc,
Database cx) {
TraceEvent("DDExclusionSafetyCheckBegin");
if (!tc.isValid()) {
TraceEvent("DDExclusionSafetyCheckTeamCollectionInvalid");
req.reply.send(false);
return Void();
}

View File

@ -1531,6 +1531,7 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
}
ACTOR Future<Void> proxyCheckSafeExclusion(Reference<AsyncVar<ServerDBInfo>> db, ExclusionSafetyCheckRequest req) {
TraceEvent("SafetyCheckMasterProxyBegin");
if (!db->get().distributor.present()) {
TraceEvent(SevWarnAlways, "DataDistributorNotPresent").detail("Operation", "ExclusionSafetyCheck");
req.reply.send(false);

View File

@ -412,8 +412,8 @@ struct RemoveServersSafelyWorkload : TestWorkload {
toKillMarkFailedArray.resize(failSet.size());
std::copy(failSet.begin(), failSet.end(), toKillMarkFailedArray.begin());
TraceEvent("RemoveAndKill", functionId)
.detail("Step", "Safety Check")
.detail("Exclusions", describe(toKillMarkFailedArray));
.detail("Step", "SafetyCheck")
.detail("Exclusions", describe(toKillMarkFailedArray));
bool safe = wait(checkSafeExclusions(cx, toKillMarkFailedArray));
if (safe) break;
}