Merge pull request #1956 from etschannen/master

Minor bug fixes
This commit is contained in:
Evan Tschannen 2019-08-05 17:00:51 -07:00 committed by GitHub
commit 1c730baedc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 15 deletions

View File

@ -2723,6 +2723,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
}
}
// Failed server should not trigger DD if SS failures are set to be ignored
if (!badTeam && self->healthyZone.get().present() && (self->healthyZone.get().get() == ignoreSSFailuresZoneString)) {
ASSERT_WE_THINK(serversLeft == self->configuration.storageTeamSize);
}
if( !self->initialFailureReactionDelay.isReady() ) {
change.push_back( self->initialFailureReactionDelay );
}
@ -2880,11 +2885,6 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
rs.keys = shards[i];
rs.priority = maxPriority;
// Failed server should not trigger DD if SS failures are set to be ignored
if (rs.priority == PRIORITY_TEAM_UNHEALTHY) {
ASSERT_WE_THINK(!(!badTeam && self->healthyZone.get().present() &&
(self->healthyZone.get().get() == ignoreSSFailuresZoneString)));
}
self->output.send(rs);
if(deterministicRandom()->random01() < 0.01) {
TraceEvent("SendRelocateToDDQx100", self->distributorId)

View File

@ -35,16 +35,21 @@ static std::set<int> const& normalAttritionErrors() {
return s;
}
ACTOR Future<Void> resetHealthyZoneAfter(Database cx, double duration) {
ACTOR Future<bool> ignoreSSFailuresForDuration(Database cx, double duration) {
// duration doesn't matter since this won't timeout
TraceEvent("IgnoreSSFailureStart");
bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString, 0));
TraceEvent("IgnoreSSFailureWait");
wait(delay(duration));
TraceEvent("IgnoreSSFailureClear");
state Transaction tr(cx);
state Future<Void> delayF = delay(duration);
loop {
try {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
wait(delayF);
tr.clear(healthyZoneKey);
wait(tr.commit());
return Void();
TraceEvent("IgnoreSSFailureComplete");
return true;
} catch (Error& e) {
wait(tr.onError(e));
}
@ -61,6 +66,7 @@ struct MachineAttritionWorkload : TestWorkload {
bool replacement;
bool waitForVersion;
bool allowFaultInjection;
Future<bool> ignoreSSFailures;
// This is set in setup from the list of workers when the cluster is started
std::vector<LocalityData> machines;
@ -78,6 +84,7 @@ struct MachineAttritionWorkload : TestWorkload {
replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 );
waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false );
allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true );
ignoreSSFailures = true;
}
static vector<ISimulator::ProcessInfo*> getServers() {
@ -121,7 +128,7 @@ struct MachineAttritionWorkload : TestWorkload {
throw please_reboot();
return Void();
}
virtual Future<bool> check( Database const& cx ) { return true; }
virtual Future<bool> check( Database const& cx ) { return ignoreSSFailures; }
virtual void getMetrics( vector<PerfMetric>& m ) {
}
@ -185,7 +192,6 @@ struct MachineAttritionWorkload : TestWorkload {
// decide on a machine to kill
state LocalityData targetMachine = self->machines.back();
state Future<Void> resetHealthyZone = Future<Void>(Void());
if(BUGGIFY_WITH_PROB(0.01)) {
TEST(true); //Marked a zone for maintenance before killing it
bool _ =
@ -193,9 +199,7 @@ struct MachineAttritionWorkload : TestWorkload {
// }
} else if (BUGGIFY_WITH_PROB(0.005)) {
TEST(true); // Disable DD for all storage server failures
bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString,
0)); // duration doesn't matter since this won't timeout
resetHealthyZone = resetHealthyZoneAfter(cx, deterministicRandom()->random01() * 5);
self->ignoreSSFailures = ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5);
}
TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
@ -226,7 +230,7 @@ struct MachineAttritionWorkload : TestWorkload {
if(!self->replacement)
self->machines.pop_back();
wait(delay(meanDelay - delayBeforeKill) && resetHealthyZone);
wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);