commit
1c730baedc
|
@ -2723,6 +2723,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
|||
}
|
||||
}
|
||||
|
||||
// Failed server should not trigger DD if SS failures are set to be ignored
|
||||
if (!badTeam && self->healthyZone.get().present() && (self->healthyZone.get().get() == ignoreSSFailuresZoneString)) {
|
||||
ASSERT_WE_THINK(serversLeft == self->configuration.storageTeamSize);
|
||||
}
|
||||
|
||||
if( !self->initialFailureReactionDelay.isReady() ) {
|
||||
change.push_back( self->initialFailureReactionDelay );
|
||||
}
|
||||
|
@ -2880,11 +2885,6 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
|||
rs.keys = shards[i];
|
||||
rs.priority = maxPriority;
|
||||
|
||||
// Failed server should not trigger DD if SS failures are set to be ignored
|
||||
if (rs.priority == PRIORITY_TEAM_UNHEALTHY) {
|
||||
ASSERT_WE_THINK(!(!badTeam && self->healthyZone.get().present() &&
|
||||
(self->healthyZone.get().get() == ignoreSSFailuresZoneString)));
|
||||
}
|
||||
self->output.send(rs);
|
||||
if(deterministicRandom()->random01() < 0.01) {
|
||||
TraceEvent("SendRelocateToDDQx100", self->distributorId)
|
||||
|
|
|
@ -35,16 +35,21 @@ static std::set<int> const& normalAttritionErrors() {
|
|||
return s;
|
||||
}
|
||||
|
||||
ACTOR Future<Void> resetHealthyZoneAfter(Database cx, double duration) {
|
||||
ACTOR Future<bool> ignoreSSFailuresForDuration(Database cx, double duration) {
|
||||
// duration doesn't matter since this won't timeout
|
||||
TraceEvent("IgnoreSSFailureStart");
|
||||
bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString, 0));
|
||||
TraceEvent("IgnoreSSFailureWait");
|
||||
wait(delay(duration));
|
||||
TraceEvent("IgnoreSSFailureClear");
|
||||
state Transaction tr(cx);
|
||||
state Future<Void> delayF = delay(duration);
|
||||
loop {
|
||||
try {
|
||||
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||
wait(delayF);
|
||||
tr.clear(healthyZoneKey);
|
||||
wait(tr.commit());
|
||||
return Void();
|
||||
TraceEvent("IgnoreSSFailureComplete");
|
||||
return true;
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
|
@ -61,6 +66,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
bool replacement;
|
||||
bool waitForVersion;
|
||||
bool allowFaultInjection;
|
||||
Future<bool> ignoreSSFailures;
|
||||
|
||||
// This is set in setup from the list of workers when the cluster is started
|
||||
std::vector<LocalityData> machines;
|
||||
|
@ -78,6 +84,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 );
|
||||
waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false );
|
||||
allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true );
|
||||
ignoreSSFailures = true;
|
||||
}
|
||||
|
||||
static vector<ISimulator::ProcessInfo*> getServers() {
|
||||
|
@ -121,7 +128,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
throw please_reboot();
|
||||
return Void();
|
||||
}
|
||||
virtual Future<bool> check( Database const& cx ) { return true; }
|
||||
virtual Future<bool> check( Database const& cx ) { return ignoreSSFailures; }
|
||||
virtual void getMetrics( vector<PerfMetric>& m ) {
|
||||
}
|
||||
|
||||
|
@ -185,7 +192,6 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
|
||||
// decide on a machine to kill
|
||||
state LocalityData targetMachine = self->machines.back();
|
||||
state Future<Void> resetHealthyZone = Future<Void>(Void());
|
||||
if(BUGGIFY_WITH_PROB(0.01)) {
|
||||
TEST(true); //Marked a zone for maintenance before killing it
|
||||
bool _ =
|
||||
|
@ -193,9 +199,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
// }
|
||||
} else if (BUGGIFY_WITH_PROB(0.005)) {
|
||||
TEST(true); // Disable DD for all storage server failures
|
||||
bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString,
|
||||
0)); // duration doesn't matter since this won't timeout
|
||||
resetHealthyZone = resetHealthyZoneAfter(cx, deterministicRandom()->random01() * 5);
|
||||
self->ignoreSSFailures = ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5);
|
||||
}
|
||||
|
||||
TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
|
||||
|
@ -226,7 +230,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
if(!self->replacement)
|
||||
self->machines.pop_back();
|
||||
|
||||
wait(delay(meanDelay - delayBeforeKill) && resetHealthyZone);
|
||||
wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));
|
||||
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);
|
||||
|
|
Loading…
Reference in New Issue