commit
1c730baedc
|
@ -2723,6 +2723,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Failed server should not trigger DD if SS failures are set to be ignored
|
||||||
|
if (!badTeam && self->healthyZone.get().present() && (self->healthyZone.get().get() == ignoreSSFailuresZoneString)) {
|
||||||
|
ASSERT_WE_THINK(serversLeft == self->configuration.storageTeamSize);
|
||||||
|
}
|
||||||
|
|
||||||
if( !self->initialFailureReactionDelay.isReady() ) {
|
if( !self->initialFailureReactionDelay.isReady() ) {
|
||||||
change.push_back( self->initialFailureReactionDelay );
|
change.push_back( self->initialFailureReactionDelay );
|
||||||
}
|
}
|
||||||
|
@ -2880,11 +2885,6 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
|
||||||
rs.keys = shards[i];
|
rs.keys = shards[i];
|
||||||
rs.priority = maxPriority;
|
rs.priority = maxPriority;
|
||||||
|
|
||||||
// Failed server should not trigger DD if SS failures are set to be ignored
|
|
||||||
if (rs.priority == PRIORITY_TEAM_UNHEALTHY) {
|
|
||||||
ASSERT_WE_THINK(!(!badTeam && self->healthyZone.get().present() &&
|
|
||||||
(self->healthyZone.get().get() == ignoreSSFailuresZoneString)));
|
|
||||||
}
|
|
||||||
self->output.send(rs);
|
self->output.send(rs);
|
||||||
if(deterministicRandom()->random01() < 0.01) {
|
if(deterministicRandom()->random01() < 0.01) {
|
||||||
TraceEvent("SendRelocateToDDQx100", self->distributorId)
|
TraceEvent("SendRelocateToDDQx100", self->distributorId)
|
||||||
|
|
|
@ -35,16 +35,21 @@ static std::set<int> const& normalAttritionErrors() {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
ACTOR Future<Void> resetHealthyZoneAfter(Database cx, double duration) {
|
ACTOR Future<bool> ignoreSSFailuresForDuration(Database cx, double duration) {
|
||||||
|
// duration doesn't matter since this won't timeout
|
||||||
|
TraceEvent("IgnoreSSFailureStart");
|
||||||
|
bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString, 0));
|
||||||
|
TraceEvent("IgnoreSSFailureWait");
|
||||||
|
wait(delay(duration));
|
||||||
|
TraceEvent("IgnoreSSFailureClear");
|
||||||
state Transaction tr(cx);
|
state Transaction tr(cx);
|
||||||
state Future<Void> delayF = delay(duration);
|
|
||||||
loop {
|
loop {
|
||||||
try {
|
try {
|
||||||
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||||
wait(delayF);
|
|
||||||
tr.clear(healthyZoneKey);
|
tr.clear(healthyZoneKey);
|
||||||
wait(tr.commit());
|
wait(tr.commit());
|
||||||
return Void();
|
TraceEvent("IgnoreSSFailureComplete");
|
||||||
|
return true;
|
||||||
} catch (Error& e) {
|
} catch (Error& e) {
|
||||||
wait(tr.onError(e));
|
wait(tr.onError(e));
|
||||||
}
|
}
|
||||||
|
@ -61,6 +66,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
||||||
bool replacement;
|
bool replacement;
|
||||||
bool waitForVersion;
|
bool waitForVersion;
|
||||||
bool allowFaultInjection;
|
bool allowFaultInjection;
|
||||||
|
Future<bool> ignoreSSFailures;
|
||||||
|
|
||||||
// This is set in setup from the list of workers when the cluster is started
|
// This is set in setup from the list of workers when the cluster is started
|
||||||
std::vector<LocalityData> machines;
|
std::vector<LocalityData> machines;
|
||||||
|
@ -78,6 +84,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
||||||
replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 );
|
replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 );
|
||||||
waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false );
|
waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false );
|
||||||
allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true );
|
allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true );
|
||||||
|
ignoreSSFailures = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vector<ISimulator::ProcessInfo*> getServers() {
|
static vector<ISimulator::ProcessInfo*> getServers() {
|
||||||
|
@ -121,7 +128,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
||||||
throw please_reboot();
|
throw please_reboot();
|
||||||
return Void();
|
return Void();
|
||||||
}
|
}
|
||||||
virtual Future<bool> check( Database const& cx ) { return true; }
|
virtual Future<bool> check( Database const& cx ) { return ignoreSSFailures; }
|
||||||
virtual void getMetrics( vector<PerfMetric>& m ) {
|
virtual void getMetrics( vector<PerfMetric>& m ) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,7 +192,6 @@ struct MachineAttritionWorkload : TestWorkload {
|
||||||
|
|
||||||
// decide on a machine to kill
|
// decide on a machine to kill
|
||||||
state LocalityData targetMachine = self->machines.back();
|
state LocalityData targetMachine = self->machines.back();
|
||||||
state Future<Void> resetHealthyZone = Future<Void>(Void());
|
|
||||||
if(BUGGIFY_WITH_PROB(0.01)) {
|
if(BUGGIFY_WITH_PROB(0.01)) {
|
||||||
TEST(true); //Marked a zone for maintenance before killing it
|
TEST(true); //Marked a zone for maintenance before killing it
|
||||||
bool _ =
|
bool _ =
|
||||||
|
@ -193,9 +199,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
||||||
// }
|
// }
|
||||||
} else if (BUGGIFY_WITH_PROB(0.005)) {
|
} else if (BUGGIFY_WITH_PROB(0.005)) {
|
||||||
TEST(true); // Disable DD for all storage server failures
|
TEST(true); // Disable DD for all storage server failures
|
||||||
bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString,
|
self->ignoreSSFailures = ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5);
|
||||||
0)); // duration doesn't matter since this won't timeout
|
|
||||||
resetHealthyZone = resetHealthyZoneAfter(cx, deterministicRandom()->random01() * 5);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
|
TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
|
||||||
|
@ -226,7 +230,7 @@ struct MachineAttritionWorkload : TestWorkload {
|
||||||
if(!self->replacement)
|
if(!self->replacement)
|
||||||
self->machines.pop_back();
|
self->machines.pop_back();
|
||||||
|
|
||||||
wait(delay(meanDelay - delayBeforeKill) && resetHealthyZone);
|
wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));
|
||||||
|
|
||||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||||
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);
|
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);
|
||||||
|
|
Loading…
Reference in New Issue