Merge pull request #1956 from etschannen/master

Minor bug fixes
2019-08-05 17:00:51 -07:00 · 2019-08-05 17:00:51 -07:00 · 1c730baedc
parent 9923472636 5dc4c80d44
commit 1c730baedc
2 changed files with 19 additions and 15 deletions
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -2723,6 +2723,11 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 				}
 			}

+			// Failed server should not trigger DD if SS failures are set to be ignored
+			if (!badTeam && self->healthyZone.get().present() && (self->healthyZone.get().get() == ignoreSSFailuresZoneString)) {
+				ASSERT_WE_THINK(serversLeft == self->configuration.storageTeamSize);
+			}
+
 			if( !self->initialFailureReactionDelay.isReady() ) {
 				change.push_back( self->initialFailureReactionDelay );
 			}
@ -2880,11 +2885,6 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 						rs.keys = shards[i];
 						rs.priority = maxPriority;

-						// Failed server should not trigger DD if SS failures are set to be ignored
-						if (rs.priority == PRIORITY_TEAM_UNHEALTHY) {
-							ASSERT_WE_THINK(!(!badTeam && self->healthyZone.get().present() &&
-							                  (self->healthyZone.get().get() == ignoreSSFailuresZoneString)));
-						}
 						self->output.send(rs);
 						if(deterministicRandom()->random01() < 0.01) {
 							TraceEvent("SendRelocateToDDQx100", self->distributorId)
--- a/fdbserver/workloads/MachineAttrition.actor.cpp
+++ b/fdbserver/workloads/MachineAttrition.actor.cpp
@ -35,16 +35,21 @@ static std::set<int> const& normalAttritionErrors() {
 	return s;
 }

-ACTOR Future<Void> resetHealthyZoneAfter(Database cx, double duration) {
+ACTOR Future<bool> ignoreSSFailuresForDuration(Database cx, double duration) {
+	// duration doesn't matter since this won't timeout
+	TraceEvent("IgnoreSSFailureStart");
+	bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString, 0)); 
+	TraceEvent("IgnoreSSFailureWait");
+	wait(delay(duration));
+	TraceEvent("IgnoreSSFailureClear");
 	state Transaction tr(cx);
-	state Future<Void> delayF = delay(duration);
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			wait(delayF);
 			tr.clear(healthyZoneKey);
 			wait(tr.commit());
-			return Void();
+			TraceEvent("IgnoreSSFailureComplete");
+			return true;
 		} catch (Error& e) {
 			wait(tr.onError(e));
 		}
@ -61,6 +66,7 @@ struct MachineAttritionWorkload : TestWorkload {
 	bool replacement;
 	bool waitForVersion;
 	bool allowFaultInjection;
+	Future<bool> ignoreSSFailures;

 	// This is set in setup from the list of workers when the cluster is started
 	std::vector<LocalityData> machines;
@ -78,6 +84,7 @@ struct MachineAttritionWorkload : TestWorkload {
 		replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 );
 		waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false );
 		allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true );
+		ignoreSSFailures = true;
 	}

 	static vector<ISimulator::ProcessInfo*> getServers() {
@ -121,7 +128,7 @@ struct MachineAttritionWorkload : TestWorkload {
 			throw please_reboot();
 		return Void();
 	}
-	virtual Future<bool> check( Database const& cx ) { return true; }
+	virtual Future<bool> check( Database const& cx ) { return ignoreSSFailures; }
 	virtual void getMetrics( vector<PerfMetric>& m ) {
 	}

@ -185,7 +192,6 @@ struct MachineAttritionWorkload : TestWorkload {

 				// decide on a machine to kill
 				state LocalityData targetMachine = self->machines.back();
-				state Future<Void> resetHealthyZone = Future<Void>(Void());
 				if(BUGGIFY_WITH_PROB(0.01)) {
 					TEST(true); //Marked a zone for maintenance before killing it
 					bool _ =
@ -193,9 +199,7 @@ struct MachineAttritionWorkload : TestWorkload {
 					// }
 				} else if (BUGGIFY_WITH_PROB(0.005)) {
 					TEST(true); // Disable DD for all storage server failures
-					bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString,
-					                             0)); // duration doesn't matter since this won't timeout
-					resetHealthyZone = resetHealthyZoneAfter(cx, deterministicRandom()->random01() * 5);
+					self->ignoreSSFailures = ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5);
 				}

 				TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
@ -226,7 +230,7 @@ struct MachineAttritionWorkload : TestWorkload {
 				if(!self->replacement)
 					self->machines.pop_back();

-				wait(delay(meanDelay - delayBeforeKill) && resetHealthyZone);
+				wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));

 				delayBeforeKill = deterministicRandom()->random01() * meanDelay;
 				TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);