fix: do not force a recovery if the master was already in the other region (and therefore already recovered)

fix: reboot the remaining DC, because any storage server rejoins that were rolled back will cause that server to be unusable
2018-09-28 12:10:04 -07:00 · 2018-09-28 12:10:04 -07:00 · b560b94ebc
parent 77e2fb787e
commit b560b94ebc
1 changed files with 22 additions and 14 deletions
--- a/fdbserver/workloads/KillRegion.actor.cpp
+++ b/fdbserver/workloads/KillRegion.actor.cpp
@ -79,25 +79,33 @@ struct KillRegionWorkload : TestWorkload {
 		g_simulator.killDataCenter( LiteralStringRef("2"), ISimulator::RebootAndDelete, true );
 		g_simulator.killDataCenter( LiteralStringRef("4"), ISimulator::RebootAndDelete, true );

+		state bool first = true;
 		loop {
+			state Transaction tr(cx);
+			loop {
+				try {
+					tr.addWriteConflictRange(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\x00")));
+					choose {
+						when( Void _ = wait(tr.commit()) ) {
+							TraceEvent("ForceRecovery_Complete");
+							g_simulator.killDataCenter( LiteralStringRef("1"), ISimulator::Reboot );
+							g_simulator.killDataCenter( LiteralStringRef("3"), ISimulator::Reboot );
+							g_simulator.killDataCenter( LiteralStringRef("5"), ISimulator::Reboot );
+							return Void();
+						}
+						when( Void _ = wait(delay(first ? 5.0 : 120.0)) ) {
+							break;
+						}
+					}
+				} catch( Error &e ) {
+					Void _ = wait( tr.onError(e) );
+				}
+			}
 			TraceEvent("ForceRecovery_Begin");
 			Void _ = wait( forceRecovery(cx->cluster->getConnectionFile()) );
+			first = false;
 			TraceEvent("ForceRecovery_Attempted");
-			state Transaction tr(cx);
-			try {
-				choose {
-					when( Version _ = wait(tr.getReadVersion()) ) {
-						TraceEvent("ForceRecovery_Complete");
-						break;
-					}
-					when( Void _ = wait(delay(120.0)) ) {}
-				}
-			} catch( Error &e ) {
-				Void _ = wait( tr.onError(e) );
-			}
 		}
-
-		return Void();
 	}
 };