Remove explicit degraded peer recovery since this may be false positive

2022-06-21 08:24:22 -07:00 · 2022-06-21 08:24:22 -07:00 · 3cb587edfb
parent c2752dc773
commit 3cb587edfb
2 changed files with 19 additions and 27 deletions
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -2779,11 +2779,14 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
 		ASSERT(health.degradedPeers.find(badPeer1) != health.degradedPeers.end());
 		ASSERT_EQ(health.degradedPeers[badPeer1].startTime, health.degradedPeers[badPeer1].lastRefreshTime);
 		ASSERT(health.degradedPeers.find(badPeer2) != health.degradedPeers.end());
+		ASSERT_EQ(health.degradedPeers[badPeer2].startTime, health.degradedPeers[badPeer2].lastRefreshTime);
 	}

 	// Create a `UpdateWorkerHealthRequest` with two bad peers, one from the previous test and a new one.
 	// The one from the previous test should have lastRefreshTime updated.
-	// The other one from the previous test not included in this test should be removed.
+	// The other one from the previous test not included in this test should not be removed.
+	state double previousStartTime;
+	state double previousRefreshTime;
 	{
 		// Make the time to move so that now() guarantees to return a larger value than before.
 		wait(delay(0.001));
@ -2794,20 +2797,31 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
 		data.updateWorkerHealth(req);
 		ASSERT(data.workerHealth.find(workerAddress) != data.workerHealth.end());
 		auto& health = data.workerHealth[workerAddress];
-		ASSERT_EQ(health.degradedPeers.size(), 2);
+		ASSERT_EQ(health.degradedPeers.size(), 3);
 		ASSERT(health.degradedPeers.find(badPeer1) != health.degradedPeers.end());
 		ASSERT_LT(health.degradedPeers[badPeer1].startTime, health.degradedPeers[badPeer1].lastRefreshTime);
-		ASSERT(health.degradedPeers.find(badPeer2) == health.degradedPeers.end());
+		ASSERT(health.degradedPeers.find(badPeer2) != health.degradedPeers.end());
+		ASSERT_EQ(health.degradedPeers[badPeer2].startTime, health.degradedPeers[badPeer2].lastRefreshTime);
+		ASSERT_EQ(health.degradedPeers[badPeer2].startTime, health.degradedPeers[badPeer1].startTime);
 		ASSERT(health.degradedPeers.find(badPeer3) != health.degradedPeers.end());
+		ASSERT_EQ(health.degradedPeers[badPeer3].startTime, health.degradedPeers[badPeer3].lastRefreshTime);
+		previousStartTime = health.degradedPeers[badPeer3].startTime;
+		previousRefreshTime = health.degradedPeers[badPeer3].lastRefreshTime;
 	}

-	// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should remove the worker from
+	// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should not remove the worker from
 	// `workerHealth`.
 	{
+		wait(delay(0.001));
 		UpdateWorkerHealthRequest req;
 		req.address = workerAddress;
 		data.updateWorkerHealth(req);
-		ASSERT(data.workerHealth.find(workerAddress) == data.workerHealth.end());
+		ASSERT(data.workerHealth.find(workerAddress) != data.workerHealth.end());
+		auto& health = data.workerHealth[workerAddress];
+		ASSERT_EQ(health.degradedPeers.size(), 3);
+		ASSERT(health.degradedPeers.find(badPeer3) != health.degradedPeers.end());
+		ASSERT_EQ(health.degradedPeers[badPeer3].startTime, previousStartTime);
+		ASSERT_EQ(health.degradedPeers[badPeer3].lastRefreshTime, previousRefreshTime);
 	}

 	return Void();
--- a/fdbserver/ClusterController.actor.h
+++ b/fdbserver/ClusterController.actor.h
@ -2907,13 +2907,6 @@ public:
 		    .detail("WorkerAddress", req.address)
 		    .detail("DegradedPeers", degradedPeersString);

-		// `req.degradedPeers` contains the latest peer performance view from the worker. Clear the worker if the
-		// requested worker doesn't see any degraded peers.
-		if (req.degradedPeers.empty()) {
-			workerHealth.erase(req.address);
-			return;
-		}
-
 		double currentTime = now();

 		// Current `workerHealth` doesn't have any information about the incoming worker. Add the worker into
@ -2931,21 +2924,6 @@ public:

 		auto& health = workerHealth[req.address];

-		// First, remove any degraded peers recorded in the `workerHealth`, but aren't in the incoming request. These
-		// machines network performance should have recovered.
-		std::unordered_set<NetworkAddress> recoveredPeers;
-		for (const auto& [peer, times] : health.degradedPeers) {
-			recoveredPeers.insert(peer);
-		}
-		for (const auto& peer : req.degradedPeers) {
-			if (recoveredPeers.find(peer) != recoveredPeers.end()) {
-				recoveredPeers.erase(peer);
-			}
-		}
-		for (const auto& peer : recoveredPeers) {
-			health.degradedPeers.erase(peer);
-		}
-
 		// Update the worker's degradedPeers.
 		for (const auto& peer : req.degradedPeers) {
 			auto it = health.degradedPeers.find(peer);