Remove explicit degraded peer recovery since this may be false positive

This commit is contained in:
Zhe Wu 2022-06-21 08:24:22 -07:00
parent c2752dc773
commit 3cb587edfb
2 changed files with 19 additions and 27 deletions

View File

@ -2779,11 +2779,14 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
ASSERT(health.degradedPeers.find(badPeer1) != health.degradedPeers.end());
ASSERT_EQ(health.degradedPeers[badPeer1].startTime, health.degradedPeers[badPeer1].lastRefreshTime);
ASSERT(health.degradedPeers.find(badPeer2) != health.degradedPeers.end());
ASSERT_EQ(health.degradedPeers[badPeer2].startTime, health.degradedPeers[badPeer2].lastRefreshTime);
}
// Create a `UpdateWorkerHealthRequest` with two bad peers, one from the previous test and a new one.
// The one from the previous test should have lastRefreshTime updated.
// The other one from the previous test not included in this test should be removed.
// The other one from the previous test not included in this test should not be removed.
state double previousStartTime;
state double previousRefreshTime;
{
// Make the time to move so that now() guarantees to return a larger value than before.
wait(delay(0.001));
@ -2794,20 +2797,31 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
data.updateWorkerHealth(req);
ASSERT(data.workerHealth.find(workerAddress) != data.workerHealth.end());
auto& health = data.workerHealth[workerAddress];
ASSERT_EQ(health.degradedPeers.size(), 2);
ASSERT_EQ(health.degradedPeers.size(), 3);
ASSERT(health.degradedPeers.find(badPeer1) != health.degradedPeers.end());
ASSERT_LT(health.degradedPeers[badPeer1].startTime, health.degradedPeers[badPeer1].lastRefreshTime);
ASSERT(health.degradedPeers.find(badPeer2) == health.degradedPeers.end());
ASSERT(health.degradedPeers.find(badPeer2) != health.degradedPeers.end());
ASSERT_EQ(health.degradedPeers[badPeer2].startTime, health.degradedPeers[badPeer2].lastRefreshTime);
ASSERT_EQ(health.degradedPeers[badPeer2].startTime, health.degradedPeers[badPeer1].startTime);
ASSERT(health.degradedPeers.find(badPeer3) != health.degradedPeers.end());
ASSERT_EQ(health.degradedPeers[badPeer3].startTime, health.degradedPeers[badPeer3].lastRefreshTime);
previousStartTime = health.degradedPeers[badPeer3].startTime;
previousRefreshTime = health.degradedPeers[badPeer3].lastRefreshTime;
}
// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should remove the worker from
// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should not remove the worker from
// `workerHealth`.
{
wait(delay(0.001));
UpdateWorkerHealthRequest req;
req.address = workerAddress;
data.updateWorkerHealth(req);
ASSERT(data.workerHealth.find(workerAddress) == data.workerHealth.end());
ASSERT(data.workerHealth.find(workerAddress) != data.workerHealth.end());
auto& health = data.workerHealth[workerAddress];
ASSERT_EQ(health.degradedPeers.size(), 3);
ASSERT(health.degradedPeers.find(badPeer3) != health.degradedPeers.end());
ASSERT_EQ(health.degradedPeers[badPeer3].startTime, previousStartTime);
ASSERT_EQ(health.degradedPeers[badPeer3].lastRefreshTime, previousRefreshTime);
}
return Void();

View File

@ -2907,13 +2907,6 @@ public:
.detail("WorkerAddress", req.address)
.detail("DegradedPeers", degradedPeersString);
// `req.degradedPeers` contains the latest peer performance view from the worker. Clear the worker if the
// requested worker doesn't see any degraded peers.
if (req.degradedPeers.empty()) {
workerHealth.erase(req.address);
return;
}
double currentTime = now();
// Current `workerHealth` doesn't have any information about the incoming worker. Add the worker into
@ -2931,21 +2924,6 @@ public:
auto& health = workerHealth[req.address];
// First, remove any degraded peers recorded in the `workerHealth`, but aren't in the incoming request. These
// machines network performance should have recovered.
std::unordered_set<NetworkAddress> recoveredPeers;
for (const auto& [peer, times] : health.degradedPeers) {
recoveredPeers.insert(peer);
}
for (const auto& peer : req.degradedPeers) {
if (recoveredPeers.find(peer) != recoveredPeers.end()) {
recoveredPeers.erase(peer);
}
}
for (const auto& peer : recoveredPeers) {
health.degradedPeers.erase(peer);
}
// Update the worker's degradedPeers.
for (const auto& peer : req.degradedPeers) {
auto it = health.degradedPeers.find(peer);