Cluster controller remove recovered peer in gray failure
This commit is contained in:
parent
d1bafa7514
commit
b89a50a37d
|
@ -3252,6 +3252,18 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
|
|||
ASSERT_EQ(health.disconnectedPeers[badPeer3].lastRefreshTime, previousRefreshTime);
|
||||
}
|
||||
|
||||
// Make badPeer1 a recovered peer, and CC should remove it from `workerAddress` bad peers.
|
||||
{
|
||||
wait(delay(0.001));
|
||||
UpdateWorkerHealthRequest req;
|
||||
req.address = workerAddress;
|
||||
req.recoveredPeers.push_back(badPeer1);
|
||||
data.updateWorkerHealth(req);
|
||||
auto& health = data.workerHealth[workerAddress];
|
||||
ASSERT(health.degradedPeers.find(badPeer1) == health.degradedPeers.end());
|
||||
ASSERT(health.disconnectedPeers.find(badPeer1) == health.disconnectedPeers.end());
|
||||
}
|
||||
|
||||
return Void();
|
||||
}
|
||||
|
||||
|
|
|
@ -2971,16 +2971,28 @@ public:
|
|||
for (int i = 0; i < req.disconnectedPeers.size(); ++i) {
|
||||
disconnectedPeersString += (i == 0 ? "" : " ") + req.disconnectedPeers[i].toString();
|
||||
}
|
||||
std::string recoveredPeersString;
|
||||
for (int i = 0; i < req.recoveredPeers.size(); ++i) {
|
||||
recoveredPeersString += (i == 0 ? "" : " ") + req.recoveredPeers[i].toString();
|
||||
}
|
||||
TraceEvent("ClusterControllerUpdateWorkerHealth")
|
||||
.detail("WorkerAddress", req.address)
|
||||
.detail("DegradedPeers", degradedPeersString)
|
||||
.detail("DisconnectedPeers", disconnectedPeersString);
|
||||
.detail("DisconnectedPeers", disconnectedPeersString)
|
||||
.detail("RecoveredPeers", recoveredPeersString);
|
||||
|
||||
double currentTime = now();
|
||||
|
||||
// Current `workerHealth` doesn't have any information about the incoming worker. Add the worker into
|
||||
// `workerHealth`.
|
||||
if (workerHealth.find(req.address) == workerHealth.end()) {
|
||||
if (req.degradedPeers.empty() && req.disconnectedPeers.empty()) {
|
||||
// This request doesn't report any new degradation. Although there may contain recovered peer, since
|
||||
// `workerHealth` doesn't record any information on this address, those recovered peers have already
|
||||
// been considered recovered.
|
||||
return;
|
||||
}
|
||||
|
||||
workerHealth[req.address] = {};
|
||||
for (const auto& degradedPeer : req.degradedPeers) {
|
||||
workerHealth[req.address].degradedPeers[degradedPeer] = { currentTime, currentTime };
|
||||
|
@ -2990,6 +3002,8 @@ public:
|
|||
workerHealth[req.address].disconnectedPeers[degradedPeer] = { currentTime, currentTime };
|
||||
}
|
||||
|
||||
// We can return directly here since we just created the health info for this address and there shouldn't be
|
||||
// any recovered peers.
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2997,6 +3011,16 @@ public:
|
|||
|
||||
auto& health = workerHealth[req.address];
|
||||
|
||||
// Remove any recovered peers.
|
||||
for (const auto& peer : req.recoveredPeers) {
|
||||
TraceEvent("ClusterControllerReceivedPeerRecovering")
|
||||
.suppressFor(10.0)
|
||||
.detail("Worker", req.address)
|
||||
.detail("Peer", peer);
|
||||
health.degradedPeers.erase(peer);
|
||||
health.disconnectedPeers.erase(peer);
|
||||
}
|
||||
|
||||
// Update the worker's degradedPeers.
|
||||
for (const auto& peer : req.degradedPeers) {
|
||||
auto it = health.degradedPeers.find(peer);
|
||||
|
|
Loading…
Reference in New Issue