From 1afef44f195f69c3dc621cd4c3879ba27e6f0d9d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 23 Nov 2021 15:26:38 -0800 Subject: [PATCH] fix: self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting the destination team or we could miss failure notifications for the storage servers in the destination team --- fdbserver/DataDistributionQueue.actor.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index d3ed343da5..fcbc73dbed 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1027,8 +1027,18 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any // server that hosts the relocateData. This is possible, for example, in a fearless configuration // when the remote DC is just brought up. - std::pair>, bool> bestTeam = - wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req))); + Future>, bool>> fbestTeam = + brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)); + state bool bestTeamReady = fbestTeam.isReady(); + std::pair>, bool> bestTeam = wait(fbestTeam); + if (tciIndex > 0 && !bestTeamReady) { + // self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting + // the destination team or we could miss failure notifications for the storage servers in the + // destination team + TraceEvent("BestTeamNotReady"); + foundTeams = false; + break; + } // If a DC has no healthy team, we stop checking the other DCs until // the unhealthy DC is healthy again or is excluded. if (!bestTeam.first.present()) {