fix: self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting the destination team or we could miss failure notifications for the storage servers in the destination team

2021-11-23 15:26:38 -08:00 · 2021-11-23 15:26:38 -08:00 · 1afef44f19
parent 01c37a053a
commit 1afef44f19
1 changed files with 12 additions and 2 deletions
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@ -1027,8 +1027,18 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
 					// bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any
 					// server that hosts the relocateData. This is possible, for example, in a fearless configuration
 					// when the remote DC is just brought up.
-					std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam =
-					    wait(brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req)));
+					Future<std::pair<Optional<Reference<IDataDistributionTeam>>, bool>> fbestTeam =
+					    brokenPromiseToNever(self->teamCollections[tciIndex].getTeam.getReply(req));
+					state bool bestTeamReady = fbestTeam.isReady();
+					std::pair<Optional<Reference<IDataDistributionTeam>>, bool> bestTeam = wait(fbestTeam);
+					if (tciIndex > 0 && !bestTeamReady) {
+						// self->shardsAffectedByTeamFailure->moveShard must be called without any waits after getting
+						// the destination team or we could miss failure notifications for the storage servers in the
+						// destination team
+						TraceEvent("BestTeamNotReady");
+						foundTeams = false;
+						break;
+					}
 					// If a DC has no healthy team, we stop checking the other DCs until
 					// the unhealthy DC is healthy again or is excluded.
 					if (!bestTeam.first.present()) {