Fix hang due to distributor death in QuietDatabase

It's possible that after obtaining data distributor, the distributor then dies
and a new one is recruited. Because the tester is still contacting the old one,
it becomes stuck.
This commit is contained in:
Jingyu Zhou 2019-08-11 09:46:13 -07:00
parent 73824faf65
commit 6c6a553dcc
1 changed files with 20 additions and 4 deletions

View File

@ -535,7 +535,7 @@ ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerD
try {
TraceEvent("QuietDatabaseWaitingOnDataDistributor");
WorkerInterface distributorWorker = wait( getDataDistributorWorker( cx, dbInfo ) );
UID distributorUID = dbInfo->get().distributor.get().id();
state UID distributorUID = dbInfo->get().distributor.get().id();
TraceEvent("QuietDatabaseGotDataDistributor", distributorUID).detail("Locality", distributorWorker.locality.toString());
state Future<int64_t> dataInFlight = getDataInFlight( cx, distributorWorker);
@ -546,9 +546,25 @@ ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerD
state Future<bool> dataDistributionActive = getDataDistributionActive( cx, distributorWorker );
state Future<bool> storageServersRecruiting = getStorageServersRecruiting ( cx, distributorWorker, distributorUID );
wait(success(dataInFlight) && success(tLogQueueInfo) && success(dataDistributionQueueSize) &&
success(teamCollectionValid) && success(storageQueueSize) && success(dataDistributionActive) &&
success(storageServersRecruiting));
state bool distributorChanged = false;
loop choose {
when(wait(dbInfo->onChange())) {
if (!dbInfo->get().distributor.present() ||
dbInfo->get().distributor.get().id() != distributorUID) {
TraceEvent("QuietDatabaseDataDistributorChanged");
distributorChanged = true;
break;
}
}
when(wait(success(dataInFlight) && success(tLogQueueInfo) && success(dataDistributionQueueSize) &&
success(teamCollectionValid) && success(storageQueueSize) &&
success(dataDistributionActive) && success(storageServersRecruiting))) {
break;
}
}
if (distributorChanged) {
continue;
}
TraceEvent(("QuietDatabase" + phase).c_str())
.detail("DataInFlight", dataInFlight.get())