Fix backup worker removal races with setting

The master waits for all backup worker recruitment done and then set them in a
batch. However, a backup worker could remove itself before the master sets it.
As a result, the worker is not removed and oldest backup epoch can't advance,
and TLog can't be popped.
This commit is contained in:
Jingyu Zhou 2020-04-19 21:39:47 -07:00
parent 70221a25d7
commit 0823091423
1 changed files with 10 additions and 0 deletions

View File

@ -188,6 +188,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
bool remoteLogsWrittenToCoreState;
bool hasRemoteServers;
AsyncTrigger backupWorkerChanged;
std::set<UID> removedBackupWorkers; // Workers that are removed before setting them.
Optional<Version> recoverAt;
Optional<Version> recoveredAt;
@ -1399,6 +1400,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
LogEpoch logsetEpoch = this->epoch;
oldestBackupEpoch = this->epoch;
for (const auto& reply : replies) {
if (removedBackupWorkers.count(reply.interf.id()) > 0) {
removedBackupWorkers.erase(reply.interf.id());
continue;
}
Reference<AsyncVar<OptionalInterface<BackupInterface>>> worker(new AsyncVar<OptionalInterface<BackupInterface>>(OptionalInterface<BackupInterface>(reply.interf)));
if (reply.backupEpoch != logsetEpoch) {
// find the logset from oldLogData
@ -1408,6 +1413,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
ASSERT(logset.isValid());
}
logset->backupWorkers.push_back(worker);
TraceEvent("AddBackupWorker", dbgid)
.detail("Epoch", logsetEpoch)
.detail("BackupWorkerID", reply.interf.id());
}
TraceEvent("SetOldestBackupEpoch", dbgid).detail("Epoch", oldestBackupEpoch);
backupWorkerChanged.trigger();
@ -1434,6 +1442,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
}
backupWorkerChanged.trigger();
} else {
removedBackupWorkers.insert(req.workerUID);
}
TraceEvent("RemoveBackupWorker", dbgid)