Fix backup worker removal races with setting
The master waits for all backup worker recruitment done and then set them in a batch. However, a backup worker could remove itself before the master sets it. As a result, the worker is not removed and oldest backup epoch can't advance, and TLog can't be popped.
This commit is contained in:
parent
70221a25d7
commit
0823091423
|
@ -188,6 +188,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
bool remoteLogsWrittenToCoreState;
|
||||
bool hasRemoteServers;
|
||||
AsyncTrigger backupWorkerChanged;
|
||||
std::set<UID> removedBackupWorkers; // Workers that are removed before setting them.
|
||||
|
||||
Optional<Version> recoverAt;
|
||||
Optional<Version> recoveredAt;
|
||||
|
@ -1399,6 +1400,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
LogEpoch logsetEpoch = this->epoch;
|
||||
oldestBackupEpoch = this->epoch;
|
||||
for (const auto& reply : replies) {
|
||||
if (removedBackupWorkers.count(reply.interf.id()) > 0) {
|
||||
removedBackupWorkers.erase(reply.interf.id());
|
||||
continue;
|
||||
}
|
||||
Reference<AsyncVar<OptionalInterface<BackupInterface>>> worker(new AsyncVar<OptionalInterface<BackupInterface>>(OptionalInterface<BackupInterface>(reply.interf)));
|
||||
if (reply.backupEpoch != logsetEpoch) {
|
||||
// find the logset from oldLogData
|
||||
|
@ -1408,6 +1413,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
ASSERT(logset.isValid());
|
||||
}
|
||||
logset->backupWorkers.push_back(worker);
|
||||
TraceEvent("AddBackupWorker", dbgid)
|
||||
.detail("Epoch", logsetEpoch)
|
||||
.detail("BackupWorkerID", reply.interf.id());
|
||||
}
|
||||
TraceEvent("SetOldestBackupEpoch", dbgid).detail("Epoch", oldestBackupEpoch);
|
||||
backupWorkerChanged.trigger();
|
||||
|
@ -1434,6 +1442,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
}
|
||||
}
|
||||
backupWorkerChanged.trigger();
|
||||
} else {
|
||||
removedBackupWorkers.insert(req.workerUID);
|
||||
}
|
||||
|
||||
TraceEvent("RemoveBackupWorker", dbgid)
|
||||
|
|
Loading…
Reference in New Issue