Fix a race between submit and abort backup
After submit a backup, immediately abort the backup may cause a rare race condition, which results in BackupCorrectnessLeftoverVersionKey error. Specifically, in the StartFullBackupTaskFunc: 1st Txn sets the destUid at the source database and the 2nd Txn writes the dest DB. An abort can come after the 1st Txn succeeds, and clears the config rage so that the 2nd Txn above would fail. Because 2nd Txn didn't write destUid, the 3rd Txn of abort can't read the correct source DB for latestVersionKey, which contains the destUid value. The fix is to let the 1st Txn of abort to wait until destUid becomes valid.
This commit is contained in:
parent
d7420bb8c7
commit
bfd3328448
|
@ -1530,6 +1530,7 @@ namespace dbBackup {
|
|||
wait(tr->commit());
|
||||
break;
|
||||
} catch (Error &e) {
|
||||
TraceEvent("SetDestUidOrBeginVersionError").error(e, true);
|
||||
wait(tr->onError(e));
|
||||
}
|
||||
}
|
||||
|
@ -2117,14 +2118,18 @@ public:
|
|||
state Future<UID> destUidFuture = backupAgent->getDestUid(tr, logUid);
|
||||
wait(success(statusFuture) && success(destUidFuture));
|
||||
|
||||
UID destUid = destUidFuture.get();
|
||||
if (destUid.isValid()) {
|
||||
destUidValue = BinaryWriter::toValue(destUid, Unversioned());
|
||||
}
|
||||
int status = statusFuture.get();
|
||||
if (!backupAgent->isRunnable((BackupAgentBase::enumState)status)) {
|
||||
throw backup_unneeded();
|
||||
}
|
||||
UID destUid = destUidFuture.get();
|
||||
if (destUid.isValid()) {
|
||||
destUidValue = BinaryWriter::toValue(destUid, Unversioned());
|
||||
} else {
|
||||
// Give DR task a chance to update destUid to avoid the problem of
|
||||
// leftover version key.
|
||||
throw not_committed();
|
||||
}
|
||||
|
||||
Optional<Value> _backupUid = wait(tr->get(backupAgent->states.get(logUidValue).pack(DatabaseBackupAgent::keyFolderId)));
|
||||
backupUid = _backupUid.get();
|
||||
|
@ -2144,6 +2149,7 @@ public:
|
|||
break;
|
||||
}
|
||||
catch (Error &e) {
|
||||
TraceEvent("DBA_AbortError").error(e, true);
|
||||
wait(tr->onError(e));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -339,6 +339,7 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
|
|||
|
||||
try {
|
||||
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
||||
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||
|
||||
// Check the left over tasks
|
||||
// We have to wait for the list to empty since an abort and get status
|
||||
|
|
|
@ -148,6 +148,7 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
|
|||
|
||||
try {
|
||||
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
||||
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||
|
||||
// Check the left over tasks
|
||||
// We have to wait for the list to empty since an abort and get status
|
||||
|
|
Loading…
Reference in New Issue