Fix a race between submit and abort backup

After submit a backup, immediately abort the backup may cause a rare race
condition, which results in BackupCorrectnessLeftoverVersionKey error.

Specifically, in the StartFullBackupTaskFunc:
1st Txn sets the destUid at the source database and the 2nd Txn writes the dest
DB.

An abort can come after the 1st Txn succeeds, and clears the config rage so
that the 2nd Txn above would fail. Because 2nd Txn didn't write destUid, the
3rd Txn of abort can't read the correct source DB for latestVersionKey, which
contains the destUid value.

The fix is to let the 1st Txn of abort to wait until destUid becomes valid.
This commit is contained in:
Jingyu Zhou 2020-10-18 16:46:41 -07:00
parent d7420bb8c7
commit bfd3328448
3 changed files with 12 additions and 4 deletions

View File

@ -1530,6 +1530,7 @@ namespace dbBackup {
wait(tr->commit());
break;
} catch (Error &e) {
TraceEvent("SetDestUidOrBeginVersionError").error(e, true);
wait(tr->onError(e));
}
}
@ -2117,14 +2118,18 @@ public:
state Future<UID> destUidFuture = backupAgent->getDestUid(tr, logUid);
wait(success(statusFuture) && success(destUidFuture));
UID destUid = destUidFuture.get();
if (destUid.isValid()) {
destUidValue = BinaryWriter::toValue(destUid, Unversioned());
}
int status = statusFuture.get();
if (!backupAgent->isRunnable((BackupAgentBase::enumState)status)) {
throw backup_unneeded();
}
UID destUid = destUidFuture.get();
if (destUid.isValid()) {
destUidValue = BinaryWriter::toValue(destUid, Unversioned());
} else {
// Give DR task a chance to update destUid to avoid the problem of
// leftover version key.
throw not_committed();
}
Optional<Value> _backupUid = wait(tr->get(backupAgent->states.get(logUidValue).pack(DatabaseBackupAgent::keyFolderId)));
backupUid = _backupUid.get();
@ -2144,6 +2149,7 @@ public:
break;
}
catch (Error &e) {
TraceEvent("DBA_AbortError").error(e, true);
wait(tr->onError(e));
}
}

View File

@ -339,6 +339,7 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
// Check the left over tasks
// We have to wait for the list to empty since an abort and get status

View File

@ -148,6 +148,7 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
// Check the left over tasks
// We have to wait for the list to empty since an abort and get status