Stop backup workers before clearing DB in parallel restore workload

This is because the clearing of DB can be picked up by backup workers and be
applied during restore, causing restore failures.
This commit is contained in:
Jingyu Zhou 2020-04-11 10:23:53 -07:00
parent 7e5551ea19
commit 4e128328f7
2 changed files with 16 additions and 7 deletions

View File

@ -179,7 +179,9 @@ struct BackupData {
config.startedBackupWorkers().set(tr, workers.get());
}
for (auto p : workers.get()) {
TraceEvent("BackupWorkerDebug", self->myId).detail("Epoch", p.first).detail("TagID", p.second);
TraceEvent("BackupWorkerDebugTag", self->myId)
.detail("Epoch", p.first)
.detail("TagID", p.second);
}
wait(tr->commit());

View File

@ -21,6 +21,7 @@
#include "fdbrpc/simulator.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
@ -421,6 +422,11 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
// wait(attemptDirtyRestore(self, cx, &backupAgent, StringRef(lastBackupContainer->getURL()),
// randomID));
}
// We must ensure no backup workers are running, otherwise the clear DB
// below can be picked up by backup workers and applied during restore.
wait(success(changeConfig(cx, "backup_worker_enabled:=0", true)));
// Clear DB before restore
wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
for (auto& kvrange : self->backupRanges) tr->clear(kvrange);
@ -437,12 +443,6 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
BackupDescription desc = wait(container->describeBackup());
ASSERT(self->usePartitionedLogs == desc.partitioned);
TraceEvent("BAFRW_Restore", randomID)
.detail("LastBackupContainer", lastBackupContainer->getURL())
.detail("MinRestorableVersion", desc.minRestorableVersion.get())
.detail("MaxRestorableVersion", desc.maxRestorableVersion.get())
.detail("ContiguousLogEnd", desc.contiguousLogEnd.get());
state Version targetVersion = -1;
if (desc.maxRestorableVersion.present()) {
if (deterministicRandom()->random01() < 0.1) {
@ -461,6 +461,13 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
}
}
TraceEvent("BAFRW_Restore", randomID)
.detail("LastBackupContainer", lastBackupContainer->getURL())
.detail("MinRestorableVersion", desc.minRestorableVersion.get())
.detail("MaxRestorableVersion", desc.maxRestorableVersion.get())
.detail("ContiguousLogEnd", desc.contiguousLogEnd.get())
.detail("TargetVersion", targetVersion);
state std::vector<Future<Version>> restores;
state std::vector<Standalone<StringRef>> restoreTags;