Merge pull request #2901 from jzhou77/backup-cmd

Add pause/resume for new backups
This commit is contained in:
Meng Xu 2020-04-07 16:53:51 -07:00 committed by GitHub
commit 867f734d8f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 80 additions and 12 deletions

View File

@ -2051,8 +2051,8 @@ ACTOR Future<Void> discontinueBackup(Database db, std::string tagName, bool wait
ACTOR Future<Void> changeBackupResumed(Database db, bool pause) { ACTOR Future<Void> changeBackupResumed(Database db, bool pause) {
try { try {
state FileBackupAgent backupAgent; FileBackupAgent backupAgent;
wait(backupAgent.taskBucket->changePause(db, pause)); wait(backupAgent.changePause(db, pause));
printf("All backup agents have been %s.\n", pause ? "paused" : "resumed"); printf("All backup agents have been %s.\n", pause ? "paused" : "resumed");
} }
catch (Error& e) { catch (Error& e) {

View File

@ -362,6 +362,9 @@ public:
Future<bool> checkActive(Database cx) { return taskBucket->checkActive(cx); } Future<bool> checkActive(Database cx) { return taskBucket->checkActive(cx); }
// If "pause" is true, pause all backups; otherwise, resume all.
Future<Void> changePause(Database db, bool pause);
friend class FileBackupAgentImpl; friend class FileBackupAgentImpl;
static const int dataFooterSize; static const int dataFooterSize;

View File

@ -4049,6 +4049,28 @@ public:
return Void(); return Void();
} }
ACTOR static Future<Void> changePause(FileBackupAgent* backupAgent, Database db, bool pause) {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(db));
state Future<Void> change = backupAgent->taskBucket->changePause(db, pause);
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
try {
tr->set(backupPausedKey, pause ? LiteralStringRef("1") : LiteralStringRef("0"));
wait(tr->commit());
break;
} catch (Error& e) {
wait(tr->onError(e));
}
}
wait(change);
TraceEvent("FileBackupAgentChangePaused").detail("Action", pause ? "Paused" : "Resumed");
return Void();
}
struct TimestampedVersion { struct TimestampedVersion {
Optional<Version> version; Optional<Version> version;
Optional<int64_t> epochs; Optional<int64_t> epochs;
@ -4652,3 +4674,7 @@ void FileBackupAgent::setLastRestorable(Reference<ReadYourWritesTransaction> tr,
Future<int> FileBackupAgent::waitBackup(Database cx, std::string tagName, bool stopWhenDone, Reference<IBackupContainer> *pContainer, UID *pUID) { Future<int> FileBackupAgent::waitBackup(Database cx, std::string tagName, bool stopWhenDone, Reference<IBackupContainer> *pContainer, UID *pUID) {
return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone, pContainer, pUID); return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone, pContainer, pUID);
} }
Future<Void> FileBackupAgent::changePause(Database db, bool pause) {
return FileBackupAgentImpl::changePause(this, db, pause);
}

View File

@ -501,6 +501,7 @@ const KeyRangeRef backupProgressKeys(LiteralStringRef("\xff\x02/backupProgress/"
LiteralStringRef("\xff\x02/backupProgress0")); LiteralStringRef("\xff\x02/backupProgress0"));
const KeyRef backupProgressPrefix = backupProgressKeys.begin; const KeyRef backupProgressPrefix = backupProgressKeys.begin;
const KeyRef backupStartedKey = LiteralStringRef("\xff\x02/backupStarted"); const KeyRef backupStartedKey = LiteralStringRef("\xff\x02/backupStarted");
extern const KeyRef backupPausedKey = LiteralStringRef("\xff\x02/backupPaused");
const Key backupProgressKeyFor(UID workerID) { const Key backupProgressKeyFor(UID workerID) {
BinaryWriter wr(Unversioned()); BinaryWriter wr(Unversioned());

View File

@ -179,7 +179,7 @@ const Value workerListValue( ProcessData const& );
Key decodeWorkerListKey( KeyRef const& ); Key decodeWorkerListKey( KeyRef const& );
ProcessData decodeWorkerListValue( ValueRef const& ); ProcessData decodeWorkerListValue( ValueRef const& );
// "\xff/backupProgress/[[workerID]]" := "[[WorkerBackupStatus]]" // "\xff\x02/backupProgress/[[workerID]]" := "[[WorkerBackupStatus]]"
extern const KeyRangeRef backupProgressKeys; extern const KeyRangeRef backupProgressKeys;
extern const KeyRef backupProgressPrefix; extern const KeyRef backupProgressPrefix;
const Key backupProgressKeyFor(UID workerID); const Key backupProgressKeyFor(UID workerID);
@ -187,11 +187,16 @@ const Value backupProgressValue(const WorkerBackupStatus& status);
UID decodeBackupProgressKey(const KeyRef& key); UID decodeBackupProgressKey(const KeyRef& key);
WorkerBackupStatus decodeBackupProgressValue(const ValueRef& value); WorkerBackupStatus decodeBackupProgressValue(const ValueRef& value);
// "\xff/backupStarted" := "[[vector<UID,Version1>]]" // The key to signal backup workers a new backup job is submitted.
// "\xff\x02/backupStarted" := "[[vector<UID,Version1>]]"
extern const KeyRef backupStartedKey; extern const KeyRef backupStartedKey;
Value encodeBackupStartedValue(const std::vector<std::pair<UID, Version>>& ids); Value encodeBackupStartedValue(const std::vector<std::pair<UID, Version>>& ids);
std::vector<std::pair<UID, Version>> decodeBackupStartedValue(const ValueRef& value); std::vector<std::pair<UID, Version>> decodeBackupStartedValue(const ValueRef& value);
// The key to signal backup workers that they should pause or resume.
// "\xff\x02/backupPaused" := "[[0|1]]"
extern const KeyRef backupPausedKey;
extern const KeyRef coordinatorsKey; extern const KeyRef coordinatorsKey;
extern const KeyRef logsKey; extern const KeyRef logsKey;
extern const KeyRef minRequiredCommitVersionKey; extern const KeyRef minRequiredCommitVersionKey;

View File

@ -82,6 +82,7 @@ struct BackupData {
bool pulling = false; bool pulling = false;
bool stopped = false; bool stopped = false;
bool exitEarly = false; // If the worker is on an old epoch and all backups starts a version >= the endVersion bool exitEarly = false; // If the worker is on an old epoch and all backups starts a version >= the endVersion
AsyncVar<bool> paused; // Track if "backupPausedKey" is set.
struct PerBackupInfo { struct PerBackupInfo {
PerBackupInfo() = default; PerBackupInfo() = default;
@ -223,7 +224,7 @@ struct BackupData {
: myId(id), tag(req.routerTag), totalTags(req.totalTags), startVersion(req.startVersion), : myId(id), tag(req.routerTag), totalTags(req.totalTags), startVersion(req.startVersion),
endVersion(req.endVersion), recruitedEpoch(req.recruitedEpoch), backupEpoch(req.backupEpoch), endVersion(req.endVersion), recruitedEpoch(req.recruitedEpoch), backupEpoch(req.backupEpoch),
minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1), minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1),
cc("BackupWorker", myId.toString()), pulledVersion(0) { cc("BackupWorker", myId.toString()), pulledVersion(0), paused(false) {
cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true); cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true);
specialCounter(cc, "SavedVersion", [this]() { return this->savedVersion; }); specialCounter(cc, "SavedVersion", [this]() { return this->savedVersion; });
@ -797,6 +798,10 @@ ACTOR Future<Void> pullAsyncData(BackupData* self) {
TraceEvent("BackupWorkerPull", self->myId); TraceEvent("BackupWorkerPull", self->myId);
loop { loop {
while (self->paused.get()) {
wait(self->paused.onChange());
}
loop choose { loop choose {
when (wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) { when (wait(r ? r->getMore(TaskPriority::TLogCommit) : Never())) {
break; break;
@ -856,7 +861,7 @@ ACTOR Future<Void> monitorBackupKeyOrPullData(BackupData* self, bool keyPresent)
wait(self->pulledVersion.whenAtLeast(currentVersion)); wait(self->pulledVersion.whenAtLeast(currentVersion));
pullFinished = Future<Void>(); // cancels pullAsyncData() pullFinished = Future<Void>(); // cancels pullAsyncData()
self->pulling = false; self->pulling = false;
TraceEvent("BackupWorkerPaused", self->myId); TraceEvent("BackupWorkerPaused", self->myId).detail("Reson", "NoBackup");
} else { } else {
// Backup key is not present, enter this NOOP POP mode. // Backup key is not present, enter this NOOP POP mode.
state Future<Version> committedVersion = self->getMinKnownCommittedVersion(); state Future<Version> committedVersion = self->getMinKnownCommittedVersion();
@ -899,6 +904,33 @@ ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, LogEpoch r
} }
} }
ACTOR static Future<Void> monitorWorkerPause(BackupData* self) {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(self->cx));
state Future<Void> watch;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> value = wait(tr->get(backupPausedKey));
bool paused = value.present() && value.get() == LiteralStringRef("1");
if (self->paused.get() != paused) {
TraceEvent(paused ? "BackupWorkerPaused" : "BackupWorkerResumed", self->myId);
self->paused.set(paused);
}
watch = tr->watch(backupPausedKey);
wait(tr->commit());
wait(watch);
tr->reset();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest req, ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest req,
Reference<AsyncVar<ServerDBInfo>> db) { Reference<AsyncVar<ServerDBInfo>> db) {
state BackupData self(interf.id(), db, req); state BackupData self(interf.id(), db, req);
@ -921,6 +953,7 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
if (req.recruitedEpoch == req.backupEpoch && req.routerTag.id == 0) { if (req.recruitedEpoch == req.backupEpoch && req.routerTag.id == 0) {
addActor.send(monitorBackupProgress(&self)); addActor.send(monitorBackupProgress(&self));
} }
addActor.send(monitorWorkerPause(&self));
// Check if backup key is present to avoid race between this check and // Check if backup key is present to avoid race between this check and
// noop pop as well as upload data: pop or skip upload before knowing // noop pop as well as upload data: pop or skip upload before knowing

View File

@ -136,9 +136,9 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
ACTOR static Future<Void> changePaused(Database cx, FileBackupAgent* backupAgent) { ACTOR static Future<Void> changePaused(Database cx, FileBackupAgent* backupAgent) {
loop { loop {
wait(backupAgent->taskBucket->changePause(cx, true)); wait(backupAgent->changePause(cx, true));
wait(delay(30 * deterministicRandom()->random01())); wait(delay(30 * deterministicRandom()->random01()));
wait(backupAgent->taskBucket->changePause(cx, false)); wait(backupAgent->changePause(cx, false));
wait(delay(120 * deterministicRandom()->random01())); wait(delay(120 * deterministicRandom()->random01()));
} }
} }

View File

@ -180,10 +180,10 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
ACTOR static Future<Void> changePaused(Database cx, FileBackupAgent* backupAgent) { ACTOR static Future<Void> changePaused(Database cx, FileBackupAgent* backupAgent) {
loop { loop {
wait( backupAgent->taskBucket->changePause(cx, true) ); wait(backupAgent->changePause(cx, true));
wait( delay(30*deterministicRandom()->random01()) ); wait(delay(30 * deterministicRandom()->random01()));
wait( backupAgent->taskBucket->changePause(cx, false) ); wait(backupAgent->changePause(cx, false));
wait( delay(120*deterministicRandom()->random01()) ); wait(delay(120 * deterministicRandom()->random01()));
} }
} }