Add delay for master to recruit backup workers

This delay is to ensure old epoch's backup workers can save their progress in
the database. Otherwise, the new master could attempts to recruit backup
workers for the old epoch on version ranges that have already been popped. As
a result, the logs will lose data.
This commit is contained in:
Jingyu Zhou 2020-03-05 11:34:37 -08:00
parent b8c362cf44
commit 15437ffb53
4 changed files with 7 additions and 1 deletions

View File

@ -373,6 +373,8 @@ ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
loop {
try {
// It's critical to save progress immediately so that after a master
// recovery, the new master can know the progress so far.
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
@ -466,7 +468,6 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
it->second.lastSavedVersion, popVersion + 1, blockSize, self->tag.id, self->totalTags));
it++;
}
ASSERT(!activeUids.empty());
keyRangeMap.coalesce(allKeys);
wait(waitForAll(logFileFutures));

View File

@ -355,6 +355,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula
init( PROVISIONAL_START_DELAY, 1.0 );
init( PROVISIONAL_MAX_DELAY, 60.0 );
init( PROVISIONAL_DELAY_GROWTH, 1.5 );
init( SECONDS_BEFORE_RECRUIT_BACKUP_WORKER, 4.0 );
// Resolver
init( SAMPLE_OFFSET_PER_KEY, 100 );

View File

@ -292,6 +292,7 @@ public:
double PROVISIONAL_START_DELAY;
double PROVISIONAL_DELAY_GROWTH;
double PROVISIONAL_MAX_DELAY;
double SECONDS_BEFORE_RECRUIT_BACKUP_WORKER;
// Resolver
int64_t KEY_BYTES_PER_SAMPLE;

View File

@ -1241,6 +1241,9 @@ ACTOR Future<Void> configurationMonitor(Reference<MasterData> self, Database cx)
ACTOR static Future<Void> recruitBackupWorkers(Reference<MasterData> self, Database cx) {
ASSERT(self->backupWorkers.size() > 0);
// Avoid race between a backup worker's save progress and the reads below.
wait(delay(SERVER_KNOBS->SECONDS_BEFORE_RECRUIT_BACKUP_WORKER));
state LogEpoch epoch = self->cstate.myDBState.recoveryCount;
state Reference<BackupProgress> backupProgress(
new BackupProgress(self->dbgid, self->logSystem->getOldEpochTagsVersionsInfo()));