Address review comments for PR #1625

This commit is contained in:
Jingyu Zhou 2019-08-14 14:19:50 -07:00
parent 116608a0a7
commit 85c4a4e422
7 changed files with 26 additions and 47 deletions

View File

@ -57,10 +57,16 @@ struct BackupData {
logger = traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc,
"BackupWorkerMetrics");
}
void pop() {
const Tag popTag = logSystem.get()->getPseudoPopTag(tag, ProcessClass::BackupClass);
logSystem.get()->pop(savedVersion, popTag);
}
};
ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
state Transaction tr(self->cx);
state Key key = backupProgressKeyFor(self->myId);
loop {
try {
@ -69,7 +75,8 @@ ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
WorkerBackupStatus status(self->backupEpoch, backupVersion, self->tag);
tr.set(backupProgressKeyFor(self->myId), backupProgressValue(status));
tr.set(key, backupProgressValue(status));
tr.addReadConflictRange(singleKeyRange(key));
wait(tr.commit());
return Void();
} catch (Error& e) {
@ -89,6 +96,10 @@ ACTOR Future<Void> uploadData(BackupData* self) {
return Void();
}
// TODO: knobify the delay of 20s. This delay is sensitive, as it is the
// lag TLog might have.
state Future<Void> uploadDelay = delay(20);
if (self->messages.empty()) {
// Even though messages is empty, we still want to advance popVersion.
popVersion = std::max(popVersion, self->lastSeenVersion);
@ -108,9 +119,6 @@ ACTOR Future<Void> uploadData(BackupData* self) {
if (popVersion > self->savedVersion) {
savedProgress = saveProgress(self, popVersion);
}
// TODO: knobify the delay of 20s. This delay is sensitive, as it is the
// lag TLog might have.
state Future<Void> uploadDelay = delay(20);
loop choose {
when(wait(uploadDelay)) {
@ -120,6 +128,7 @@ ACTOR Future<Void> uploadData(BackupData* self) {
when(wait(savedProgress)) {
TraceEvent("BackupWorkerSavedProgress", self->myId).detail("Version", popVersion);
self->savedVersion = std::max(popVersion, self->savedVersion);
self->pop();
savedProgress = Never(); // Still wait until uploadDelay expires
}
}
@ -172,18 +181,6 @@ ACTOR Future<Void> pullAsyncData(BackupData* self) {
}
}
// We need to pop data faster than the upload interval to avoid accumulate
// mutations in TLogs.
ACTOR Future<Void> popData(BackupData* self) {
loop {
if (self->logSystem.get()) {
const Tag popTag = self->logSystem.get()->getPseudoPopTag(self->tag, ProcessClass::BackupClass);
self->logSystem.get()->pop(self->savedVersion, popTag);
}
wait(delay(2.0)); // TODO: knobify this delay
}
}
ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, LogEpoch recoveryCount,
BackupData* self) {
loop {
@ -219,7 +216,7 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
try {
addActor.send(pullAsyncData(&self));
addActor.send(uploadData(&self));
addActor.send(popData(&self));
addActor.send(checkRemoved(db, req.recruitedEpoch, &self));
addActor.send(waitFailureServer(interf.waitFailure.getFuture()));
loop choose {
@ -228,6 +225,7 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
Reference<ILogSystem> ls = ILogSystem::fromServerDBInfo(self.myId, db->get(), true);
if (ls && ls->hasPseudoLocality(tagLocalityBackup)) {
self.logSystem.set(ls);
self.pop();
TraceEvent("BackupWorkerLogSystem", interf.id())
.detail("HasBackupLocality", true)
.detail("Tag", self.tag.toString());
@ -237,12 +235,6 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
.detail("Tag", self.tag.toString());
}
}
when(HaltBackupRequest req = waitNext(interf.haltBackup.getFuture())) {
req.reply.send(Void());
TraceEvent("BackupWorkerHalted", interf.id()).detail("ReqID", req.requesterID);
break;
}
when(wait(checkRemoved(db, req.recruitedEpoch, &self))) {}
when(wait(onDone)) {
// TODO: when backup is done, we don't exit because master would think
// this worker failed and start recovery. Should fix the protocol between

View File

@ -29,7 +29,6 @@
struct BackupInterface {
constexpr static FileIdentifier file_identifier = 6762745;
RequestStream<ReplyPromise<Void>> waitFailure;
RequestStream<struct HaltBackupRequest> haltBackup;
struct LocalityData locality;
LogEpoch backupEpoch;
UID myId;
@ -41,7 +40,7 @@ struct BackupInterface {
void initEndpoints() {}
UID id() const { return myId; }
NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
UID getToken() const { return haltBackup.getEndpoint().token; }
UID getToken() const { return waitFailure.getEndpoint().token; }
bool operator== (const BackupInterface& r) const {
return id() == r.id();
}
@ -51,21 +50,7 @@ struct BackupInterface {
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, waitFailure, haltBackup, locality, backupEpoch, myId);
}
};
struct HaltBackupRequest {
constexpr static FileIdentifier file_identifier = 4212277;
UID requesterID;
ReplyPromise<Void> reply;
HaltBackupRequest() = default;
explicit HaltBackupRequest(UID uid) : requesterID(uid) {}
template<class Ar>
void serialize(Ar& ar) {
serializer(ar, requesterID, reply);
serializer(ar, waitFailure, locality, backupEpoch, myId);
}
};

View File

@ -593,12 +593,13 @@ ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerD
if(++numSuccesses == 3) {
TraceEvent(("QuietDatabase" + phase + "Done").c_str());
break;
} else {
wait(delay(2.0));
}
else
wait(delay( 2.0 ) );
}
} catch (Error& e) {
if( e.code() != error_code_actor_cancelled && e.code() != error_code_attribute_not_found && e.code() != error_code_timed_out)
if (e.code() != error_code_actor_cancelled && e.code() != error_code_attribute_not_found &&
e.code() != error_code_timed_out)
TraceEvent(("QuietDatabase" + phase + "Error").c_str()).error(e);
//Client invalid operation occurs if we don't get back a message from one of the servers, often corrected by retrying

View File

@ -26,6 +26,7 @@
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/FDBTypes.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/Knobs.h"
@ -995,7 +996,6 @@ ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere
}
state Tag tag(tagLocality, inputTag.id);
auto tagData = logData->getTagData(tag);
TraceEvent("TLogPop0", logData->logId).detail("Tag", tag.toString()).detail("To", to).detail("UpTo", upTo).detail("Popped", tagData ? tagData->popped : -1);
if (!tagData) {
tagData = logData->createTagData(tag, upTo, true, true, false);
} else if (upTo > tagData->popped) {

View File

@ -190,6 +190,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
AsyncVar<bool> recoveryCompleteWrittenToCoreState;
bool remoteLogsWrittenToCoreState;
bool hasRemoteServers;
AsyncTrigger backupWorkerChanged;
Optional<Version> recoverAt;
Optional<Version> recoveredAt;
@ -258,7 +259,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
break;
default:
default: // This should be an error at caller site.
break;
}
return tag;
@ -465,6 +466,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
changes.push_back(self->recoveryCompleteWrittenToCoreState.onChange());
changes.push_back(self->backupWorkerChanged.onTrigger());
ASSERT( failed.size() >= 1 );
wait( quorum(changes, 1) || tagError<Void>( quorum( failed, 1 ), master_tlog_failed() ) );
@ -1375,6 +1377,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
}
logset->backupWorkers.push_back(worker);
}
backupWorkerChanged.trigger();
}
ACTOR static Future<Void> monitorLog(Reference<AsyncVar<OptionalInterface<TLogInterface>>> logServer, Reference<AsyncVar<bool>> failed) {

View File

@ -1267,7 +1267,6 @@ ACTOR Future<std::map<LogEpoch, std::map<Tag, Version>>> getBackupProgress(Refer
.detail("Version", status.version)
.detail("Tag", status.tag.toString());
}
wait(tr.commit());
return progress;
} catch (Error& e) {
wait(tr.onError(e));

View File

@ -1121,7 +1121,6 @@ ACTOR Future<Void> workerServer(
startRole(Role::BACKUP, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.haltBackup);
Future<Void> backupProcess = backupWorker(recruited, req, dbInfo);
errorForwarders.add(forwardError(errors, Role::BACKUP, recruited.id(), backupProcess));