Address review comments for PR #1625
This commit is contained in:
parent
116608a0a7
commit
85c4a4e422
|
@ -57,10 +57,16 @@ struct BackupData {
|
|||
logger = traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc,
|
||||
"BackupWorkerMetrics");
|
||||
}
|
||||
|
||||
void pop() {
|
||||
const Tag popTag = logSystem.get()->getPseudoPopTag(tag, ProcessClass::BackupClass);
|
||||
logSystem.get()->pop(savedVersion, popTag);
|
||||
}
|
||||
};
|
||||
|
||||
ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
|
||||
state Transaction tr(self->cx);
|
||||
state Key key = backupProgressKeyFor(self->myId);
|
||||
|
||||
loop {
|
||||
try {
|
||||
|
@ -69,7 +75,8 @@ ACTOR Future<Void> saveProgress(BackupData* self, Version backupVersion) {
|
|||
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||
|
||||
WorkerBackupStatus status(self->backupEpoch, backupVersion, self->tag);
|
||||
tr.set(backupProgressKeyFor(self->myId), backupProgressValue(status));
|
||||
tr.set(key, backupProgressValue(status));
|
||||
tr.addReadConflictRange(singleKeyRange(key));
|
||||
wait(tr.commit());
|
||||
return Void();
|
||||
} catch (Error& e) {
|
||||
|
@ -89,6 +96,10 @@ ACTOR Future<Void> uploadData(BackupData* self) {
|
|||
return Void();
|
||||
}
|
||||
|
||||
// TODO: knobify the delay of 20s. This delay is sensitive, as it is the
|
||||
// lag TLog might have.
|
||||
state Future<Void> uploadDelay = delay(20);
|
||||
|
||||
if (self->messages.empty()) {
|
||||
// Even though messages is empty, we still want to advance popVersion.
|
||||
popVersion = std::max(popVersion, self->lastSeenVersion);
|
||||
|
@ -108,9 +119,6 @@ ACTOR Future<Void> uploadData(BackupData* self) {
|
|||
if (popVersion > self->savedVersion) {
|
||||
savedProgress = saveProgress(self, popVersion);
|
||||
}
|
||||
// TODO: knobify the delay of 20s. This delay is sensitive, as it is the
|
||||
// lag TLog might have.
|
||||
state Future<Void> uploadDelay = delay(20);
|
||||
|
||||
loop choose {
|
||||
when(wait(uploadDelay)) {
|
||||
|
@ -120,6 +128,7 @@ ACTOR Future<Void> uploadData(BackupData* self) {
|
|||
when(wait(savedProgress)) {
|
||||
TraceEvent("BackupWorkerSavedProgress", self->myId).detail("Version", popVersion);
|
||||
self->savedVersion = std::max(popVersion, self->savedVersion);
|
||||
self->pop();
|
||||
savedProgress = Never(); // Still wait until uploadDelay expires
|
||||
}
|
||||
}
|
||||
|
@ -172,18 +181,6 @@ ACTOR Future<Void> pullAsyncData(BackupData* self) {
|
|||
}
|
||||
}
|
||||
|
||||
// We need to pop data faster than the upload interval to avoid accumulate
|
||||
// mutations in TLogs.
|
||||
ACTOR Future<Void> popData(BackupData* self) {
|
||||
loop {
|
||||
if (self->logSystem.get()) {
|
||||
const Tag popTag = self->logSystem.get()->getPseudoPopTag(self->tag, ProcessClass::BackupClass);
|
||||
self->logSystem.get()->pop(self->savedVersion, popTag);
|
||||
}
|
||||
wait(delay(2.0)); // TODO: knobify this delay
|
||||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> checkRemoved(Reference<AsyncVar<ServerDBInfo>> db, LogEpoch recoveryCount,
|
||||
BackupData* self) {
|
||||
loop {
|
||||
|
@ -219,7 +216,7 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
|
|||
try {
|
||||
addActor.send(pullAsyncData(&self));
|
||||
addActor.send(uploadData(&self));
|
||||
addActor.send(popData(&self));
|
||||
addActor.send(checkRemoved(db, req.recruitedEpoch, &self));
|
||||
addActor.send(waitFailureServer(interf.waitFailure.getFuture()));
|
||||
|
||||
loop choose {
|
||||
|
@ -228,6 +225,7 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
|
|||
Reference<ILogSystem> ls = ILogSystem::fromServerDBInfo(self.myId, db->get(), true);
|
||||
if (ls && ls->hasPseudoLocality(tagLocalityBackup)) {
|
||||
self.logSystem.set(ls);
|
||||
self.pop();
|
||||
TraceEvent("BackupWorkerLogSystem", interf.id())
|
||||
.detail("HasBackupLocality", true)
|
||||
.detail("Tag", self.tag.toString());
|
||||
|
@ -237,12 +235,6 @@ ACTOR Future<Void> backupWorker(BackupInterface interf, InitializeBackupRequest
|
|||
.detail("Tag", self.tag.toString());
|
||||
}
|
||||
}
|
||||
when(HaltBackupRequest req = waitNext(interf.haltBackup.getFuture())) {
|
||||
req.reply.send(Void());
|
||||
TraceEvent("BackupWorkerHalted", interf.id()).detail("ReqID", req.requesterID);
|
||||
break;
|
||||
}
|
||||
when(wait(checkRemoved(db, req.recruitedEpoch, &self))) {}
|
||||
when(wait(onDone)) {
|
||||
// TODO: when backup is done, we don't exit because master would think
|
||||
// this worker failed and start recovery. Should fix the protocol between
|
||||
|
|
|
@ -29,7 +29,6 @@
|
|||
struct BackupInterface {
|
||||
constexpr static FileIdentifier file_identifier = 6762745;
|
||||
RequestStream<ReplyPromise<Void>> waitFailure;
|
||||
RequestStream<struct HaltBackupRequest> haltBackup;
|
||||
struct LocalityData locality;
|
||||
LogEpoch backupEpoch;
|
||||
UID myId;
|
||||
|
@ -41,7 +40,7 @@ struct BackupInterface {
|
|||
void initEndpoints() {}
|
||||
UID id() const { return myId; }
|
||||
NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
|
||||
UID getToken() const { return haltBackup.getEndpoint().token; }
|
||||
UID getToken() const { return waitFailure.getEndpoint().token; }
|
||||
bool operator== (const BackupInterface& r) const {
|
||||
return id() == r.id();
|
||||
}
|
||||
|
@ -51,21 +50,7 @@ struct BackupInterface {
|
|||
|
||||
template <class Archive>
|
||||
void serialize(Archive& ar) {
|
||||
serializer(ar, waitFailure, haltBackup, locality, backupEpoch, myId);
|
||||
}
|
||||
};
|
||||
|
||||
struct HaltBackupRequest {
|
||||
constexpr static FileIdentifier file_identifier = 4212277;
|
||||
UID requesterID;
|
||||
ReplyPromise<Void> reply;
|
||||
|
||||
HaltBackupRequest() = default;
|
||||
explicit HaltBackupRequest(UID uid) : requesterID(uid) {}
|
||||
|
||||
template<class Ar>
|
||||
void serialize(Ar& ar) {
|
||||
serializer(ar, requesterID, reply);
|
||||
serializer(ar, waitFailure, locality, backupEpoch, myId);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -593,12 +593,13 @@ ACTOR Future<Void> waitForQuietDatabase( Database cx, Reference<AsyncVar<ServerD
|
|||
if(++numSuccesses == 3) {
|
||||
TraceEvent(("QuietDatabase" + phase + "Done").c_str());
|
||||
break;
|
||||
} else {
|
||||
wait(delay(2.0));
|
||||
}
|
||||
else
|
||||
wait(delay( 2.0 ) );
|
||||
}
|
||||
} catch (Error& e) {
|
||||
if( e.code() != error_code_actor_cancelled && e.code() != error_code_attribute_not_found && e.code() != error_code_timed_out)
|
||||
if (e.code() != error_code_actor_cancelled && e.code() != error_code_attribute_not_found &&
|
||||
e.code() != error_code_timed_out)
|
||||
TraceEvent(("QuietDatabase" + phase + "Error").c_str()).error(e);
|
||||
|
||||
//Client invalid operation occurs if we don't get back a message from one of the servers, often corrected by retrying
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "fdbclient/KeyRangeMap.h"
|
||||
#include "fdbclient/RunTransaction.actor.h"
|
||||
#include "fdbclient/SystemData.h"
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbserver/WorkerInterface.actor.h"
|
||||
#include "fdbserver/TLogInterface.h"
|
||||
#include "fdbserver/Knobs.h"
|
||||
|
@ -995,7 +996,6 @@ ACTOR Future<Void> tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere
|
|||
}
|
||||
state Tag tag(tagLocality, inputTag.id);
|
||||
auto tagData = logData->getTagData(tag);
|
||||
TraceEvent("TLogPop0", logData->logId).detail("Tag", tag.toString()).detail("To", to).detail("UpTo", upTo).detail("Popped", tagData ? tagData->popped : -1);
|
||||
if (!tagData) {
|
||||
tagData = logData->createTagData(tag, upTo, true, true, false);
|
||||
} else if (upTo > tagData->popped) {
|
||||
|
|
|
@ -190,6 +190,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
AsyncVar<bool> recoveryCompleteWrittenToCoreState;
|
||||
bool remoteLogsWrittenToCoreState;
|
||||
bool hasRemoteServers;
|
||||
AsyncTrigger backupWorkerChanged;
|
||||
|
||||
Optional<Version> recoverAt;
|
||||
Optional<Version> recoveredAt;
|
||||
|
@ -258,7 +259,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
default: // This should be an error at caller site.
|
||||
break;
|
||||
}
|
||||
return tag;
|
||||
|
@ -465,6 +466,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
}
|
||||
|
||||
changes.push_back(self->recoveryCompleteWrittenToCoreState.onChange());
|
||||
changes.push_back(self->backupWorkerChanged.onTrigger());
|
||||
|
||||
ASSERT( failed.size() >= 1 );
|
||||
wait( quorum(changes, 1) || tagError<Void>( quorum( failed, 1 ), master_tlog_failed() ) );
|
||||
|
@ -1375,6 +1377,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
|
|||
}
|
||||
logset->backupWorkers.push_back(worker);
|
||||
}
|
||||
backupWorkerChanged.trigger();
|
||||
}
|
||||
|
||||
ACTOR static Future<Void> monitorLog(Reference<AsyncVar<OptionalInterface<TLogInterface>>> logServer, Reference<AsyncVar<bool>> failed) {
|
||||
|
|
|
@ -1267,7 +1267,6 @@ ACTOR Future<std::map<LogEpoch, std::map<Tag, Version>>> getBackupProgress(Refer
|
|||
.detail("Version", status.version)
|
||||
.detail("Tag", status.tag.toString());
|
||||
}
|
||||
wait(tr.commit());
|
||||
return progress;
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
|
|
|
@ -1121,7 +1121,6 @@ ACTOR Future<Void> workerServer(
|
|||
|
||||
startRole(Role::BACKUP, recruited.id(), interf.id());
|
||||
DUMPTOKEN(recruited.waitFailure);
|
||||
DUMPTOKEN(recruited.haltBackup);
|
||||
|
||||
Future<Void> backupProcess = backupWorker(recruited, req, dbInfo);
|
||||
errorForwarders.add(forwardError(errors, Role::BACKUP, recruited.id(), backupProcess));
|
||||
|
|
Loading…
Reference in New Issue