FastRestore: Resolve review comments

1) Do not keep restore role data (e.g., masterData) in restore worker;
2) Change function parameter list by only passing in the needed variables in role data;
3) Remove unneccessary files vector from masterData;
4) Change typos in comments and some functions name.
This commit is contained in:
Meng Xu 2019-07-24 16:59:05 -07:00
parent 701676dbd2
commit f1741aa90d
10 changed files with 179 additions and 183 deletions

View File

@ -37,7 +37,9 @@
ACTOR static Future<Void> handleSendMutationVectorRequest(RestoreSendMutationVectorVersionedRequest req, Reference<RestoreApplierData> self);
ACTOR static Future<Void> handleApplyToDBRequest(RestoreVersionBatchRequest req, Reference<RestoreApplierData> self, Database cx);
ACTOR Future<Void> restoreApplierCore(Reference<RestoreApplierData> self, RestoreApplierInterface applierInterf, Database cx) {
ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int nodeIndex, Database cx) {
state Reference<RestoreApplierData> self = Reference<RestoreApplierData>( new RestoreApplierData(applierInterf.id(), nodeIndex) );
state ActorCollection actors(false);
state Future<Void> exitRole = Never();
state double lastLoopTopTime;

View File

@ -127,7 +127,7 @@ struct RestoreApplierData : RestoreRoleData, public ReferenceCounted<RestoreAppl
};
ACTOR Future<Void> restoreApplierCore(Reference<RestoreApplierData> self, RestoreApplierInterface applierInterf, Database cx);
ACTOR Future<Void> restoreApplierCore(RestoreApplierInterface applierInterf, int nodeIndex, Database cx);
#include "flow/unactorcompiler.h"

View File

@ -32,9 +32,10 @@ bool isRangeMutation(MutationRef m);
void splitMutation(Reference<RestoreLoaderData> self, MutationRef m, Arena& mvector_arena, VectorRef<MutationRef>& mvector, Arena& nodeIDs_arena, VectorRef<UID>& nodeIDs) ;
void _parseSerializedMutation(VersionedMutationsMap *kvOps, SerializedMutationListMap *mutationMap, bool isSampling = false);
ACTOR Future<Void> handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference<RestoreLoaderData> self);
ACTOR Future<Void> handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, Reference<RestoreLoaderData> self);
ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<RestoreLoaderData> self, bool isSampling = false);
ACTOR Future<Void> registerMutationsToApplier(Reference<RestoreLoaderData> self, VersionedMutationsMap *kvOps, bool isRangeFile, Version startVersion, Version endVersion);
ACTOR Future<Void> sendMutationsToApplier(Reference<RestoreLoaderData> self, VersionedMutationsMap *kvOps, bool isRangeFile, Version startVersion, Version endVersion);
ACTOR static Future<Void> _parseLogFileToMutationsOnLoader(SerializedMutationListMap *mutationMap,
std::map<Standalone<StringRef>, uint32_t> *mutationPartMap,
Reference<IBackupContainer> bc, Version version,
@ -46,7 +47,9 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(VersionedMutationsM
std::string fileName, int64_t readOffset_input, int64_t readLen_input,KeyRange restoreRange);
ACTOR Future<Void> restoreLoaderCore(Reference<RestoreLoaderData> self, RestoreLoaderInterface loaderInterf, Database cx) {
ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx) {
state Reference<RestoreLoaderData> self = Reference<RestoreLoaderData>( new RestoreLoaderData(loaderInterf.id(), nodeIndex) );
state ActorCollection actors(false);
state Future<Void> exitRole = Never();
state double lastLoopTopTime;
@ -67,6 +70,10 @@ ACTOR Future<Void> restoreLoaderCore(Reference<RestoreLoaderData> self, RestoreL
requestTypeStr = "heartbeat";
actors.add(handleHeartbeat(req, loaderInterf.id()));
}
when ( RestoreSysInfoRequest req = waitNext(loaderInterf.updateRestoreSysInfo.getFuture()) ) {
requestTypeStr = "updateRestoreSysInfo";
actors.add( handleRestoreSysInfoRequest(req, self) );
}
when ( RestoreSetApplierKeyRangeVectorRequest req = waitNext(loaderInterf.setApplierKeyRangeVectorRequest.getFuture()) ) {
requestTypeStr = "setApplierKeyRangeVectorRequest";
actors.add(handleSetApplierKeyRangeVectorRequest(req, self));
@ -98,6 +105,24 @@ ACTOR Future<Void> restoreLoaderCore(Reference<RestoreLoaderData> self, RestoreL
return Void();
}
// Assume: Only update the local data if it (applierInterf) has not been set
ACTOR Future<Void> handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference<RestoreLoaderData> self) {
TraceEvent("FastRestore").detail("HandleRestoreSysInfoRequest", self->id());
ASSERT(self.isValid());
// The loader has received the appliers interfaces
if ( !self->appliersInterf.empty() ) {
req.reply.send(RestoreCommonReply(self->id()));
return Void();
}
self->appliersInterf = req.sysInfo.appliers;
req.reply.send(RestoreCommonReply(self->id()) );
return Void();
}
ACTOR Future<Void> handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, Reference<RestoreLoaderData> self) {
// Idempodent operation. OK to re-execute the duplicate cmd
if ( self->range2Applier.empty() ) {
@ -139,7 +164,7 @@ ACTOR Future<Void> _processLoadingParam(LoadingParam param, Reference<RestoreLoa
_parseSerializedMutation(&kvOps, &mutationMap);
}
wait( registerMutationsToApplier(self, &kvOps, param.isRangeFile, param.prevVersion, param.endVersion) ); // Send the parsed mutation to applier who will apply the mutation to DB
wait( sendMutationsToApplier(self, &kvOps, param.isRangeFile, param.prevVersion, param.endVersion) ); // Send the parsed mutation to applier who will apply the mutation to DB
TraceEvent("FastRestore").detail("Loader", self->id()).detail("FinishLoadingFile", param.filename);
@ -160,14 +185,14 @@ ACTOR Future<Void> handleLoadFileRequest(RestoreLoadFileRequest req, Reference<R
}
// TODO: This function can be revised better
ACTOR Future<Void> registerMutationsToApplier(Reference<RestoreLoaderData> self,
ACTOR Future<Void> sendMutationsToApplier(Reference<RestoreLoaderData> self,
VersionedMutationsMap *pkvOps,
bool isRangeFile, Version startVersion, Version endVersion) {
state VersionedMutationsMap &kvOps = *pkvOps;
state int kvCount = 0;
state int splitMutationIndex = 0;
TraceEvent("FastRestore").detail("RegisterMutationToApplier", self->id()).detail("IsRangeFile", isRangeFile)
TraceEvent("FastRestore").detail("SendMutationToApplier", self->id()).detail("IsRangeFile", isRangeFile)
.detail("StartVersion", startVersion).detail("EndVersion", endVersion);
// Ensure there is a mutation request sent at endVersion, so that applier can advance its notifiedVersion
@ -233,7 +258,7 @@ ACTOR Future<Void> registerMutationsToApplier(Reference<RestoreLoaderData> self,
}
} // Mutations at the same version
// Register the mutations to appliers for each version
// Send the mutations to appliers for each version
for (auto &applierID : applierIDs) {
requests.push_back( std::make_pair(applierID, RestoreSendMutationVectorVersionedRequest(prevVersion, commitVersion, isRangeFile, applierMutationsBuffer[applierID])) );
applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size());
@ -245,7 +270,7 @@ ACTOR Future<Void> registerMutationsToApplier(Reference<RestoreLoaderData> self,
prevVersion = commitVersion;
} // all versions of mutations
TraceEvent("FastRestore").detail("LoaderRegisterMutationOnAppliers", kvCount);
TraceEvent("FastRestore").detail("LoaderSendMutationOnAppliers", kvCount);
return Void();
}
@ -433,7 +458,7 @@ ACTOR static Future<Void> _parseRangeFileToMutationsOnLoader(VersionedMutationsM
while(rangeStart < rangeEnd && !restoreRange.contains(blockData[rangeStart].key)) {
++rangeStart;
}
// Side end backwaself, stop if something at (rangeEnd-1) is found in range
// Side end from back, stop if something at (rangeEnd-1) is found in range
while(rangeEnd > rangeStart && !restoreRange.contains(blockData[rangeEnd - 1].key)) {
--rangeEnd;
}

View File

@ -62,7 +62,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
role = RestoreRole::Loader;
}
~RestoreLoaderData() {} = default;
~RestoreLoaderData() = default;
std::string describeNode() {
std::stringstream ss;
@ -101,7 +101,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted<RestoreLoade
};
ACTOR Future<Void> restoreLoaderCore(Reference<RestoreLoaderData> self, RestoreLoaderInterface loaderInterf, Database cx);
ACTOR Future<Void> restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx);
#include "flow/unactorcompiler.h"
#endif

View File

@ -36,10 +36,14 @@
#include "flow/actorcompiler.h" // This must be the last #include.
ACTOR static Future<Void> _clearDB(Database cx);
ACTOR static Future<Void> _collectBackupFiles(Reference<RestoreMasterData> self, Database cx, RestoreRequest request);
ACTOR static Future<Void> _collectBackupFiles(Reference<IBackupContainer> bc, std::vector<RestoreFileFR>* output_files, Database cx, RestoreRequest request);
ACTOR static Future<Version> processRestoreRequest(RestoreRequest request, Reference<RestoreMasterData> self, Database cx);
ACTOR static Future<Void> startProcessRestoreRequests(Reference<RestoreMasterData> self, Database cx);
ACTOR static Future<Void> distributeWorkloadPerVersionBatch(Reference<RestoreMasterData> self, Database cx, RestoreRequest request, VersionBatch versionBatch);
ACTOR static Future<Void> recruitRestoreRoles(Reference<RestoreWorkerData> masterWorker, Reference<RestoreMasterData> masterData);
ACTOR static Future<Void> distributeRestoreSysInfo(Reference<RestoreWorkerData> masterWorker, Reference<RestoreMasterData> masterData);
ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequests(Database cx);
ACTOR static Future<Void> initializeVersionBatch(Reference<RestoreMasterData> self);
@ -49,6 +53,80 @@ ACTOR static Future<Void> notifyRestoreCompleted(Reference<RestoreMasterData> se
void dummySampleWorkload(Reference<RestoreMasterData> self);
ACTOR Future<Void> startRestoreMaster(Reference<RestoreWorkerData> masterWorker, Database cx) {
state Reference<RestoreMasterData> self = Reference<RestoreMasterData>(new RestoreMasterData());
// recruitRestoreRoles must come after masterWorker has finished collectWorkerInterface
wait( recruitRestoreRoles(masterWorker, self) );
wait( distributeRestoreSysInfo(masterWorker, self) );
wait( startProcessRestoreRequests(self, cx) );
return Void();
}
// RestoreWorker that has restore master role: Recruite a role for each worker
ACTOR Future<Void> recruitRestoreRoles(Reference<RestoreWorkerData> masterWorker, Reference<RestoreMasterData> masterData) {
TraceEvent("FastRestore").detail("RecruitRestoreRoles", masterWorker->workerInterfaces.size())
.detail("NumLoaders", opConfig.num_loaders).detail("NumAppliers", opConfig.num_appliers);
ASSERT( masterData.isValid() );
ASSERT( opConfig.num_loaders > 0 && opConfig.num_appliers > 0 );
ASSERT( opConfig.num_loaders + opConfig.num_appliers <= masterWorker->workerInterfaces.size() ); // We assign 1 role per worker for now
// Assign a role to each worker
state int nodeIndex = 0;
state RestoreRole role;
std::map<UID, RestoreRecruitRoleRequest> requests;
for (auto &workerInterf : masterWorker->workerInterfaces) {
if ( nodeIndex >= 0 && nodeIndex < opConfig.num_appliers ) {
// [0, numApplier) are appliers
role = RestoreRole::Applier;
} else if ( nodeIndex >= opConfig.num_appliers && nodeIndex < opConfig.num_loaders + opConfig.num_appliers ) {
// [numApplier, numApplier + numLoader) are loaders
role = RestoreRole::Loader;
}
TraceEvent("FastRestore").detail("Role", getRoleStr(role)).detail("WorkerNode", workerInterf.first);
requests[workerInterf.first] = RestoreRecruitRoleRequest(role, nodeIndex);
nodeIndex++;
}
state std::vector<RestoreRecruitRoleReply> replies;
wait( getBatchReplies(&RestoreWorkerInterface::recruitRole, masterWorker->workerInterfaces, requests, &replies) );
for (auto& reply : replies) {
if ( reply.role == RestoreRole::Applier ) {
ASSERT_WE_THINK(reply.applier.present());
masterData->appliersInterf[reply.applier.get().id()] = reply.applier.get();
} else if ( reply.role == RestoreRole::Loader ) {
ASSERT_WE_THINK(reply.loader.present());
masterData->loadersInterf[reply.loader.get().id()] = reply.loader.get();
} else {
TraceEvent(SevError, "FastRestore").detail("RecruitRestoreRoles_InvalidRole", reply.role);
}
}
TraceEvent("FastRestore").detail("RecruitRestoreRolesDone", masterWorker->workerInterfaces.size());
return Void();
}
ACTOR Future<Void> distributeRestoreSysInfo(Reference<RestoreWorkerData> masterWorker, Reference<RestoreMasterData> masterData) {
ASSERT( masterData.isValid() );
ASSERT( !masterData->loadersInterf.empty() );
RestoreSysInfo sysInfo(masterData->appliersInterf);
std::vector<std::pair<UID, RestoreSysInfoRequest>> requests;
for (auto &loader : masterData->loadersInterf) {
requests.push_back( std::make_pair(loader.first, RestoreSysInfoRequest(sysInfo)) );
}
TraceEvent("FastRestore").detail("DistributeRestoreSysInfoToLoaders", masterData->loadersInterf.size());
wait( sendBatchRequests(&RestoreLoaderInterface::updateRestoreSysInfo, masterData->loadersInterf, requests) );
return Void();
}
// The server of the restore master. It drives the restore progress with the following steps:
// 1) Lock database and clear the normal keyspace
// 2) Wait on each RestoreRequest, which is sent by RestoreAgent operated by DBA
@ -58,10 +136,8 @@ void dummySampleWorkload(Reference<RestoreMasterData> self);
// 3.3) Construct requests of which file should be loaded by which loader, and send requests to loaders;
// 4) After process all restore requests, finish restore by cleaning up the restore related system key
// and ask all restore roles to quit.
ACTOR Future<Void> startRestoreMaster(Reference<RestoreMasterData> self, Database cx) {
state int checkNum = 0;
ACTOR Future<Void> startProcessRestoreRequests(Reference<RestoreMasterData> self, Database cx) {
state UID randomUID = g_random->randomUniqueID();
TraceEvent("FastRestore").detail("RestoreMaster", "WaitOnRestoreRequests");
state Standalone<VectorRef<RestoreRequest>> restoreRequests = wait( collectRestoreRequests(cx) );
@ -70,20 +146,25 @@ ACTOR Future<Void> startRestoreMaster(Reference<RestoreMasterData> self, Databas
wait( _clearDB(cx) );
// Step: Perform the restore requests
for ( auto &it : restoreRequests ) {
TraceEvent("FastRestore").detail("RestoreRequestInfo", it.toString());
Version ver = wait( processRestoreRequest(it, self, cx) );
state int restoreIndex = 0;
try {
for ( restoreIndex = 0; restoreIndex < restoreRequests.size(); restoreIndex++ ) {
RestoreRequest& request = restoreRequests[restoreIndex];
TraceEvent("FastRestore").detail("RestoreRequestInfo", request.toString());
Version ver = wait( processRestoreRequest(request, self, cx) );
}
} catch(Error &e) {
TraceEvent(SevError, "FastRestoreFailed").detail("RestoreRequest", restoreRequests[restoreIndex].toString());
}
// Step: Notify all restore requests have been handled by cleaning up the restore keys
wait( notifyRestoreCompleted(self, cx) );
try {
wait( unlockDatabase(cx,randomUID) );
} catch(Error &e) {
printf(" unlockDB fails. uid:%s\n", randomUID.toString().c_str());
TraceEvent(SevError, "UnlockDBFailed").detail("UID", randomUID.toString());
}
TraceEvent("FastRestore").detail("RestoreMasterComplete", self->id());
@ -91,9 +172,15 @@ ACTOR Future<Void> startRestoreMaster(Reference<RestoreMasterData> self, Databas
}
ACTOR static Future<Version> processRestoreRequest(RestoreRequest request, Reference<RestoreMasterData> self, Database cx) {
wait( _collectBackupFiles(self, cx, request) );
self->constructFilesWithVersionRange();
self->buildVersionBatches();
state std::vector<RestoreFileFR> files;
state std::vector<RestoreFileFR> allFiles;
self->initBackupContainer(request.url);
wait( _collectBackupFiles(self->bc, &files, cx, request) ); // Get all backup files' description and save them to files
self->constructFilesWithVersionRange(files, allFiles); // Assign modified files to allFiles
self->buildVersionBatches(allFiles, self->versionBatches); // Divide files into version batches
state std::map<Version, VersionBatch>::iterator versionBatch;
for (versionBatch = self->versionBatches.begin(); versionBatch != self->versionBatches.end(); versionBatch++) {
wait( initializeVersionBatch(self) );
@ -237,10 +324,10 @@ ACTOR static Future<Standalone<VectorRef<RestoreRequest>>> collectRestoreRequest
return restoreRequests;
}
// NOTE: This function can now get the backup file descriptors
ACTOR static Future<Void> _collectBackupFiles(Reference<RestoreMasterData> self, Database cx, RestoreRequest request) {
self->initBackupContainer(request.url);
state BackupDescription desc = wait(self->bc->describeBackup());
// Collect the backup files' description into output_files by reading the backupContainer bc.
ACTOR static Future<Void> _collectBackupFiles(Reference<IBackupContainer> bc, std::vector<RestoreFileFR>* output_files, Database cx, RestoreRequest request) {
state std::vector<RestoreFileFR> &files = *output_files;
state BackupDescription desc = wait(bc->describeBackup());
// TODO: Delete this and see if it works
wait(desc.resolveVersionTimes(cx));
@ -249,27 +336,27 @@ ACTOR static Future<Void> _collectBackupFiles(Reference<RestoreMasterData> self,
if(request.targetVersion == invalidVersion && desc.maxRestorableVersion.present())
request.targetVersion = desc.maxRestorableVersion.get();
Optional<RestorableFileSet> restorable = wait(self->bc->getRestoreSet(request.targetVersion));
Optional<RestorableFileSet> restorable = wait(bc->getRestoreSet(request.targetVersion));
if(!restorable.present()) {
TraceEvent(SevWarn, "FastRestore").detail("NotRestorable", request.targetVersion);
throw restore_missing_data();
}
if (!self->files.empty()) {
TraceEvent(SevError, "FastRestore").detail("ClearOldFiles", self->files.size());
self->files.clear();
if (!files.empty()) {
TraceEvent(SevError, "FastRestore").detail("ClearOldFiles", files.size());
files.clear();
}
for(const RangeFile &f : restorable.get().ranges) {
TraceEvent("FastRestore").detail("RangeFile", f.toString());
RestoreFileFR file(f.version, f.fileName, true, f.blockSize, f.fileSize, f.version, f.version);
self->files.push_back(file);
files.push_back(file);
}
for(const LogFile &f : restorable.get().logs) {
TraceEvent("FastRestore").detail("LogFile", f.toString());
RestoreFileFR file(f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.beginVersion);
self->files.push_back(file);
files.push_back(file);
}
return Void();

View File

@ -36,6 +36,7 @@
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/RestoreWorker.actor.h"
#include "flow/actorcompiler.h" // has to be last include
@ -57,10 +58,6 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMast
std::map<Standalone<KeyRef>, UID> range2Applier; // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for
std::map<Version, VersionBatch> versionBatches; // key is the beginVersion of the version batch
// Temporary variables to hold files and data to restore
std::vector<RestoreFileFR> allFiles; // All backup files to be processed in all version batches
std::vector<RestoreFileFR> files; // Backup files to be parsed and applied: range and log files in 1 version batch
int batchIndex;
Reference<IBackupContainer> bc; // Backup container is used to read backup files
@ -81,7 +78,8 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMast
return ss.str();
}
void buildVersionBatches() {
// Split allFiles into multiple versionBatches based on files' version
void buildVersionBatches(const std::vector<RestoreFileFR>& allFiles, std::map<Version, VersionBatch>& versionBatches) {
// A version batch includes a log file
// Because log file's verion range does not overlap, we use log file's version range as the version range of a version batch
// Create a version batch for a log file
@ -133,7 +131,8 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMast
}
}
void constructFilesWithVersionRange() {
// Parse file's name to get beginVersion and endVersion of the file; and assign files to allFiles
void constructFilesWithVersionRange(std::vector<RestoreFileFR> &files, std::vector<RestoreFileFR>& allFiles) {
printf("[INFO] constructFilesWithVersionRange for num_files:%ld\n", files.size());
allFiles.clear();
for (int i = 0; i < files.size(); i++) {
@ -176,7 +175,7 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted<RestoreMast
}
};
ACTOR Future<Void> startRestoreMaster(Reference<RestoreMasterData> self, Database cx);
ACTOR Future<Void> startRestoreMaster(Reference<RestoreWorkerData> masterWorker, Database cx);
#include "flow/unactorcompiler.h"
#endif

View File

@ -37,7 +37,7 @@ struct RestoreWorkerData;
// id is the id of the worker to be monitored
// This actor is used for both restore loader and restore applier
ACTOR Future<Void> handleHeartbeat(RestoreSimpleRequest req, UID id) {
wait( delay(g_random->random01() + 0.01) ); // Random jitter reduces heat beat monitor's pressure
wait( delayJittered(5.0) ); // Random jitter reduces heat beat monitor's pressure
req.reply.send(RestoreCommonReply(id));
return Void();

View File

@ -35,12 +35,12 @@
#include "flow/genericactors.actor.h"
#include "flow/Hash3.h"
#include "flow/ActorCollection.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/RestoreWorkerInterface.actor.h"
#include "fdbserver/RestoreCommon.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/RestoreLoader.actor.h"
#include "fdbserver/RestoreApplier.actor.h"
// #include "fdbserver/RestoreUtil.h"
// #include "fdbserver/RestoreWorkerInterface.actor.h"
// #include "fdbserver/RestoreCommon.actor.h"
// #include "fdbserver/RestoreRoleCommon.actor.h"
// #include "fdbserver/RestoreLoader.actor.h"
// #include "fdbserver/RestoreApplier.actor.h"
#include "fdbserver/RestoreMaster.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -61,43 +61,12 @@ ACTOR Future<Void> handlerTerminateWorkerRequest(RestoreSimpleRequest req, Refer
ACTOR Future<Void> monitorWorkerLiveness(Reference<RestoreWorkerData> self);
ACTOR Future<Void> handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Reference<RestoreWorkerData> self, ActorCollection *actors, Database cx);
ACTOR Future<Void> collectRestoreWorkerInterface(Reference<RestoreWorkerData> self, Database cx, int min_num_workers = 2);
ACTOR Future<Void> recruitRestoreRoles(Reference<RestoreWorkerData> self);
ACTOR Future<Void> monitorleader(Reference<AsyncVar<RestoreWorkerInterface>> leader, Database cx, RestoreWorkerInterface myWorkerInterf);
ACTOR Future<Void> startRestoreWorkerLeader(Reference<RestoreWorkerData> self, RestoreWorkerInterface workerInterf, Database cx);
ACTOR Future<Void> handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference<RestoreWorkerData> self);
template<> Tuple Codec<ERestoreState>::pack(ERestoreState const &val);
template<> ERestoreState Codec<ERestoreState>::unpack(Tuple const &val);
// Each restore worker (a process) is assigned for a role.
// MAYBE Later: We will support multiple restore roles on a worker
struct RestoreWorkerData : NonCopyable, public ReferenceCounted<RestoreWorkerData> {
UID workerID;
std::map<UID, RestoreWorkerInterface> workerInterfaces; // UID is worker's node id, RestoreWorkerInterface is worker's communication workerInterface
// Restore Roles
Optional<RestoreLoaderInterface> loaderInterf;
Reference<RestoreLoaderData> loaderData;
Optional<RestoreApplierInterface> applierInterf;
Reference<RestoreApplierData> applierData;
Reference<RestoreMasterData> masterData;
uint32_t inProgressFlag = 0; // To avoid race between duplicate message delivery that invokes the same actor multiple times
UID id() const { return workerID; };
RestoreWorkerData() = default;
~RestoreWorkerData() {
printf("[Exit] Worker:%s RestoreWorkerData is deleted\n", workerID.toString().c_str());
}
std::string describeNode() {
std::stringstream ss;
ss << "RestoreWorker workerID:" << workerID.toString();
return ss.str();
}
};
// Remove the worker interface from restoreWorkerKey and remove its roles interfaces from their keys.
ACTOR Future<Void> handlerTerminateWorkerRequest(RestoreSimpleRequest req, Reference<RestoreWorkerData> self, RestoreWorkerInterface workerInterf, Database cx) {
@ -135,9 +104,8 @@ ACTOR Future<Void> handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Refer
DUMPTOKEN(recruited.initVersionBatch);
DUMPTOKEN(recruited.collectRestoreRoleInterfaces);
DUMPTOKEN(recruited.finishRestore);
self->loaderData = Reference<RestoreLoaderData>( new RestoreLoaderData(self->loaderInterf.get().id(), req.nodeIndex) );
actors->add( restoreLoaderCore(self->loaderData, self->loaderInterf.get(), cx) );
TraceEvent("FastRestore").detail("LoaderRecruited", self->loaderData->id());
actors->add( restoreLoaderCore(self->loaderInterf.get(), req.nodeIndex, cx) );
TraceEvent("FastRestore").detail("RecruitedLoaderNodeIndex", req.nodeIndex);
req.reply.send(RestoreRecruitRoleReply(self->id(), RestoreRole::Loader, self->loaderInterf.get()));
} else if (req.role == RestoreRole::Applier) {
ASSERT( !self->applierInterf.present() );
@ -149,9 +117,8 @@ ACTOR Future<Void> handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Refer
DUMPTOKEN(recruited.initVersionBatch);
DUMPTOKEN(recruited.collectRestoreRoleInterfaces);
DUMPTOKEN(recruited.finishRestore);
self->applierData = Reference<RestoreApplierData>( new RestoreApplierData(self->applierInterf.get().id(), req.nodeIndex) );
actors->add( restoreApplierCore(self->applierData, self->applierInterf.get(), cx) );
TraceEvent("FastRestore").detail("ApplierRecruited", self->applierData->id());
actors->add( restoreApplierCore(self->applierInterf.get(), req.nodeIndex, cx) );
TraceEvent("FastRestore").detail("RecruitedApplierNodeIndex", req.nodeIndex);
req.reply.send(RestoreRecruitRoleReply(self->id(), RestoreRole::Applier, self->applierInterf.get()));
} else {
TraceEvent(SevError, "FastRestore").detail("HandleRecruitRoleRequest", "UnknownRole"); //.detail("Request", req.printable());
@ -160,26 +127,6 @@ ACTOR Future<Void> handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Refer
return Void();
}
// Assume: Only update the local data if it (applierInterf) has not been set
ACTOR Future<Void> handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference<RestoreWorkerData> self) {
TraceEvent("FastRestore").detail("HandleRestoreSysInfoRequest", self->id());
// Applier does not need to know appliers interfaces
if ( !self->loaderData.isValid() ) {
req.reply.send(RestoreCommonReply(self->id()));
return Void();
}
// The loader has received the appliers interfaces
if ( !self->loaderData->appliersInterf.empty() ) {
req.reply.send(RestoreCommonReply(self->id()));
return Void();
}
self->loaderData->appliersInterf = req.sysInfo.appliers;
req.reply.send(RestoreCommonReply(self->id()) );
return Void();
}
// Read restoreWorkersKeys from DB to get each restore worker's restore workerInterface and set it to self->workerInterfaces
// This is done before we assign restore roles for restore workers
@ -242,69 +189,8 @@ void initRestoreWorkerConfig() {
.detail("TxnBatchSize", opConfig.transactionBatchSizeThreshold);
}
// RestoreWorker that has restore master role: Recruite a role for each worker
ACTOR Future<Void> recruitRestoreRoles(Reference<RestoreWorkerData> self) {
TraceEvent("FastRestore").detail("RecruitRestoreRoles", self->workerInterfaces.size())
.detail("NumLoaders", opConfig.num_loaders).detail("NumAppliers", opConfig.num_appliers);
ASSERT( self->masterData.isValid() );
ASSERT( opConfig.num_loaders > 0 && opConfig.num_appliers > 0 );
ASSERT( opConfig.num_loaders + opConfig.num_appliers <= self->workerInterfaces.size() ); // We assign 1 role per worker for now
// Assign a role to each worker
state int nodeIndex = 0;
state RestoreRole role;
std::map<UID, RestoreRecruitRoleRequest> requests;
for (auto &workerInterf : self->workerInterfaces) {
if ( nodeIndex >= 0 && nodeIndex < opConfig.num_appliers ) {
// [0, numApplier) are appliers
role = RestoreRole::Applier;
} else if ( nodeIndex >= opConfig.num_appliers && nodeIndex < opConfig.num_loaders + opConfig.num_appliers ) {
// [numApplier, numApplier + numLoader) are loaders
role = RestoreRole::Loader;
}
TraceEvent("FastRestore").detail("Role", getRoleStr(role)).detail("WorkerNode", workerInterf.first);
requests[workerInterf.first] = RestoreRecruitRoleRequest(role, nodeIndex);
nodeIndex++;
}
state std::vector<RestoreRecruitRoleReply> replies;
wait( getBatchReplies(&RestoreWorkerInterface::recruitRole, self->workerInterfaces, requests, &replies) );
for (auto& reply : replies) {
if ( reply.role == RestoreRole::Applier ) {
ASSERT_WE_THINK(reply.applier.present());
self->masterData->appliersInterf[reply.applier.get().id()] = reply.applier.get();
} else if ( reply.role == RestoreRole::Loader ) {
ASSERT_WE_THINK(reply.loader.present());
self->masterData->loadersInterf[reply.loader.get().id()] = reply.loader.get();
} else {
TraceEvent(SevError, "FastRestore").detail("RecruitRestoreRoles_InvalidRole", reply.role);
}
}
TraceEvent("FastRestore").detail("RecruitRestoreRolesDone", self->workerInterfaces.size());
return Void();
}
ACTOR Future<Void> distributeRestoreSysInfo(Reference<RestoreWorkerData> self) {
ASSERT( self->masterData.isValid() );
ASSERT( !self->masterData->loadersInterf.empty() );
RestoreSysInfo sysInfo(self->masterData->appliersInterf);
std::vector<std::pair<UID, RestoreSysInfoRequest>> requests;
for (auto &worker : self->workerInterfaces) {
requests.push_back( std::make_pair(worker.first, RestoreSysInfoRequest(sysInfo)) );
}
TraceEvent("FastRestore").detail("DistributeRestoreSysInfo", self->workerInterfaces.size());
wait( sendBatchRequests(&RestoreWorkerInterface::updateRestoreSysInfo, self->workerInterfaces, requests) );
return Void();
}
// RestoreWorkerLeader is the worker that runs RestoreMaster role
ACTOR Future<Void> startRestoreWorkerLeader(Reference<RestoreWorkerData> self, RestoreWorkerInterface workerInterf, Database cx) {
self->masterData = Reference<RestoreMasterData>(new RestoreMasterData());
// We must wait for enough time to make sure all restore workers have registered their workerInterfaces into the DB
printf("[INFO][Master] NodeID:%s Restore master waits for agents to register their workerKeys\n",
workerInterf.id().toString().c_str());
@ -316,12 +202,7 @@ ACTOR Future<Void> startRestoreWorkerLeader(Reference<RestoreWorkerData> self, R
// TODO: Needs to keep this monitor's future. May use actorCollection
state Future<Void> workersFailureMonitor = monitorWorkerLiveness(self);
// recruitRestoreRoles must be after collectWorkerInterface
wait( recruitRestoreRoles(self) );
wait( distributeRestoreSysInfo(self) );
wait( startRestoreMaster(self->masterData, cx) );
wait( startRestoreMaster(self, cx) );
return Void();
}
@ -351,10 +232,6 @@ ACTOR Future<Void> startRestoreWorker(Reference<RestoreWorkerData> self, Restore
requestTypeStr = "recruitRole";
actors.add( handleRecruitRoleRequest(req, self, &actors, cx) );
}
when ( RestoreSysInfoRequest req = waitNext(interf.updateRestoreSysInfo.getFuture()) ) {
requestTypeStr = "updateRestoreSysInfo";
actors.add( handleRestoreSysInfoRequest(req, self) );
}
when ( RestoreSimpleRequest req = waitNext(interf.terminateWorker.getFuture()) ) {
// Destroy the worker at the end of the restore
requestTypeStr = "terminateWorker";

View File

@ -57,11 +57,14 @@ struct RestoreSysInfo;
struct RestoreApplierInterface;
// RestoreSysInfo includes information each (type of) restore roles should know.
// At this moment, it only include appliers. We keep the name for future extension.
// TODO: If it turns out this struct only has appliers in the final version, we will rename it to a more specific name, e.g., AppliersMap
struct RestoreSysInfo {
std::map<UID, RestoreApplierInterface> appliers;
RestoreSysInfo() = default;
explicit RestoreSysInfo(std::map<UID, RestoreApplierInterface> appliers) : appliers(appliers) {}
explicit RestoreSysInfo(const std::map<UID, RestoreApplierInterface> appliers) : appliers(appliers) {}
template <class Ar>
void serialize(Ar& ar) {
@ -74,7 +77,6 @@ struct RestoreWorkerInterface {
RequestStream<RestoreSimpleRequest> heartbeat;
RequestStream<RestoreRecruitRoleRequest> recruitRole;
RequestStream<RestoreSysInfoRequest> updateRestoreSysInfo;
RequestStream<RestoreSimpleRequest> terminateWorker;
bool operator == (RestoreWorkerInterface const& r) const { return id() == r.id(); }
@ -87,7 +89,6 @@ struct RestoreWorkerInterface {
void initEndpoints() {
heartbeat.getEndpoint( TaskClusterController );
recruitRole.getEndpoint( TaskClusterController );// Q: Why do we need this?
updateRestoreSysInfo.getEndpoint(TaskClusterController);
terminateWorker.getEndpoint( TaskClusterController );
interfID = g_random->randomUniqueID();
@ -95,7 +96,7 @@ struct RestoreWorkerInterface {
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, interfID, heartbeat, updateRestoreSysInfo, recruitRole, terminateWorker);
serializer(ar, interfID, heartbeat, recruitRole, terminateWorker);
}
};
@ -125,6 +126,7 @@ struct RestoreRoleInterface {
struct RestoreLoaderInterface : RestoreRoleInterface {
RequestStream<RestoreSimpleRequest> heartbeat;
RequestStream<RestoreSysInfoRequest> updateRestoreSysInfo;
RequestStream<RestoreSetApplierKeyRangeVectorRequest> setApplierKeyRangeVectorRequest;
RequestStream<RestoreLoadFileRequest> loadFile;
RequestStream<RestoreVersionBatchRequest> initVersionBatch;
@ -143,6 +145,7 @@ struct RestoreLoaderInterface : RestoreRoleInterface {
void initEndpoints() {
heartbeat.getEndpoint( TaskClusterController );
updateRestoreSysInfo.getEndpoint( TaskClusterController );
setApplierKeyRangeVectorRequest.getEndpoint( TaskClusterController );
loadFile.getEndpoint( TaskClusterController );
initVersionBatch.getEndpoint( TaskClusterController );
@ -152,7 +155,7 @@ struct RestoreLoaderInterface : RestoreRoleInterface {
template <class Ar>
void serialize( Ar& ar ) {
serializer(ar, * (RestoreRoleInterface*) this, heartbeat,
serializer(ar, * (RestoreRoleInterface*) this, heartbeat, updateRestoreSysInfo,
setApplierKeyRangeVectorRequest, loadFile,
initVersionBatch, collectRestoreRoleInterfaces, finishRestore);
}

View File

@ -206,7 +206,10 @@
<ActorCompiler Include="RestoreRoleCommon.actor.h">
<EnableCompile>false</EnableCompile>
</ActorCompiler>
<ActorCompiler Include="RestoreMaster.actor.h">
<ActorCompiler Include="RestoreWorker.actor.h">
<EnableCompile>false</EnableCompile>
</ActorCompiler>
<ActorCompiler Include="RestoreMaster.actor.h">
<EnableCompile>false</EnableCompile>
</ActorCompiler>
<ActorCompiler Include="RestoreLoader.actor.h">