From a08a6776f5bba6bcc15c72202fa1fd1ce2aafb47 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 9 May 2019 20:55:44 -0700 Subject: [PATCH] FastRestore: Refactor to smaller components The current code uses one restore interface to handle the work for all restore roles, i.e., master, loader and applier. This makes it harder to review or maintain or scale. This commit split the restore into multiple roles by mimicing FDB transaction system: 1) It uses a RestoreWorker as the process to host restore roles; This commit assumes one restore role per RestoreWorker; but it should be easy to extend to support multiple roles per RestoreWorker; 2) It creates 3 restore roles: RestoreMaster: Coordinate the restore process and send commands to the other two roles; RestoreLoader: Parse backup files to mutations and send mutations to appliers; RestoreApplier: Sort received mutations and apply them to DB in order. Compilable version. To be tested in correctness. --- fdbclient/BackupContainer.h | 5 + fdbclient/SystemData.cpp | 62 +- fdbclient/SystemData.h | 19 +- fdbserver/Restore.actor.cpp | 4154 ++----------------------- fdbserver/RestoreApplier.actor.cpp | 450 +++ fdbserver/RestoreApplier.actor.h | 147 +- fdbserver/RestoreCommon.actor.h | 2 +- fdbserver/RestoreLoader.actor.cpp | 1129 +++++++ fdbserver/RestoreLoader.actor.h | 108 +- fdbserver/RestoreMaster.actor.cpp | 1326 ++++++++ fdbserver/RestoreMaster.actor.h | 264 ++ fdbserver/RestoreRoleCommon.actor.cpp | 324 ++ fdbserver/RestoreRoleCommon.actor.h | 200 ++ fdbserver/RestoreUtil.actor.cpp | 70 + fdbserver/RestoreUtil.h | 146 + fdbserver/RestoreWorkerInterface.h | 247 +- fdbserver/fdbserver.vcxproj | 12 +- 17 files changed, 4623 insertions(+), 4042 deletions(-) create mode 100644 fdbserver/RestoreMaster.actor.cpp create mode 100644 fdbserver/RestoreMaster.actor.h create mode 100644 fdbserver/RestoreRoleCommon.actor.cpp create mode 100644 fdbserver/RestoreRoleCommon.actor.h create mode 100644 fdbserver/RestoreUtil.actor.cpp create mode 100644 fdbserver/RestoreUtil.h diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 75e209216f..e4f6ebf1de 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -18,6 +18,8 @@ * limitations under the License. */ +#ifndef FDBCLIENT_BackupContainer_H +#define FDBCLIENT_BackupContainer_H #pragma once #include "flow/flow.h" @@ -27,6 +29,8 @@ #include "fdbclient/ReadYourWrites.h" #include +class ReadYourWritesTransaction; + Future> timeKeeperEpochsFromVersion(Version const &v, Reference const &tr); Future timeKeeperVersionFromDatetime(std::string const &datetime, Database const &db); @@ -255,3 +259,4 @@ private: std::string URL; }; +#endif \ No newline at end of file diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 6c6ea5b071..ebf078748b 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -601,6 +601,14 @@ const KeyRangeRef restoreWorkersKeys( LiteralStringRef("\xff\x02/restoreWorkers/"), LiteralStringRef("\xff\x02/restoreWorkers0") ); +const KeyRangeRef restoreLoaderKeys( + LiteralStringRef("\xff\x02/restoreLoaders/"), + LiteralStringRef("\xff\x02/restoreLoaders0") +); +const KeyRangeRef restoreApplierKeys( + LiteralStringRef("\xff\x02/restoreAppliers/"), + LiteralStringRef("\xff\x02/restoreAppliers0") +); const KeyRef restoreStatusKey = LiteralStringRef("\xff\x02/restoreStatus/"); @@ -611,24 +619,64 @@ const KeyRangeRef restoreRequestKeys( LiteralStringRef("\xff\x02/restoreRequests0") ); -// Encode restore agent key for agentID -const Key restoreWorkerKeyFor( UID const& agentID ) { +// Encode restore worker key for workerID +const Key restoreWorkerKeyFor( UID const& workerID ) { BinaryWriter wr(Unversioned()); wr.serializeBytes( restoreWorkersKeys.begin ); - wr << agentID; + wr << workerID; + return wr.toValue(); +} + +// Encode restore role (loader or applier) for roleID +const Key restoreLoaderKeyFor( UID const& roleID ) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes( restoreLoaderKeys.begin ); + wr << roleID; + return wr.toValue(); +} + +const Key restoreApplierKeyFor( UID const& roleID ) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes( restoreApplierKeys.begin ); + wr << roleID; return wr.toValue(); } // Encode restore agent value - -const Value restoreCommandInterfaceValue( RestoreInterface const& cmdInterf ) { +const Value restoreWorkerInterfaceValue( RestoreWorkerInterface const& cmdInterf ) { BinaryWriter wr(IncludeVersion()); wr << cmdInterf; return wr.toValue(); } -RestoreInterface decodeRestoreCommandInterfaceValue( ValueRef const& value ) { - RestoreInterface s; +RestoreWorkerInterface decodeRestoreWorkerInterfaceValue( ValueRef const& value ) { + RestoreWorkerInterface s; + BinaryReader reader( value, IncludeVersion() ); + reader >> s; + return s; +} + +const Value restoreLoaderInterfaceValue( RestoreLoaderInterface const& cmdInterf ) { + BinaryWriter wr(IncludeVersion()); + wr << cmdInterf; + return wr.toValue(); +} + +RestoreLoaderInterface decodeRestoreLoaderInterfaceValue( ValueRef const& value ) { + RestoreLoaderInterface s; + BinaryReader reader( value, IncludeVersion() ); + reader >> s; + return s; +} + +const Value restoreApplierInterfaceValue( RestoreApplierInterface const& cmdInterf ) { + BinaryWriter wr(IncludeVersion()); + wr << cmdInterf; + return wr.toValue(); +} + +RestoreApplierInterface decodeRestoreApplierInterfaceValue( ValueRef const& value ) { + RestoreApplierInterface s; BinaryReader reader( value, IncludeVersion() ); reader >> s; return s; diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 70342b68ae..f3b8174fe9 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -27,6 +27,9 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/RestoreWorkerInterface.h" +struct RestoreLoaderInterface; +struct RestoreApplierInterface; +struct RestoreMasterInterface; extern const KeyRangeRef normalKeys; // '' to systemKeys.begin extern const KeyRangeRef systemKeys; // [FF] to [FF][FF] @@ -275,6 +278,9 @@ extern const KeyRangeRef monitorConfKeys; extern const KeyRef restoreLeaderKey; extern const KeyRangeRef restoreWorkersKeys; +extern const KeyRangeRef restoreRolesKeys; +extern const KeyRangeRef restoreLoaderKeys; +extern const KeyRangeRef restoreApplierKeys; extern const KeyRef restoreStatusKey; @@ -282,9 +288,16 @@ extern const KeyRef restoreRequestTriggerKey; extern const KeyRef restoreRequestDoneKey; extern const KeyRangeRef restoreRequestKeys; -const Key restoreWorkerKeyFor( UID const& agentID ); -const Value restoreCommandInterfaceValue( RestoreInterface const& server ); -RestoreInterface decodeRestoreCommandInterfaceValue( ValueRef const& value ); +const Key restoreWorkerKeyFor( UID const& workerID ); +const Key restoreLoaderKeyFor( UID const& roleID ); +const Key restoreApplierKeyFor( UID const& roleID ); + +const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& server ); +RestoreWorkerInterface decodeRestoreWorkerInterfaceValue( ValueRef const& value ); +const Value restoreLoaderInterfaceValue(RestoreLoaderInterface const& server ); +RestoreLoaderInterface decodeRestoreLoaderInterfaceValue( ValueRef const& value ); +const Value restoreApplierInterfaceValue(RestoreApplierInterface const& server ); +RestoreApplierInterface decodeRestoreApplierInterfaceValue( ValueRef const& value ); // MX: parallel restore const Value restoreRequestTriggerValue (int const numRequests); diff --git a/fdbserver/Restore.actor.cpp b/fdbserver/Restore.actor.cpp index 6fa7a80efc..dd73d11e2b 100644 --- a/fdbserver/Restore.actor.cpp +++ b/fdbserver/Restore.actor.cpp @@ -18,7 +18,7 @@ * limitations under the License. */ -#include "fdbserver/RestoreWorkerInterface.h" + #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/SystemData.h" @@ -39,7 +39,14 @@ #include #include +#include "flow/ActorCollection.h" +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreWorkerInterface.h" #include "fdbserver/RestoreCommon.actor.h" +#include "fdbserver/RestoreRoleCommon.actor.h" +#include "fdbserver/RestoreLoader.actor.h" +#include "fdbserver/RestoreApplier.actor.h" +#include "fdbserver/RestoreMaster.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -52,95 +59,25 @@ double loadBatchSizeThresholdB = loadBatchSizeMB * 1024 * 1024; double mutationVectorThreshold = 100; // Bytes // correctness passed when the value is 1 double transactionBatchSizeThreshold = 512; // Byte +int restoreStatusIndex = 0; + class RestoreConfig; -struct RestoreData; // Only declare the struct exist but we cannot use its field +struct RestoreWorkerData; // Only declare the struct exist but we cannot use its field -// Forward declaration -ACTOR Future registerMutationsToApplier(Reference rd); -ACTOR Future registerMutationsToMasterApplier(Reference rd); -ACTOR Future notifyApplierToApplyMutations(Reference rd); -ACTOR Future notifyWorkersToSetWorkersInterface(Reference rd); -ACTOR Future configureRoles(Reference rd); -ACTOR Future notifyWorkersToSetWorkersInterface(Reference rd); -ACTOR Future handleSendSampleMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference rd, RestoreInterface interf); -ACTOR Future handleFinishRestoreReq(RestoreSimpleRequest req, Reference rd, RestoreInterface interf, Database cx); +// Forwaself declaration +void initRestoreWorkerConfig(); -ACTOR Future workerCore( Reference rd, RestoreInterface ri, Database cx ); -ACTOR Future masterCore(Reference rd, RestoreInterface ri, Database cx); - -ACTOR static Future processRestoreRequest(RestoreInterface interf, Reference rd, Database cx, RestoreRequest request); -ACTOR static Future finishRestore(Reference rd, Database cx, Standalone> restoreRequests); -ACTOR static Future _clearDB(Reference tr); - -bool concatenateBackupMutationForLogFile(Reference rd, Standalone val_input, Standalone key_input); -void concatenateBackupMutation(Standalone val_input, Standalone key_input); -void registerBackupMutationForAll(Version empty); -bool isKVOpsSorted(Reference rd); -bool allOpsAreKnown(Reference rd); -void sanityCheckMutationOps(Reference rd); -void parseSerializedMutation(Reference rd, bool isSampling = false); -bool collectFilesForOneVersionBatch(Reference rd); - -// Helper class for reading restore data from a buffer and throwing the right errors. -// This struct is mostly copied from StringRefReader. We add a sanity check in this struct. -// TODO: Merge this struct with StringRefReader. -struct StringRefReaderMX { - StringRefReaderMX(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e), str_size(s.size()) {} - - // Return remainder of data as a StringRef - StringRef remainder() { - return StringRef(rptr, end - rptr); - } - - // Return a pointer to len bytes at the current read position and advance read pos - //Consume a little-Endian data. Since we only run on little-Endian machine, the data on storage is little Endian - const uint8_t * consume(unsigned int len) { - if(rptr == end && len != 0) - throw end_of_stream(); - const uint8_t *p = rptr; - rptr += len; - if(rptr > end) { - printf("[ERROR] StringRefReaderMX throw error! string length:%d\n", str_size); - printf("!!!!!!!!!!!![ERROR]!!!!!!!!!!!!!! Worker may die due to the error. Master will stuck when a worker die\n"); - throw failure_error; - } - return p; - } - - // Return a T from the current read position and advance read pos - template const T consume() { - return *(const T *)consume(sizeof(T)); - } - - // Functions for consuming big endian (network byte order) integers. - // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} - const uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} - - const int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume< int64_t>());} - const uint64_t consumeNetworkUInt64() { return bigEndian64( consume());} - - bool eof() { return rptr == end; } - - const uint8_t *rptr, *end; - const int str_size; - Error failure_error; -}; +ACTOR Future handlerTerminateWorkerRequest(RestoreSimpleRequest req, Reference self, RestoreWorkerInterface workerInterf, Database cx); +ACTOR Future monitorWorkerLiveness(Reference self); +ACTOR Future commitRestoreRoleInterfaces(Reference self, Database cx); +ACTOR Future handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Reference self, ActorCollection *actors, Database cx); +ACTOR Future collectRestoreWorkerInterface(Reference self, Database cx, int min_num_workers); +ACTOR Future recruitRestoreRoles(Reference self); bool debug_verbose = true; -void printGlobalNodeStatus(Reference); +void printGlobalNodeStatus(Reference); -std::vector RestoreRoleStr = {"Invalid", "Master", "Loader", "Applier"}; -int numRoles = RestoreRoleStr.size(); -std::string getRoleStr(RestoreRole role) { - if ( (int) role >= numRoles || (int) role < 0) { - printf("[ERROR] role:%d is out of scope\n", (int) role); - return "[Unset]"; - } - return RestoreRoleStr[(int)role]; -} - const char *RestoreCommandEnumStr[] = {"Init", "Set_Role", "Set_Role_Done", "Sample_Range_File", "Sample_Log_File", "Sample_File_Done", @@ -157,42 +94,6 @@ const char *RestoreCommandEnumStr[] = {"Init", template<> Tuple Codec::pack(ERestoreState const &val); // { return Tuple().append(val); } template<> ERestoreState Codec::unpack(Tuple const &val); // { return (ERestoreState)val.getInt(0); } -// CMDUID implementation -void CMDUID::initPhase(RestoreCommandEnum newPhase) { - printf("CMDID, current phase:%d, new phase:%d\n", phase, newPhase); - phase = (uint16_t) newPhase; - cmdID = 0; -} - -void CMDUID::nextPhase() { - phase++; - cmdID = 0; -} - -void CMDUID::nextCmd() { - cmdID++; -} - -RestoreCommandEnum CMDUID::getPhase() { - return (RestoreCommandEnum) phase; -} - -void CMDUID::setPhase(RestoreCommandEnum newPhase) { - phase = (uint16_t) newPhase; -} - -void CMDUID::setBatch(int newBatchIndex) { - batch = newBatchIndex; -} - -uint64_t CMDUID::getIndex() { - return cmdID; -} - -std::string CMDUID::toString() const { - return format("%04ld|%04ld|%016lld", batch, phase, cmdID); -} - // DEBUG_FAST_RESTORE is not used right now! #define DEBUG_FAST_RESTORE 1 @@ -203,740 +104,42 @@ std::string CMDUID::toString() const { #define dbprintf_rs(fmt, args...) #endif -// RestoreData is the context for each restore process (worker and master) -struct RestoreData : NonCopyable, public ReferenceCounted { - //---- Declare status structure which records the progress and status of each worker in each role - std::map workers_interface; // UID is worker's node id, RestoreInterface is worker's communication interface - UID masterApplier; //TODO: Remove this variable. The first version uses 1 applier to apply the mutations - RestoreNodeStatus localNodeStatus; //Each worker node (process) has one such variable. - std::vector globalNodeStatus; // status of all notes, excluding master node, stored in master node // May change to map, like servers_info +// Each restore worker (a process) is assigned for a role. +// MAYBE Later: We will support multiple restore roles on a worker +struct RestoreWorkerData : NonCopyable, public ReferenceCounted { + UID workerID; + std::map workers_workerInterface; // UID is worker's node id, RestoreWorkerInterface is worker's communication workerInterface - // range2Applier is in master and loader node. Loader node uses this to determine which applier a mutation should be sent - std::map, UID> range2Applier; // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for - std::map, int> keyOpsCount; // The number of operations per key which is used to determine the key-range boundary for appliers - int numSampledMutations; // The total number of mutations received from sampled data. + // Restore Roles + Optional loaderInterf; + Reference loaderData; + Optional applierInterf; + Reference applierData; + Reference masterData; - struct ApplierStatus { // NOT USED //TODO: Remove this - UID id; - KeyRange keyRange; // the key range the applier is responsible for - // Applier state is changed at the following event - // Init: when applier's role is set - // Assigned: when applier is set for a key range to be respoinsible for - // Applying: when applier starts to apply the mutations to DB after receiving the cmd from loader - // Done: when applier has finished applying the mutation and notify the master. It will change to Assigned after Done - enum class ApplierState {Invalid = 0, Init = 1, Assigned, Applying, Done}; - ApplierState state; - }; - ApplierStatus applierStatus; + CMDUID cmdID; - // TODO: Record loading progress for (i) operators to check the restore status; (ii) recovering from node fault in the middle of restore + UID id() const { return workerID; }; - // Loader's state to handle the duplicate delivery of loading commands - std::map processedFiles; //first is filename of processed file, second is not used - std::map processedCmd; - bool inProgressApplyToDB = false; - uint32_t inProgressFlag = 0; - CMDUID cmdID; // Command id to record the progress - - - // Temporary variables to hold files and data to restore - std::vector allFiles; // All backup files to be processed in all version batches - std::vector files; // Backup files to be parsed and applied: range and log files in 1 version batch - std::map forbiddenVersions; // forbidden version range [first, second) - - // Temporary data structure for parsing range and log files into (version, ) - std::map>> kvOps; - // Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted - std::map, Standalone> mutationMap; // Key is the unique identifier for a batch of mutation logs at the same version - std::map, uint32_t> mutationPartMap; // Record the most recent - - // In each version batch, we process the files in [curBackupFilesBeginIndex, curBackupFilesEndIndex] in RestoreData.allFiles. - long curBackupFilesBeginIndex; - long curBackupFilesEndIndex; - double totalWorkloadSize; - double curWorkloadSize; - int batchIndex; - - - Reference bc; // Backup container is used to read backup files - Key bcUrl; // The url used to get the bc - - // For master applier to hold the lower bound of key ranges for each appliers - std::vector> keyRangeLowerBounds; - - // Helper functions to set/clear the flag when a worker is in the middle of processing an actor. - void setInProgressFlag(RestoreCommandEnum phaseEnum) { - int phase = (int) phaseEnum; - ASSERT(phase < 32); - inProgressFlag |= (1UL << phase); + RestoreWorkerData() { + workerID = UID(); } - void clearInProgressFlag(RestoreCommandEnum phaseEnum) { - int phase = (int) phaseEnum; - ASSERT(phase < 32); - inProgressFlag &= ~(1UL << phase); + ~RestoreWorkerData() { + printf("[Exit] Worker:%s RestoreWorkerData is deleted\n", workerID.toString().c_str()); } - bool isInProgress(RestoreCommandEnum phaseEnum) { - int phase = (int) phaseEnum; - ASSERT(phase < 32); - return (inProgressFlag & (1UL << phase)); - } - - RestoreRole getRole() { - return localNodeStatus.role; - } - - bool isCmdProcessed(CMDUID const &cmdID) { - return processedCmd.find(cmdID) != processedCmd.end(); - } - - // Describe the node information std::string describeNode() { std::stringstream ss; - ss << "[Role:" << getRoleStr(localNodeStatus.role) << "] [NodeID:" << localNodeStatus.nodeID.toString().c_str() - << "] [NodeIndex:" << std::to_string(localNodeStatus.nodeIndex) << "]"; + ss << "RestoreWorker workerID:" << workerID.toString(); return ss.str(); } - - void resetPerVersionBatch() { - printf("[INFO]Node:%s resetPerVersionBatch\n", localNodeStatus.nodeID.toString().c_str()); - range2Applier.clear(); - keyOpsCount.clear(); - numSampledMutations = 0; - kvOps.clear(); - mutationMap.clear(); - mutationPartMap.clear(); - processedCmd.clear(); - inProgressApplyToDB = false; - files.clear(); // files are backup files for a version batch - curWorkloadSize = 0; - } - - vector getBusyAppliers() { - vector busyAppliers; - for (auto &app : range2Applier) { - busyAppliers.push_back(app.second); - } - return busyAppliers; - } - - RestoreData() { - cmdID.initPhase(RestoreCommandEnum::Init); - localNodeStatus.role = RestoreRole::Invalid; - localNodeStatus.nodeIndex = 0; - curBackupFilesBeginIndex = 0; - curBackupFilesEndIndex = 0; - totalWorkloadSize = 0; - curWorkloadSize = 0; - batchIndex = 0; - bc = Reference(); - bcUrl = StringRef(); - } - - ~RestoreData() { - printf("[Exit] NodeID:%s RestoreData is deleted\n", localNodeStatus.nodeID.toString().c_str()); - } }; -void printAppliersKeyRange(Reference rd) { - printf("[INFO] The mapping of KeyRange_start --> Applier ID\n"); - // applier type: std::map, UID> - for (auto &applier : rd->range2Applier) { - printf("\t[INFO]%s -> %s\n", getHexString(applier.first).c_str(), applier.second.toString().c_str()); - } -} - -//Print out the works_interface info -void printWorkersInterface(Reference rd) { - printf("[INFO] workers_interface info: num of workers:%ld\n", rd->workers_interface.size()); - int index = 0; - for (auto &interf : rd->workers_interface) { - printf("\t[INFO][Worker %d] NodeID:%s, Interface.id():%s\n", index, - interf.first.toString().c_str(), interf.second.id().toString().c_str()); - } -} - -// Return in the system -std::pair getNumLoaderAndApplier(Reference rd){ - int numLoaders = 0; - int numAppliers = 0; - for (int i = 0; i < rd->globalNodeStatus.size(); ++i) { - if (rd->globalNodeStatus[i].role == RestoreRole::Loader) { - numLoaders++; - } else if (rd->globalNodeStatus[i].role == RestoreRole::Applier) { - numAppliers++; - } else { - printf("[ERROR] unknown role: %d\n", rd->globalNodeStatus[i].role); - } - } - - if ( numLoaders + numAppliers != rd->globalNodeStatus.size() ) { - printf("[ERROR] Number of workers does not add up! numLoaders:%d, numApplier:%d, totalProcess:%ld\n", - numLoaders, numAppliers, rd->globalNodeStatus.size()); - } - - return std::make_pair(numLoaders, numAppliers); -} - -std::vector getWorkingApplierIDs(Reference rd) { - std::vector applierIDs; - for ( auto &applier : rd->range2Applier ) { - applierIDs.push_back(applier.second); - } - - ASSERT( !applierIDs.empty() ); - return applierIDs; -} - -std::vector getApplierIDs(Reference rd) { - std::vector applierIDs; - for (int i = 0; i < rd->globalNodeStatus.size(); ++i) { - if (rd->globalNodeStatus[i].role == RestoreRole::Applier) { - applierIDs.push_back(rd->globalNodeStatus[i].nodeID); - } - } - - // Check if there exist duplicate applier IDs, which should never occur - std::sort(applierIDs.begin(), applierIDs.end()); - bool unique = true; - for (int i = 1; i < applierIDs.size(); ++i) { - if (applierIDs[i-1] == applierIDs[i]) { - unique = false; - break; - } - } - if (!unique) { - fprintf(stderr, "[ERROR] Applier IDs are not unique! All worker IDs are as follows\n"); - printGlobalNodeStatus(rd); - } - - ASSERT( !applierIDs.empty() ); - return applierIDs; -} - -std::vector getLoaderIDs(Reference rd) { - std::vector loaderIDs; - for (int i = 0; i < rd->globalNodeStatus.size(); ++i) { - if (rd->globalNodeStatus[i].role == RestoreRole::Loader) { - loaderIDs.push_back(rd->globalNodeStatus[i].nodeID); - } - } - - // Check if there exist duplicate applier IDs, which should never occur - std::sort(loaderIDs.begin(), loaderIDs.end()); - bool unique = true; - for (int i = 1; i < loaderIDs.size(); ++i) { - if (loaderIDs[i-1] == loaderIDs[i]) { - unique = false; - break; - } - } - if (!unique) { - printf("[ERROR] Applier IDs are not unique! All worker IDs are as follows\n"); - printGlobalNodeStatus(rd); - } - - return loaderIDs; -} - -std::vector getWorkerIDs(Reference rd) { - std::vector workerIDs; - for (int i = 0; i < rd->globalNodeStatus.size(); ++i) { - if (rd->globalNodeStatus[i].role == RestoreRole::Loader || - rd->globalNodeStatus[i].role == RestoreRole::Applier) { - workerIDs.push_back(rd->globalNodeStatus[i].nodeID); - } - } - - // Check if there exist duplicate applier IDs, which should never occur - std::sort(workerIDs.begin(), workerIDs.end()); - bool unique = true; - for (int i = 1; i < workerIDs.size(); ++i) { - if (workerIDs[i-1] == workerIDs[i]) { - unique = false; - break; - } - } - if (!unique) { - printf("[ERROR] Applier IDs are not unique! All worker IDs are as follows\n"); - printGlobalNodeStatus(rd); - } - - return workerIDs; -} - -void printGlobalNodeStatus(Reference rd) { - printf("---Print globalNodeStatus---\n"); - printf("Number of entries:%ld\n", rd->globalNodeStatus.size()); - for(int i = 0; i < rd->globalNodeStatus.size(); ++i) { - printf("[Node:%d] %s, role:%s\n", i, rd->globalNodeStatus[i].toString().c_str(), - getRoleStr(rd->globalNodeStatus[i].role).c_str()); - } -} - -void printBackupFilesInfo(Reference rd) { - printf("[INFO] The backup files for current batch to load and apply: num:%ld\n", rd->files.size()); - for (int i = 0; i < rd->files.size(); ++i) { - printf("\t[INFO][File %d] %s\n", i, rd->files[i].toString().c_str()); - } -} - - -void printAllBackupFilesInfo(Reference rd) { - printf("[INFO] All backup files: num:%ld\n", rd->allFiles.size()); - for (int i = 0; i < rd->allFiles.size(); ++i) { - printf("\t[INFO][File %d] %s\n", i, rd->allFiles[i].toString().c_str()); - } -} - -void buildForbiddenVersionRange(Reference rd) { - - printf("[INFO] Build forbidden version ranges for all backup files: num:%ld\n", rd->allFiles.size()); - for (int i = 0; i < rd->allFiles.size(); ++i) { - if (!rd->allFiles[i].isRange) { - rd->forbiddenVersions.insert(std::make_pair(rd->allFiles[i].beginVersion, rd->allFiles[i].endVersion)); - } - } -} - -bool isForbiddenVersionRangeOverlapped(Reference rd) { - printf("[INFO] Check if forbidden version ranges is overlapped: num of ranges:%ld\n", rd->forbiddenVersions.size()); - if (rd->forbiddenVersions.empty()) { - return false; - } - - std::map::iterator prevRange = rd->forbiddenVersions.begin(); - std::map::iterator curRange = rd->forbiddenVersions.begin(); - curRange++; // Assume rd->forbiddenVersions has at least one element! - - while ( curRange != rd->forbiddenVersions.end() ) { - if ( curRange->first < prevRange->second ) { - return true; // overlapped - } - curRange++; - } - - return false; //not overlapped -} - -// endVersion is begin version for range file, because range file takes snapshot at the same version -// endVersion is the end version (excluded) for mutations recorded in log file -bool isVersionInForbiddenRange(Reference rd, Version endVersion, bool isRange) { - bool isForbidden = false; - for (auto &range : rd->forbiddenVersions) { - if ( isRange ) { //the range file includes mutations at the endVersion - if (endVersion >= range.first && endVersion < range.second) { - isForbidden = true; - break; - } - } else { // the log file does NOT include mutations at the endVersion - continue; // Log file's endVersion is always a valid version batch boundary as long as the forbidden version ranges do not overlap - } - } - - return isForbidden; -} - -void printForbiddenVersionRange(Reference rd) { - printf("[INFO] Number of forbidden version ranges:%ld\n", rd->forbiddenVersions.size()); - int i = 0; - for (auto &range : rd->forbiddenVersions) { - printf("\t[INFO][Range%d] [%ld, %ld)\n", i, range.first, range.second); - ++i; - } -} - -void constructFilesWithVersionRange(Reference rd) { - printf("[INFO] constructFilesWithVersionRange for num_files:%ld\n", rd->files.size()); - rd->allFiles.clear(); - for (int i = 0; i < rd->files.size(); i++) { - printf("\t[File:%d] Start %s\n", i, rd->files[i].toString().c_str()); - Version beginVersion = 0; - Version endVersion = 0; - if (rd->files[i].isRange) { - // No need to parse range filename to get endVersion - beginVersion = rd->files[i].version; - endVersion = beginVersion; - } else { // Log file - //Refer to pathToLogFile() in BackupContainer.actor.cpp - long blockSize, len; - int pos = rd->files[i].fileName.find_last_of("/"); - std::string fileName = rd->files[i].fileName.substr(pos); - printf("\t[File:%d] Log filename:%s, pos:%d\n", i, fileName.c_str(), pos); - sscanf(fileName.c_str(), "/log,%ld,%ld,%*[^,],%lu%ln", &beginVersion, &endVersion, &blockSize, &len); - printf("\t[File:%d] Log filename:%s produces beginVersion:%ld endVersion:%ld\n",i, fileName.c_str(), beginVersion, endVersion); - } - rd->files[i].beginVersion = beginVersion; - rd->files[i].endVersion = endVersion; - printf("\t[File:%d] End %s\n", i, rd->files[i].toString().c_str()); - ASSERT(beginVersion <= endVersion); - rd->allFiles.push_back(rd->files[i]); - // rd->allFiles.back().beginVersion = beginVersion; - // rd->allFiles.back().endVersion = endVersion; - } -} - - -//// --- Some common functions - ACTOR static Future _parseRangeFileToMutationsOnLoader(Reference rd, - Reference bc, Version version, - std::string fileName, int64_t readOffset_input, int64_t readLen_input, - KeyRange restoreRange, Key addPrefix, Key removePrefix) { - - state int64_t readOffset = readOffset_input; - state int64_t readLen = readLen_input; - - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] Parse range file and get mutations 1, bc:%lx\n", bc.getPtr()); - } - // The set of key value version is rangeFile.version. the key-value set in the same range file has the same version - Reference inFile = wait(bc->readFile(fileName)); - - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] Parse range file and get mutations 2\n"); - } - state Standalone> blockData = wait(parallelFileRestore::decodeRangeFileBlock(inFile, readOffset, readLen)); - - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] Parse range file and get mutations 3\n"); - int tmpi = 0; - for (tmpi = 0; tmpi < blockData.size(); tmpi++) { - printf("\t[VERBOSE_DEBUG] mutation: key:%s value:%s\n", blockData[tmpi].key.toString().c_str(), blockData[tmpi].value.toString().c_str()); - } - } - - // First and last key are the range for this file - state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key); - printf("[INFO] RangeFile:%s KeyRange:%s, restoreRange:%s\n", - fileName.c_str(), fileRange.toString().c_str(), restoreRange.toString().c_str()); - - // If fileRange doesn't intersect restore range then we're done. - if(!fileRange.intersects(restoreRange)) { - TraceEvent("ExtractApplyRangeFileToDB_MX").detail("NoIntersectRestoreRange", "FinishAndReturn"); - return Void(); - } - - // We know the file range intersects the restore range but there could still be keys outside the restore range. - // Find the subvector of kv pairs that intersect the restore range. Note that the first and last keys are just the range endpoints for this file - // The blockData's first and last entries are metadata, not the real data - int rangeStart = 1; //1 - int rangeEnd = blockData.size() -1; //blockData.size() - 1 // Q: the rangeStart and rangeEnd is [,)? - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] Range file decoded blockData\n"); - for (auto& data : blockData ) { - printf("\t[VERBOSE_DEBUG] data key:%s val:%s\n", data.key.toString().c_str(), data.value.toString().c_str()); - } - } - - // Slide start forward, stop if something in range is found - // Move rangeStart and rangeEnd until they is within restoreRange - while(rangeStart < rangeEnd && !restoreRange.contains(blockData[rangeStart].key)) { - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] rangeStart:%d key:%s is not in the range:%s\n", rangeStart, blockData[rangeStart].key.toString().c_str(), restoreRange.toString().c_str()); - } - ++rangeStart; - } - // Side end backward, stop if something in range is found - while(rangeEnd > rangeStart && !restoreRange.contains(blockData[rangeEnd - 1].key)) { - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] (rangeEnd:%d - 1) key:%s is not in the range:%s\n", rangeEnd, blockData[rangeStart].key.toString().c_str(), restoreRange.toString().c_str()); - } - --rangeEnd; - } - - // MX: now data only contains the kv mutation within restoreRange - state VectorRef data = blockData.slice(rangeStart, rangeEnd); - printf("[INFO] RangeFile:%s blockData entry size:%d recovered data size:%d\n", fileName.c_str(), blockData.size(), data.size()); - - // Shrink file range to be entirely within restoreRange and translate it to the new prefix - // First, use the untranslated file range to create the shrunk original file range which must be used in the kv range version map for applying mutations - state KeyRange originalFileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end)); - - // Now shrink and translate fileRange - Key fileEnd = std::min(fileRange.end, restoreRange.end); - if(fileEnd == (removePrefix == StringRef() ? normalKeys.end : strinc(removePrefix)) ) { - fileEnd = addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix); - } else { - fileEnd = fileEnd.removePrefix(removePrefix).withPrefix(addPrefix); - } - fileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin).removePrefix(removePrefix).withPrefix(addPrefix),fileEnd); - - state int start = 0; - state int end = data.size(); - //state int dataSizeLimit = BUGGIFY ? g_random->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; - state int dataSizeLimit = CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; - state int kvCount = 0; - - //MX: This is where the key-value pair in range file is applied into DB - loop { - - state int i = start; - state int txBytes = 0; - state int iend = start; - - // find iend that results in the desired transaction size - for(; iend < end && txBytes < dataSizeLimit; ++iend) { - txBytes += data[iend].key.expectedSize(); - txBytes += data[iend].value.expectedSize(); - } - - - for(; i < iend; ++i) { - //MXX: print out the key value version, and operations. - if ( debug_verbose ) { - printf("RangeFile [key:%s, value:%s, version:%ld, op:set]\n", data[i].key.printable().c_str(), data[i].value.printable().c_str(), version); - } -// TraceEvent("PrintRangeFile_MX").detail("Key", data[i].key.printable()).detail("Value", data[i].value.printable()) -// .detail("Version", rangeFile.version).detail("Op", "set"); -//// printf("PrintRangeFile_MX: mType:set param1:%s param2:%s param1_size:%d, param2_size:%d\n", -//// getHexString(data[i].key.c_str(), getHexString(data[i].value).c_str(), data[i].key.size(), data[i].value.size()); - - //NOTE: Should NOT removePrefix and addPrefix for the backup data! - // In other words, the following operation is wrong: data[i].key.removePrefix(removePrefix).withPrefix(addPrefix) - MutationRef m(MutationRef::Type::SetValue, data[i].key, data[i].value); //ASSUME: all operation in range file is set. - ++kvCount; - - // TODO: we can commit the kv operation into DB. - // Right now, we cache all kv operations into kvOps, and apply all kv operations later in one place - if ( rd->kvOps.find(version) == rd->kvOps.end() ) { // Create the map's key if mutation m is the first on to be inserted - //kvOps.insert(std::make_pair(rangeFile.version, Standalone>(VectorRef()))); - rd->kvOps.insert(std::make_pair(version, VectorRef())); - } - - ASSERT(rd->kvOps.find(version) != rd->kvOps.end()); - rd->kvOps[version].push_back_deep(rd->kvOps[version].arena(), m); - - } - - // Commit succeeded, so advance starting point - start = i; - - if(start == end) { - //TraceEvent("ExtraApplyRangeFileToDB_MX").detail("Progress", "DoneApplyKVToDB"); - printf("[INFO][Loader] NodeID:%s Parse RangeFile:%s: the number of kv operations = %d\n", - rd->describeNode().c_str(), fileName.c_str(), kvCount); - return Void(); - } - } - - } - - ACTOR static Future _parseLogFileToMutationsOnLoader(Reference rd, - Reference bc, Version version, - std::string fileName, int64_t readOffset, int64_t readLen, - KeyRange restoreRange, Key addPrefix, Key removePrefix, - Key mutationLogPrefix) { - - // Step: concatenate the backuped param1 and param2 (KV) at the same version. - //state Key mutationLogPrefix = mutationLogPrefix; - //TraceEvent("ReadLogFileStart").detail("LogFileName", fileName); - state Reference inFile = wait(bc->readFile(fileName)); - //TraceEvent("ReadLogFileFinish").detail("LogFileName", fileName); - - printf("Parse log file:%s readOffset:%d readLen:%ld\n", fileName.c_str(), readOffset, readLen); - //TODO: NOTE: decodeLogFileBlock() should read block by block! based on my serial version. This applies to decode range file as well - state Standalone> data = wait(parallelFileRestore::decodeLogFileBlock(inFile, readOffset, readLen)); - //state Standalone> data = wait(fileBackup::decodeLogFileBlock_MX(inFile, readOffset, readLen)); //Decode log file - TraceEvent("ReadLogFileFinish").detail("LogFileName", fileName).detail("DecodedDataSize", data.contents().size()); - printf("ReadLogFile, raw data size:%d\n", data.size()); - - state int start = 0; - state int end = data.size(); - //state int dataSizeLimit = BUGGIFY ? g_random->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; - state int dataSizeLimit = CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; - state int kvCount = 0; - state int numConcatenated = 0; - loop { - try { -// printf("Process start:%d where end=%d\n", start, end); - if(start == end) { - printf("ReadLogFile: finish reading the raw data and concatenating the mutation at the same version\n"); - break; - } - - state int i = start; - state int txBytes = 0; - for(; i < end && txBytes < dataSizeLimit; ++i) { - Key k = data[i].key.withPrefix(mutationLogPrefix); - ValueRef v = data[i].value; - txBytes += k.expectedSize(); - txBytes += v.expectedSize(); - //MXX: print out the key value version, and operations. - //printf("LogFile [key:%s, value:%s, version:%ld, op:NoOp]\n", k.printable().c_str(), v.printable().c_str(), logFile.version); - // printf("LogFile [KEY:%s, VALUE:%s, VERSION:%ld, op:NoOp]\n", getHexString(k).c_str(), getHexString(v).c_str(), logFile.version); - // printBackupMutationRefValueHex(v, " |\t"); - // printf("[DEBUG]||Concatenate backup mutation:fileInfo:%s, data:%d\n", logFile.toString().c_str(), i); - bool concatenated = concatenateBackupMutationForLogFile(rd, data[i].value, data[i].key); - numConcatenated += ( concatenated ? 1 : 0); - // //TODO: Decode the value to get the mutation type. Use NoOp to distinguish from range kv for now. - // MutationRef m(MutationRef::Type::NoOp, data[i].key, data[i].value); //ASSUME: all operation in log file is NoOp. - // if ( rd->kvOps.find(logFile.version) == rd->kvOps.end() ) { - // rd->kvOps.insert(std::make_pair(logFile.version, std::vector())); - // } else { - // rd->kvOps[logFile.version].push_back(m); - // } - } - - start = i; - - } catch(Error &e) { - if(e.code() == error_code_transaction_too_large) - dataSizeLimit /= 2; - } - } - - printf("[INFO] raw kv number:%d parsed from log file, concatenated:%d kv, num_log_versions:%d\n", data.size(), numConcatenated, rd->mutationMap.size()); - - return Void(); - } - - // Parse the kv pair (version, serialized_mutation), which are the results parsed from log file. - void parseSerializedMutation(Reference rd, bool isSampling) { - // Step: Parse the concatenated KV pairs into (version, ) pair - printf("[INFO] Parse the concatenated log data\n"); - std::string prefix = "||\t"; - std::stringstream ss; - const int version_size = 12; - const int header_size = 12; - int kvCount = 0; - - for ( auto& m : rd->mutationMap ) { - StringRef k = m.first.contents(); - StringRefReaderMX readerVersion(k, restore_corrupted_data()); - uint64_t commitVersion = readerVersion.consume(); // Consume little Endian data - - - StringRef val = m.second.contents(); - StringRefReaderMX reader(val, restore_corrupted_data()); - - int count_size = 0; - // Get the include version in the batch commit, which is not the commitVersion. - // commitVersion is in the key - uint64_t includeVersion = reader.consume(); - count_size += 8; - uint32_t val_length_decode = reader.consume(); //Parse little endian value, confirmed it is correct! - count_size += 4; - - if ( rd->kvOps.find(commitVersion) == rd->kvOps.end() ) { - rd->kvOps.insert(std::make_pair(commitVersion, VectorRef())); - } - - if ( debug_verbose ) { - printf("----------------------------------------------------------Register Backup Mutation into KVOPs version:%08lx\n", commitVersion); - printf("To decode value:%s\n", getHexString(val).c_str()); - } - // In sampling, the last mutation vector may be not complete, we do not concatenate for performance benefit - if ( val_length_decode != (val.size() - 12) ) { - //IF we see val.size() == 10000, It means val should be concatenated! The concatenation may fail to copy the data - if (isSampling) { - printf("[PARSE WARNING]!!! val_length_decode:%d != val.size:%d version:%ld(0x%lx)\n", val_length_decode, val.size(), - commitVersion, commitVersion); - printf("[PARSE WARNING] Skipped the mutation! OK for sampling workload but WRONG for restoring the workload\n"); - continue; - } else { - printf("[PARSE ERROR]!!! val_length_decode:%d != val.size:%d version:%ld(0x%lx)\n", val_length_decode, val.size(), - commitVersion, commitVersion); - } - } else { - if ( debug_verbose ) { - printf("[PARSE SUCCESS] val_length_decode:%d == (val.size:%d - 12)\n", val_length_decode, val.size()); - } - } - - // Get the mutation header - while (1) { - // stop when reach the end of the string - if(reader.eof() ) { //|| *reader.rptr == 0xFF - //printf("Finish decode the value\n"); - break; - } - - - uint32_t type = reader.consume();//reader.consumeNetworkUInt32(); - uint32_t kLen = reader.consume();//reader.consumeNetworkUInkvOps[t32(); - uint32_t vLen = reader.consume();//reader.consumeNetworkUInt32(); - const uint8_t *k = reader.consume(kLen); - const uint8_t *v = reader.consume(vLen); - count_size += 4 * 3 + kLen + vLen; - - MutationRef mutation((MutationRef::Type) type, KeyRef(k, kLen), KeyRef(v, vLen)); - rd->kvOps[commitVersion].push_back_deep(rd->kvOps[commitVersion].arena(), mutation); - kvCount++; - - if ( kLen < 0 || kLen > val.size() || vLen < 0 || vLen > val.size() ) { - printf("%s[PARSE ERROR]!!!! kLen:%d(0x%04x) vLen:%d(0x%04x)\n", prefix.c_str(), kLen, kLen, vLen, vLen); - } - - if ( debug_verbose ) { - printf("%s---LogFile parsed mutations. Prefix:[%d]: Version:%016lx Type:%d K:%s V:%s k_size:%d v_size:%d\n", prefix.c_str(), - kvCount, - commitVersion, type, getHexString(KeyRef(k, kLen)).c_str(), getHexString(KeyRef(v, vLen)).c_str(), kLen, vLen); - } - - } - // printf("----------------------------------------------------------\n"); - } - - printf("[INFO] Produces %d mutation operations from concatenated kv pairs that are parsed from log\n", kvCount); - -} - - -ACTOR Future setWorkerInterface(RestoreSimpleRequest req, Reference rd, RestoreInterface interf, Database cx) { - state Transaction tr(cx); - - while (rd->isInProgress(RestoreCommandEnum::Set_WorkerInterface)) { - printf("[DEBUG] NODE:%s setWorkerInterface wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - // Handle duplicate, assuming cmdUID is always unique for the same workload - if ( rd->isCmdProcessed(req.cmdID) ) { - printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - - rd->setInProgressFlag(RestoreCommandEnum::Set_WorkerInterface); - - state vector agents; // agents is cmdsInterf - printf("[INFO][Worker] Node:%s Get the interface for all workers\n", rd->describeNode().c_str()); - loop { - try { - rd->workers_interface.clear(); - tr.reset(); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - Standalone agentValues = wait(tr.getRange(restoreWorkersKeys, CLIENT_KNOBS->TOO_MANY)); - ASSERT(!agentValues.more); - if(agentValues.size()) { - for(auto& it : agentValues) { - agents.push_back(BinaryReader::fromStringRef(it.value, IncludeVersion())); - // Save the RestoreInterface for the later operations - rd->workers_interface.insert(std::make_pair(agents.back().id(), agents.back())); - } - tr.commit(); - break; - } - } catch( Error &e ) { - printf("[WARNING] Node:%s setWorkerInterface() transaction error:%s\n", rd->describeNode().c_str(), e.what()); - wait( tr.onError(e) ); - } - printf("[WARNING] Node:%s setWorkerInterface should always succeed in the first loop! Something goes wrong!\n", rd->describeNode().c_str()); - wait ( delay(1.0) ); - }; - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - rd->processedCmd[req.cmdID] = 1; - rd->clearInProgressFlag(RestoreCommandEnum::Set_WorkerInterface); - - return Void(); - } - - -ACTOR Future handleFinishRestoreReq(RestoreSimpleRequest req, Reference rd, RestoreInterface interf, Database cx) { +// Restore worker +ACTOR Future handlerTerminateWorkerRequest(RestoreSimpleRequest req, Reference self, RestoreWorkerInterface workerInterf, Database cx) { state Transaction tr(cx); loop { @@ -944,1176 +147,59 @@ ACTOR Future handleFinishRestoreReq(RestoreSimpleRequest req, ReferenceloaderInterf.present() ) { + tr.clear(restoreLoaderKeyFor(self->loaderInterf.get().id())); + } + if ( self->applierInterf.present() ) { + tr.clear(restoreApplierKeyFor(self->applierInterf.get().id())); + } wait( tr.commit() ) ; - printf("Node:%s finish restore, clear the key for interf.id:%s and exit\n", rd->describeNode().c_str(), interf.id().toString().c_str()); - req.reply.send( RestoreCommonReply(interf.id(), req.cmdID) ); + printf("Node:%s finish restore, clear the interface keys for all roles on the worker (id:%s) and the worker itself. Then exit\n", self->describeNode().c_str(), workerInterf.id().toString().c_str()); + req.reply.send( RestoreCommonReply(workerInterf.id(), req.cmdID) ); break; } catch( Error &e ) { - printf("[WARNING] Node:%s finishRestoreHandler() transaction error:%s\n", rd->describeNode().c_str(), e.what()); + printf("[WARNING] Node:%s finishRestoreHandler() transaction error:%s\n", self->describeNode().c_str(), e.what()); wait( tr.onError(e) ); } }; - return Void(); } -// Read restoreWorkersKeys from DB to get each restore worker's restore interface and set it to rd->workers_interface - ACTOR Future collectWorkerInterface(Reference rd, Database cx, int min_num_workers) { - state Transaction tr(cx); - - state vector agents; // agents is cmdsInterf - - loop { - try { - rd->workers_interface.clear(); - agents.clear(); - tr.reset(); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - Standalone agentValues = wait(tr.getRange(restoreWorkersKeys, CLIENT_KNOBS->TOO_MANY)); - ASSERT(!agentValues.more); - // If agentValues.size() < min_num_workers, we should wait for coming workers to register their interface before we read them once for all - if(agentValues.size() >= min_num_workers) { - for(auto& it : agentValues) { - agents.push_back(BinaryReader::fromStringRef(it.value, IncludeVersion())); - // Save the RestoreInterface for the later operations - rd->workers_interface.insert(std::make_pair(agents.back().id(), agents.back())); - printf("collectWorkerInterface, interface id:%s\n", agents.back().id().toString().c_str()); - } - break; - } - printf("%s:Wait for enough workers. Current num_workers:%d target num_workers:%d\n", - rd->describeNode().c_str(), agentValues.size(), min_num_workers); - wait( delay(5.0) ); - } catch( Error &e ) { - printf("[WARNING]%s: collectWorkerInterface transaction error:%s\n", rd->describeNode().c_str(), e.what()); - wait( tr.onError(e) ); - } - } - ASSERT(agents.size() >= min_num_workers); // ASSUMPTION: We must have at least 1 loader and 1 applier - - TraceEvent("FastRestore").detail("CollectWorkerInterfaceNumWorkers", rd->workers_interface.size()); - - return Void(); - } - - // Periodically send worker heartbeat to - ACTOR Future monitorWorkerLiveness(Reference rd) { - ASSERT( !rd->workers_interface.empty() ); + ACTOR Future monitorWorkerLiveness(Reference self) { + ASSERT( !self->workers_workerInterface.empty() ); state int wIndex = 0; - for (auto &workerInterf : rd->workers_interface) { - printf("[Worker:%d][UID:%s][Interf.NodeInfo:%s]\n", wIndex, workerInterf.first.toString().c_str(), workerInterf.second.nodeID.toString().c_str()); + for (auto &workerInterf : self->workers_workerInterface) { + printf("[Worker:%d][UID:%s][Interf.NodeInfo:%s]\n", wIndex, workerInterf.first.toString().c_str(), workerInterf.second.id().toString().c_str()); wIndex++; } state std::vector> cmdReplies; - state std::map::iterator workerInterf; + state std::map::iterator workerInterf; loop { wIndex = 0; - for ( workerInterf = rd->workers_interface.begin(); workerInterf != rd->workers_interface.end(); workerInterf++) { + self->cmdID.initPhase(RestoreCommandEnum::Heart_Beat); + for ( workerInterf = self->workers_workerInterface.begin(); workerInterf != self->workers_workerInterface.end(); workerInterf++) { + self->cmdID.nextCmd(); try { wait( delay(1.0) ); - cmdReplies.push_back( workerInterf->second.heartbeat.getReply(RestoreSimpleRequest(rd->cmdID)) ); + cmdReplies.push_back( workerInterf->second.heartbeat.getReply(RestoreSimpleRequest(self->cmdID)) ); std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); cmdReplies.clear(); wIndex++; } catch (Error &e) { // Handle the command reply timeout error - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - printf("[Heartbeat: Node may be down][Worker:%d][UID:%s][Interf.NodeInfo:%s]\n", wIndex, workerInterf->first.toString().c_str(), workerInterf->second.nodeID.toString().c_str()); + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + printf("[Heartbeat: Node may be down][Worker:%d][UID:%s][Interf.NodeInfo:%s]\n", wIndex, workerInterf->first.toString().c_str(), workerInterf->second.id().toString().c_str()); } } wait( delay(30.0) ); } - - //return Void(); } -// Set roles (Loader or Applier) for workers and ask all workers to share their interface -// The master node's localNodeStatus has been set outside of this function -ACTOR Future configureRoles(Reference rd) { - printf("%s:Start configuring roles for workers\n", rd->describeNode().c_str()); - // Set up the role, and the global status for each node - int numNodes = rd->workers_interface.size(); - int numLoader = numNodes * ratio_loader_to_applier / (ratio_loader_to_applier + 1); - int numApplier = numNodes - numLoader; - if (numLoader <= 0 || numApplier <= 0) { - ASSERT( numLoader > 0 ); // Quick check in correctness - ASSERT( numApplier > 0 ); - fprintf(stderr, "[ERROR] not enough nodes for loader and applier. numLoader:%d, numApplier:%d, ratio_loader_to_applier:%d, numAgents:%d\n", numLoader, numApplier, ratio_loader_to_applier, numNodes); - } else { - printf("Node%s: Configure roles numWorkders:%d numLoader:%d numApplier:%d\n", rd->describeNode().c_str(), numNodes, numLoader, numApplier); - } - - rd->localNodeStatus.nodeIndex = 0; // Master has nodeIndex = 0 - - // The first numLoader nodes will be loader, and the rest nodes will be applier - int nodeIndex = 1; // worker's nodeIndex starts from 1 - for (auto &workerInterf : rd->workers_interface) { - // globalNodeStatus does not include the master's info because master holds globalNodeStatus - rd->globalNodeStatus.push_back(RestoreNodeStatus()); - rd->globalNodeStatus.back().nodeID = workerInterf.second.id(); - rd->globalNodeStatus.back().nodeIndex = nodeIndex; - if ( nodeIndex < numLoader + 1) { - rd->globalNodeStatus.back().init(RestoreRole::Loader); - } else { - rd->globalNodeStatus.back().init(RestoreRole::Applier); - } - nodeIndex++; - } - - // Set the last Applier as the master applier - rd->masterApplier = rd->globalNodeStatus.back().nodeID; - printf("masterApplier ID:%s\n", rd->masterApplier.toString().c_str()); - - // Notify each worker about the worker's role - state int index = 0; - state RestoreRole role; - state UID nodeID; - printf("Node:%s Start configuring roles for workers\n", rd->describeNode().c_str()); - rd->cmdID.initPhase(RestoreCommandEnum::Set_Role); - loop { - try { - wait(delay(1.0)); - std::vector> cmdReplies; - index = 0; - for (auto &workerInterf : rd->workers_interface) { - role = rd->globalNodeStatus[index].role; - nodeID = rd->globalNodeStatus[index].nodeID; - rd->cmdID.nextCmd(); - printf("[CMD:%s] Node:%s Set role (%s) to node (index=%d uid=%s)\n", rd->cmdID.toString().c_str(), rd->describeNode().c_str(), - getRoleStr(role).c_str(), index, nodeID.toString().c_str()); - cmdReplies.push_back( workerInterf.second.setRole.getReply(RestoreSetRoleRequest(rd->cmdID, role, index, rd->masterApplier)) ); - index++; - } - std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); - printf("[SetRole] Finished\n"); - break; - } catch (Error &e) { - // Handle the command reply timeout error - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - printf("Node:%s waits on replies time out. Current phase: Set_Role, Retry all commands.\n", rd->describeNode().c_str()); - } - } - - // Sanity check roles configuration - std::pair numWorkers = getNumLoaderAndApplier(rd); - int numLoaders = numWorkers.first; - int numAppliers = numWorkers.second; - ASSERT( rd->globalNodeStatus.size() > 0 ); - ASSERT( numLoaders > 0 ); - ASSERT( numAppliers > 0 ); - - printf("Node:%s finish configure roles\n", rd->describeNode().c_str()); - - return Void(); -} - -// Ask each restore worker to share its restore interface -ACTOR Future notifyWorkersToSetWorkersInterface(Reference rd) { - state int index = 0; - loop { - try { - wait(delay(1.0)); - index = 0; - std::vector> cmdReplies; - for(auto& workersInterface : rd->workers_interface) { - rd->cmdID.nextCmd(); - printf("[CMD:%s] Node:%s setWorkerInterface for node (index=%d uid=%s)\n", - rd->cmdID.toString().c_str(), rd->describeNode().c_str(), - index, rd->globalNodeStatus[index].nodeID.toString().c_str()); - cmdReplies.push_back( workersInterface.second.setWorkerInterface.getReply(RestoreSimpleRequest(rd->cmdID)) ); - index++; - } - std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); - printf("[setWorkerInterface] Finished\n"); - break; - } catch (Error &e) { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - printf("Node:%s waits on replies time out. Current phase: setWorkerInterface, Retry all commands.\n", rd->describeNode().c_str()); - } - } - - return Void(); -} - -void printApplierKeyRangeInfo(std::map> appliers) { - printf("[INFO] appliers num:%ld\n", appliers.size()); - int index = 0; - for(auto &applier : appliers) { - printf("\t[INFO][Applier:%d] ID:%s --> KeyRange:%s\n", index, applier.first.toString().c_str(), applier.second.toString().c_str()); - } -} - -ACTOR Future assignKeyRangeToAppliers(Reference rd, Database cx) { //, VectorRef ret_agents - //construct the key range for each applier - std::vector lowerBounds; - std::vector> keyRanges; - std::vector applierIDs; - - // printf("[INFO] Node:%s, Assign key range to appliers. num_appliers:%ld\n", rd->describeNode().c_str(), rd->range2Applier.size()); - for (auto& applier : rd->range2Applier) { - lowerBounds.push_back(applier.first); - applierIDs.push_back(applier.second); - // printf("\t[INFO] ApplierID:%s lowerBound:%s\n", - // applierIDs.back().toString().c_str(), - // lowerBounds.back().toString().c_str()); - } - for (int i = 0; i < lowerBounds.size(); ++i) { - KeyRef startKey = lowerBounds[i]; - KeyRef endKey; - if ( i < lowerBounds.size() - 1) { - endKey = lowerBounds[i+1]; - } else { - endKey = normalKeys.end; - } - - if (startKey > endKey) { - fprintf(stderr, "ERROR at assignKeyRangeToAppliers, startKey:%s > endKey:%s\n", startKey.toString().c_str(), endKey.toString().c_str()); - } - - keyRanges.push_back(KeyRangeRef(startKey, endKey)); - } - - ASSERT( applierIDs.size() == keyRanges.size() ); - state std::map> appliers; - appliers.clear(); // If this function is called more than once in multiple version batches, appliers may carry over the data from earlier version batch - for (int i = 0; i < applierIDs.size(); ++i) { - if (appliers.find(applierIDs[i]) != appliers.end()) { - printf("[ERROR] ApplierID appear more than once. appliers size:%ld applierID: %s\n", - appliers.size(), applierIDs[i].toString().c_str()); - printApplierKeyRangeInfo(appliers); - } - ASSERT( appliers.find(applierIDs[i]) == appliers.end() ); // we should not have a duplicate applierID respoinsbile for multiple key ranges - appliers.insert(std::make_pair(applierIDs[i], keyRanges[i])); - } - - state std::vector> cmdReplies; - loop { - try { - cmdReplies.clear(); - rd->cmdID.initPhase(RestoreCommandEnum::Assign_Applier_KeyRange); - for (auto& applier : appliers) { - KeyRangeRef keyRange = applier.second; - UID nodeID = applier.first; - ASSERT(rd->workers_interface.find(nodeID) != rd->workers_interface.end()); - RestoreInterface& cmdInterf = rd->workers_interface[nodeID]; - printf("[CMD] Node:%s, Assign KeyRange:%s [begin:%s end:%s] to applier ID:%s\n", rd->describeNode().c_str(), - keyRange.toString().c_str(), - getHexString(keyRange.begin).c_str(), getHexString(keyRange.end).c_str(), - nodeID.toString().c_str()); - rd->cmdID.nextCmd(); - cmdReplies.push_back( cmdInterf.setApplierKeyRangeRequest.getReply(RestoreSetApplierKeyRangeRequest(rd->cmdID, nodeID, keyRange)) ); - - } - printf("[INFO] Wait for %ld applier to accept the cmd Assign_Applier_KeyRange\n", appliers.size()); - std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); - for (int i = 0; i < reps.size(); ++i) { - printf("[INFO] Get reply:%s for Assign_Applier_KeyRange\n", - reps[i].toString().c_str()); - } - - break; - } catch (Error &e) { - if (e.code() != error_code_io_timeout) { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s timeout\n", rd->describeNode().c_str(), rd->cmdID.toString().c_str()); - } else { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - } - } - } - - return Void(); -} - -// Notify loader about appliers' responsible key range -ACTOR Future notifyAppliersKeyRangeToLoader(Reference rd, Database cx) { - state std::vector loaders = getLoaderIDs(rd); - state std::vector> cmdReplies; - state Standalone> appliers; - state Standalone> ranges; - - state std::map, UID>::iterator applierRange; - for (applierRange = rd->range2Applier.begin(); applierRange != rd->range2Applier.end(); applierRange++) { - KeyRef beginRange = applierRange->first; - KeyRange range(KeyRangeRef(beginRange, beginRange)); // TODO: Use the end of key range - appliers.push_back(appliers.arena(), applierRange->second); - ranges.push_back(ranges.arena(), range); - } - - printf("Notify_Loader_ApplierKeyRange: number of appliers:%d\n", appliers.size()); - ASSERT( appliers.size() == ranges.size() && appliers.size() != 0 ); - - rd->cmdID.initPhase( RestoreCommandEnum::Notify_Loader_ApplierKeyRange ); - state UID nodeID; - state int i = 0; - for (i = 0; i < loaders.size(); ++i) { - nodeID = loaders[i]; - rd->cmdID.nextCmd(); - ASSERT(rd->workers_interface.find(nodeID) != rd->workers_interface.end()); - loop { - try { - cmdReplies.clear(); - RestoreInterface& cmdInterf = rd->workers_interface[nodeID]; - printf("[CMD] Node:%s Notify node:%s about appliers key range\n", rd->describeNode().c_str(), nodeID.toString().c_str()); - //cmdReplies.push_back( cmdInterf.setApplierKeyRangeRequest.getReply(RestoreSetApplierKeyRangeRequest(rd->cmdID, applierRange->second, range)) ); - cmdReplies.push_back( cmdInterf.setApplierKeyRangeVectorRequest.getReply(RestoreSetApplierKeyRangeVectorRequest(rd->cmdID, appliers, ranges)) ); - printf("[INFO] Wait for node:%s to accept the cmd Notify_Loader_ApplierKeyRange\n", nodeID.toString().c_str()); - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - for (int i = 0; i < reps.size(); ++i) { - printf("[INFO] Get reply:%s from Notify_Loader_ApplierKeyRange cmd for node.\n", - reps[i].toString().c_str()); - } - cmdReplies.clear(); - break; - } catch (Error &e) { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s timeout\n", rd->describeNode().c_str(), rd->cmdID.toString().c_str()); - } - } - } - - return Void(); -} - - -void printLowerBounds(std::vector> lowerBounds) { - if ( debug_verbose == false ) - return; - - printf("[INFO] Print out %ld keys in the lowerbounds\n", lowerBounds.size()); - for (int i = 0; i < lowerBounds.size(); i++) { - printf("\t[INFO][%d] %s\n", i, getHexString(lowerBounds[i]).c_str()); - } -} - -std::vector> _calculateAppliersKeyRanges(Reference rd, int numAppliers) { - ASSERT(numAppliers > 0); - std::vector> lowerBounds; - int numSampledMutations = 0; - for (auto &count : rd->keyOpsCount) { - numSampledMutations += count.second; - } - - //intervalLength = (numSampledMutations - remainder) / (numApplier - 1) - int intervalLength = std::max(numSampledMutations / numAppliers, 1); // minimal length is 1 - int curCount = 0; - int curInterval = 0; - - printf("[INFO] Node:%s calculateAppliersKeyRanges(): numSampledMutations:%d numAppliers:%d intervalLength:%d\n", - rd->describeNode().c_str(), - rd->numSampledMutations, numAppliers, intervalLength); - for (auto &count : rd->keyOpsCount) { - if (curCount >= curInterval * intervalLength) { - printf("[INFO] Node:%s calculateAppliersKeyRanges(): Add a new key range [%d]:%s: curCount:%d\n", - rd->describeNode().c_str(), curInterval, count.first.toString().c_str(), curCount); - lowerBounds.push_back(count.first); // The lower bound of the current key range - curInterval++; - } - curCount += count.second; - } - - if ( lowerBounds.size() != numAppliers ) { - printf("[WARNING] calculateAppliersKeyRanges() WE MAY NOT USE ALL APPLIERS efficiently! num_keyRanges:%ld numAppliers:%d\n", - lowerBounds.size(), numAppliers); - printLowerBounds(lowerBounds); - } - - //ASSERT(lowerBounds.size() <= numAppliers + 1); // We may have at most numAppliers + 1 key ranges - if ( lowerBounds.size() >= numAppliers ) { - printf("[WARNING] Key ranges number:%ld > numAppliers:%d. Merge the last ones\n", lowerBounds.size(), numAppliers); - } - - while ( lowerBounds.size() >= numAppliers ) { - printf("[WARNING] Key ranges number:%ld > numAppliers:%d. Merge the last ones\n", lowerBounds.size(), numAppliers); - lowerBounds.pop_back(); - } - - return lowerBounds; -} - -ACTOR Future>> collectRestoreRequests(Database cx) { - state int restoreId = 0; - state int checkNum = 0; - state Standalone> restoreRequests; - state Future watch4RestoreRequest; - - //wait for the restoreRequestTriggerKey to be set by the client/test workload - state ReadYourWritesTransaction tr2(cx); - - loop { - try { - tr2.reset(); // The transaction may fail! Must full reset the transaction - tr2.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr2.setOption(FDBTransactionOptions::LOCK_AWARE); - // Assumption: restoreRequestTriggerKey has not been set - // Question: What if restoreRequestTriggerKey has been set? we will stuck here? - // Question: Can the following code handle the situation? - // Note: restoreRequestTriggerKey may be set before the watch is set or may have a conflict when the client sets the same key - // when it happens, will we stuck at wait on the watch? - - watch4RestoreRequest = tr2.watch(restoreRequestTriggerKey); - wait(tr2.commit()); - printf("[INFO][Master] Finish setting up watch for restoreRequestTriggerKey\n"); - break; - } catch(Error &e) { - printf("[WARNING] Transaction for restore request in watch restoreRequestTriggerKey. Error:%s\n", e.name()); - wait(tr2.onError(e)); - } - }; - - - loop { - try { - tr2.reset(); // The transaction may fail! Must full reset the transaction - tr2.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr2.setOption(FDBTransactionOptions::LOCK_AWARE); - // Assumption: restoreRequestTriggerKey has not been set - // Before we wait on the watch, we must make sure the key is not there yet! - //printf("[INFO][Master] Make sure restoreRequestTriggerKey does not exist before we wait on the key\n"); - Optional triggerKey = wait( tr2.get(restoreRequestTriggerKey) ); - if ( triggerKey.present() ) { - printf("!!! restoreRequestTriggerKey (and restore requests) is set before restore agent waits on the request. Restore agent can immediately proceed\n"); - break; - } - wait(watch4RestoreRequest); - printf("[INFO][Master] restoreRequestTriggerKey watch is triggered\n"); - break; - } catch(Error &e) { - printf("[WARNING] Transaction for restore request at wait on watch restoreRequestTriggerKey. Error:%s\n", e.name()); - wait(tr2.onError(e)); - } - }; - - loop { - try { - tr2.reset(); - tr2.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr2.setOption(FDBTransactionOptions::LOCK_AWARE); - - state Optional numRequests = wait(tr2.get(restoreRequestTriggerKey)); - int num = decodeRestoreRequestTriggerValue(numRequests.get()); - //TraceEvent("RestoreRequestKey").detail("NumRequests", num); - printf("[INFO] RestoreRequestNum:%d\n", num); - - state Standalone restoreRequestValues = wait(tr2.getRange(restoreRequestKeys, CLIENT_KNOBS->TOO_MANY)); - printf("Restore worker get restoreRequest: %s\n", restoreRequestValues.toString().c_str()); - - ASSERT(!restoreRequestValues.more); - - if(restoreRequestValues.size()) { - for ( auto &it : restoreRequestValues ) { - printf("Now decode restore request value...\n"); - restoreRequests.push_back(restoreRequests.arena(), decodeRestoreRequestValue(it.value)); - } - } - break; - } catch(Error &e) { - printf("[WARNING] Transaction error: collect restore requests. Error:%s\n", e.name()); - wait(tr2.onError(e)); - } - }; - - return restoreRequests; -} - -void initBackupContainer(Reference rd, Key url) { - if ( rd->bcUrl == url && rd->bc.isValid() ) { - return; - } - printf("initBackupContainer, url:%s\n", url.toString().c_str()); - rd->bcUrl = url; - rd->bc = IBackupContainer::openContainer(url.toString()); - //state BackupDescription desc = wait(rd->bc->describeBackup()); - //return Void(); -} - -// NOTE: This function can now get the backup file descriptors -ACTOR static Future collectBackupFiles(Reference rd, Database cx, RestoreRequest request) { - state Key tagName = request.tagName; - state Key url = request.url; - state bool waitForComplete = request.waitForComplete; - state Version targetVersion = request.targetVersion; - state bool verbose = request.verbose; - state KeyRange range = request.range; - state Key addPrefix = request.addPrefix; - state Key removePrefix = request.removePrefix; - state bool lockDB = request.lockDB; - state UID randomUid = request.randomUid; - - ASSERT( lockDB == true ); - - initBackupContainer(rd, url); - - state Reference bc = rd->bc; - state BackupDescription desc = wait(bc->describeBackup()); - - wait(desc.resolveVersionTimes(cx)); - - printf("[INFO] Backup Description\n%s", desc.toString().c_str()); - printf("[INFO] Restore for url:%s, lockDB:%d\n", url.toString().c_str(), lockDB); - if(targetVersion == invalidVersion && desc.maxRestorableVersion.present()) - targetVersion = desc.maxRestorableVersion.get(); - - printf("[INFO] collectBackupFiles: now getting backup files for restore request: %s\n", request.toString().c_str()); - Optional restorable = wait(bc->getRestoreSet(targetVersion)); - - if(!restorable.present()) { - printf("[WARNING] restoreVersion:%ld (%lx) is not restorable!\n", targetVersion, targetVersion); - throw restore_missing_data(); - } - - if (!rd->files.empty()) { - printf("[WARNING] global files are not empty! files.size() is %ld. We forcely clear files\n", rd->files.size()); - rd->files.clear(); - } - - printf("[INFO] Found backup files: num of files:%ld\n", rd->files.size()); - for(const RangeFile &f : restorable.get().ranges) { - TraceEvent("FoundRangeFileMX").detail("FileInfo", f.toString()); - printf("[INFO] FoundRangeFile, fileInfo:%s\n", f.toString().c_str()); - RestoreFileFR file(f.version, f.fileName, true, f.blockSize, f.fileSize, f.version, f.version); - rd->files.push_back(file); - } - for(const LogFile &f : restorable.get().logs) { - TraceEvent("FoundLogFileMX").detail("FileInfo", f.toString()); - printf("[INFO] FoundLogFile, fileInfo:%s\n", f.toString().c_str()); - RestoreFileFR file(f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.beginVersion); - rd->files.push_back(file); - } - - printf("[INFO] Restoring backup to version: %lld\n", (long long) targetVersion); - - return Void(); -} - -// The manager that manage the control of sampling workload -ACTOR static Future sampleWorkload(Reference rd, RestoreRequest request, Reference restoreConfig, int64_t sampleMB_input) { - state Key tagName = request.tagName; - state Key url = request.url; - state bool waitForComplete = request.waitForComplete; - state Version targetVersion = request.targetVersion; - state bool verbose = request.verbose; - state KeyRange restoreRange = request.range; - state Key addPrefix = request.addPrefix; - state Key removePrefix = request.removePrefix; - state bool lockDB = request.lockDB; - state UID randomUid = request.randomUid; - state Key mutationLogPrefix = restoreConfig->mutationLogPrefix(); - - state bool allLoadReqsSent = false; - state std::vector loaderIDs = getLoaderIDs(rd); - state std::vector applierIDs = getApplierIDs(rd); - state std::vector finishedLoaderIDs; - state int64_t sampleMB = sampleMB_input; //100; - state int64_t sampleB = sampleMB * 1024 * 1024; // Sample a block for every sampleB bytes. // Should adjust this value differently for simulation mode and real mode - state int64_t curFileIndex = 0; - state int64_t curFileOffset = 0; - state int64_t loadSizeB = 0; - state int64_t loadingCmdIndex = 0; - state int64_t sampleIndex = 0; - state double totalBackupSizeB = 0; - state double samplePercent = 0.05; // sample 1 data block per samplePercent (0.01) of data. num_sample = 1 / samplePercent - - // We should sample 1% data - for (int i = 0; i < rd->files.size(); i++) { - totalBackupSizeB += rd->files[i].fileSize; - } - sampleB = std::max((int) (samplePercent * totalBackupSizeB), 10 * 1024 * 1024); // The minimal sample size is 10MB - printf("Node:%s totalBackupSizeB:%.1fB (%.1fMB) samplePercent:%.2f, sampleB:%ld\n", rd->describeNode().c_str(), - totalBackupSizeB, totalBackupSizeB / 1024 / 1024, samplePercent, sampleB); - - // Step: Distribute sampled file blocks to loaders to sample the mutations - rd->cmdID.initPhase(RestoreCommandEnum::Sample_Range_File); - curFileIndex = 0; - state CMDUID checkpointCMDUID = rd->cmdID; - state int checkpointCurFileIndex = curFileIndex; - state int64_t checkpointCurFileOffset = 0; - state std::vector> cmdReplies; - state RestoreCommandEnum cmdType; - loop { // For retry on timeout - try { - if ( allLoadReqsSent ) { - break; // All load requests have been handled - } - wait(delay(1.0)); - - cmdReplies.clear(); - - printf("[Sampling] Node:%s We will sample the workload among %ld backup files.\n", rd->describeNode().c_str(), rd->files.size()); - printf("[Sampling] Node:%s totalBackupSizeB:%.1fB (%.1fMB) samplePercent:%.2f, sampleB:%ld, loadSize:%dB sampleIndex:%ld\n", rd->describeNode().c_str(), - totalBackupSizeB, totalBackupSizeB / 1024 / 1024, samplePercent, sampleB, loadSizeB, sampleIndex); - for (auto &loaderID : loaderIDs) { - // Find the sample file - while ( curFileIndex < rd->files.size() && rd->files[curFileIndex].fileSize == 0 ) { - // NOTE: && rd->files[curFileIndex].cursor >= rd->files[curFileIndex].fileSize - printf("[Sampling] File %ld:%s filesize:%ld skip the file\n", curFileIndex, - rd->files[curFileIndex].fileName.c_str(), rd->files[curFileIndex].fileSize); - curFileOffset = 0; - curFileIndex++; - } - // Find the next sample point - while ( loadSizeB / sampleB < sampleIndex && curFileIndex < rd->files.size() ) { - if (rd->files[curFileIndex].fileSize == 0) { - // NOTE: && rd->files[curFileIndex].cursor >= rd->files[curFileIndex].fileSize - printf("[Sampling] File %ld:%s filesize:%ld skip the file\n", curFileIndex, - rd->files[curFileIndex].fileName.c_str(), rd->files[curFileIndex].fileSize); - curFileIndex++; - curFileOffset = 0; - continue; - } - if ( loadSizeB / sampleB >= sampleIndex ) { - break; - } - if (curFileIndex >= rd->files.size()) { - break; - } - loadSizeB += std::min( rd->files[curFileIndex].blockSize, std::max(rd->files[curFileIndex].fileSize - curFileOffset * rd->files[curFileIndex].blockSize, (int64_t) 0) ); - curFileOffset++; - if ( rd->files[curFileIndex].blockSize == 0 || curFileOffset >= rd->files[curFileIndex].fileSize / rd->files[curFileIndex].blockSize ) { - curFileOffset = 0; - curFileIndex++; - } - } - if ( curFileIndex >= rd->files.size() ) { - allLoadReqsSent = true; - break; - } - - //sampleIndex++; - - // Notify loader to sample the file - LoadingParam param; - param.url = request.url; - param.version = rd->files[curFileIndex].version; - param.filename = rd->files[curFileIndex].fileName; - param.offset = curFileOffset * rd->files[curFileIndex].blockSize; // The file offset in bytes - //param.length = std::min(rd->files[curFileIndex].fileSize - rd->files[curFileIndex].cursor, loadSizeB); - param.length = std::min(rd->files[curFileIndex].blockSize, std::max((int64_t)0, rd->files[curFileIndex].fileSize - param.offset)); - loadSizeB += param.length; - sampleIndex = std::ceil(loadSizeB / sampleB); - curFileOffset++; - - //loadSizeB = param.length; - param.blockSize = rd->files[curFileIndex].blockSize; - param.restoreRange = restoreRange; - param.addPrefix = addPrefix; - param.removePrefix = removePrefix; - param.mutationLogPrefix = mutationLogPrefix; - if ( !(param.length > 0 && param.offset >= 0 && param.offset < rd->files[curFileIndex].fileSize) ) { - printf("[ERROR] param: length:%ld offset:%ld fileSize:%ld for %ldth file:%s\n", - param.length, param.offset, rd->files[curFileIndex].fileSize, curFileIndex, - rd->files[curFileIndex].toString().c_str()); - } - - - printf("[Sampling][File:%ld] filename:%s offset:%ld blockSize:%ld filesize:%ld loadSize:%ldB sampleIndex:%ld\n", - curFileIndex, rd->files[curFileIndex].fileName.c_str(), curFileOffset, - rd->files[curFileIndex].blockSize, rd->files[curFileIndex].fileSize, - loadSizeB, sampleIndex); - - - ASSERT( param.length > 0 ); - ASSERT( param.offset >= 0 ); - ASSERT( param.offset <= rd->files[curFileIndex].fileSize ); - UID nodeID = loaderID; - - ASSERT(rd->workers_interface.find(nodeID) != rd->workers_interface.end()); - RestoreInterface& cmdInterf = rd->workers_interface[nodeID]; - printf("[Sampling][CMD] Node:%s Loading %s on node %s\n", - rd->describeNode().c_str(), param.toString().c_str(), nodeID.toString().c_str()); - - rd->cmdID.nextCmd(); // The cmd index is the i^th file (range or log file) to be processed - if (!rd->files[curFileIndex].isRange) { - cmdType = RestoreCommandEnum::Sample_Log_File; - rd->cmdID.setPhase(RestoreCommandEnum::Sample_Log_File); - cmdReplies.push_back( cmdInterf.sampleLogFile.getReply(RestoreLoadFileRequest(rd->cmdID, param)) ); - } else { - cmdType = RestoreCommandEnum::Sample_Range_File; - rd->cmdID.setPhase(RestoreCommandEnum::Sample_Range_File); - cmdReplies.push_back( cmdInterf.sampleRangeFile.getReply(RestoreLoadFileRequest(rd->cmdID, param)) ); - } - - printf("[Sampling] Master cmdType:%d cmdUID:%s isRange:%d destinationNode:%s\n", - (int) cmdType, rd->cmdID.toString().c_str(), (int) rd->files[curFileIndex].isRange, - nodeID.toString().c_str()); - - if (param.offset + param.length >= rd->files[curFileIndex].fileSize) { // Reach the end of the file - curFileIndex++; - curFileOffset = 0; - } - if ( curFileIndex >= rd->files.size() ) { - allLoadReqsSent = true; - break; - } - ++loadingCmdIndex; - } - - printf("[Sampling] Wait for %ld loaders to accept the cmd Sample_Range_File or Sample_Log_File\n", cmdReplies.size()); - - if ( !cmdReplies.empty() ) { - //TODO: change to getAny. NOTE: need to keep the still-waiting replies - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - //std::vector reps = wait( getAll(cmdReplies) ); - - finishedLoaderIDs.clear(); - for (int i = 0; i < reps.size(); ++i) { - printf("[Sampling][%d out of %d] Get reply:%s for Sample_Range_File or Sample_Log_File\n", - i, reps.size(), reps[i].toString().c_str()); - finishedLoaderIDs.push_back(reps[i].id); - //int64_t repLoadingCmdIndex = reps[i].cmdIndex; - } - loaderIDs = finishedLoaderIDs; - checkpointCMDUID = rd->cmdID; - checkpointCurFileIndex = curFileIndex; - checkpointCurFileOffset = curFileOffset; - } - - if (allLoadReqsSent) { - printf("[Sampling] allLoadReqsSent, sampling finished\n"); - break; // NOTE: need to change when change to wait on any cmdReplies - } - - } catch (Error &e) { - // Handle the command reply timeout error - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - rd->cmdID = checkpointCMDUID; - curFileIndex = checkpointCurFileIndex; - curFileOffset = checkpointCurFileOffset; - allLoadReqsSent = false; - printf("[Sampling][Waring] Retry at CMDID:%s curFileIndex:%ld\n", rd->cmdID.toString().c_str(), curFileIndex); - } - } - - wait(delay(1.0)); - - // Ask master applier to calculate the key ranges for appliers - state int numKeyRanges = 0; - loop { - try { - printf("[Sampling][CMD] Ask master applier %s for the key ranges for appliers\n", rd->masterApplier.toString().c_str()); - RestoreInterface& cmdInterf = rd->workers_interface[rd->masterApplier]; - ASSERT(applierIDs.size() > 0); - rd->cmdID.initPhase(RestoreCommandEnum::Calculate_Applier_KeyRange); - rd->cmdID.nextCmd(); - GetKeyRangeNumberReply rep = wait( timeoutError( - cmdInterf.calculateApplierKeyRange.getReply(RestoreCalculateApplierKeyRangeRequest(rd->cmdID, applierIDs.size())), FastRestore_Failure_Timeout) ); - printf("[Sampling][CMDRep] number of key ranges calculated by master applier:%d\n", rep.keyRangeNum); - numKeyRanges = rep.keyRangeNum; - - if (numKeyRanges <= 0 || numKeyRanges >= applierIDs.size() ) { - printf("[WARNING] Calculate_Applier_KeyRange receives wrong reply (numKeyRanges:%ld) from other phases. applierIDs.size:%d Retry Calculate_Applier_KeyRange\n", numKeyRanges, applierIDs.size()); - continue; - } - - if ( numKeyRanges < applierIDs.size() ) { - printf("[WARNING][Sampling] numKeyRanges:%d < appliers number:%ld. %ld appliers will not be used!\n", - numKeyRanges, applierIDs.size(), applierIDs.size() - numKeyRanges); - } - - break; - } catch (Error &e) { - // Handle the command reply timeout error - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - printf("[Sampling] [Warning] Retry on Calculate_Applier_KeyRange\n"); - } - } - - wait(delay(1.0)); - - // Ask master applier to return the key range for appliers - state std::vector> keyRangeReplies; - loop { - try { - rd->range2Applier.clear(); - keyRangeReplies.clear(); // In case error happens in try loop - rd->cmdID.initPhase(RestoreCommandEnum::Get_Applier_KeyRange); - //rd->cmdID.nextCmd(); - for (int i = 0; i < applierIDs.size() && i < numKeyRanges; ++i) { - UID applierID = applierIDs[i]; - rd->cmdID.nextCmd(); - printf("[Sampling][Master] Node:%s, CMDID:%s Ask masterApplier:%s for the lower boundary of the key range for applier:%s\n", - rd->describeNode().c_str(), rd->cmdID.toString().c_str(), - rd->masterApplier.toString().c_str(), applierID.toString().c_str()); - ASSERT(rd->workers_interface.find(rd->masterApplier) != rd->workers_interface.end()); - RestoreInterface& masterApplierCmdInterf = rd->workers_interface[rd->masterApplier]; - keyRangeReplies.push_back( masterApplierCmdInterf.getApplierKeyRangeRequest.getReply( - RestoreGetApplierKeyRangeRequest(rd->cmdID, i)) ); - } - std::vector reps = wait( timeoutError( getAll(keyRangeReplies), FastRestore_Failure_Timeout) ); - - ASSERT( reps.size() <= applierIDs.size() ); - - // TODO: Directly use the replied lowerBound and upperBound - for (int i = 0; i < reps.size() && i < numKeyRanges; ++i) { - UID applierID = applierIDs[i]; - Standalone lowerBound = reps[i].lowerBound; - // if (i < numKeyRanges) { - // lowerBound = reps[i].lowerBound; - // } else { - // lowerBound = normalKeys.end; - // } - - if (i == 0) { - lowerBound = LiteralStringRef("\x00"); // The first interval must starts with the smallest possible key - } - printf("[INFO] Node:%s Assign key-to-applier map: Key:%s -> applierID:%s\n", rd->describeNode().c_str(), - getHexString(lowerBound).c_str(), applierID.toString().c_str()); - rd->range2Applier.insert(std::make_pair(lowerBound, applierID)); - } - - break; - } catch (Error &e) { - // TODO: Handle the command reply timeout error - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - printf("[Sampling] [Warning] Retry on Get_Applier_KeyRange\n"); - } - } - printf("[Sampling] rd->range2Applier has been set. Its size is:%d\n", rd->range2Applier.size()); - printAppliersKeyRange(rd); - - wait(delay(1.0)); - - return Void(); - -} - -bool isBackupEmpty(Reference rd) { - for (int i = 0; i < rd->files.size(); ++i) { - if (rd->files[i].fileSize > 0) { - return false; - } - } - return true; -} - -// Distribution workload per version batch -ACTOR static Future distributeWorkloadPerVersionBatch(RestoreInterface interf, Reference rd, Database cx, RestoreRequest request, Reference restoreConfig) { - state Key tagName = request.tagName; - state Key url = request.url; - state bool waitForComplete = request.waitForComplete; - state Version targetVersion = request.targetVersion; - state bool verbose = request.verbose; - state KeyRange restoreRange = request.range; - state Key addPrefix = request.addPrefix; - state Key removePrefix = request.removePrefix; - state bool lockDB = request.lockDB; - state UID randomUid = request.randomUid; - state Key mutationLogPrefix = restoreConfig->mutationLogPrefix(); - - if ( isBackupEmpty(rd) ) { - printf("[WARNING] Node:%s distributeWorkloadPerVersionBatch() load an empty batch of backup. Print out the empty backup files info.\n", rd->describeNode().c_str()); - printBackupFilesInfo(rd); - return Void(); - } - - printf("[INFO] Node:%s mutationLogPrefix:%s (hex value:%s)\n", rd->describeNode().c_str(), mutationLogPrefix.toString().c_str(), getHexString(mutationLogPrefix).c_str()); - - // Determine the key range each applier is responsible for - std::pair numWorkers = getNumLoaderAndApplier(rd); - int numLoaders = numWorkers.first; - int numAppliers = numWorkers.second; - ASSERT( rd->globalNodeStatus.size() > 0 ); - ASSERT( numLoaders > 0 ); - ASSERT( numAppliers > 0 ); - - state int loadingSizeMB = 0; //numLoaders * 1000; //NOTE: We want to load the entire file in the first version, so we want to make this as large as possible - int64_t sampleSizeMB = 0; //loadingSizeMB / 100; // Will be overwritten. The sampleSizeMB will be calculated based on the batch size - - state double startTime = now(); - state double startTimeBeforeSampling = now(); - // TODO: WiP Sample backup files to determine the key range for appliers - wait( sampleWorkload(rd, request, restoreConfig, sampleSizeMB) ); - wait( delay(1.0) ); - - printf("[Progress] distributeWorkloadPerVersionBatch sampling time:%.2f seconds\n", now() - startTime); - state double startTimeAfterSampling = now(); - - // Notify each applier about the key range it is responsible for, and notify appliers to be ready to receive data - startTime = now(); - wait( assignKeyRangeToAppliers(rd, cx) ); - wait( delay(1.0) ); - printf("[Progress] distributeWorkloadPerVersionBatch assignKeyRangeToAppliers time:%.2f seconds\n", now() - startTime); - - startTime = now(); - wait( notifyAppliersKeyRangeToLoader(rd, cx) ); - wait( delay(1.0) ); - printf("[Progress] distributeWorkloadPerVersionBatch notifyAppliersKeyRangeToLoader time:%.2f seconds\n", now() - startTime); - - // Determine which backup data block (filename, offset, and length) each loader is responsible for and - // Notify the loader about the data block and send the cmd to the loader to start loading the data - // Wait for the ack from loader and repeats - - // Prepare the file's loading status - for (int i = 0; i < rd->files.size(); ++i) { - rd->files[i].cursor = 0; - } - - // Send loading cmd to available loaders whenever loaders become available - // NOTE: We must split the workload in the correct boundary: - // For range file, it's the block boundary; - // For log file, it is the version boundary. - // This is because - // (1) The set of mutations at a version may be encoded in multiple KV pairs in log files. - // We need to concatenate the related KVs to a big KV before we can parse the value into a vector of mutations at that version - // (2) The backuped KV are arranged in blocks in range file. - // For simplicity, we distribute at the granularity of files for now. - - state int loadSizeB = loadingSizeMB * 1024 * 1024; - state int loadingCmdIndex = 0; - state std::vector loaderIDs = getLoaderIDs(rd); - state std::vector applierIDs; - state std::vector finishedLoaderIDs = loaderIDs; - - - state int checkpointCurFileIndex = 0; - state long checkpointCurOffset = 0; - - startTime = now(); - // We should load log file before we do range file - state RestoreCommandEnum phaseType = RestoreCommandEnum::Assign_Loader_Log_File; - state std::vector> cmdReplies; - loop { - state int curFileIndex = 0; // The smallest index of the files that has not been FULLY loaded - state long curOffset = 0; - state bool allLoadReqsSent = false; - loop { - try { - if ( allLoadReqsSent ) { - break; // All load requests have been handled - } - wait(delay(1.0)); - - cmdReplies.clear(); - printf("[INFO] Number of backup files:%ld\n", rd->files.size()); - rd->cmdID.initPhase(phaseType); - for (auto &loaderID : loaderIDs) { - while ( curFileIndex < rd->files.size() && rd->files[curFileIndex].fileSize == 0 ) { - // NOTE: && rd->files[curFileIndex].cursor >= rd->files[curFileIndex].fileSize - printf("[INFO] File %ld:%s filesize:%ld skip the file\n", curFileIndex, - rd->files[curFileIndex].fileName.c_str(), rd->files[curFileIndex].fileSize); - curFileIndex++; - curOffset = 0; - } - if ( curFileIndex >= rd->files.size() ) { - allLoadReqsSent = true; - break; - } - LoadingParam param; - //rd->files[curFileIndex].cursor = 0; // This is a hacky way to make sure cursor is correct in current version when we load 1 file at a time - param.url = request.url; - param.version = rd->files[curFileIndex].version; - param.filename = rd->files[curFileIndex].fileName; - param.offset = curOffset; //rd->files[curFileIndex].cursor; - param.length = std::min(rd->files[curFileIndex].fileSize - curOffset, rd->files[curFileIndex].blockSize); - //param.length = rd->files[curFileIndex].fileSize; - loadSizeB = param.length; - param.blockSize = rd->files[curFileIndex].blockSize; - param.restoreRange = restoreRange; - param.addPrefix = addPrefix; - param.removePrefix = removePrefix; - param.mutationLogPrefix = mutationLogPrefix; - if ( !(param.length > 0 && param.offset >= 0 && param.offset < rd->files[curFileIndex].fileSize) ) { - printf("[ERROR] param: length:%ld offset:%ld fileSize:%ld for %ldth filename:%s\n", - param.length, param.offset, rd->files[curFileIndex].fileSize, curFileIndex, - rd->files[curFileIndex].fileName.c_str()); - } - ASSERT( param.length > 0 ); - ASSERT( param.offset >= 0 ); - ASSERT( param.offset < rd->files[curFileIndex].fileSize ); - rd->files[curFileIndex].cursor = rd->files[curFileIndex].cursor + param.length; - UID nodeID = loaderID; - // TODO: record the loading status - - ASSERT(rd->workers_interface.find(nodeID) != rd->workers_interface.end()); - RestoreInterface& cmdInterf = rd->workers_interface[nodeID]; - - RestoreCommandEnum cmdType = RestoreCommandEnum::Assign_Loader_Range_File; - if (rd->files[curFileIndex].isRange) { - cmdType = RestoreCommandEnum::Assign_Loader_Range_File; - rd->cmdID.setPhase(RestoreCommandEnum::Assign_Loader_Range_File); - } else { - cmdType = RestoreCommandEnum::Assign_Loader_Log_File; - rd->cmdID.setPhase(RestoreCommandEnum::Assign_Loader_Log_File); - } - - if ( (phaseType == RestoreCommandEnum::Assign_Loader_Log_File && rd->files[curFileIndex].isRange) - || (phaseType == RestoreCommandEnum::Assign_Loader_Range_File && !rd->files[curFileIndex].isRange) ) { - rd->files[curFileIndex].cursor = 0; - curFileIndex++; - curOffset = 0; - } else { // load the type of file in the phaseType - rd->cmdID.nextCmd(); - printf("[CMD] Loading fileIndex:%ld fileInfo:%s loadingParam:%s on node %s\n", - curFileIndex, rd->files[curFileIndex].toString().c_str(), - param.toString().c_str(), nodeID.toString().c_str()); // VERY USEFUL INFO - printf("[INFO] Node:%s CMDUID:%s cmdType:%d isRange:%d loaderNode:%s\n", rd->describeNode().c_str(), rd->cmdID.toString().c_str(), - (int) cmdType, (int) rd->files[curFileIndex].isRange, nodeID.toString().c_str()); - if (rd->files[curFileIndex].isRange) { - cmdReplies.push_back( cmdInterf.loadRangeFile.getReply(RestoreLoadFileRequest(rd->cmdID, param)) ); - } else { - cmdReplies.push_back( cmdInterf.loadLogFile.getReply(RestoreLoadFileRequest(rd->cmdID, param)) ); - } - curOffset += param.length; - - // Reach the end of the file - if ( param.length + param.offset >= rd->files[curFileIndex].fileSize ) { - curFileIndex++; - curOffset = 0; - } - - // if (param.length <= loadSizeB) { // Reach the end of the file - // ASSERT( rd->files[curFileIndex].cursor == rd->files[curFileIndex].fileSize ); - // curFileIndex++; - // } - } - - if ( curFileIndex >= rd->files.size() ) { - allLoadReqsSent = true; - break; - } - //++loadingCmdIndex; // Replaced by cmdUID - } - - printf("[INFO] Wait for %ld loaders to accept the cmd Assign_Loader_File\n", cmdReplies.size()); - - // Question: How to set reps to different value based on cmdReplies.empty()? - if ( !cmdReplies.empty() ) { - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); //TODO: change to getAny. NOTE: need to keep the still-waiting replies - //std::vector reps = wait( getAll(cmdReplies) ); - - finishedLoaderIDs.clear(); - cmdReplies.clear(); - for (int i = 0; i < reps.size(); ++i) { - printf("[INFO] Get Ack reply:%s for Assign_Loader_File\n", - reps[i].toString().c_str()); - finishedLoaderIDs.push_back(reps[i].id); - //int64_t repLoadingCmdIndex = reps[i].cmdIndex; - } - //loaderIDs = finishedLoaderIDs; // loaderIDs are also used in enumerating all loaders. The finishedLoaderIDs can be different based on the getRply results - checkpointCurFileIndex = curFileIndex; // Save the previous success point - checkpointCurOffset = curOffset; - } - - // TODO: Let master print all nodes status. Note: We need a function to print out all nodes status - - if (allLoadReqsSent) { - printf("[INFO] allLoadReqsSent has finished.\n"); - break; // NOTE: need to change when change to wait on any cmdReplies - } - - } catch (Error &e) { - // TODO: Handle the command reply timeout error - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - curFileIndex = checkpointCurFileIndex; - curOffset = checkpointCurOffset; - } - } - - if (phaseType == RestoreCommandEnum::Assign_Loader_Log_File) { - phaseType = RestoreCommandEnum::Assign_Loader_Range_File; - } else if (phaseType == RestoreCommandEnum::Assign_Loader_Range_File) { - break; - } - } - - wait( delay(1.0) ); - printf("[Progress] distributeWorkloadPerVersionBatch loadFiles time:%.2f seconds\n", now() - startTime); - - ASSERT( cmdReplies.empty() ); - - wait( delay(5.0) ); - // Notify the applier to applly mutation to DB - - startTime = now(); - wait( notifyApplierToApplyMutations(rd) ); - printf("[Progress] distributeWorkloadPerVersionBatch applyToDB time:%.2f seconds\n", now() - startTime); - - state double endTime = now(); - - double runningTime = endTime - startTimeBeforeSampling; - printf("[Progress] Node:%s distributeWorkloadPerVersionBatch runningTime without sampling time:%.2f seconds, with sampling time:%.2f seconds\n", - rd->describeNode().c_str(), - runningTime, endTime - startTimeAfterSampling); - - return Void(); - -} - -ACTOR Future notifyApplierToApplyMutations(Reference rd) { - state std::vector appliers = getApplierIDs(rd); - state std::vector> cmdReplies; - loop { - try { - rd->cmdID.initPhase( RestoreCommandEnum::Apply_Mutation_To_DB ); - for (auto& nodeID : appliers) { - ASSERT(rd->workers_interface.find(nodeID) != rd->workers_interface.end()); - RestoreInterface& cmdInterf = rd->workers_interface[nodeID]; - printf("[CMD] Node:%s Notify node:%s to apply mutations to DB\n", rd->describeNode().c_str(), nodeID.toString().c_str()); - cmdReplies.push_back( cmdInterf.applyToDB.getReply(RestoreSimpleRequest(rd->cmdID)) ); - } - printf("[INFO] Wait for %ld appliers to apply mutations to DB\n", appliers.size()); - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - //std::vector reps = wait( getAll(cmdReplies) ); - printf("[INFO] %ld appliers finished applying mutations to DB\n", appliers.size()); - - cmdReplies.clear(); - - wait(delay(5.0)); - - break; - } catch (Error &e) { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - } - } - - return Void(); -} - - -void sanityCheckMutationOps(Reference rd) { - if (rd->kvOps.empty()) - return; - - if ( isKVOpsSorted(rd) ) { - printf("[CORRECT] KVOps is sorted by version\n"); - } else { - printf("[ERROR]!!! KVOps is NOT sorted by version\n"); - } - - if ( allOpsAreKnown(rd) ) { - printf("[CORRECT] KVOps all operations are known.\n"); - } else { - printf("[ERROR]!!! KVOps has unknown mutation op. Exit...\n"); - } -} - -ACTOR Future sanityCheckRestoreOps(Reference rd, Database cx, UID uid) { - sanityCheckMutationOps(rd); - - state Reference tr(new ReadYourWritesTransaction(cx)); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - printf("Now apply KVOps to DB. start...\n"); - tr->reset(); - wait(checkDatabaseLock(tr, uid)); - wait(tr->commit()); - - return Void(); - -} void initRestoreWorkerConfig() { MIN_NUM_WORKERS = g_network->isSimulated() ? 3 : 120; //10; // TODO: This can become a configuration param later @@ -2132,1958 +218,193 @@ void initRestoreWorkerConfig() { MIN_NUM_WORKERS, ratio_loader_to_applier, loadBatchSizeMB, loadBatchSizeThresholdB, transactionBatchSizeThreshold); } -ACTOR Future _restoreWorker(Database cx_input, LocalityData locality) { - state Database cx = cx_input; - state RestoreInterface interf; - interf.initEndpoints(); - state Optional leaderInterf; - //Global data for the worker - state Reference rd = Reference(new RestoreData()); - rd->localNodeStatus.nodeID = interf.id(); - initRestoreWorkerConfig(); +// Restore Worker +ACTOR Future commitRestoreRoleInterfaces(Reference self, Database cx) { + state ReadYourWritesTransaction tr(cx); + // For now, we assume only one role per restore worker + ASSERT( !(self->loaderInterf.present() && self->applierInterf.present()) ); - // Compete in registering its restoreInterface as the leader. - state Transaction tr(cx); loop { try { tr.reset(); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); - Optional leader = wait(tr.get(restoreLeaderKey)); - if(leader.present()) { - leaderInterf = BinaryReader::fromStringRef(leader.get(), IncludeVersion()); - // NOTE: Handle the situation that the leader's commit of its key causes error(commit_unknown_result) - // In this situation, the leader will try to register its key again, which will never succeed. - // We should let leader escape from the infinite loop - if ( leaderInterf.get().id() == interf.id() ) { - printf("[Worker] NodeID:%s is the leader and has registered its key in commit_unknown_result error. Let it set the key again\n", - leaderInterf.get().id().toString().c_str()); - tr.set(restoreLeaderKey, BinaryWriter::toValue(interf, IncludeVersion())); - wait(tr.commit()); - // reset leaderInterf to invalid for the leader process - // because a process will not execute leader's logic unless leaderInterf is invalid - leaderInterf = Optional(); - break; - } - printf("[Worker] Leader key exists:%s. Worker registers its restore interface id:%s\n", - leaderInterf.get().id().toString().c_str(), interf.id().toString().c_str()); - tr.set(restoreWorkerKeyFor(interf.id()), restoreCommandInterfaceValue(interf)); - wait(tr.commit()); - break; + if ( self->loaderInterf.present() ) { + tr.set( restoreLoaderKeyFor(self->loaderInterf.get().id()), restoreLoaderInterfaceValue(self->loaderInterf.get()) ); } - printf("[Worker] NodeID:%s tries to register its interface as leader\n", interf.id().toString().c_str()); - tr.set(restoreLeaderKey, BinaryWriter::toValue(interf, IncludeVersion())); - wait(tr.commit()); + if ( self->applierInterf.present() ) { + tr.set( restoreApplierKeyFor(self->applierInterf.get().id()), restoreApplierInterfaceValue(self->applierInterf.get()) ); + } + wait (tr.commit() ); break; } catch( Error &e ) { - // ATTENTION: We may have error commit_unknown_result, the commit may or may not succeed! - // We must handle this error, otherwise, if the leader does not know its key has been registered, the leader will stuck here! - printf("[INFO] NodeID:%s restoreWorker select leader error, error code:%d error info:%s\n", - interf.id().toString().c_str(), e.code(), e.what()); + printf("[WARNING]%s: commitRestoreRoleInterfaces transaction error:%s\n", self->describeNode().c_str(), e.what()); wait( tr.onError(e) ); } } - //we are not the leader, so put our interface in the agent list - if(leaderInterf.present()) { - // Initialize the node's UID - //rd->localNodeStatus.nodeID = interf.id(); - wait( workerCore(rd, interf, cx) ); + return Void(); +} + +// Restore Worker +ACTOR Future handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Reference self, ActorCollection *actors, Database cx) { + printf("[INFO][Worker] Node:%s get role %s\n", self->describeNode().c_str(), + getRoleStr(req.role).c_str()); + + if (req.role == RestoreRole::Loader) { + ASSERT( !self->loaderInterf.present() ); + self->loaderData = Reference(new RestoreLoaderData()); + self->loaderInterf = RestoreLoaderInterface(); + actors->add( restoreLoaderCore(self->loaderData, self->loaderInterf.get(), cx) ); + } else if (req.role == RestoreRole::Applier) { + ASSERT( !self->applierInterf.present() ); + self->applierData = Reference( new RestoreApplierData() ); + self->applierInterf = RestoreApplierInterface(); + actors->add( restoreApplierCore(self->applierData, self->applierInterf.get(), cx) ); } else { - wait( masterCore(rd, interf, cx) ); + TraceEvent(SevError, "FastRestore").detail("HandleRecruitRoleRequest", "UnknownRole"); //.detail("Request", req.printable()); } + wait( commitRestoreRoleInterfaces(self, cx) ); // Commit the interface after the interface is ready to accept requests + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); } -ACTOR Future restoreWorker(Reference ccf, LocalityData locality) { - Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST,locality); - wait(_restoreWorker(cx, locality)); - return Void(); -} -// ToDelete: If we can pass the correctness test -ACTOR static Future finishRestore(Reference rd, Database cx, Standalone> restoreRequests) { - // Make restore workers quit - state std::vector workersIDs = getWorkerIDs(rd); // All workers ID - state std::vector> cmdReplies; - state std::map::iterator workerInterf; - printGlobalNodeStatus(rd); +// Read restoreWorkersKeys from DB to get each restore worker's restore workerInterface and set it to self->workers_workerInterface +// This is done before we assign restore roles for restore workers + ACTOR Future collectRestoreWorkerInterface(Reference self, Database cx, int min_num_workers) { + state Transaction tr(cx); + + state vector agents; // agents is cmdsInterf + loop { try { - cmdReplies.clear(); - rd->cmdID.initPhase(RestoreCommandEnum::Finish_Restore); - - for ( workerInterf = rd->workers_interface.begin(); workerInterf != rd->workers_interface.end(); workerInterf++ ) { - if ( std::find(workersIDs.begin(), workersIDs.end(), workerInterf->first) == workersIDs.end() ) { - continue; // The workerInterf is not discovered at configureRoles and therefore not involve in restore + self->workers_workerInterface.clear(); + agents.clear(); + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + Standalone agentValues = wait(tr.getRange(restoreWorkersKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!agentValues.more); + // If agentValues.size() < min_num_workers, we should wait for coming workers to register their workerInterface before we read them once for all + if(agentValues.size() >= min_num_workers) { + for(auto& it : agentValues) { + agents.push_back(BinaryReader::fromStringRef(it.value, IncludeVersion())); + // Save the RestoreWorkerInterface for the later operations + self->workers_workerInterface.insert(std::make_pair(agents.back().id(), agents.back())); + printf("collectWorkerInterface, workerInterface id:%s\n", agents.back().id().toString().c_str()); } - rd->cmdID.nextCmd(); - RestoreInterface &interf = workerInterf->second; - cmdReplies.push_back(interf.finishRestore.getReply(RestoreSimpleRequest(rd->cmdID))); - } - - if (!cmdReplies.empty()) { - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout / 100 ) ); - //std::vector reps = wait( getAll(cmdReplies) ); - cmdReplies.clear(); - } - printf("All restore workers have quited\n"); - - break; - } catch(Error &e) { - printf("[ERROR] At sending finishRestore request. error code:%d message:%s. Retry...\n", e.code(), e.what()); - rd->workers_interface.clear(); - cmdReplies.clear(); - wait( collectWorkerInterface(rd, cx, 0) ); - } - } - - // Notify tester that the restore has finished - state ReadYourWritesTransaction tr3(cx); - loop { - try { - tr3.reset(); - tr3.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr3.setOption(FDBTransactionOptions::LOCK_AWARE); - tr3.clear(restoreRequestTriggerKey); - tr3.clear(restoreRequestKeys); - tr3.set(restoreRequestDoneKey, restoreRequestDoneValue(restoreRequests.size())); - wait(tr3.commit()); - TraceEvent("LeaderFinishRestoreRequest"); - printf("[INFO] RestoreLeader write restoreRequestDoneKey\n"); - - break; - } catch( Error &e ) { - TraceEvent("RestoreAgentLeaderErrorTr3").detail("ErrorCode", e.code()).detail("ErrorName", e.name()); - printf("[Error] RestoreLead operation on restoreRequestDoneKey, error:%s\n", e.what()); - wait( tr3.onError(e) ); - } - }; - - - // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for any restore operation - // the ranges to restore must be within the backed up ranges, otherwise from the restore perspective it will appear that some - // key ranges were missing and so the backup set is incomplete and the restore has failed. - // This validation cannot be done currently because Restore only supports a single restore range but backups can have many ranges. - - // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored version. - // restore.clearApplyMutationsKeys(tr); - - printf("[INFO] Notify the end of the restore\n"); - TraceEvent("NotifyRestoreFinished"); - - return Void(); -} - -////--- Restore functions -ACTOR static Future unlockDB(Database cx, UID uid) { - state Reference tr(new ReadYourWritesTransaction(cx)); - loop { - try { - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - printf("CheckDBlock:%s START\n", uid.toString().c_str()); - wait(checkDatabaseLock(tr, uid)); - printf("CheckDBlock:%s DONE\n", uid.toString().c_str()); - - printf("UnlockDB now. Start.\n"); - wait(unlockDatabase(tr, uid)); //NOTE: unlockDatabase didn't commit inside the function! - - printf("CheckDBlock:%s START\n", uid.toString().c_str()); - wait(checkDatabaseLock(tr, uid)); - printf("CheckDBlock:%s DONE\n", uid.toString().c_str()); - - printf("UnlockDB now. Commit.\n"); - wait( tr->commit() ); - - printf("UnlockDB now. Done.\n"); - break; - } catch( Error &e ) { - printf("Error when we unlockDB. Error:%s\n", e.what()); - wait(tr->onError(e)); - } - }; - - return Void(); - } - - struct FastRestoreStatus { - double curWorkloadSize; - double curRunningTime; - double curSpeed; - - double totalWorkloadSize; - double totalRunningTime; - double totalSpeed; -}; - -int restoreStatusIndex = 0; -ACTOR static Future registerStatus(Database cx, struct FastRestoreStatus status) { - state Reference tr(new ReadYourWritesTransaction(cx)); - loop { - try { - printf("[Restore_Status][%d] curWorkload:%.2f curRunningtime:%.2f curSpeed:%.2f totalWorkload:%.2f totalRunningTime:%.2f totalSpeed:%.2f\n", - restoreStatusIndex, status.curWorkloadSize, status.curRunningTime, status.curSpeed, status.totalWorkloadSize, status.totalRunningTime, status.totalSpeed); - - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - tr->set(restoreStatusKeyFor(StringRef(std::string("curWorkload") + std::to_string(restoreStatusIndex))), restoreStatusValue(status.curWorkloadSize)); - tr->set(restoreStatusKeyFor(StringRef(std::string("curRunningTime") + std::to_string(restoreStatusIndex))), restoreStatusValue(status.curRunningTime)); - tr->set(restoreStatusKeyFor(StringRef(std::string("curSpeed") + std::to_string(restoreStatusIndex))), restoreStatusValue(status.curSpeed)); - - tr->set(restoreStatusKeyFor(StringRef(std::string("totalWorkload"))), restoreStatusValue(status.totalWorkloadSize)); - tr->set(restoreStatusKeyFor(StringRef(std::string("totalRunningTime"))), restoreStatusValue(status.totalRunningTime)); - tr->set(restoreStatusKeyFor(StringRef(std::string("totalSpeed"))), restoreStatusValue(status.totalSpeed)); - - wait( tr->commit() ); - restoreStatusIndex++; - - break; - } catch( Error &e ) { - printf("Transaction Error when we registerStatus. Error:%s\n", e.what()); - wait(tr->onError(e)); - } - }; - - return Void(); -} - - -ACTOR static Future _lockDB(Database cx, UID uid, bool lockDB) { - printf("[Lock] DB will be locked, uid:%s, lockDB:%d\n", uid.toString().c_str(), lockDB); - - ASSERT( lockDB ); - - loop { - try { - wait(lockDatabase(cx, uid)); - break; - } catch( Error &e ) { - printf("Transaction Error when we lockDB. Error:%s\n", e.what()); - wait(tr->onError(e)); - } - } - - state Reference tr(new ReadYourWritesTransaction(cx)); - loop { - try { - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - wait(checkDatabaseLock(tr, uid)); - - tr->commit(); - break; - } catch( Error &e ) { - printf("Transaction Error when we lockDB. Error:%s\n", e.what()); - wait(tr->onError(e)); - } - } - - - return Void(); -} - -ACTOR static Future _clearDB(Reference tr) { - loop { - try { - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - tr->clear(normalKeys); - tr->commit(); - break; - } catch(Error &e) { - printf("Retry at clean up DB before restore. error code:%d message:%s. Retry...\n", e.code(), e.what()); - if(e.code() != error_code_restore_duplicate_tag) { - wait(tr->onError(e)); - } - } - } - - return Void(); -} - -ACTOR Future initializeVersionBatch(Reference rd, int batchIndex) { - rd->batchIndex = batchIndex; - state std::vector workerIDs = getWorkerIDs(rd); - state int index = 0; - loop { - try { - wait(delay(1.0)); - std::vector> cmdReplies; - rd->cmdID.initPhase(RestoreCommandEnum::RESET_VersionBatch); - for(auto& workerID : workerIDs) { - ASSERT( rd->workers_interface.find(workerID) != rd->workers_interface.end() ); - auto& cmdInterf = rd->workers_interface[workerID]; - RestoreRole role = rd->globalNodeStatus[index].role; - UID nodeID = rd->globalNodeStatus[index].nodeID; - rd->cmdID.nextCmd(); - printf("[CMD:%s] Node:%s Initialize version batch %d\n", rd->cmdID.toString().c_str(), rd->describeNode().c_str(), - batchIndex); - cmdReplies.push_back( cmdInterf.initVersionBatch.getReply(RestoreVersionBatchRequest(rd->cmdID, batchIndex)) ); - index++; - } - std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); - printf("Initilaize Version Batch done\n"); - - break; - } catch (Error &e) { - // TODO: Handle the command reply timeout error - if (e.code() != error_code_io_timeout) { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s timeout\n", rd->describeNode().c_str(), rd->cmdID.toString().c_str()); - } else { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - } - - printf("Node:%s waits on replies time out. Current phase: Set_Role, Retry all commands.\n", rd->describeNode().c_str()); - } - } - - return Void(); -} - -// Collect the set of backup files to be used for a version batch -// Return true if there is still files to be restored; false otherwise. -// This function will change the process' RestoreData -bool collectFilesForOneVersionBatch(Reference rd) { - rd->files.clear(); - rd->curWorkloadSize = 0; - Version endVersion = -1; - bool isRange = false; - bool validVersion = false; - // Step: Find backup files in each version batch and restore them. - while ( rd->curBackupFilesBeginIndex < rd->allFiles.size() ) { - // Find the curBackupFilesEndIndex, such that the to-be-loaded files size (curWorkloadSize) is as close to loadBatchSizeThresholdB as possible, - // and curBackupFilesEndIndex must not belong to the forbidden version range! - if ( rd->curBackupFilesEndIndex < rd->allFiles.size() ) { - endVersion = rd->allFiles[rd->curBackupFilesEndIndex].endVersion; - isRange = rd->allFiles[rd->curBackupFilesEndIndex].isRange; - validVersion = !isVersionInForbiddenRange(rd, endVersion, isRange); - rd->curWorkloadSize += rd->allFiles[rd->curBackupFilesEndIndex].fileSize; - printf("[DEBUG][Batch:%d] Calculate backup files for a version batch: endVersion:%lld isRange:%d validVersion:%d curWorkloadSize:%.2fB curBackupFilesBeginIndex:%ld curBackupFilesEndIndex:%ld, files.size:%ld\n", - rd->batchIndex, (long long) endVersion, isRange, validVersion, rd->curWorkloadSize , rd->curBackupFilesBeginIndex, rd->curBackupFilesEndIndex, rd->allFiles.size()); - } - if ( (validVersion && rd->curWorkloadSize >= loadBatchSizeThresholdB) || rd->curBackupFilesEndIndex >= rd->allFiles.size() ) { - if ( rd->curBackupFilesEndIndex >= rd->allFiles.size() && rd->curWorkloadSize <= 0 ) { - printf("Restore finishes: curBackupFilesEndIndex:%ld, allFiles.size:%ld, curWorkloadSize:%.2f\n", - rd->curBackupFilesEndIndex, rd->allFiles.size(), rd->curWorkloadSize ); - //break; // return result - } - // Construct the files [curBackupFilesBeginIndex, curBackupFilesEndIndex] - //rd->resetPerVersionBatch(); - //rd->cmdID.setBatch(rd->batchIndex); - if ( rd->curBackupFilesBeginIndex < rd->allFiles.size()) { - for (int fileIndex = rd->curBackupFilesBeginIndex; fileIndex <= rd->curBackupFilesEndIndex && fileIndex < rd->allFiles.size(); fileIndex++) { - rd->files.push_back(rd->allFiles[fileIndex]); - } - } - printBackupFilesInfo(rd); - rd->totalWorkloadSize += rd->curWorkloadSize; - break; - } else if (validVersion && rd->curWorkloadSize < loadBatchSizeThresholdB) { - rd->curBackupFilesEndIndex++; - } else if (!validVersion && rd->curWorkloadSize < loadBatchSizeThresholdB) { - rd->curBackupFilesEndIndex++; - } else if (!validVersion && rd->curWorkloadSize >= loadBatchSizeThresholdB) { - // Now: just move to the next file. We will eventually find a valid version but load more than loadBatchSizeThresholdB - printf("[WARNING] The loading batch size will be larger than expected! curBatchSize:%.2fB, expectedBatchSize:%2.fB, endVersion:%ld\n", - rd->curWorkloadSize, loadBatchSizeThresholdB, endVersion); - rd->curBackupFilesEndIndex++; - // TODO: Roll back to find a valid version - } - } - - return (rd->files.size() > 0); -} - -ACTOR static Future processRestoreRequest(RestoreInterface interf, Reference rd, Database cx, RestoreRequest request) { - state Key tagName = request.tagName; - state Key url = request.url; - state bool waitForComplete = request.waitForComplete; - state Version targetVersion = request.targetVersion; - state bool verbose = request.verbose; - state KeyRange range = request.range; - state Key addPrefix = request.addPrefix; - state Key removePrefix = request.removePrefix; - state bool lockDB = request.lockDB; - state UID randomUid = request.randomUid; - - //MX: Lock DB if it is not locked - printf("RestoreRequest lockDB:%d\n", lockDB); - if ( lockDB == false ) { - printf("[WARNING] RestoreRequest lockDB:%d; we will overwrite request.lockDB to true and forcely lock db\n", lockDB); - lockDB = true; - request.lockDB = true; - } - - state long curBackupFilesBeginIndex = 0; - state long curBackupFilesEndIndex = 0; - - state double totalWorkloadSize = 0; - state double totalRunningTime = 0; // seconds - state double curRunningTime = 0; // seconds - state double curStartTime = 0; - state double curEndTime = 0; - state double curWorkloadSize = 0; //Bytes - - - state Reference tr(new ReadYourWritesTransaction(cx)); - state Reference restoreConfig(new RestoreConfig(randomUid)); - - // lock DB for restore - wait( _lockDB(cx, randomUid, lockDB) ); - wait( _clearDB(tr) ); - - // Step: Collect all backup files - printf("===========Restore request start!===========\n"); - state double startTime = now(); - wait( collectBackupFiles(rd, cx, request) ); - printf("[Perf] Node:%s collectBackupFiles takes %.2f seconds\n", rd->describeNode().c_str(), now() - startTime); - constructFilesWithVersionRange(rd); - rd->files.clear(); // Ensure no mistakely use rd->files - - // Sort the backup files based on end version. - sort(rd->allFiles.begin(), rd->allFiles.end()); - printAllBackupFilesInfo(rd); - - buildForbiddenVersionRange(rd); - printForbiddenVersionRange(rd); - if ( isForbiddenVersionRangeOverlapped(rd) ) { - fprintf(stderr, "[ERROR] forbidden version ranges are overlapped! Check out the forbidden version range above\n"); - } - - rd->batchIndex = 0; - state int prevBatchIndex = 0; - state long prevCurBackupFilesBeginIndex = 0; - state long prevCurBackupFilesEndIndex = 0; - state double prevCurWorkloadSize = 0; - state double prevtotalWorkloadSize = 0; - - loop { - try { - curStartTime = now(); - rd->files.clear(); - rd->resetPerVersionBatch(); - rd->cmdID.setBatch(rd->batchIndex); - // Checkpoint the progress of the previous version batch - prevBatchIndex = rd->batchIndex; - prevCurBackupFilesBeginIndex = rd->curBackupFilesBeginIndex; - prevCurBackupFilesEndIndex = rd->curBackupFilesEndIndex; - prevCurWorkloadSize = rd->curWorkloadSize; - prevtotalWorkloadSize = rd->totalWorkloadSize; - - bool hasBackupFilesToProcess = collectFilesForOneVersionBatch(rd); - if ( !hasBackupFilesToProcess ) { // No more backup files to restore - printf("No backup files to process any more\n"); break; } - - printf("[Progress][Start version batch] Node:%s, restoreBatchIndex:%d, curWorkloadSize:%.2f------\n", rd->describeNode().c_str(), rd->batchIndex, rd->curWorkloadSize); - wait( initializeVersionBatch(rd, rd->batchIndex) ); - - wait( delay(1.0) ); - - wait( distributeWorkloadPerVersionBatch(interf, rd, cx, request, restoreConfig) ); - - curEndTime = now(); - curRunningTime = curEndTime - curStartTime; - ASSERT(curRunningTime >= 0); - totalRunningTime += curRunningTime; - - struct FastRestoreStatus status; - status.curRunningTime = curRunningTime; - status.curWorkloadSize = rd->curWorkloadSize; - status.curSpeed = rd->curWorkloadSize / curRunningTime; - status.totalRunningTime = totalRunningTime; - status.totalWorkloadSize = rd->totalWorkloadSize; - status.totalSpeed = rd->totalWorkloadSize / totalRunningTime; - - printf("[Progress][Finish version batch] restoreBatchIndex:%d, curWorkloadSize:%.2f B, curWorkload:%.2f B curRunningtime:%.2f s curSpeed:%.2f B/s totalWorkload:%.2f B totalRunningTime:%.2f s totalSpeed:%.2f B/s\n", - rd->batchIndex, rd->curWorkloadSize, - status.curWorkloadSize, status.curRunningTime, status.curSpeed, status.totalWorkloadSize, status.totalRunningTime, status.totalSpeed); - - wait( registerStatus(cx, status) ); - printf("[Progress] Finish 1 version batch. curBackupFilesBeginIndex:%ld curBackupFilesEndIndex:%ld allFiles.size():%ld", - rd->curBackupFilesBeginIndex, rd->curBackupFilesEndIndex, rd->allFiles.size()); - - rd->curBackupFilesBeginIndex = rd->curBackupFilesEndIndex + 1; - rd->curBackupFilesEndIndex++; - rd->curWorkloadSize = 0; - rd->batchIndex++; - - } catch(Error &e) { - fprintf(stdout, "!!![MAY HAVE BUG] Reset the version batch state to the start of the current version batch, due to error:%s\n", e.what()); - if(e.code() != error_code_restore_duplicate_tag) { - wait(tr->onError(e)); - } - rd->batchIndex = prevBatchIndex; - rd->curBackupFilesBeginIndex = prevCurBackupFilesBeginIndex; - rd->curBackupFilesEndIndex = prevCurBackupFilesEndIndex; - rd->curWorkloadSize = prevCurWorkloadSize; - rd->totalWorkloadSize = prevtotalWorkloadSize; + printf("%s:Wait for enough workers. Current num_workers:%d target num_workers:%d\n", + self->describeNode().c_str(), agentValues.size(), min_num_workers); + wait( delay(5.0) ); + } catch( Error &e ) { + printf("[WARNING]%s: collectWorkerInterface transaction error:%s\n", self->describeNode().c_str(), e.what()); + wait( tr.onError(e) ); } } + ASSERT(agents.size() >= min_num_workers); // ASSUMPTION: We must have at least 1 loader and 1 applier - // Unlock DB at the end of handling the restore request - - wait( unlockDB(cx, randomUid) ); - printf("Finish restore uid:%s \n", randomUid.toString().c_str()); + TraceEvent("FastRestore").detail("CollectWorkerInterfaceNumWorkers", self->workers_workerInterface.size()); - return targetVersion; -} - -//-------Helper functions -std::string getHexString(StringRef input) { - std::stringstream ss; - for (int i = 0; itype, - getHexString(iter->param1).c_str(), getHexString(iter->param2).c_str(), iter->param1.size(), iter->param2.size()); - } -} +// RestoreWorker that has restore master role: Recruite a role for each worker +ACTOR Future recruitRestoreRoles(Reference self) { + printf("%s:Start configuring roles for workers\n", self->describeNode().c_str()); + ASSERT( self->masterData.isValid() ); -//TODO: Print out the backup mutation log value. The backup log value (i.e., the value in the kv pair) has the following format -//version(12B)|mutationRef|MutationRef|.... -//A mutationRef has the format: |type_4B|param1_size_4B|param2_size_4B|param1|param2. -//Note: The data is stored in little endian! You need to convert it to BigEndian so that you know how long the param1 and param2 is and how to format them! -void printBackupMutationRefValueHex(Standalone val_input, std::string prefix) { - std::stringstream ss; - const int version_size = 12; - const int header_size = 12; - StringRef val = val_input.contents(); - StringRefReaderMX reader(val, restore_corrupted_data()); - - int count_size = 0; - // Get the version - uint64_t version = reader.consume(); - count_size += 8; - uint32_t val_length_decode = reader.consume(); - count_size += 4; - - printf("----------------------------------------------------------\n"); - printf("To decode value:%s\n", getHexString(val).c_str()); - if ( val_length_decode != (val.size() - 12) ) { - fprintf(stderr, "%s[PARSE ERROR]!!! val_length_decode:%d != val.size:%d\n", prefix.c_str(), val_length_decode, val.size()); + // Set up the role, and the global status for each node + int numNodes = self->workers_workerInterface.size(); + state int numLoader = numNodes * ratio_loader_to_applier / (ratio_loader_to_applier + 1); + int numApplier = numNodes - numLoader; + if (numLoader <= 0 || numApplier <= 0) { + ASSERT( numLoader > 0 ); // Quick check in correctness + ASSERT( numApplier > 0 ); + fprintf(stderr, "[ERROR] not enough nodes for loader and applier. numLoader:%d, numApplier:%d, ratio_loader_to_applier:%d, numAgents:%d\n", numLoader, numApplier, ratio_loader_to_applier, numNodes); } else { - if ( debug_verbose ) { - printf("%s[PARSE SUCCESS] val_length_decode:%d == (val.size:%d - 12)\n", prefix.c_str(), val_length_decode, val.size()); - } + printf("Node%s: Configure roles numWorkders:%d numLoader:%d numApplier:%d\n", self->describeNode().c_str(), numNodes, numLoader, numApplier); } - // Get the mutation header - while (1) { - // stop when reach the end of the string - if(reader.eof() ) { //|| *reader.rptr == 0xFFCheckRestoreRequestDoneErrorMX - //printf("Finish decode the value\n"); - break; - } - - - uint32_t type = reader.consume();//reader.consumeNetworkUInt32(); - uint32_t kLen = reader.consume();//reader.consumeNetworkUInt32(); - uint32_t vLen = reader.consume();//reader.consumeNetworkUInt32(); - const uint8_t *k = reader.consume(kLen); - const uint8_t *v = reader.consume(vLen); - count_size += 4 * 3 + kLen + vLen; - - if ( kLen < 0 || kLen > val.size() || vLen < 0 || vLen > val.size() ) { - fprintf(stderr, "%s[PARSE ERROR]!!!! kLen:%d(0x%04x) vLen:%d(0x%04x)\n", prefix.c_str(), kLen, kLen, vLen, vLen); - } - - if ( debug_verbose ) { - printf("%s---DedodeBackupMutation: Type:%d K:%s V:%s k_size:%d v_size:%d\n", prefix.c_str(), - type, getHexString(KeyRef(k, kLen)).c_str(), getHexString(KeyRef(v, vLen)).c_str(), kLen, vLen); - } - - } - if ( debug_verbose ) { - printf("----------------------------------------------------------\n"); - } -} - -void printBackupLogKeyHex(Standalone key_input, std::string prefix) { - std::stringstream ss; - const int version_size = 12; - const int header_size = 12; - StringRef val = key_input.contents(); - StringRefReaderMX reader(val, restore_corrupted_data()); - - int count_size = 0; - // Get the version - uint64_t version = reader.consume(); - count_size += 8; - uint32_t val_length_decode = reader.consume(); - count_size += 4; - - printf("----------------------------------------------------------\n"); - printf("To decode value:%s\n", getHexString(val).c_str()); - if ( val_length_decode != (val.size() - 12) ) { - fprintf(stderr, "%s[PARSE ERROR]!!! val_length_decode:%d != val.size:%d\n", prefix.c_str(), val_length_decode, val.size()); - } else { - printf("%s[PARSE SUCCESS] val_length_decode:%d == (val.size:%d - 12)\n", prefix.c_str(), val_length_decode, val.size()); - } - - // Get the mutation header - while (1) { - // stop when reach the end of the string - if(reader.eof() ) { //|| *reader.rptr == 0xFF - //printf("Finish decode the value\n"); - break; - } - - - uint32_t type = reader.consume();//reader.consumeNetworkUInt32(); - uint32_t kLen = reader.consume();//reader.consumeNetworkUInt32(); - uint32_t vLen = reader.consume();//reader.consumeNetworkUInt32(); - const uint8_t *k = reader.consume(kLen); - const uint8_t *v = reader.consume(vLen); - count_size += 4 * 3 + kLen + vLen; - - if ( kLen < 0 || kLen > val.size() || vLen < 0 || vLen > val.size() ) { - printf("%s[PARSE ERROR]!!!! kLen:%d(0x%04x) vLen:%d(0x%04x)\n", prefix.c_str(), kLen, kLen, vLen, vLen); - } - - printf("%s---DedoceBackupMutation: Type:%d K:%s V:%s k_size:%d v_size:%d\n", prefix.c_str(), - type, getHexString(KeyRef(k, kLen)).c_str(), getHexString(KeyRef(v, vLen)).c_str(), kLen, vLen); - - } - printf("----------------------------------------------------------\n"); -} - -void printKVOps(Reference rd) { - std::string typeStr = "MSet"; - TraceEvent("PrintKVOPs").detail("MapSize", rd->kvOps.size()); - printf("PrintKVOPs num_of_version:%ld\n", rd->kvOps.size()); - for ( auto it = rd->kvOps.begin(); it != rd->kvOps.end(); ++it ) { - TraceEvent("PrintKVOPs\t").detail("Version", it->first).detail("OpNum", it->second.size()); - printf("PrintKVOPs Version:%08lx num_of_ops:%d\n", it->first, it->second.size()); - for ( auto m = it->second.begin(); m != it->second.end(); ++m ) { - if ( m->type >= MutationRef::Type::SetValue && m->type <= MutationRef::Type::MAX_ATOMIC_OP ) - typeStr = typeString[m->type]; - else { - printf("PrintKVOPs MutationType:%d is out of range\n", m->type); - } - - printf("\tPrintKVOPs Version:%016lx MType:%s K:%s, V:%s K_size:%d V_size:%d\n", it->first, typeStr.c_str(), - getHexString(m->param1).c_str(), getHexString(m->param2).c_str(), m->param1.size(), m->param2.size()); - - TraceEvent("PrintKVOPs\t\t").detail("Version", it->first) - .detail("MType", m->type).detail("MTypeStr", typeStr) - .detail("MKey", getHexString(m->param1)) - .detail("MValueSize", m->param2.size()) - .detail("MValue", getHexString(m->param2)); - } - } -} - -// Sanity check if KVOps is sorted -bool isKVOpsSorted(Reference rd) { - bool ret = true; - auto prev = rd->kvOps.begin(); - for ( auto it = rd->kvOps.begin(); it != rd->kvOps.end(); ++it ) { - if ( prev->first > it->first ) { - ret = false; - break; - } - prev = it; - } - return ret; -} - -bool allOpsAreKnown(Reference rd) { - bool ret = true; - for ( auto it = rd->kvOps.begin(); it != rd->kvOps.end(); ++it ) { - for ( auto m = it->second.begin(); m != it->second.end(); ++m ) { - if ( m->type == MutationRef::SetValue || m->type == MutationRef::ClearRange - || isAtomicOp((MutationRef::Type) m->type) ) - continue; - else { - printf("[ERROR] Unknown mutation type:%d\n", m->type); - ret = false; - } - } - - } - - return ret; -} - -//key_input format: [logRangeMutation.first][hash_value_of_commit_version:1B][bigEndian64(commitVersion)][bigEndian32(part)] -bool concatenateBackupMutationForLogFile(Reference rd, Standalone val_input, Standalone key_input) { - std::string prefix = "||\t"; - std::stringstream ss; - const int version_size = 12; - const int header_size = 12; - StringRef val = val_input.contents(); - StringRefReaderMX reader(val, restore_corrupted_data()); - StringRefReaderMX readerKey(key_input, restore_corrupted_data()); //read key_input! - int logRangeMutationFirstLength = key_input.size() - 1 - 8 - 4; - bool concatenated = false; - - if ( logRangeMutationFirstLength < 0 ) { - printf("[ERROR]!!! logRangeMutationFirstLength:%ld < 0, key_input.size:%ld\n", logRangeMutationFirstLength, key_input.size()); - } - - if ( debug_verbose ) { - printf("[DEBUG] Process key_input:%s\n", getHexKey(key_input, logRangeMutationFirstLength).c_str()); - } - - //PARSE key - Standalone id_old = key_input.substr(0, key_input.size() - 4); //Used to sanity check the decoding of key is correct - Standalone partStr = key_input.substr(key_input.size() - 4, 4); //part - StringRefReaderMX readerPart(partStr, restore_corrupted_data()); - uint32_t part_direct = readerPart.consumeNetworkUInt32(); //Consume a bigEndian value - if ( debug_verbose ) { - printf("[DEBUG] Process prefix:%s and partStr:%s part_direct:%08x fromm key_input:%s, size:%ld\n", - getHexKey(id_old, logRangeMutationFirstLength).c_str(), - getHexString(partStr).c_str(), - part_direct, - getHexKey(key_input, logRangeMutationFirstLength).c_str(), - key_input.size()); - } - - StringRef longRangeMutationFirst; - - if ( logRangeMutationFirstLength > 0 ) { - printf("readerKey consumes %dB\n", logRangeMutationFirstLength); - longRangeMutationFirst = StringRef(readerKey.consume(logRangeMutationFirstLength), logRangeMutationFirstLength); - } - - uint8_t hashValue = readerKey.consume(); - uint64_t commitVersion = readerKey.consumeNetworkUInt64(); // Consume big Endian value encoded in log file, commitVersion is in littleEndian - uint64_t commitVersionBE = bigEndian64(commitVersion); - uint32_t part = readerKey.consumeNetworkUInt32(); //Consume big Endian value encoded in log file - uint32_t partBE = bigEndian32(part); - Standalone id2 = longRangeMutationFirst.withSuffix(StringRef(&hashValue,1)).withSuffix(StringRef((uint8_t*) &commitVersion, 8)); - - //Use commitVersion as id - Standalone id = StringRef((uint8_t*) &commitVersion, 8); - - if ( debug_verbose ) { - printf("[DEBUG] key_input_size:%d longRangeMutationFirst:%s hashValue:%02x commitVersion:%016lx (BigEndian:%016lx) part:%08x (BigEndian:%08x), part_direct:%08x mutationMap.size:%ld\n", - key_input.size(), longRangeMutationFirst.printable().c_str(), hashValue, - commitVersion, commitVersionBE, - part, partBE, - part_direct, rd->mutationMap.size()); - } - - if ( rd->mutationMap.find(id) == rd->mutationMap.end() ) { - rd->mutationMap.insert(std::make_pair(id, val_input)); - if ( part_direct != 0 ) { - printf("[ERROR]!!! part:%d != 0 for key_input:%s\n", part_direct, getHexString(key_input).c_str()); - } - rd->mutationPartMap.insert(std::make_pair(id, part_direct)); - } else { // concatenate the val string -// printf("[INFO] Concatenate the log's val string at version:%ld\n", id.toString().c_str()); - rd->mutationMap[id] = rd->mutationMap[id].contents().withSuffix(val_input.contents()); //Assign the new Areana to the map's value - if ( part_direct != (rd->mutationPartMap[id] + 1) ) { - printf("[ERROR]!!! current part id:%d new part_direct:%d is not the next integer of key_input:%s\n", rd->mutationPartMap[id], part_direct, getHexString(key_input).c_str()); - printf("[HINT] Check if the same range or log file has been processed more than once!\n"); - } - if ( part_direct != part ) { - printf("part_direct:%08x != part:%08x\n", part_direct, part); - } - rd->mutationPartMap[id] = part_direct; - concatenated = true; - } - - return concatenated; -} - -bool isRangeMutation(MutationRef m) { - if (m.type == MutationRef::Type::ClearRange) { - if (m.type == MutationRef::Type::DebugKeyRange) { - printf("[ERROR] DebugKeyRange mutation is in backup data unexpectedly. We still handle it as a range mutation; the suspicious mutation:%s\n", m.toString().c_str()); - } - return true; - } else { - if ( !(m.type == MutationRef::Type::SetValue || - isAtomicOp((MutationRef::Type) m.type)) ) { - printf("[ERROR] %s mutation is in backup data unexpectedly. We still handle it as a key mutation; the suspicious mutation:%s\n", typeString[m.type], m.toString().c_str()); - - } - return false; - } -} - -void splitMutation(Reference rd, MutationRef m, Arena& mvector_arena, VectorRef mvector, Arena& nodeIDs_arena, VectorRef nodeIDs) { - // mvector[i] should be mapped to nodeID[i] - ASSERT(mvector.empty()); - ASSERT(nodeIDs.empty()); - // key range [m->param1, m->param2) - //std::map, UID>; - std::map, UID>::iterator itlow, itup; //we will return [itlow, itup) - itlow = rd->range2Applier.lower_bound(m.param1); // lower_bound returns the iterator that is >= m.param1 - if ( itlow != rd->range2Applier.begin()) { // m.param1 is not the smallest key \00 - // (itlow-1) is the node whose key range includes m.param1 - --itlow; - } else { - if (m.param1 != LiteralStringRef("\00")) { - printf("[ERROR] splitMutation has bug on range mutation:%s\n", m.toString().c_str()); - } - } - - itup = rd->range2Applier.upper_bound(m.param2); // upper_bound returns the iterator that is > m.param2; return rmap::end if no keys are considered to go after m.param2. - ASSERT( itup == rd->range2Applier.end() || itup->first >= m.param2 ); - // Now adjust for the case: example: mutation range is [a, d); we have applier's ranges' inclusive lower bound values are: a, b, c, d, e; upper_bound(d) returns itup to e, but we want itup to d. - --itup; - ASSERT( itup->first <= m.param2 ); - if ( itup->first < m.param2 ) { - ++itup; //make sure itup is >= m.param2, that is, itup is the next key range >= m.param2 - } - - while (itlow->first < itup->first) { - MutationRef curm; //current mutation - curm.type = m.type; - curm.param1 = itlow->first; - itlow++; - if (itlow == rd->range2Applier.end()) { - curm.param2 = normalKeys.end; - } else { - curm.param2 = itlow->first; - } - mvector.push_back(mvector_arena, curm); - - nodeIDs.push_back(nodeIDs_arena, itlow->second); - } - - return; -} - -ACTOR Future registerMutationsToApplier(Reference rd) { - printf("[INFO][Loader] Node:%s rd->masterApplier:%s, hasApplierInterface:%d registerMutationsToApplier\n", - rd->describeNode().c_str(), rd->masterApplier.toString().c_str(), - rd->workers_interface.find(rd->masterApplier) != rd->workers_interface.end()); - - state RestoreInterface applierCmdInterf; // = rd->workers_interface[rd->masterApplier]; - state int packMutationNum = 0; - state int packMutationThreshold = 10; - state int kvCount = 0; - state std::vector> cmdReplies; - - state int splitMutationIndex = 0; - - printAppliersKeyRange(rd); - - //state double mutationVectorThreshold = 1;//1024 * 10; // Bytes. - state std::map>> applierMutationsBuffer; // The mutation vector to be sent to each applier - state std::map applierMutationsSize; // buffered mutation vector size for each applier - // Initialize the above two maps - state std::vector applierIDs = getWorkingApplierIDs(rd); + // Assign a role to each worker + state int nodeIndex = 0; + state RestoreRole role; + state UID nodeID; + printf("Node:%s Start configuring roles for workers\n", self->describeNode().c_str()); + self->cmdID.initPhase(RestoreCommandEnum::Set_Role); loop { try { - packMutationNum = 0; - splitMutationIndex = 0; - kvCount = 0; - state std::map>>::iterator kvOp; - rd->cmdID.initPhase(RestoreCommandEnum::Loader_Send_Mutations_To_Applier); - // In case try-catch has error and loop back - applierMutationsBuffer.clear(); - applierMutationsSize.clear(); - for (auto &applierID : applierIDs) { - applierMutationsBuffer[applierID] = Standalone>(VectorRef()); - applierMutationsSize[applierID] = 0.0; - } - for ( kvOp = rd->kvOps.begin(); kvOp != rd->kvOps.end(); kvOp++) { - state uint64_t commitVersion = kvOp->first; - state int mIndex; - state MutationRef kvm; - for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) { - kvm = kvOp->second[mIndex]; - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] mutation to sent to applier, mutation:%s\n", kvm.toString().c_str()); - } - // Send the mutation to applier - if (isRangeMutation(kvm)) { - // Because using a vector of mutations causes overhead, and the range mutation should happen rarely; - // We handle the range mutation and key mutation differently for the benefit of avoiding memory copy - state Standalone> mvector; - state Standalone> nodeIDs; - // '' Bug may be here! The splitMutation() may be wrong! - splitMutation(rd, kvm, mvector.arena(), mvector.contents(), nodeIDs.arena(), nodeIDs.contents()); - ASSERT(mvector.size() == nodeIDs.size()); - - for (splitMutationIndex = 0; splitMutationIndex < mvector.size(); splitMutationIndex++ ) { - MutationRef mutation = mvector[splitMutationIndex]; - UID applierID = nodeIDs[splitMutationIndex]; - applierCmdInterf = rd->workers_interface[applierID]; - applierMutationsBuffer[applierID].push_back(applierMutationsBuffer[applierID].arena(), mutation); // Q: Maybe push_back_deep()? - applierMutationsSize[applierID] += mutation.expectedSize(); - - kvCount++; - } - - for (auto &applierID : applierIDs) { - if ( applierMutationsSize[applierID] >= mutationVectorThreshold ) { - rd->cmdID.nextCmd(); - cmdReplies.push_back(applierCmdInterf.sendMutationVector.getReply( - RestoreSendMutationVectorRequest(rd->cmdID, commitVersion, applierMutationsBuffer[applierID]))); - applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size()); - applierMutationsSize[applierID] = 0; - - printf("[INFO][Loader] Waits for applier to receive %ld range mutations\n", cmdReplies.size()); - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - cmdReplies.clear(); - } - } - } else { // mutation operates on a particular key - std::map, UID>::iterator itlow = rd->range2Applier.lower_bound(kvm.param1); // lower_bound returns the iterator that is >= m.param1 - // make sure itlow->first <= m.param1 - if ( itlow == rd->range2Applier.end() || itlow->first > kvm.param1 ) { - --itlow; - } - ASSERT( itlow->first <= kvm.param1 ); - MutationRef mutation = kvm; - UID applierID = itlow->second; - applierCmdInterf = rd->workers_interface[applierID]; - kvCount++; - - applierMutationsBuffer[applierID].push_back(applierMutationsBuffer[applierID].arena(), mutation); // Q: Maybe push_back_deep()? - applierMutationsSize[applierID] += mutation.expectedSize(); - if ( applierMutationsSize[applierID] >= mutationVectorThreshold ) { - rd->cmdID.nextCmd(); - cmdReplies.push_back(applierCmdInterf.sendMutationVector.getReply( - RestoreSendMutationVectorRequest(rd->cmdID, commitVersion, applierMutationsBuffer[applierID]))); - applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size()); - applierMutationsSize[applierID] = 0; - - printf("[INFO][Loader] Waits for applier to receive %ld range mutations\n", cmdReplies.size()); - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - cmdReplies.clear(); - } - } + std::vector> cmdReplies; + for (auto &workerInterf : self->workers_workerInterface) { + if ( nodeIndex < numLoader ) { + role = RestoreRole::Loader; + } else { + role = RestoreRole::Applier; } - + nodeID = workerInterf.first; + self->cmdID.nextCmd(); + printf("[CMD:%s] Node:%s Set role (%s) to node (index=%d uid=%s)\n", self->cmdID.toString().c_str(), self->describeNode().c_str(), + getRoleStr(role).c_str(), nodeIndex, nodeID.toString().c_str()); + cmdReplies.push_back( workerInterf.second.recruitRole.getReply(RestoreRecruitRoleRequest(self->cmdID, role, nodeIndex)) ); + nodeIndex++; } - - // In case the mutation vector is not larger than mutationVectorThreshold - printf("[DEBUG][Loader] sendMutationVector sends the remaining applierMutationsBuffer, applierIDs.size:%d\n", applierIDs.size()); - for (auto &applierID : applierIDs) { - if (applierMutationsBuffer[applierID].empty()) { //&& applierMutationsSize[applierID] >= 1 - continue; - } - printf("[DEBUG][Loader] sendMutationVector for applierID:%s\n", applierID.toString().c_str()); - rd->cmdID.nextCmd(); - cmdReplies.push_back(applierCmdInterf.sendMutationVector.getReply( - RestoreSendMutationVectorRequest(rd->cmdID, commitVersion, applierMutationsBuffer[applierID]))); - applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size()); - applierMutationsSize[applierID] = 0; - printf("[INFO][Loader] Waits for applier to receive %ld range mutations\n", cmdReplies.size()); - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); // Q: We need to wait for each reply, otherwise, correctness has error. Why? - cmdReplies.clear(); - } - - if (!cmdReplies.empty()) { - printf("[INFO][Loader] Last Waits for applier to receive %ld range mutations\n", cmdReplies.size()); - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - //std::vector reps = wait( getAll(cmdReplies) ); - cmdReplies.clear(); - } - printf("[Summary][Loader] Node:%s Last CMDUID:%s produces %d mutation operations\n", - rd->describeNode().c_str(), rd->cmdID.toString().c_str(), kvCount); - + std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); + printf("[RecruitRestoreRoles] Finished\n"); break; - } catch (Error &e) { // Handle the command reply timeout error - fprintf(stdout, "[ERROR] registerMutationsToApplier Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - } - }; - - return Void(); -} - -// Loader: Register sampled mutations -ACTOR Future registerMutationsToMasterApplier(Reference rd) { - printf("[Sampling] Node:%s registerMutationsToMaster() rd->masterApplier:%s, hasApplierInterface:%d\n", - rd->describeNode().c_str(), rd->masterApplier.toString().c_str(), - rd->workers_interface.find(rd->masterApplier) != rd->workers_interface.end()); - - ASSERT(rd->workers_interface.find(rd->masterApplier) != rd->workers_interface.end()); - - state RestoreInterface applierCmdInterf = rd->workers_interface[rd->masterApplier]; - state UID applierID = rd->masterApplier; - state int packMutationNum = 0; - state int packMutationThreshold = 1; - state int kvCount = 0; - state std::vector> cmdReplies; - - state int splitMutationIndex = 0; - state std::map>>::iterator kvOp; - state int mIndex; - state uint64_t commitVersion; - state MutationRef kvm; - - state Standalone> mutationsBuffer; // The mutation vector to be sent to master applier - state double mutationsSize = 0; - //state double mutationVectorThreshold = 1; //1024 * 10; // Bytes - loop { - try { - cmdReplies.clear(); - mutationsBuffer.pop_front(mutationsBuffer.size()); - mutationsSize = 0; - packMutationNum = 0; - rd->cmdID.initPhase(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier); - // TODO: Consider using a different EndPoint for loader and applier communication. - // Otherwise, applier may receive loader's message while applier is waiting for master to assign key-range - for ( kvOp = rd->kvOps.begin(); kvOp != rd->kvOps.end(); kvOp++) { - commitVersion = kvOp->first; - - for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) { - kvm = kvOp->second[mIndex]; - rd->cmdID.nextCmd(); - if ( debug_verbose || true ) { // Debug deterministic bug - printf("[VERBOSE_DEBUG] send mutation to applier, mIndex:%d mutation:%s\n", mIndex, kvm.toString().c_str()); - } - mutationsBuffer.push_back(mutationsBuffer.arena(), kvm); - mutationsSize += kvm.expectedSize(); - if ( mutationsSize >= mutationVectorThreshold ) { - rd->cmdID.nextCmd(); - cmdReplies.push_back(applierCmdInterf.sendSampleMutationVector.getReply( - RestoreSendMutationVectorRequest(rd->cmdID, commitVersion, mutationsBuffer))); - mutationsBuffer.pop_front(mutationsBuffer.size()); - mutationsSize = 0; - if ( debug_verbose ) { - printf("[INFO][Loader] Waits for master applier to receive %ld mutations\n", mutationsBuffer.size()); - } - std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); - //std::vector reps = wait( getAll(cmdReplies) ); - cmdReplies.clear(); - } - - kvCount++; - } - } - - // The leftover mutationVector whose size is < mutationVectorThreshold - if ( mutationsSize > 0 ) { - rd->cmdID.nextCmd(); - cmdReplies.push_back(applierCmdInterf.sendSampleMutationVector.getReply( - RestoreSendMutationVectorRequest(rd->cmdID, commitVersion, mutationsBuffer))); - mutationsBuffer.pop_front(mutationsBuffer.size()); - mutationsSize = 0; - } - - - if (!cmdReplies.empty()) { - printf("[INFO][Loader] Last waits for master applier to receive %ld mutations\n", mutationsBuffer.size()); - //std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout) ); - std::vector reps = wait( getAll(cmdReplies) ); - cmdReplies.clear(); - } - - printf("[Sample Summary][Loader] Node:%s produces %d mutation operations\n", rd->describeNode().c_str(), kvCount); - break; - } catch (Error &e) { - // TODO: Handle the command reply timeout error - if (e.code() != error_code_io_timeout) { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s timeout\n", rd->describeNode().c_str(), rd->cmdID.toString().c_str()); - } else { - fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", rd->describeNode().c_str(), - rd->cmdID.toString().c_str(), e.code(), e.what()); - } - printf("[WARNING] Node:%s timeout at waiting on replies of Loader_Send_Sample_Mutation_To_Applier. Retry...\n", rd->describeNode().c_str()); + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + printf("Node:%s waits on replies time out. Current phase: Set_Role, Retry all commands.\n", self->describeNode().c_str()); } } return Void(); } -ACTOR Future handleHeartbeat(RestoreSimpleRequest req, Reference rd, RestoreInterface interf) { - // wait( delay(1.0) ); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); -} - -ACTOR Future handleVersionBatchRequest(RestoreVersionBatchRequest req, Reference rd, RestoreInterface interf) { - // wait( delay(1.0) ); - printf("[Batch:%d] Node:%s Start...\n", req.batchID, rd->describeNode().c_str()); - while (rd->isInProgress(RestoreCommandEnum::RESET_VersionBatch)) { - printf("[DEBUG] NODE:%s handleVersionBatchRequest wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - // Handle duplicate, assuming cmdUID is always unique for the same workload - if ( rd->isCmdProcessed(req.cmdID) ) { - printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - - rd->setInProgressFlag(RestoreCommandEnum::RESET_VersionBatch); - - rd->resetPerVersionBatch(); - rd->processedFiles.clear(); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - - rd->processedCmd[req.cmdID] = 1; - rd->clearInProgressFlag(RestoreCommandEnum::RESET_VersionBatch); - - // This actor never returns. You may cancel it in master - return Void(); -} - -ACTOR Future handleSetRoleRequest(RestoreSetRoleRequest req, Reference rd, RestoreInterface interf) { - // wait( delay(1.0) ); - rd->localNodeStatus.init(req.role); - rd->localNodeStatus.nodeID = interf.id(); - rd->localNodeStatus.nodeIndex = req.nodeIndex; - rd->masterApplier = req.masterApplierID; - printf("[INFO][Worker] Node:%s get role %s\n", rd->describeNode().c_str(), - getRoleStr(rd->localNodeStatus.role).c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - - // This actor never returns. You may cancel it in master - return Void(); -} - - -ACTOR Future handleSampleRangeFileRequest(RestoreLoadFileRequest req, Reference rd, RestoreInterface interf) { - //printf("[INFO] Node:%s Got Restore Command: cmdID:%s.\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - - state LoadingParam param = req.param; - state int beginBlock = 0; - state int j = 0; - state int readLen = 0; - state int64_t readOffset = param.offset; - - while (rd->isInProgress(RestoreCommandEnum::Sample_Range_File)) { - printf("[DEBUG] NODE:%s sampleRangeFile wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - // Handle duplicate, assuming cmdUID is always unique for the same workload - if ( rd->isCmdProcessed(req.cmdID) ) { - printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - - rd->setInProgressFlag(RestoreCommandEnum::Sample_Range_File); - printf("[Sample_Range_File][Loader] Node: %s, loading param:%s\n", - rd->describeNode().c_str(), param.toString().c_str()); - - // TODO: This can be expensive - state Reference bc = rd->bc; - printf("[INFO] node:%s open backup container for url:%s\n", - rd->describeNode().c_str(), - param.url.toString().c_str()); - - - rd->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier - rd->mutationMap.clear(); - rd->mutationPartMap.clear(); - - ASSERT( param.blockSize > 0 ); - //state std::vector> fileParserFutures; - if (param.offset % param.blockSize != 0) { - printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", - param.offset, param.blockSize, param.offset % param.blockSize); - } - - ASSERT( param.offset + param.blockSize >= param.length ); // We only sample one data block or less (at the end of the file) of a file. - for (j = param.offset; j < param.length; j += param.blockSize) { - readOffset = j; - readLen = std::min(param.blockSize, param.length - j); - wait( _parseRangeFileToMutationsOnLoader(rd, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix) ); - ++beginBlock; - } - - printf("[Sampling][Loader] Node:%s finishes sample Range file:%s\n", rd->describeNode().c_str(), param.filename.c_str()); - // TODO: Send to applier to apply the mutations - printf("[Sampling][Loader] Node:%s will send sampled mutations to applier\n", rd->describeNode().c_str()); - wait( registerMutationsToMasterApplier(rd) ); // Send the parsed mutation to applier who will apply the mutation to DB - - //rd->processedFiles.insert(std::make_pair(param.filename, 1)); - - //TODO: Send ack to master that loader has finished loading the data - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - rd->processedCmd[req.cmdID] = 1; // Record the processed comand to handle duplicate command - //rd->kvOps.clear(); - - rd->clearInProgressFlag(RestoreCommandEnum::Sample_Range_File); - - return Void(); -} - -ACTOR Future handleSampleLogFileRequest(RestoreLoadFileRequest req, Reference rd, RestoreInterface interf) { - state LoadingParam param = req.param; - state int beginBlock = 0; - state int j = 0; - state int readLen = 0; - state int64_t readOffset = param.offset; - - while (rd->isInProgress(RestoreCommandEnum::Sample_Log_File)) { - printf("[DEBUG] NODE:%s sampleLogFile wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - // Handle duplicate message - if ( rd->isCmdProcessed(req.cmdID) ) { - printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - - rd->setInProgressFlag(RestoreCommandEnum::Sample_Log_File); - printf("[Sample_Log_File][Loader] Node: %s, loading param:%s\n", rd->describeNode().c_str(), param.toString().c_str()); - - // TODO: Expensive operation - state Reference bc = rd->bc; - printf("[Sampling][Loader] Node:%s open backup container for url:%s\n", - rd->describeNode().c_str(), - param.url.toString().c_str()); - printf("[Sampling][Loader] Node:%s filename:%s blockSize:%ld\n", - rd->describeNode().c_str(), - param.filename.c_str(), param.blockSize); - - rd->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier - rd->mutationMap.clear(); - rd->mutationPartMap.clear(); - - ASSERT( param.blockSize > 0 ); - //state std::vector> fileParserFutures; - if (param.offset % param.blockSize != 0) { - printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", - param.offset, param.blockSize, param.offset % param.blockSize); - } - ASSERT( param.offset + param.blockSize >= param.length ); // Assumption: Only sample one data block or less - for (j = param.offset; j < param.length; j += param.blockSize) { - readOffset = j; - readLen = std::min(param.blockSize, param.length - j); - // NOTE: Log file holds set of blocks of data. We need to parse the data block by block and get the kv pair(version, serialized_mutations) - // The set of mutations at the same version may be splitted into multiple kv pairs ACROSS multiple data blocks when the size of serialized_mutations is larger than 20000. - wait( _parseLogFileToMutationsOnLoader(rd, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix, param.mutationLogPrefix) ); - ++beginBlock; - } - printf("[Sampling][Loader] Node:%s finishes parsing the data block into kv pairs (version, serialized_mutations) for file:%s\n", rd->describeNode().c_str(), param.filename.c_str()); - parseSerializedMutation(rd, true); - - printf("[Sampling][Loader] Node:%s finishes process Log file:%s\n", rd->describeNode().c_str(), param.filename.c_str()); - printf("[Sampling][Loader] Node:%s will send log mutations to applier\n", rd->describeNode().c_str()); - wait( registerMutationsToMasterApplier(rd) ); // Send the parsed mutation to applier who will apply the mutation to DB - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); // master node is waiting - rd->processedFiles.insert(std::make_pair(param.filename, 1)); - rd->processedCmd[req.cmdID] = 1; - - rd->clearInProgressFlag(RestoreCommandEnum::Sample_Log_File); - - return Void(); -} - -ACTOR Future handleCalculateApplierKeyRangeRequest(RestoreCalculateApplierKeyRangeRequest req, Reference rd, RestoreInterface interf) { - state int numMutations = 0; - state std::vector> keyRangeLowerBounds; - - while (rd->isInProgress(RestoreCommandEnum::Calculate_Applier_KeyRange)) { - printf("[DEBUG] NODE:%s Calculate_Applier_KeyRange wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - wait( delay(1.0) ); - // Handle duplicate message - // We need to recalculate the value for duplicate message! Because the reply to duplicate message may arrive earlier! - if (rd->isCmdProcessed(req.cmdID) && !keyRangeLowerBounds.empty() ) { - printf("[DEBUG] Node:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(GetKeyRangeNumberReply(keyRangeLowerBounds.size())); - return Void(); - } - rd->setInProgressFlag(RestoreCommandEnum::Calculate_Applier_KeyRange); - - // Applier will calculate applier key range - printf("[INFO][Applier] CMD:%s, Node:%s Calculate key ranges for %d appliers\n", - req.cmdID.toString().c_str(), rd->describeNode().c_str(), req.numAppliers); - - //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); - if ( keyRangeLowerBounds.empty() ) { - keyRangeLowerBounds = _calculateAppliersKeyRanges(rd, req.numAppliers); // keyRangeIndex is the number of key ranges requested - rd->keyRangeLowerBounds = keyRangeLowerBounds; - } - - printf("[INFO][Applier] CMD:%s, NodeID:%s: num of key ranges:%ld\n", - rd->cmdID.toString().c_str(), rd->describeNode().c_str(), keyRangeLowerBounds.size()); - req.reply.send(GetKeyRangeNumberReply(keyRangeLowerBounds.size())); - rd->processedCmd[req.cmdID] = 1; // We should not skip this command in the following phase. Otherwise, the handler in other phases may return a wrong number of appliers - rd->clearInProgressFlag(RestoreCommandEnum::Calculate_Applier_KeyRange); - - return Void(); -} - -ACTOR Future handleGetApplierKeyRangeRequest(RestoreGetApplierKeyRangeRequest req, Reference rd, RestoreInterface interf) { - state int numMutations = 0; - //state std::vector> keyRangeLowerBounds = rd->keyRangeLowerBounds; - - while (rd->isInProgress(RestoreCommandEnum::Get_Applier_KeyRange)) { - printf("[DEBUG] NODE:%s Calculate_Applier_KeyRange wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - wait( delay(1.0) ); - //NOTE: Must reply a valid lowerBound and upperBound! Otherwise, the master will receive an invalid value! - // if (rd->isCmdProcessed(req.cmdID) ) { - // printf("[DEBUG] Node:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - // req.reply.send(GetKeyRangeReply(interf.id(), req.cmdID)); // Must wait until the previous command returns - // return Void(); - // } - rd->setInProgressFlag(RestoreCommandEnum::Get_Applier_KeyRange); - - if ( req.applierIndex < 0 || req.applierIndex >= rd->keyRangeLowerBounds.size() ) { - printf("[INFO][Applier] NodeID:%s Get_Applier_KeyRange keyRangeIndex is out of range. keyIndex:%d keyRagneSize:%ld\n", - rd->describeNode().c_str(), req.applierIndex, rd->keyRangeLowerBounds.size()); - } - //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); - - printf("[INFO][Applier] NodeID:%s replies Get_Applier_KeyRange. keyRangeIndex:%d lower_bound_of_keyRange:%s\n", - rd->describeNode().c_str(), req.applierIndex, getHexString(rd->keyRangeLowerBounds[req.applierIndex]).c_str()); - - KeyRef lowerBound = rd->keyRangeLowerBounds[req.applierIndex]; - KeyRef upperBound = (req.applierIndex + 1) < rd->keyRangeLowerBounds.size() ? rd->keyRangeLowerBounds[req.applierIndex+1] : normalKeys.end; - - req.reply.send(GetKeyRangeReply(interf.id(), req.cmdID, req.applierIndex, lowerBound, upperBound)); - rd->clearInProgressFlag(RestoreCommandEnum::Get_Applier_KeyRange); - - return Void(); - -} - -// Assign key range to applier -ACTOR Future handleSetApplierKeyRangeRequest(RestoreSetApplierKeyRangeRequest req, Reference rd, RestoreInterface interf) { - // Idempodent operation. OK to re-execute the duplicate cmd - // The applier should remember the key range it is responsible for - //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); - //rd->applierStatus.keyRange = req.range; - while (rd->isInProgress(RestoreCommandEnum::Assign_Applier_KeyRange)) { - printf("[DEBUG] NODE:%s handleSetApplierKeyRangeRequest wait for 1s\n", rd->describeNode().c_str()); - wait(delay(1.0)); - } - if ( rd->isCmdProcessed(req.cmdID) ) { - req.reply.send(RestoreCommonReply(interf.id(),req.cmdID)); - return Void(); - } - rd->setInProgressFlag(RestoreCommandEnum::Assign_Applier_KeyRange); - - rd->range2Applier[req.range.begin] = req.applierID; - - rd->processedCmd[req.cmdID] = 1; - rd->clearInProgressFlag(RestoreCommandEnum::Assign_Applier_KeyRange); - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - - return Void(); -} - -ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, Reference rd, RestoreInterface interf) { - // Idempodent operation. OK to re-execute the duplicate cmd - // The applier should remember the key range it is responsible for - //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); - //rd->applierStatus.keyRange = req.range; - while (rd->isInProgress(RestoreCommandEnum::Notify_Loader_ApplierKeyRange)) { - printf("[DEBUG] NODE:%s handleSetApplierKeyRangeVectorRequest wait for 1s\n", rd->describeNode().c_str()); - wait(delay(1.0)); - } - if ( rd->isCmdProcessed(req.cmdID) ) { - req.reply.send(RestoreCommonReply(interf.id(),req.cmdID)); - return Void(); - } - rd->setInProgressFlag(RestoreCommandEnum::Notify_Loader_ApplierKeyRange); - - VectorRef appliers = req.applierIDs; - VectorRef ranges = req.ranges; - for ( int i = 0; i < appliers.size(); i++ ) { - rd->range2Applier[ranges[i].begin] = appliers[i]; - } - - rd->processedCmd[req.cmdID] = 1; - rd->clearInProgressFlag(RestoreCommandEnum::Notify_Loader_ApplierKeyRange); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - - return Void(); -} - -ACTOR Future handleLoadRangeFileRequest(RestoreLoadFileRequest req, Reference rd, RestoreInterface interf) { - //printf("[INFO] Worker Node:%s starts handleLoadRangeFileRequest\n", rd->describeNode().c_str()); - - state LoadingParam param; - state int64_t beginBlock = 0; - state int64_t j = 0; - state int64_t readLen = 0; - state int64_t readOffset = 0; - state Reference bc; - - param = req.param; - beginBlock = 0; - j = 0; - readLen = 0; - readOffset = 0; - readOffset = param.offset; - - while (rd->isInProgress(RestoreCommandEnum::Assign_Loader_Range_File)) { - printf("[DEBUG] NODE:%s loadRangeFile wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - //Note: handle duplicate message delivery - if (rd->processedFiles.find(param.filename) != rd->processedFiles.end() || - rd->isCmdProcessed(req.cmdID)) { - // printf("[WARNING]Node:%s, CMDUID:%s file:%s is delivered more than once! Reply directly without loading the file\n", - // rd->describeNode().c_str(), req.cmdID.toString().c_str(), - // param.filename.c_str()); - req.reply.send(RestoreCommonReply(interf.id(),req.cmdID)); - return Void(); - } - - rd->setInProgressFlag(RestoreCommandEnum::Assign_Loader_Range_File); - - printf("[INFO][Loader] Node:%s, CMDUID:%s Execute: Assign_Loader_Range_File, role: %s, loading param:%s\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - getRoleStr(rd->localNodeStatus.role).c_str(), - param.toString().c_str()); - - bc = rd->bc; - // printf("[INFO] Node:%s CMDUID:%s open backup container for url:%s\n", - // rd->describeNode().c_str(), req.cmdID.toString().c_str(), - // param.url.toString().c_str()); - - - rd->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier - rd->mutationMap.clear(); - rd->mutationPartMap.clear(); - - ASSERT( param.blockSize > 0 ); - //state std::vector> fileParserFutures; - if (param.offset % param.blockSize != 0) { - printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", - param.offset, param.blockSize, param.offset % param.blockSize); - } - for (j = param.offset; j < param.length; j += param.blockSize) { - readOffset = j; - readLen = std::min(param.blockSize, param.length - j); - printf("[DEBUG_TMP] _parseRangeFileToMutationsOnLoader starts\n"); - wait( _parseRangeFileToMutationsOnLoader(rd, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix) ); - printf("[DEBUG_TMP] _parseRangeFileToMutationsOnLoader ends\n"); - ++beginBlock; - } - - printf("[INFO][Loader] Node:%s CMDUID:%s finishes process Range file:%s\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - param.filename.c_str()); - // TODO: Send to applier to apply the mutations - // printf("[INFO][Loader] Node:%s CMDUID:%s will send range mutations to applier\n", - // rd->describeNode().c_str(), rd->cmdID.toString().c_str()); - wait( registerMutationsToApplier(rd) ); // Send the parsed mutation to applier who will apply the mutation to DB - wait ( delay(1.0) ); - - rd->processedFiles[param.filename] = 1; - rd->processedCmd[req.cmdID] = 1; - - rd->clearInProgressFlag(RestoreCommandEnum::Assign_Loader_Range_File); - printf("[INFO][Loader] Node:%s CMDUID:%s clear inProgressFlag :%lx for Assign_Loader_Range_File.\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), rd->inProgressFlag); - - //Send ack to master that loader has finished loading the data - printf("[INFO][Loader] Node:%s CMDUID:%s send ack.\n", - rd->describeNode().c_str(), rd->cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - - return Void(); - -} - - -ACTOR Future handleLoadLogFileRequest(RestoreLoadFileRequest req, Reference rd, RestoreInterface interf) { - printf("[INFO] Worker Node:%s starts handleLoadLogFileRequest\n", rd->describeNode().c_str()); - - state LoadingParam param; - state int64_t beginBlock = 0; - state int64_t j = 0; - state int64_t readLen = 0; - state int64_t readOffset = 0; - state Reference bc; - - param = req.param; - beginBlock = 0; - j = 0; - readLen = 0; - readOffset = 0; - readOffset = param.offset; - - while (rd->isInProgress(RestoreCommandEnum::Assign_Loader_Log_File)) { - printf("[DEBUG] NODE:%s loadLogFile wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - //Note: handle duplicate message delivery - if (rd->processedFiles.find(param.filename) != rd->processedFiles.end() - || rd->isCmdProcessed(req.cmdID)) { - printf("[WARNING] Node:%s CMDUID:%s file:%s is delivered more than once! Reply directly without loading the file\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - param.filename.c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - - rd->setInProgressFlag(RestoreCommandEnum::Assign_Loader_Log_File); - - printf("[INFO][Loader] Node:%s CMDUID:%s Assign_Loader_Log_File role: %s, loading param:%s\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - getRoleStr(rd->localNodeStatus.role).c_str(), - param.toString().c_str()); - - bc = rd->bc; - printf("[INFO][Loader] Node:%s CMDUID:%s open backup container for url:%s\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - param.url.toString().c_str()); - printf("[INFO][Loader] Node:%s CMDUID:%s filename:%s blockSize:%ld\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - param.filename.c_str(), param.blockSize); - - rd->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier - rd->mutationMap.clear(); - rd->mutationPartMap.clear(); - - ASSERT( param.blockSize > 0 ); - //state std::vector> fileParserFutures; - if (param.offset % param.blockSize != 0) { - printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", - param.offset, param.blockSize, param.offset % param.blockSize); - } - for (j = param.offset; j < param.length; j += param.blockSize) { - readOffset = j; - readLen = std::min(param.blockSize, param.length - j); - // NOTE: Log file holds set of blocks of data. We need to parse the data block by block and get the kv pair(version, serialized_mutations) - // The set of mutations at the same version may be splitted into multiple kv pairs ACROSS multiple data blocks when the size of serialized_mutations is larger than 20000. - wait( _parseLogFileToMutationsOnLoader(rd, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix, param.mutationLogPrefix) ); - ++beginBlock; - } - printf("[INFO][Loader] Node:%s CMDUID:%s finishes parsing the data block into kv pairs (version, serialized_mutations) for file:%s\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - param.filename.c_str()); - parseSerializedMutation(rd, false); - - printf("[INFO][Loader] Node:%s CMDUID:%s finishes process Log file:%s\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str(), - param.filename.c_str()); - printf("[INFO][Loader] Node:%s CMDUID:%s will send log mutations to applier\n", - rd->describeNode().c_str(), req.cmdID.toString().c_str()); - wait( registerMutationsToApplier(rd) ); // Send the parsed mutation to applier who will apply the mutation to DB - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); // master node is waiting - rd->processedFiles[param.filename] = 1; - rd->processedCmd[req.cmdID] = 1; - - rd->clearInProgressFlag(RestoreCommandEnum::Assign_Loader_Log_File); - - return Void(); -} - -// Applier receive mutation from loader -ACTOR Future handleSendMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference rd, RestoreInterface interf) { - state int numMutations = 0; - - //wait( delay(1.0) ); //Q: Why adding this delay will cause segmentation fault? - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] Node:%s receive mutation number:%d\n", rd->describeNode().c_str(), req.mutations.size()); - } - - // NOTE: We have insert operation to rd->kvOps. For the same worker, we should only allow one actor of this kind to run at any time! - // Otherwise, race condition may happen! - while (rd->isInProgress(RestoreCommandEnum::Loader_Send_Mutations_To_Applier)) { - printf("[DEBUG] NODE:%s sendMutation wait for 1s\n", rd->describeNode().c_str()); - wait(delay(1.0)); - } - - // Handle duplicat cmd - if ( rd->isCmdProcessed(req.cmdID) ) { - //printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - //printf("[DEBUG] Skipped mutation:%s\n", req.mutation.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - rd->setInProgressFlag(RestoreCommandEnum::Loader_Send_Mutations_To_Applier); - - // Applier will cache the mutations at each version. Once receive all mutations, applier will apply them to DB - state uint64_t commitVersion = req.commitVersion; - VectorRef mutations(req.mutations); - printf("[DEBUG] Node:%s receive %d mutations at version:%ld\n", rd->describeNode().c_str(), mutations.size(), commitVersion); - if ( rd->kvOps.find(commitVersion) == rd->kvOps.end() ) { - rd->kvOps.insert(std::make_pair(commitVersion, VectorRef())); - } - state int mIndex = 0; - for (mIndex = 0; mIndex < mutations.size(); mIndex++) { - MutationRef mutation = mutations[mIndex]; - rd->kvOps[commitVersion].push_back_deep(rd->kvOps[commitVersion].arena(), mutation); - numMutations++; - if ( numMutations % 100000 == 1 ) { // Should be different value in simulation and in real mode - printf("[INFO][Applier] Node:%s Receives %d mutations. cur_mutation:%s\n", - rd->describeNode().c_str(), numMutations, mutation.toString().c_str()); - } - } - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - // Avoid race condition when this actor is called twice on the same command - rd->processedCmd[req.cmdID] = 1; - rd->clearInProgressFlag(RestoreCommandEnum::Loader_Send_Mutations_To_Applier); - - return Void(); -} - -ACTOR Future handleSendSampleMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference rd, RestoreInterface interf) { - state int numMutations = 0; - rd->numSampledMutations = 0; - //wait( delay(1.0) ); - //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); - - // NOTE: We have insert operation to rd->kvOps. For the same worker, we should only allow one actor of this kind to run at any time! - // Otherwise, race condition may happen! - while (rd->isInProgress(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier)) { - printf("[DEBUG] NODE:%s handleSendSampleMutationVectorRequest wait for 1s\n", rd->describeNode().c_str()); - wait(delay(1.0)); - } - - // Handle duplicate message - if (rd->isCmdProcessed(req.cmdID)) { - printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - rd->setInProgressFlag(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier); - - // Applier will cache the mutations at each version. Once receive all mutations, applier will apply them to DB - state uint64_t commitVersion = req.commitVersion; - // TODO: Change the req.mutation to a vector of mutations - VectorRef mutations(req.mutations); - - state int mIndex = 0; - for (mIndex = 0; mIndex < mutations.size(); mIndex++) { - MutationRef mutation = mutations[mIndex]; - if ( rd->keyOpsCount.find(mutation.param1) == rd->keyOpsCount.end() ) { - rd->keyOpsCount.insert(std::make_pair(mutation.param1, 0)); - } - // NOTE: We may receive the same mutation more than once due to network package lost. - // Since sampling is just an estimation and the network should be stable enough, we do NOT handle the duplication for now - // In a very unreliable network, we may get many duplicate messages and get a bad key-range splits for appliers. But the restore should still work except for running slower. - rd->keyOpsCount[mutation.param1]++; - rd->numSampledMutations++; - - if ( debug_verbose && rd->numSampledMutations % 1000 == 1 ) { - printf("[Sampling][Applier] Node:%s Receives %d sampled mutations. cur_mutation:%s\n", - rd->describeNode().c_str(), rd->numSampledMutations, mutation.toString().c_str()); - } - } - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - rd->processedCmd[req.cmdID] = 1; - - rd->clearInProgressFlag(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier); - - return Void(); -} - - ACTOR Future handleApplyToDBRequest(RestoreSimpleRequest req, Reference rd, RestoreInterface interf, Database cx) { - state bool isPrint = false; //Debug message - state std::string typeStr = ""; - - // Wait in case the applyToDB request was delivered twice; - while (rd->inProgressApplyToDB) { - printf("[DEBUG] NODE:%s inProgressApplyToDB wait for 5s\n", rd->describeNode().c_str()); - wait(delay(5.0)); - } - - if ( rd->isCmdProcessed(req.cmdID) ) { - printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", rd->describeNode().c_str(), req.cmdID.toString().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - return Void(); - } - - rd->inProgressApplyToDB = true; - - // Assume the process will not crash when it apply mutations to DB. The reply message can be lost though - if (rd->kvOps.empty()) { - printf("Node:%s kvOps is empty. No-op for apply to DB\n", rd->describeNode().c_str()); - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - rd->processedCmd[req.cmdID] = 1; - rd->inProgressApplyToDB = false; - return Void(); - } - - sanityCheckMutationOps(rd); - - if ( debug_verbose ) { - TraceEvent("ApplyKVOPsToDB").detail("MapSize", rd->kvOps.size()); - printf("ApplyKVOPsToDB num_of_version:%ld\n", rd->kvOps.size()); - } - state std::map>>::iterator it = rd->kvOps.begin(); - state std::map>>::iterator prevIt = it; - state int index = 0; - state int prevIndex = index; - state int count = 0; - state Reference tr(new ReadYourWritesTransaction(cx)); - state int numVersion = 0; - state double transactionSize = 0; - loop { - try { - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - transactionSize = 0; - - for ( ; it != rd->kvOps.end(); ++it ) { - numVersion++; - if ( debug_verbose ) { - TraceEvent("ApplyKVOPsToDB\t").detail("Version", it->first).detail("OpNum", it->second.size()); - } - //printf("ApplyKVOPsToDB numVersion:%d Version:%08lx num_of_ops:%d, \n", numVersion, it->first, it->second.size()); - - state MutationRef m; - for ( ; index < it->second.size(); ++index ) { - m = it->second[index]; - if ( m.type >= MutationRef::Type::SetValue && m.type <= MutationRef::Type::MAX_ATOMIC_OP ) - typeStr = typeString[m.type]; - else { - printf("ApplyKVOPsToDB MutationType:%d is out of range\n", m.type); - } - - if ( debug_verbose && count % 1000 == 1 ) { - printf("ApplyKVOPsToDB Node:%s num_mutation:%d Version:%08lx num_of_ops:%d\n", - rd->describeNode().c_str(), count, it->first, it->second.size()); - } - - if ( debug_verbose ) { - printf("[VERBOSE_DEBUG] Node:%s apply mutation:%s\n", rd->describeNode().c_str(), m.toString().c_str()); - } - - if ( m.type == MutationRef::SetValue ) { - tr->set(m.param1, m.param2); - } else if ( m.type == MutationRef::ClearRange ) { - KeyRangeRef mutationRange(m.param1, m.param2); - tr->clear(mutationRange); - } else if ( isAtomicOp((MutationRef::Type) m.type) ) { - //// Now handle atomic operation from this if statement - // TODO: Have not de-duplicated the mutations for multiple network delivery - // ATOMIC_MASK = (1 << AddValue) | (1 << And) | (1 << Or) | (1 << Xor) | (1 << AppendIfFits) | (1 << Max) | (1 << Min) | (1 << SetVersionstampedKey) | (1 << SetVersionstampedValue) | (1 << ByteMin) | (1 << ByteMax) | (1 << MinV2) | (1 << AndV2), - //atomicOp( const KeyRef& key, const ValueRef& operand, uint32_t operationType ) - tr->atomicOp(m.param1, m.param2, m.type); - } else { - printf("[WARNING] mtype:%d (%s) unhandled\n", m.type, typeStr.c_str()); - } - ++count; - transactionSize += m.expectedSize(); - - if ( transactionSize >= transactionBatchSizeThreshold ) { // commit per 1000 mutations - wait(tr->commit()); - tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - prevIt = it; - prevIndex = index; - transactionSize = 0; - } - - if ( isPrint ) { - printf("\tApplyKVOPsToDB Version:%016lx MType:%s K:%s, V:%s K_size:%d V_size:%d\n", it->first, typeStr.c_str(), - getHexString(m.param1).c_str(), getHexString(m.param2).c_str(), m.param1.size(), m.param2.size()); - - TraceEvent("ApplyKVOPsToDB\t\t").detail("Version", it->first) - .detail("MType", m.type).detail("MTypeStr", typeStr) - .detail("MKey", getHexString(m.param1)) - .detail("MValueSize", m.param2.size()) - .detail("MValue", getHexString(m.param2)); - } - } - index = 0; - } - // Last transaction - if (transactionSize > 0) { - wait(tr->commit()); - } - break; - } catch(Error &e) { - printf("ApplyKVOPsToDB transaction error:%s.\n", e.what()); - wait(tr->onError(e)); - it = prevIt; - index = prevIndex; - transactionSize = 0; - } - } - - rd->kvOps.clear(); - printf("Node:%s ApplyKVOPsToDB number of kv mutations:%d\n", rd->describeNode().c_str(), count); - - req.reply.send(RestoreCommonReply(interf.id(), req.cmdID)); - printf("rd->processedCmd size:%d req.cmdID:%s\n", rd->processedCmd.size(), req.cmdID.toString().c_str()); - rd->processedCmd[req.cmdID] = 1; - rd->inProgressApplyToDB = false; - - return Void(); -} - -ACTOR Future workerCore(Reference rd, RestoreInterface ri, Database cx) { - state ActorCollection actors(false); +ACTOR Future startRestoreWorker(Reference self, RestoreWorkerInterface interf, Database cx) { state double lastLoopTopTime; + state ActorCollection actors(false); // Collect the main actor for each role + loop { - double loopTopTime = now(); double elapsedTime = loopTopTime - lastLoopTopTime; if( elapsedTime > 0.050 ) { if (g_random->random01() < 0.01) - TraceEvent(SevWarn, "SlowRestoreLoaderLoopx100").detail("NodeDesc", rd->describeNode()).detail("Elapsed", elapsedTime); + TraceEvent(SevWarn, "SlowRestoreLoaderLoopx100").detail("NodeDesc", self->describeNode()).detail("Elapsed", elapsedTime); } lastLoopTopTime = loopTopTime; state std::string requestTypeStr = "[Init]"; try { choose { - when ( RestoreSimpleRequest req = waitNext(ri.heartbeat.getFuture()) ) { + when ( RestoreSimpleRequest req = waitNext(interf.heartbeat.getFuture()) ) { requestTypeStr = "heartbeat"; - wait(handleHeartbeat(req, rd, ri)); + actors.add( handleHeartbeat(req, interf.id()) ); } - when ( RestoreSetRoleRequest req = waitNext(ri.setRole.getFuture()) ) { - requestTypeStr = "setRole"; - wait(handleSetRoleRequest(req, rd, ri)); + when ( RestoreRecruitRoleRequest req = waitNext(interf.recruitRole.getFuture()) ) { + requestTypeStr = "recruitRole"; + actors.add( handleRecruitRoleRequest(req, self, &actors, cx) ); } - when ( RestoreLoadFileRequest req = waitNext(ri.sampleRangeFile.getFuture()) ) { - requestTypeStr = "sampleRangeFile"; - initBackupContainer(rd, req.param.url); - ASSERT(rd->getRole() == RestoreRole::Loader); - actors.add( handleSampleRangeFileRequest(req, rd, ri) ); - } - when ( RestoreLoadFileRequest req = waitNext(ri.sampleLogFile.getFuture()) ) { - initBackupContainer(rd, req.param.url); - requestTypeStr = "sampleLogFile"; - ASSERT(rd->getRole() == RestoreRole::Loader); - actors.add( handleSampleLogFileRequest(req, rd, ri) ); - } - when ( RestoreGetApplierKeyRangeRequest req = waitNext(ri.getApplierKeyRangeRequest.getFuture()) ) { - requestTypeStr = "getApplierKeyRangeRequest"; - wait(handleGetApplierKeyRangeRequest(req, rd, ri)); - } - when ( RestoreSetApplierKeyRangeRequest req = waitNext(ri.setApplierKeyRangeRequest.getFuture()) ) { - requestTypeStr = "setApplierKeyRangeRequest"; - wait(handleSetApplierKeyRangeRequest(req, rd, ri)); - } - when ( RestoreSetApplierKeyRangeVectorRequest req = waitNext(ri.setApplierKeyRangeVectorRequest.getFuture()) ) { - requestTypeStr = "setApplierKeyRangeVectorRequest"; - wait(handleSetApplierKeyRangeVectorRequest(req, rd, ri)); - } - when ( RestoreLoadFileRequest req = waitNext(ri.loadRangeFile.getFuture()) ) { - requestTypeStr = "loadRangeFile"; - ASSERT(rd->getRole() == RestoreRole::Loader); - initBackupContainer(rd, req.param.url); - actors.add( handleLoadRangeFileRequest(req, rd, ri) ); - } - when ( RestoreLoadFileRequest req = waitNext(ri.loadLogFile.getFuture()) ) { - requestTypeStr = "loadLogFile"; - ASSERT(rd->getRole() == RestoreRole::Loader); - initBackupContainer(rd, req.param.url); - actors.add( handleLoadLogFileRequest(req, rd, ri) ); - } - - when ( RestoreCalculateApplierKeyRangeRequest req = waitNext(ri.calculateApplierKeyRange.getFuture()) ) { - requestTypeStr = "calculateApplierKeyRange"; - ASSERT(rd->getRole() == RestoreRole::Applier); - wait(handleCalculateApplierKeyRangeRequest(req, rd, ri)); - } - when ( RestoreSendMutationVectorRequest req = waitNext(ri.sendSampleMutationVector.getFuture()) ) { - requestTypeStr = "sendSampleMutationVector"; - ASSERT(rd->getRole() == RestoreRole::Applier); - actors.add( handleSendSampleMutationVectorRequest(req, rd, ri)); - } - when ( RestoreSendMutationVectorRequest req = waitNext(ri.sendMutationVector.getFuture()) ) { - requestTypeStr = "sendMutationVector"; - ASSERT(rd->getRole() == RestoreRole::Applier); - actors.add( handleSendMutationVectorRequest(req, rd, ri) ); - } - when ( RestoreSimpleRequest req = waitNext(ri.applyToDB.getFuture()) ) { - requestTypeStr = "applyToDB"; - actors.add( handleApplyToDBRequest(req, rd, ri, cx) ); - } - - when ( RestoreVersionBatchRequest req = waitNext(ri.initVersionBatch.getFuture()) ) { - requestTypeStr = "initVersionBatch"; - wait(handleVersionBatchRequest(req, rd, ri)); - } - - when ( RestoreSimpleRequest req = waitNext(ri.setWorkerInterface.getFuture()) ) { - // Step: Find other worker's interfaces - // NOTE: This must be after wait(configureRolesHandler()) because we must ensure all workers have registered their interfaces into DB before we can read the interface. - // TODO: Wait until all workers have registered their interface. - wait( setWorkerInterface(req, rd, ri, cx) ); - } - - when ( RestoreSimpleRequest req = waitNext(ri.finishRestore.getFuture()) ) { + when ( RestoreSimpleRequest req = waitNext(interf.terminateWorker.getFuture()) ) { // Destroy the worker at the end of the restore // TODO: Cancel its own actors - wait( handleFinishRestoreReq(req, rd, ri, cx) ); + requestTypeStr = "terminateWorker"; + actors.add( handlerTerminateWorkerRequest(req, self, interf, cx) ); return Void(); } } } catch (Error &e) { - // TODO: Handle the command reply timeout error - if (e.code() != error_code_io_timeout) { - fprintf(stdout, "[ERROR] Loader handle received request:%s timeout\n", requestTypeStr.c_str()); - } else { - fprintf(stdout, "[ERROR] Loader handle received request:%s error. error code:%d, error message:%s\n", - requestTypeStr.c_str(), e.code(), e.what()); - } - + fprintf(stdout, "[ERROR] Loader handle received request:%s error. error code:%d, error message:%s\n", + requestTypeStr.c_str(), e.code(), e.what()); if ( requestTypeStr.find("[Init]") != std::string::npos ) { printf("Exit due to error at requestType:%s", requestTypeStr.c_str()); break; @@ -4094,62 +415,89 @@ ACTOR Future workerCore(Reference rd, RestoreInterface ri, Da return Void(); } -ACTOR Future masterCore(Reference rd, RestoreInterface interf, Database cx) { - //we are the leader - // We must wait for enough time to make sure all restore workers have registered their interfaces into the DB - printf("[INFO][Master] NodeID:%s Restore master waits for agents to register their workerKeys\n", - interf.id().toString().c_str()); - wait( delay(10.0) ); +ACTOR Future _restoreWorker(Database cx_input, LocalityData locality) { + state Database cx = cx_input; + state RestoreWorkerInterface workerInterf; + workerInterf.initEndpoints(); + state Optional leaderInterf; + //Global data for the worker + state Reference self = Reference(new RestoreWorkerData()); - rd->localNodeStatus.init(RestoreRole::Master); - rd->localNodeStatus.nodeID = interf.id(); - printf("[INFO][Master] NodeID:%s starts configuring roles for workers\n", interf.id().toString().c_str()); + self->workerID = workerInterf.id(); - wait( collectWorkerInterface(rd, cx, MIN_NUM_WORKERS) ); + initRestoreWorkerConfig(); //TODO: Change to a global struct to store the restore configuration - Future workersFailureMonitor = monitorWorkerLiveness(rd); - - // configureRoles must be after collectWorkerInterface - // Why do I need to put an extra wait() to make sure the above wait is executed after the below wwait? - wait( delay(1.0) ); - - wait( configureRoles(rd) ); - - wait( delay(1.0) ); - wait( notifyWorkersToSetWorkersInterface(rd) ); - - state int restoreId = 0; - state int checkNum = 0; + // Compete in registering its restoreInterface as the leader. + state Transaction tr(cx); loop { - printf("Node:%s---Wait on restore requests...---\n", rd->describeNode().c_str()); - state Standalone> restoreRequests = wait( collectRestoreRequests(cx) ); - - printf("Node:%s ---Received restore requests as follows---\n", rd->describeNode().c_str()); - // Print out the requests info - for ( auto &it : restoreRequests ) { - printf("\t[INFO][Master]Node:%s RestoreRequest info:%s\n", rd->describeNode().c_str(), it.toString().c_str()); + try { + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + Optional leader = wait(tr.get(restoreLeaderKey)); + if(leader.present()) { + leaderInterf = BinaryReader::fromStringRef(leader.get(), IncludeVersion()); + // NOTE: Handle the situation that the leader's commit of its key causes error(commit_unknown_result) + // In this situation, the leader will try to register its key again, which will never succeed. + // We should let leader escape from the infinite loop + if ( leaderInterf.get().id() == workerInterf.id() ) { + printf("[Worker] NodeID:%s is the leader and has registered its key in commit_unknown_result error. Let it set the key again\n", + leaderInterf.get().id().toString().c_str()); + tr.set(restoreLeaderKey, BinaryWriter::toValue(workerInterf, IncludeVersion())); + wait(tr.commit()); + // reset leaderInterf to invalid for the leader process + // because a process will not execute leader's logic unless leaderInterf is invalid + leaderInterf = Optional(); + break; + } + printf("[Worker] Leader key exists:%s. Worker registers its restore workerInterface id:%s\n", + leaderInterf.get().id().toString().c_str(), workerInterf.id().toString().c_str()); + tr.set(restoreWorkerKeyFor(workerInterf.id()), restoreWorkerInterfaceValue(workerInterf)); + wait(tr.commit()); + break; + } + printf("[Worker] NodeID:%s competes register its workerInterface as leader\n", workerInterf.id().toString().c_str()); + tr.set(restoreLeaderKey, BinaryWriter::toValue(workerInterf, IncludeVersion())); + wait(tr.commit()); + break; + } catch( Error &e ) { + // We may have error commit_unknown_result, the commit may or may not succeed! + // We must handle this error, otherwise, if the leader does not know its key has been registered, the leader will stuck here! + printf("[INFO] NodeID:%s restoreWorker select leader error, error code:%d error info:%s\n", + workerInterf.id().toString().c_str(), e.code(), e.what()); + wait( tr.onError(e) ); } + } - // Step: Perform the restore requests - for ( auto &it : restoreRequests ) { - TraceEvent("LeaderGotRestoreRequest").detail("RestoreRequestInfo", it.toString()); - printf("Node:%s Got RestoreRequestInfo:%s\n", rd->describeNode().c_str(), it.toString().c_str()); - Version ver = wait( processRestoreRequest(interf, rd, cx, it) ); - } + + if(leaderInterf.present()) { // Logic for restoer workers (restore loader and restore applier) + wait( startRestoreWorker(self, workerInterf, cx) ); + } else { // Logic for restore master + self->masterData = Reference(new RestoreMasterData()); + // We must wait for enough time to make sure all restore workers have registered their workerInterfaces into the DB + printf("[INFO][Master] NodeID:%s Restore master waits for agents to register their workerKeys\n", + workerInterf.id().toString().c_str()); + wait( delay(10.0) ); - // Step: Notify all restore requests have been handled by cleaning up the restore keys - wait( delay(5.0) ); - printf("Finish my restore now!\n"); - //wait( finishRestore(rd) ); - wait( finishRestore(rd, cx, restoreRequests) ); + printf("[INFO][Master] NodeID:%s starts configuring roles for workers\n", workerInterf.id().toString().c_str()); - printf("[INFO] MXRestoreEndHere RestoreID:%d\n", restoreId); - TraceEvent("MXRestoreEndHere").detail("RestoreID", restoreId++); - wait( delay(5.0) ); - //NOTE: we have to break the loop so that the tester.actor can receive the return of this test workload. - //Otherwise, this special workload never returns and tester will think the test workload is stuck and the tester will timesout - break; //TODO: this break will be removed later since we need the restore agent to run all the time! + wait( collectRestoreWorkerInterface(self, cx, MIN_NUM_WORKERS) ); + + state Future workersFailureMonitor = monitorWorkerLiveness(self); + + // configureRoles must be after collectWorkerInterface + // TODO: remove the delay() Why do I need to put an extra wait() to make sure the above wait is executed after the below wwait? + wait( delay(1.0) ); + wait( recruitRestoreRoles(self) ); + + wait( startRestoreMaster(self->masterData, cx) ); } + return Void(); +} + +ACTOR Future restoreWorker(Reference ccf, LocalityData locality) { + Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST,locality); + wait(_restoreWorker(cx, locality)); return Void(); } \ No newline at end of file diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index e9019ea056..a6614d6661 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -17,3 +17,453 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/SystemData.h" + +// Backup agent header +#include "fdbclient/BackupAgent.actor.h" +//#include "FileBackupAgent.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/MutationList.h" +#include "fdbclient/BackupContainer.h" + +#include "fdbserver/RestoreCommon.actor.h" +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreRoleCommon.actor.h" +#include "fdbserver/RestoreApplier.actor.h" + + +#include "flow/actorcompiler.h" // This must be the last #include. + +ACTOR Future handleGetApplierKeyRangeRequest(RestoreGetApplierKeyRangeRequest req, Reference self); +ACTOR Future handleSetApplierKeyRangeRequest(RestoreSetApplierKeyRangeRequest req, Reference self); +ACTOR Future handleCalculateApplierKeyRangeRequest(RestoreCalculateApplierKeyRangeRequest req, Reference self); +ACTOR Future handleSendSampleMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference self); +ACTOR Future handleSendMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference self); +ACTOR Future handleApplyToDBRequest(RestoreSimpleRequest req, Reference self, Database cx); + + +ACTOR Future restoreApplierCore(Reference self, RestoreApplierInterface applierInterf, Database cx) { + state ActorCollection actors(false); + state double lastLoopTopTime; + loop { + + double loopTopTime = now(); + double elapsedTime = loopTopTime - lastLoopTopTime; + if( elapsedTime > 0.050 ) { + if (g_random->random01() < 0.01) + TraceEvent(SevWarn, "SlowRestoreLoaderLoopx100").detail("NodeDesc", self->describeNode()).detail("Elapsed", elapsedTime); + } + lastLoopTopTime = loopTopTime; + state std::string requestTypeStr = "[Init]"; + + try { + choose { + when ( RestoreSimpleRequest req = waitNext(applierInterf.heartbeat.getFuture()) ) { + requestTypeStr = "heartbeat"; + wait(handleHeartbeat(req, applierInterf.id())); + } + when ( RestoreGetApplierKeyRangeRequest req = waitNext(applierInterf.getApplierKeyRangeRequest.getFuture()) ) { + requestTypeStr = "getApplierKeyRangeRequest"; + wait(handleGetApplierKeyRangeRequest(req, self)); + } + when ( RestoreSetApplierKeyRangeRequest req = waitNext(applierInterf.setApplierKeyRangeRequest.getFuture()) ) { + requestTypeStr = "setApplierKeyRangeRequest"; + wait(handleSetApplierKeyRangeRequest(req, self)); + } + + when ( RestoreCalculateApplierKeyRangeRequest req = waitNext(applierInterf.calculateApplierKeyRange.getFuture()) ) { + requestTypeStr = "calculateApplierKeyRange"; + wait(handleCalculateApplierKeyRangeRequest(req, self)); + } + when ( RestoreSendMutationVectorRequest req = waitNext(applierInterf.sendSampleMutationVector.getFuture()) ) { + requestTypeStr = "sendSampleMutationVector"; + actors.add( handleSendSampleMutationVectorRequest(req, self)); + } + when ( RestoreSendMutationVectorRequest req = waitNext(applierInterf.sendMutationVector.getFuture()) ) { + requestTypeStr = "sendMutationVector"; + actors.add( handleSendMutationVectorRequest(req, self) ); + } + when ( RestoreSimpleRequest req = waitNext(applierInterf.applyToDB.getFuture()) ) { + requestTypeStr = "applyToDB"; + actors.add( handleApplyToDBRequest(req, self, cx) ); + } + + when ( RestoreVersionBatchRequest req = waitNext(applierInterf.initVersionBatch.getFuture()) ) { + requestTypeStr = "initVersionBatch"; + wait(handleInitVersionBatchRequest(req, self)); + } + + // TODO: To modify the interface for the following 2 when condition + when ( RestoreSimpleRequest req = waitNext(applierInterf.collectRestoreRoleInterfaces.getFuture()) ) { + // Step: Find other worker's workerInterfaces + // NOTE: This must be after wait(configureRolesHandler()) because we must ensure all workers have registered their workerInterfaces into DB before we can read the workerInterface. + // TODO: Wait until all workers have registered their workerInterface. + wait( handleCollectRestoreRoleInterfaceRequest(req, self, cx) ); + } + } + + } catch (Error &e) { + fprintf(stdout, "[ERROR] Loader handle received request:%s error. error code:%d, error message:%s\n", + requestTypeStr.c_str(), e.code(), e.what()); + + if ( requestTypeStr.find("[Init]") != std::string::npos ) { + printf("Exit due to error at requestType:%s", requestTypeStr.c_str()); + break; + } + } + } + + return Void(); +} + + + +ACTOR Future handleCalculateApplierKeyRangeRequest(RestoreCalculateApplierKeyRangeRequest req, Reference self) { + state int numMutations = 0; + state std::vector> keyRangeLowerBounds; + + while (self->isInProgress(RestoreCommandEnum::Calculate_Applier_KeyRange)) { + printf("[DEBUG] NODE:%s Calculate_Applier_KeyRange wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + wait( delay(1.0) ); + // Handle duplicate message + // We need to recalculate the value for duplicate message! Because the reply to duplicate message may arrive earlier! + if (self->isCmdProcessed(req.cmdID) && !keyRangeLowerBounds.empty() ) { + printf("[DEBUG] Node:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(GetKeyRangeNumberReply(keyRangeLowerBounds.size())); + return Void(); + } + self->setInProgressFlag(RestoreCommandEnum::Calculate_Applier_KeyRange); + + // Applier will calculate applier key range + printf("[INFO][Applier] CMD:%s, Node:%s Calculate key ranges for %d appliers\n", + req.cmdID.toString().c_str(), self->describeNode().c_str(), req.numAppliers); + + //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); + if ( keyRangeLowerBounds.empty() ) { + keyRangeLowerBounds = self->calculateAppliersKeyRanges(req.numAppliers); // keyRangeIndex is the number of key ranges requested + self->keyRangeLowerBounds = keyRangeLowerBounds; + } + + printf("[INFO][Applier] CMD:%s, NodeID:%s: num of key ranges:%ld\n", + req.cmdID.toString().c_str(), self->describeNode().c_str(), keyRangeLowerBounds.size()); + req.reply.send(GetKeyRangeNumberReply(keyRangeLowerBounds.size())); + self->processedCmd[req.cmdID] = 1; // We should not skip this command in the following phase. Otherwise, the handler in other phases may return a wrong number of appliers + self->clearInProgressFlag(RestoreCommandEnum::Calculate_Applier_KeyRange); + + return Void(); +} + +ACTOR Future handleGetApplierKeyRangeRequest(RestoreGetApplierKeyRangeRequest req, Reference self) { + state int numMutations = 0; + //state std::vector> keyRangeLowerBounds = self->keyRangeLowerBounds; + + while (self->isInProgress(RestoreCommandEnum::Get_Applier_KeyRange)) { + printf("[DEBUG] NODE:%s Calculate_Applier_KeyRange wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + wait( delay(1.0) ); + //NOTE: Must reply a valid lowerBound and upperBound! Otherwise, the master will receive an invalid value! + // if (self->isCmdProcessed(req.cmdID) ) { + // printf("[DEBUG] Node:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + // req.reply.send(GetKeyRangeReply(workerInterf.id(), req.cmdID)); // Must wait until the previous command returns + // return Void(); + // } + self->setInProgressFlag(RestoreCommandEnum::Get_Applier_KeyRange); + + if ( req.applierIndex < 0 || req.applierIndex >= self->keyRangeLowerBounds.size() ) { + printf("[INFO][Applier] NodeID:%s Get_Applier_KeyRange keyRangeIndex is out of range. keyIndex:%d keyRagneSize:%ld\n", + self->describeNode().c_str(), req.applierIndex, self->keyRangeLowerBounds.size()); + } + //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); + + printf("[INFO][Applier] NodeID:%s replies Get_Applier_KeyRange. keyRangeIndex:%d lower_bound_of_keyRange:%s\n", + self->describeNode().c_str(), req.applierIndex, getHexString(self->keyRangeLowerBounds[req.applierIndex]).c_str()); + + KeyRef lowerBound = self->keyRangeLowerBounds[req.applierIndex]; + KeyRef upperBound = (req.applierIndex + 1) < self->keyRangeLowerBounds.size() ? self->keyRangeLowerBounds[req.applierIndex+1] : normalKeys.end; + + req.reply.send(GetKeyRangeReply(self->id(), req.cmdID, req.applierIndex, lowerBound, upperBound)); + self->clearInProgressFlag(RestoreCommandEnum::Get_Applier_KeyRange); + + return Void(); + +} + +// Assign key range to applier +ACTOR Future handleSetApplierKeyRangeRequest(RestoreSetApplierKeyRangeRequest req, Reference self) { + // Idempodent operation. OK to re-execute the duplicate cmd + // The applier should remember the key range it is responsible for + //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); + //self->applierStatus.keyRange = req.range; + while (self->isInProgress(RestoreCommandEnum::Assign_Applier_KeyRange)) { + printf("[DEBUG] NODE:%s handleSetApplierKeyRangeRequest wait for 1s\n", self->describeNode().c_str()); + wait(delay(1.0)); + } + if ( self->isCmdProcessed(req.cmdID) ) { + req.reply.send(RestoreCommonReply(self->id(),req.cmdID)); + return Void(); + } + self->setInProgressFlag(RestoreCommandEnum::Assign_Applier_KeyRange); + + self->range2Applier[req.range.begin] = req.applierID; + + self->processedCmd[req.cmdID] = 1; + self->clearInProgressFlag(RestoreCommandEnum::Assign_Applier_KeyRange); + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + + return Void(); +} + + + +// Applier receive mutation from loader +ACTOR Future handleSendMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference self) { + state int numMutations = 0; + + //wait( delay(1.0) ); //Q: Why adding this delay will cause segmentation fault? + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] Node:%s receive mutation number:%d\n", self->describeNode().c_str(), req.mutations.size()); + } + + // NOTE: We have insert operation to self->kvOps. For the same worker, we should only allow one actor of this kind to run at any time! + // Otherwise, race condition may happen! + while (self->isInProgress(RestoreCommandEnum::Loader_Send_Mutations_To_Applier)) { + printf("[DEBUG] NODE:%s sendMutation wait for 1s\n", self->describeNode().c_str()); + wait(delay(1.0)); + } + + // Handle duplicat cmd + if ( self->isCmdProcessed(req.cmdID) ) { + //printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + //printf("[DEBUG] Skipped mutation:%s\n", req.mutation.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + self->setInProgressFlag(RestoreCommandEnum::Loader_Send_Mutations_To_Applier); + + // Applier will cache the mutations at each version. Once receive all mutations, applier will apply them to DB + state uint64_t commitVersion = req.commitVersion; + VectorRef mutations(req.mutations); + printf("[DEBUG] Node:%s receive %d mutations at version:%ld\n", self->describeNode().c_str(), mutations.size(), commitVersion); + if ( self->kvOps.find(commitVersion) == self->kvOps.end() ) { + self->kvOps.insert(std::make_pair(commitVersion, VectorRef())); + } + state int mIndex = 0; + for (mIndex = 0; mIndex < mutations.size(); mIndex++) { + MutationRef mutation = mutations[mIndex]; + self->kvOps[commitVersion].push_back_deep(self->kvOps[commitVersion].arena(), mutation); + numMutations++; + if ( numMutations % 100000 == 1 ) { // Should be different value in simulation and in real mode + printf("[INFO][Applier] Node:%s Receives %d mutations. cur_mutation:%s\n", + self->describeNode().c_str(), numMutations, mutation.toString().c_str()); + } + } + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + // Avoid race condition when this actor is called twice on the same command + self->processedCmd[req.cmdID] = 1; + self->clearInProgressFlag(RestoreCommandEnum::Loader_Send_Mutations_To_Applier); + + return Void(); +} + +ACTOR Future handleSendSampleMutationVectorRequest(RestoreSendMutationVectorRequest req, Reference self) { + state int numMutations = 0; + self->numSampledMutations = 0; + //wait( delay(1.0) ); + //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); + + // NOTE: We have insert operation to self->kvOps. For the same worker, we should only allow one actor of this kind to run at any time! + // Otherwise, race condition may happen! + while (self->isInProgress(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier)) { + printf("[DEBUG] NODE:%s handleSendSampleMutationVectorRequest wait for 1s\n", self->describeNode().c_str()); + wait(delay(1.0)); + } + + // Handle duplicate message + if (self->isCmdProcessed(req.cmdID)) { + printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + self->setInProgressFlag(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier); + + // Applier will cache the mutations at each version. Once receive all mutations, applier will apply them to DB + state uint64_t commitVersion = req.commitVersion; + // TODO: Change the req.mutation to a vector of mutations + VectorRef mutations(req.mutations); + + state int mIndex = 0; + for (mIndex = 0; mIndex < mutations.size(); mIndex++) { + MutationRef mutation = mutations[mIndex]; + if ( self->keyOpsCount.find(mutation.param1) == self->keyOpsCount.end() ) { + self->keyOpsCount.insert(std::make_pair(mutation.param1, 0)); + } + // NOTE: We may receive the same mutation more than once due to network package lost. + // Since sampling is just an estimation and the network should be stable enough, we do NOT handle the duplication for now + // In a very unreliable network, we may get many duplicate messages and get a bad key-range splits for appliers. But the restore should still work except for running slower. + self->keyOpsCount[mutation.param1]++; + self->numSampledMutations++; + + if ( debug_verbose && self->numSampledMutations % 1000 == 1 ) { + printf("[Sampling][Applier] Node:%s Receives %d sampled mutations. cur_mutation:%s\n", + self->describeNode().c_str(), self->numSampledMutations, mutation.toString().c_str()); + } + } + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + self->processedCmd[req.cmdID] = 1; + + self->clearInProgressFlag(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier); + + return Void(); +} + + ACTOR Future handleApplyToDBRequest(RestoreSimpleRequest req, Reference self, Database cx) { + state bool isPrint = false; //Debug message + state std::string typeStr = ""; + + // Wait in case the applyToDB request was delivered twice; + while (self->inProgressApplyToDB) { + printf("[DEBUG] NODE:%s inProgressApplyToDB wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + if ( self->isCmdProcessed(req.cmdID) ) { + printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + + self->inProgressApplyToDB = true; + + // Assume the process will not crash when it apply mutations to DB. The reply message can be lost though + if (self->kvOps.empty()) { + printf("Node:%s kvOps is empty. No-op for apply to DB\n", self->describeNode().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + self->processedCmd[req.cmdID] = 1; + self->inProgressApplyToDB = false; + return Void(); + } + + self->sanityCheckMutationOps(); + + if ( debug_verbose ) { + TraceEvent("ApplyKVOPsToDB").detail("MapSize", self->kvOps.size()); + printf("ApplyKVOPsToDB num_of_version:%ld\n", self->kvOps.size()); + } + state std::map>>::iterator it = self->kvOps.begin(); + state std::map>>::iterator prevIt = it; + state int index = 0; + state int prevIndex = index; + state int count = 0; + state Reference tr(new ReadYourWritesTransaction(cx)); + state int numVersion = 0; + state double transactionSize = 0; + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + transactionSize = 0; + + for ( ; it != self->kvOps.end(); ++it ) { + numVersion++; + if ( debug_verbose ) { + TraceEvent("ApplyKVOPsToDB\t").detail("Version", it->first).detail("OpNum", it->second.size()); + } + //printf("ApplyKVOPsToDB numVersion:%d Version:%08lx num_of_ops:%d, \n", numVersion, it->first, it->second.size()); + + state MutationRef m; + for ( ; index < it->second.size(); ++index ) { + m = it->second[index]; + if ( m.type >= MutationRef::Type::SetValue && m.type <= MutationRef::Type::MAX_ATOMIC_OP ) + typeStr = typeString[m.type]; + else { + printf("ApplyKVOPsToDB MutationType:%d is out of range\n", m.type); + } + + if ( debug_verbose && count % 1000 == 1 ) { + printf("ApplyKVOPsToDB Node:%s num_mutation:%d Version:%08lx num_of_ops:%d\n", + self->describeNode().c_str(), count, it->first, it->second.size()); + } + + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] Node:%s apply mutation:%s\n", self->describeNode().c_str(), m.toString().c_str()); + } + + if ( m.type == MutationRef::SetValue ) { + tr->set(m.param1, m.param2); + } else if ( m.type == MutationRef::ClearRange ) { + KeyRangeRef mutationRange(m.param1, m.param2); + tr->clear(mutationRange); + } else if ( isAtomicOp((MutationRef::Type) m.type) ) { + //// Now handle atomic operation from this if statement + // TODO: Have not de-duplicated the mutations for multiple network delivery + // ATOMIC_MASK = (1 << AddValue) | (1 << And) | (1 << Or) | (1 << Xor) | (1 << AppendIfFits) | (1 << Max) | (1 << Min) | (1 << SetVersionstampedKey) | (1 << SetVersionstampedValue) | (1 << ByteMin) | (1 << ByteMax) | (1 << MinV2) | (1 << AndV2), + //atomicOp( const KeyRef& key, const ValueRef& operand, uint32_t operationType ) + tr->atomicOp(m.param1, m.param2, m.type); + } else { + printf("[WARNING] mtype:%d (%s) unhandled\n", m.type, typeStr.c_str()); + } + ++count; + transactionSize += m.expectedSize(); + + if ( transactionSize >= transactionBatchSizeThreshold ) { // commit per 1000 mutations + wait(tr->commit()); + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + prevIt = it; + prevIndex = index; + transactionSize = 0; + } + + if ( isPrint ) { + printf("\tApplyKVOPsToDB Version:%016lx MType:%s K:%s, V:%s K_size:%d V_size:%d\n", it->first, typeStr.c_str(), + getHexString(m.param1).c_str(), getHexString(m.param2).c_str(), m.param1.size(), m.param2.size()); + + TraceEvent("ApplyKVOPsToDB\t\t").detail("Version", it->first) + .detail("MType", m.type).detail("MTypeStr", typeStr) + .detail("MKey", getHexString(m.param1)) + .detail("MValueSize", m.param2.size()) + .detail("MValue", getHexString(m.param2)); + } + } + index = 0; + } + // Last transaction + if (transactionSize > 0) { + wait(tr->commit()); + } + break; + } catch(Error &e) { + printf("ApplyKVOPsToDB transaction error:%s.\n", e.what()); + wait(tr->onError(e)); + it = prevIt; + index = prevIndex; + transactionSize = 0; + } + } + + self->kvOps.clear(); + printf("Node:%s ApplyKVOPsToDB number of kv mutations:%d\n", self->describeNode().c_str(), count); + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + printf("self->processedCmd size:%d req.cmdID:%s\n", self->processedCmd.size(), req.cmdID.toString().c_str()); + self->processedCmd[req.cmdID] = 1; + self->inProgressApplyToDB = false; + + return Void(); +} + + + diff --git a/fdbserver/RestoreApplier.actor.h b/fdbserver/RestoreApplier.actor.h index 2295b6f9a6..2eddd58c99 100644 --- a/fdbserver/RestoreApplier.actor.h +++ b/fdbserver/RestoreApplier.actor.h @@ -21,7 +21,7 @@ // Declear RestoreApplier interface and actors #pragma once -#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RestoreApplierInterface_H) +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RestoreApplierInterface_G_H) #define FDBSERVER_RestoreApplierInterface_G_H #include "fdbserver/RestoreApplier.actor.g.h" #elif !defined(FDBSERVER_RestoreApplierInterface_H) @@ -35,5 +35,150 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbrpc/Locality.h" +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreRoleCommon.actor.h" +#include "fdbserver/RestoreWorkerInterface.h" +#include "flow/actorcompiler.h" // has to be last include + +extern double transactionBatchSizeThreshold; + +struct RestoreApplierData : RestoreRoleData, public ReferenceCounted { + // range2Applier is in master and loader node. Loader node uses this to determine which applier a mutation should be sent + std::map, UID> range2Applier; // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for + std::map, int> keyOpsCount; // The number of operations per key which is used to determine the key-range boundary for appliers + int numSampledMutations; // The total number of mutations received from sampled data. + + // For master applier to hold the lower bound of key ranges for each appliers + std::vector> keyRangeLowerBounds; + + // TODO: This block of variables may be moved to RestoreRoleData + bool inProgressApplyToDB = false; + + // Temporary data structure for parsing range and log files into (version, ) + std::map>> kvOps; + + void addref() { return ReferenceCounted::addref(); } + void delref() { return ReferenceCounted::delref(); } + + RestoreApplierData() { + nodeID = g_random->randomUniqueID(); + nodeIndex = 0; + } + + ~RestoreApplierData() {} + + std::string describeNode() { + std::stringstream ss; + ss << "NodeID:" << nodeID.toString() << " nodeIndex:" << nodeIndex; + return ss.str(); + } + + void resetPerVersionBatch() { + RestoreRoleData::resetPerVersionBatch(); + + inProgressApplyToDB = false; + kvOps.clear(); + } + + void sanityCheckMutationOps() { + if (kvOps.empty()) + return; + + if ( isKVOpsSorted() ) { + printf("[CORRECT] KVOps is sorted by version\n"); + } else { + printf("[ERROR]!!! KVOps is NOT sorted by version\n"); + } + + if ( allOpsAreKnown() ) { + printf("[CORRECT] KVOps all operations are known.\n"); + } else { + printf("[ERROR]!!! KVOps has unknown mutation op. Exit...\n"); + } + } + + bool isKVOpsSorted() { + bool ret = true; + auto prev = kvOps.begin(); + for ( auto it = kvOps.begin(); it != kvOps.end(); ++it ) { + if ( prev->first > it->first ) { + ret = false; + break; + } + prev = it; + } + return ret; + } + + bool allOpsAreKnown() { + bool ret = true; + for ( auto it = kvOps.begin(); it != kvOps.end(); ++it ) { + for ( auto m = it->second.begin(); m != it->second.end(); ++m ) { + if ( m->type == MutationRef::SetValue || m->type == MutationRef::ClearRange + || isAtomicOp((MutationRef::Type) m->type) ) + continue; + else { + printf("[ERROR] Unknown mutation type:%d\n", m->type); + ret = false; + } + } + + } + + return ret; + } + + + std::vector> calculateAppliersKeyRanges(int numAppliers) { + ASSERT(numAppliers > 0); + std::vector> lowerBounds; + int numSampledMutations = 0; + for (auto &count : keyOpsCount) { + numSampledMutations += count.second; + } + + //intervalLength = (numSampledMutations - remainder) / (numApplier - 1) + int intervalLength = std::max(numSampledMutations / numAppliers, 1); // minimal length is 1 + int curCount = 0; + int curInterval = 0; + + printf("[INFO] Node:%s calculateAppliersKeyRanges(): numSampledMutations:%d numAppliers:%d intervalLength:%d\n", + describeNode().c_str(), + numSampledMutations, numAppliers, intervalLength); + for (auto &count : keyOpsCount) { + if (curCount >= curInterval * intervalLength) { + printf("[INFO] Node:%s calculateAppliersKeyRanges(): Add a new key range [%d]:%s: curCount:%d\n", + describeNode().c_str(), curInterval, count.first.toString().c_str(), curCount); + lowerBounds.push_back(count.first); // The lower bound of the current key range + curInterval++; + } + curCount += count.second; + } + + if ( lowerBounds.size() != numAppliers ) { + printf("[WARNING] calculateAppliersKeyRanges() WE MAY NOT USE ALL APPLIERS efficiently! num_keyRanges:%ld numAppliers:%d\n", + lowerBounds.size(), numAppliers); + printLowerBounds(lowerBounds); + } + + //ASSERT(lowerBounds.size() <= numAppliers + 1); // We may have at most numAppliers + 1 key ranges + if ( lowerBounds.size() >= numAppliers ) { + printf("[WARNING] Key ranges number:%ld > numAppliers:%d. Merge the last ones\n", lowerBounds.size(), numAppliers); + } + + while ( lowerBounds.size() >= numAppliers ) { + printf("[WARNING] Key ranges number:%ld > numAppliers:%d. Merge the last ones\n", lowerBounds.size(), numAppliers); + lowerBounds.pop_back(); + } + + return lowerBounds; + } +}; + + +ACTOR Future restoreApplierCore(Reference self, RestoreApplierInterface applierInterf, Database cx); + + +#include "flow/unactorcompiler.h" #endif \ No newline at end of file diff --git a/fdbserver/RestoreCommon.actor.h b/fdbserver/RestoreCommon.actor.h index ef778fef54..834f3f51a1 100644 --- a/fdbserver/RestoreCommon.actor.h +++ b/fdbserver/RestoreCommon.actor.h @@ -32,7 +32,7 @@ #include "fdbrpc/IAsyncFile.h" #include "fdbclient/BackupAgent.actor.h" #include "flow/genericactors.actor.h" -#include "flow/actorcompiler.h" // has to be last include + // RestoreConfig copied from FileBackupAgent.actor.cpp // We copy RestoreConfig instead of using (and potentially changing) it in place to avoid conflict with the existing code diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index bc10f5226b..cfccddb442 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -17,3 +17,1132 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include "fdbclient/BackupContainer.h" +#include "fdbserver/RestoreLoader.actor.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +ACTOR Future handleSampleRangeFileRequest(RestoreLoadFileRequest req, Reference self); +ACTOR Future handleSampleLogFileRequest(RestoreLoadFileRequest req, Reference self); +ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, Reference self); +ACTOR Future handleLoadRangeFileRequest(RestoreLoadFileRequest req, Reference self); +ACTOR Future handleLoadLogFileRequest(RestoreLoadFileRequest req, Reference self); +ACTOR Future registerMutationsToMasterApplier(Reference self); + + ACTOR static Future _parseLogFileToMutationsOnLoader(Reference self, + Reference bc, Version version, + std::string fileName, int64_t readOffset, int64_t readLen, + KeyRange restoreRange, Key addPrefix, Key removePrefix, + Key mutationLogPrefix); +ACTOR static Future _parseRangeFileToMutationsOnLoader(Reference self, + Reference bc, Version version, + std::string fileName, int64_t readOffset_input, int64_t readLen_input, + KeyRange restoreRange, Key addPrefix, Key removePrefix); +ACTOR Future registerMutationsToApplier(Reference self); +void parseSerializedMutation(Reference self, bool isSampling); +bool isRangeMutation(MutationRef m); +void splitMutation(Reference self, MutationRef m, Arena& mvector_arena, VectorRef mvector, Arena& nodeIDs_arena, VectorRef nodeIDs) ; + + +ACTOR Future restoreLoaderCore(Reference self, RestoreLoaderInterface loaderInterf, Database cx) { + state ActorCollection actors(false); + state double lastLoopTopTime; + loop { + + double loopTopTime = now(); + double elapsedTime = loopTopTime - lastLoopTopTime; + if( elapsedTime > 0.050 ) { + if (g_random->random01() < 0.01) + TraceEvent(SevWarn, "SlowRestoreLoaderLoopx100").detail("NodeDesc", self->describeNode()).detail("Elapsed", elapsedTime); + } + lastLoopTopTime = loopTopTime; + state std::string requestTypeStr = "[Init]"; + + try { + choose { + when ( RestoreSimpleRequest req = waitNext(loaderInterf.heartbeat.getFuture()) ) { + requestTypeStr = "heartbeat"; + wait(handleHeartbeat(req, loaderInterf.id())); + } + when ( RestoreLoadFileRequest req = waitNext(loaderInterf.sampleRangeFile.getFuture()) ) { + requestTypeStr = "sampleRangeFile"; + self->initBackupContainer(req.param.url); + actors.add( handleSampleRangeFileRequest(req, self) ); + } + when ( RestoreLoadFileRequest req = waitNext(loaderInterf.sampleLogFile.getFuture()) ) { + self->initBackupContainer(req.param.url); + requestTypeStr = "sampleLogFile"; + actors.add( handleSampleLogFileRequest(req, self) ); + } + when ( RestoreSetApplierKeyRangeVectorRequest req = waitNext(loaderInterf.setApplierKeyRangeVectorRequest.getFuture()) ) { + requestTypeStr = "setApplierKeyRangeVectorRequest"; + wait(handleSetApplierKeyRangeVectorRequest(req, self)); + } + when ( RestoreLoadFileRequest req = waitNext(loaderInterf.loadRangeFile.getFuture()) ) { + requestTypeStr = "loadRangeFile"; + self->initBackupContainer(req.param.url); + actors.add( handleLoadRangeFileRequest(req, self) ); + } + when ( RestoreLoadFileRequest req = waitNext(loaderInterf.loadLogFile.getFuture()) ) { + requestTypeStr = "loadLogFile"; + self->initBackupContainer(req.param.url); + actors.add( handleLoadLogFileRequest(req, self) ); + } + + when ( RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture()) ) { + requestTypeStr = "initVersionBatch"; + wait(handleInitVersionBatchRequest(req, self)); + } + + // TODO: To modify the following when conditions + when ( RestoreSimpleRequest req = waitNext(loaderInterf.collectRestoreRoleInterfaces.getFuture()) ) { + // Step: Find other worker's workerInterfaces + // NOTE: This must be after wait(configureRolesHandler()) because we must ensure all workers have registered their workerInterfaces into DB before we can read the workerInterface. + // TODO: Wait until all workers have registered their workerInterface. + wait( handleCollectRestoreRoleInterfaceRequest(req, self, cx) ); + } + } + + } catch (Error &e) { + fprintf(stdout, "[ERROR] Restore Loader handle received request:%s error. error code:%d, error message:%s\n", + requestTypeStr.c_str(), e.code(), e.what()); + + if ( requestTypeStr.find("[Init]") != std::string::npos ) { + printf("Exit due to error at requestType:%s", requestTypeStr.c_str()); + break; + } + } + } + + return Void(); +} + +// Restore Loader +ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, Reference self) { + // Idempodent operation. OK to re-execute the duplicate cmd + // The applier should remember the key range it is responsible for + //ASSERT(req.cmd == (RestoreCommandEnum) req.cmdID.phase); + //self->applierStatus.keyRange = req.range; + while (self->isInProgress(RestoreCommandEnum::Notify_Loader_ApplierKeyRange)) { + printf("[DEBUG] NODE:%s handleSetApplierKeyRangeVectorRequest wait for 1s\n", self->describeNode().c_str()); + wait(delay(1.0)); + } + if ( self->isCmdProcessed(req.cmdID) ) { + req.reply.send(RestoreCommonReply(self->id(),req.cmdID)); + return Void(); + } + self->setInProgressFlag(RestoreCommandEnum::Notify_Loader_ApplierKeyRange); + + VectorRef appliers = req.applierIDs; + VectorRef ranges = req.ranges; + for ( int i = 0; i < appliers.size(); i++ ) { + self->range2Applier[ranges[i].begin] = appliers[i]; + } + + self->processedCmd[req.cmdID] = 1; + self->clearInProgressFlag(RestoreCommandEnum::Notify_Loader_ApplierKeyRange); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + + return Void(); +} + +// TODO: Remove the RestoreLoaderInterface param., which is not needed in the handler functions +// Restore Loader +ACTOR Future handleSampleRangeFileRequest(RestoreLoadFileRequest req, Reference self) { + //printf("[INFO] Node:%s Got Restore Command: cmdID:%s.\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + + state LoadingParam param = req.param; + state int beginBlock = 0; + state int j = 0; + state int readLen = 0; + state int64_t readOffset = param.offset; + + while (self->isInProgress(RestoreCommandEnum::Sample_Range_File)) { + printf("[DEBUG] NODE:%s sampleRangeFile wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + // Handle duplicate, assuming cmdUID is always unique for the same workload + if ( self->isCmdProcessed(req.cmdID) ) { + printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + + self->setInProgressFlag(RestoreCommandEnum::Sample_Range_File); + printf("[Sample_Range_File][Loader] Node: %s, loading param:%s\n", + self->describeNode().c_str(), param.toString().c_str()); + + // TODO: This can be expensive + state Reference bc = self->bc; + printf("[INFO] node:%s open backup container for url:%s\n", + self->describeNode().c_str(), + param.url.toString().c_str()); + + + self->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier + self->mutationMap.clear(); + self->mutationPartMap.clear(); + + ASSERT( param.blockSize > 0 ); + //state std::vector> fileParserFutures; + if (param.offset % param.blockSize != 0) { + printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", + param.offset, param.blockSize, param.offset % param.blockSize); + } + + ASSERT( param.offset + param.blockSize >= param.length ); // We only sample one data block or less (at the end of the file) of a file. + for (j = param.offset; j < param.length; j += param.blockSize) { + readOffset = j; + readLen = std::min(param.blockSize, param.length - j); + wait( _parseRangeFileToMutationsOnLoader(self, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix) ); + ++beginBlock; + } + + printf("[Sampling][Loader] Node:%s finishes sample Range file:%s\n", self->describeNode().c_str(), param.filename.c_str()); + // TODO: Send to applier to apply the mutations + printf("[Sampling][Loader] Node:%s will send sampled mutations to applier\n", self->describeNode().c_str()); + wait( registerMutationsToMasterApplier(self) ); // Send the parsed mutation to applier who will apply the mutation to DB + + //self->processedFiles.insert(std::make_pair(param.filename, 1)); + + //TODO: Send ack to master that loader has finished loading the data + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + self->processedCmd[req.cmdID] = 1; // Recoself the processed comand to handle duplicate command + //self->kvOps.clear(); + + self->clearInProgressFlag(RestoreCommandEnum::Sample_Range_File); + + return Void(); +} + +ACTOR Future handleSampleLogFileRequest(RestoreLoadFileRequest req, Reference self) { + state LoadingParam param = req.param; + state int beginBlock = 0; + state int j = 0; + state int readLen = 0; + state int64_t readOffset = param.offset; + + while (self->isInProgress(RestoreCommandEnum::Sample_Log_File)) { + printf("[DEBUG] NODE:%s sampleLogFile wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + // Handle duplicate message + if ( self->isCmdProcessed(req.cmdID) ) { + printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + + self->setInProgressFlag(RestoreCommandEnum::Sample_Log_File); + printf("[Sample_Log_File][Loader] Node: %s, loading param:%s\n", self->describeNode().c_str(), param.toString().c_str()); + + // TODO: Expensive operation + state Reference bc = self->bc; + printf("[Sampling][Loader] Node:%s open backup container for url:%s\n", + self->describeNode().c_str(), + param.url.toString().c_str()); + printf("[Sampling][Loader] Node:%s filename:%s blockSize:%ld\n", + self->describeNode().c_str(), + param.filename.c_str(), param.blockSize); + + self->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier + self->mutationMap.clear(); + self->mutationPartMap.clear(); + + ASSERT( param.blockSize > 0 ); + //state std::vector> fileParserFutures; + if (param.offset % param.blockSize != 0) { + printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", + param.offset, param.blockSize, param.offset % param.blockSize); + } + ASSERT( param.offset + param.blockSize >= param.length ); // Assumption: Only sample one data block or less + for (j = param.offset; j < param.length; j += param.blockSize) { + readOffset = j; + readLen = std::min(param.blockSize, param.length - j); + // NOTE: Log file holds set of blocks of data. We need to parse the data block by block and get the kv pair(version, serialized_mutations) + // The set of mutations at the same version may be splitted into multiple kv pairs ACROSS multiple data blocks when the size of serialized_mutations is larger than 20000. + wait( _parseLogFileToMutationsOnLoader(self, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix, param.mutationLogPrefix) ); + ++beginBlock; + } + printf("[Sampling][Loader] Node:%s finishes parsing the data block into kv pairs (version, serialized_mutations) for file:%s\n", self->describeNode().c_str(), param.filename.c_str()); + parseSerializedMutation(self, true); + + printf("[Sampling][Loader] Node:%s finishes process Log file:%s\n", self->describeNode().c_str(), param.filename.c_str()); + printf("[Sampling][Loader] Node:%s will send log mutations to applier\n", self->describeNode().c_str()); + wait( registerMutationsToMasterApplier(self) ); // Send the parsed mutation to applier who will apply the mutation to DB + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); // master node is waiting + self->processedFiles.insert(std::make_pair(param.filename, 1)); + self->processedCmd[req.cmdID] = 1; + + self->clearInProgressFlag(RestoreCommandEnum::Sample_Log_File); + + return Void(); +} + + +ACTOR Future handleLoadRangeFileRequest(RestoreLoadFileRequest req, Reference self) { + //printf("[INFO] Worker Node:%s starts handleLoadRangeFileRequest\n", self->describeNode().c_str()); + + state LoadingParam param; + state int64_t beginBlock = 0; + state int64_t j = 0; + state int64_t readLen = 0; + state int64_t readOffset = 0; + state Reference bc; + + param = req.param; + beginBlock = 0; + j = 0; + readLen = 0; + readOffset = 0; + readOffset = param.offset; + + while (self->isInProgress(RestoreCommandEnum::Assign_Loader_Range_File)) { + printf("[DEBUG] NODE:%s loadRangeFile wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + //Note: handle duplicate message delivery + if (self->processedFiles.find(param.filename) != self->processedFiles.end() || + self->isCmdProcessed(req.cmdID)) { + // printf("[WARNING]Node:%s, CMDUID:%s file:%s is delivered more than once! Reply directly without loading the file\n", + // self->describeNode().c_str(), req.cmdID.toString().c_str(), + // param.filename.c_str()); + req.reply.send(RestoreCommonReply(self->id(),req.cmdID)); + return Void(); + } + + self->setInProgressFlag(RestoreCommandEnum::Assign_Loader_Range_File); + + printf("[INFO][Loader] Node:%s, CMDUID:%s Execute: Assign_Loader_Range_File, loading param:%s\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.toString().c_str()); + + bc = self->bc; + // printf("[INFO] Node:%s CMDUID:%s open backup container for url:%s\n", + // self->describeNode().c_str(), req.cmdID.toString().c_str(), + // param.url.toString().c_str()); + + + self->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier + self->mutationMap.clear(); + self->mutationPartMap.clear(); + + ASSERT( param.blockSize > 0 ); + //state std::vector> fileParserFutures; + if (param.offset % param.blockSize != 0) { + printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", + param.offset, param.blockSize, param.offset % param.blockSize); + } + for (j = param.offset; j < param.length; j += param.blockSize) { + readOffset = j; + readLen = std::min(param.blockSize, param.length - j); + printf("[DEBUG_TMP] _parseRangeFileToMutationsOnLoader starts\n"); + wait( _parseRangeFileToMutationsOnLoader(self, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix) ); + printf("[DEBUG_TMP] _parseRangeFileToMutationsOnLoader ends\n"); + ++beginBlock; + } + + printf("[INFO][Loader] Node:%s CMDUID:%s finishes process Range file:%s\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.filename.c_str()); + // TODO: Send to applier to apply the mutations + // printf("[INFO][Loader] Node:%s CMDUID:%s will send range mutations to applier\n", + // self->describeNode().c_str(), self->cmdID.toString().c_str()); + wait( registerMutationsToApplier(self) ); // Send the parsed mutation to applier who will apply the mutation to DB + wait ( delay(1.0) ); + + self->processedFiles[param.filename] = 1; + self->processedCmd[req.cmdID] = 1; + + self->clearInProgressFlag(RestoreCommandEnum::Assign_Loader_Range_File); + printf("[INFO][Loader] Node:%s CMDUID:%s clear inProgressFlag :%lx for Assign_Loader_Range_File.\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), self->inProgressFlag); + + //Send ack to master that loader has finished loading the data + printf("[INFO][Loader] Node:%s CMDUID:%s send ack.\n", + self->describeNode().c_str(), self->cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + + return Void(); + +} + + +ACTOR Future handleLoadLogFileRequest(RestoreLoadFileRequest req, Reference self) { + printf("[INFO] Worker Node:%s starts handleLoadLogFileRequest\n", self->describeNode().c_str()); + + state LoadingParam param; + state int64_t beginBlock = 0; + state int64_t j = 0; + state int64_t readLen = 0; + state int64_t readOffset = 0; + state Reference bc; + + param = req.param; + beginBlock = 0; + j = 0; + readLen = 0; + readOffset = 0; + readOffset = param.offset; + + while (self->isInProgress(RestoreCommandEnum::Assign_Loader_Log_File)) { + printf("[DEBUG] NODE:%s loadLogFile wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + //Note: handle duplicate message delivery + if (self->processedFiles.find(param.filename) != self->processedFiles.end() + || self->isCmdProcessed(req.cmdID)) { + printf("[WARNING] Node:%s CMDUID:%s file:%s is delivered more than once! Reply directly without loading the file\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.filename.c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + + self->setInProgressFlag(RestoreCommandEnum::Assign_Loader_Log_File); + + printf("[INFO][Loader] Node:%s CMDUID:%s Assign_Loader_Log_File loading param:%s\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.toString().c_str()); + + bc = self->bc; + printf("[INFO][Loader] Node:%s CMDUID:%s open backup container for url:%s\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.url.toString().c_str()); + printf("[INFO][Loader] Node:%s CMDUID:%s filename:%s blockSize:%ld\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.filename.c_str(), param.blockSize); + + self->kvOps.clear(); //Clear kvOps so that kvOps only hold mutations for the current data block. We will send all mutations in kvOps to applier + self->mutationMap.clear(); + self->mutationPartMap.clear(); + + ASSERT( param.blockSize > 0 ); + //state std::vector> fileParserFutures; + if (param.offset % param.blockSize != 0) { + printf("[WARNING] Parse file not at block boundary! param.offset:%ld param.blocksize:%ld, remainder:%ld\n", + param.offset, param.blockSize, param.offset % param.blockSize); + } + for (j = param.offset; j < param.length; j += param.blockSize) { + readOffset = j; + readLen = std::min(param.blockSize, param.length - j); + // NOTE: Log file holds set of blocks of data. We need to parse the data block by block and get the kv pair(version, serialized_mutations) + // The set of mutations at the same version may be splitted into multiple kv pairs ACROSS multiple data blocks when the size of serialized_mutations is larger than 20000. + wait( _parseLogFileToMutationsOnLoader(self, bc, param.version, param.filename, readOffset, readLen, param.restoreRange, param.addPrefix, param.removePrefix, param.mutationLogPrefix) ); + ++beginBlock; + } + printf("[INFO][Loader] Node:%s CMDUID:%s finishes parsing the data block into kv pairs (version, serialized_mutations) for file:%s\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.filename.c_str()); + parseSerializedMutation(self, false); + + printf("[INFO][Loader] Node:%s CMDUID:%s finishes process Log file:%s\n", + self->describeNode().c_str(), req.cmdID.toString().c_str(), + param.filename.c_str()); + printf("[INFO][Loader] Node:%s CMDUID:%s will send log mutations to applier\n", + self->describeNode().c_str(), req.cmdID.toString().c_str()); + wait( registerMutationsToApplier(self) ); // Send the parsed mutation to applier who will apply the mutation to DB + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); // master node is waiting + self->processedFiles[param.filename] = 1; + self->processedCmd[req.cmdID] = 1; + + self->clearInProgressFlag(RestoreCommandEnum::Assign_Loader_Log_File); + + return Void(); +} + + + +// Loader: Register sampled mutations +ACTOR Future registerMutationsToMasterApplier(Reference self) { + printf("[Sampling] Node:%s registerMutationsToMaster() self->masterApplierInterf:%s\n", + self->describeNode().c_str(), self->masterApplierInterf.toString().c_str()); + + state RestoreApplierInterface applierCmdInterf = self->masterApplierInterf; + state int packMutationNum = 0; + state int packMutationThreshold = 1; + state int kvCount = 0; + state std::vector> cmdReplies; + + state int splitMutationIndex = 0; + state std::map>>::iterator kvOp; + state int mIndex; + state uint64_t commitVersion; + state MutationRef kvm; + + state Standalone> mutationsBuffer; // The mutation vector to be sent to master applier + state double mutationsSize = 0; + //state double mutationVectorThreshold = 1; //1024 * 10; // Bytes + loop { + try { + cmdReplies.clear(); + mutationsBuffer.pop_front(mutationsBuffer.size()); + mutationsSize = 0; + packMutationNum = 0; + self->cmdID.initPhase(RestoreCommandEnum::Loader_Send_Sample_Mutation_To_Applier); + // TODO: Consider using a different EndPoint for loader and applier communication. + // Otherwise, applier may receive loader's message while applier is waiting for master to assign key-range + for ( kvOp = self->kvOps.begin(); kvOp != self->kvOps.end(); kvOp++) { + commitVersion = kvOp->first; + + for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) { + kvm = kvOp->second[mIndex]; + self->cmdID.nextCmd(); + if ( debug_verbose || true ) { // Debug deterministic bug + printf("[VERBOSE_DEBUG] send mutation to applier, mIndex:%d mutation:%s\n", mIndex, kvm.toString().c_str()); + } + mutationsBuffer.push_back(mutationsBuffer.arena(), kvm); + mutationsSize += kvm.expectedSize(); + if ( mutationsSize >= mutationVectorThreshold ) { + self->cmdID.nextCmd(); + cmdReplies.push_back(applierCmdInterf.sendSampleMutationVector.getReply( + RestoreSendMutationVectorRequest(self->cmdID, commitVersion, mutationsBuffer))); + mutationsBuffer.pop_front(mutationsBuffer.size()); + mutationsSize = 0; + if ( debug_verbose ) { + printf("[INFO][Loader] Waits for master applier to receive %ld mutations\n", mutationsBuffer.size()); + } + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + //std::vector reps = wait( getAll(cmdReplies) ); + cmdReplies.clear(); + } + + kvCount++; + } + } + + // The leftover mutationVector whose size is < mutationVectorThreshold + if ( mutationsSize > 0 ) { + self->cmdID.nextCmd(); + cmdReplies.push_back(applierCmdInterf.sendSampleMutationVector.getReply( + RestoreSendMutationVectorRequest(self->cmdID, commitVersion, mutationsBuffer))); + mutationsBuffer.pop_front(mutationsBuffer.size()); + mutationsSize = 0; + } + + + if (!cmdReplies.empty()) { + printf("[INFO][Loader] Last waits for master applier to receive %ld mutations\n", mutationsBuffer.size()); + //std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout) ); + std::vector reps = wait( getAll(cmdReplies) ); + cmdReplies.clear(); + } + + printf("[Sample Summary][Loader] Node:%s produces %d mutation operations\n", self->describeNode().c_str(), kvCount); + break; + } catch (Error &e) { + // TODO: Handle the command reply timeout error + if (e.code() != error_code_io_timeout) { + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s timeout\n", self->describeNode().c_str(), self->cmdID.toString().c_str()); + } else { + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + } + printf("[WARNING] Node:%s timeout at waiting on replies of Loader_Send_Sample_Mutation_To_Applier. Retry...\n", self->describeNode().c_str()); + } + } + + return Void(); +} + + + +ACTOR Future registerMutationsToApplier(Reference self) { + printf("[INFO][Loader] Node:%s self->masterApplierInterf:%s, registerMutationsToApplier\n", + self->describeNode().c_str(), self->masterApplierInterf.toString().c_str()); + + state RestoreApplierInterface applierCmdInterf; + state int packMutationNum = 0; + state int packMutationThreshold = 10; + state int kvCount = 0; + state std::vector> cmdReplies; + + state int splitMutationIndex = 0; + + self->printAppliersKeyRange(); + + //state double mutationVectorThreshold = 1;//1024 * 10; // Bytes. + state std::map>> applierMutationsBuffer; // The mutation vector to be sent to each applier + state std::map applierMutationsSize; // buffered mutation vector size for each applier + // Initialize the above two maps + state std::vector applierIDs = self->getWorkingApplierIDs(); + loop { + try { + packMutationNum = 0; + splitMutationIndex = 0; + kvCount = 0; + state std::map>>::iterator kvOp; + self->cmdID.initPhase(RestoreCommandEnum::Loader_Send_Mutations_To_Applier); + // In case try-catch has error and loop back + applierMutationsBuffer.clear(); + applierMutationsSize.clear(); + for (auto &applierID : applierIDs) { + applierMutationsBuffer[applierID] = Standalone>(VectorRef()); + applierMutationsSize[applierID] = 0.0; + } + for ( kvOp = self->kvOps.begin(); kvOp != self->kvOps.end(); kvOp++) { + state uint64_t commitVersion = kvOp->first; + state int mIndex; + state MutationRef kvm; + for (mIndex = 0; mIndex < kvOp->second.size(); mIndex++) { + kvm = kvOp->second[mIndex]; + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] mutation to sent to applier, mutation:%s\n", kvm.toString().c_str()); + } + // Send the mutation to applier + if (isRangeMutation(kvm)) { + // Because using a vector of mutations causes overhead, and the range mutation should happen rarely; + // We handle the range mutation and key mutation differently for the benefit of avoiding memory copy + state Standalone> mvector; + state Standalone> nodeIDs; + // '' Bug may be here! The splitMutation() may be wrong! + splitMutation(self, kvm, mvector.arena(), mvector.contents(), nodeIDs.arena(), nodeIDs.contents()); + ASSERT(mvector.size() == nodeIDs.size()); + + for (splitMutationIndex = 0; splitMutationIndex < mvector.size(); splitMutationIndex++ ) { + MutationRef mutation = mvector[splitMutationIndex]; + UID applierID = nodeIDs[splitMutationIndex]; + applierCmdInterf = self->appliersInterf[applierID]; + applierMutationsBuffer[applierID].push_back(applierMutationsBuffer[applierID].arena(), mutation); // Q: Maybe push_back_deep()? + applierMutationsSize[applierID] += mutation.expectedSize(); + + kvCount++; + } + + for (auto &applierID : applierIDs) { + if ( applierMutationsSize[applierID] >= mutationVectorThreshold ) { + self->cmdID.nextCmd(); + cmdReplies.push_back(applierCmdInterf.sendMutationVector.getReply( + RestoreSendMutationVectorRequest(self->cmdID, commitVersion, applierMutationsBuffer[applierID]))); + applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size()); + applierMutationsSize[applierID] = 0; + + printf("[INFO][Loader] Waits for applier to receive %ld range mutations\n", cmdReplies.size()); + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + cmdReplies.clear(); + } + } + } else { // mutation operates on a particular key + std::map, UID>::iterator itlow = self->range2Applier.lower_bound(kvm.param1); // lower_bound returns the iterator that is >= m.param1 + // make sure itlow->first <= m.param1 + if ( itlow == self->range2Applier.end() || itlow->first > kvm.param1 ) { + --itlow; + } + ASSERT( itlow->first <= kvm.param1 ); + MutationRef mutation = kvm; + UID applierID = itlow->second; + applierCmdInterf = self->appliersInterf[applierID]; + kvCount++; + + applierMutationsBuffer[applierID].push_back(applierMutationsBuffer[applierID].arena(), mutation); // Q: Maybe push_back_deep()? + applierMutationsSize[applierID] += mutation.expectedSize(); + if ( applierMutationsSize[applierID] >= mutationVectorThreshold ) { + self->cmdID.nextCmd(); + cmdReplies.push_back(applierCmdInterf.sendMutationVector.getReply( + RestoreSendMutationVectorRequest(self->cmdID, commitVersion, applierMutationsBuffer[applierID]))); + applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size()); + applierMutationsSize[applierID] = 0; + + printf("[INFO][Loader] Waits for applier to receive %ld range mutations\n", cmdReplies.size()); + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + cmdReplies.clear(); + } + } + } + + } + + // In case the mutation vector is not larger than mutationVectorThreshold + printf("[DEBUG][Loader] sendMutationVector sends the remaining applierMutationsBuffer, applierIDs.size:%d\n", applierIDs.size()); + for (auto &applierID : applierIDs) { + if (applierMutationsBuffer[applierID].empty()) { //&& applierMutationsSize[applierID] >= 1 + continue; + } + printf("[DEBUG][Loader] sendMutationVector for applierID:%s\n", applierID.toString().c_str()); + self->cmdID.nextCmd(); + cmdReplies.push_back(applierCmdInterf.sendMutationVector.getReply( + RestoreSendMutationVectorRequest(self->cmdID, commitVersion, applierMutationsBuffer[applierID]))); + applierMutationsBuffer[applierID].pop_front(applierMutationsBuffer[applierID].size()); + applierMutationsSize[applierID] = 0; + printf("[INFO][Loader] Waits for applier to receive %ld range mutations\n", cmdReplies.size()); + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); // Q: We need to wait for each reply, otherwise, correctness has error. Why? + cmdReplies.clear(); + } + + if (!cmdReplies.empty()) { + printf("[INFO][Loader] Last Waits for applier to receive %ld range mutations\n", cmdReplies.size()); + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + //std::vector reps = wait( getAll(cmdReplies) ); + cmdReplies.clear(); + } + printf("[Summary][Loader] Node:%s Last CMDUID:%s produces %d mutation operations\n", + self->describeNode().c_str(), self->cmdID.toString().c_str(), kvCount); + + break; + + } catch (Error &e) { + // Handle the command reply timeout error + fprintf(stdout, "[ERROR] registerMutationsToApplier Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + } + }; + + return Void(); +} + + + +void splitMutation(Reference self, MutationRef m, Arena& mvector_arena, VectorRef mvector, Arena& nodeIDs_arena, VectorRef nodeIDs) { + // mvector[i] should be mapped to nodeID[i] + ASSERT(mvector.empty()); + ASSERT(nodeIDs.empty()); + // key range [m->param1, m->param2) + //std::map, UID>; + std::map, UID>::iterator itlow, itup; //we will return [itlow, itup) + itlow = self->range2Applier.lower_bound(m.param1); // lower_bound returns the iterator that is >= m.param1 + if ( itlow != self->range2Applier.begin()) { // m.param1 is not the smallest key \00 + // (itlow-1) is the node whose key range includes m.param1 + --itlow; + } else { + if (m.param1 != LiteralStringRef("\00")) { + printf("[ERROR] splitMutation has bug on range mutation:%s\n", m.toString().c_str()); + } + } + + itup = self->range2Applier.upper_bound(m.param2); // upper_bound returns the iterator that is > m.param2; return rmap::end if no keys are considered to go after m.param2. + ASSERT( itup == self->range2Applier.end() || itup->first >= m.param2 ); + // Now adjust for the case: example: mutation range is [a, d); we have applier's ranges' inclusive lower bound values are: a, b, c, d, e; upper_bound(d) returns itup to e, but we want itup to d. + --itup; + ASSERT( itup->first <= m.param2 ); + if ( itup->first < m.param2 ) { + ++itup; //make sure itup is >= m.param2, that is, itup is the next key range >= m.param2 + } + + while (itlow->first < itup->first) { + MutationRef curm; //current mutation + curm.type = m.type; + curm.param1 = itlow->first; + itlow++; + if (itlow == self->range2Applier.end()) { + curm.param2 = normalKeys.end; + } else { + curm.param2 = itlow->first; + } + mvector.push_back(mvector_arena, curm); + + nodeIDs.push_back(nodeIDs_arena, itlow->second); + } + + return; +} + + +//key_input format: [logRangeMutation.first][hash_value_of_commit_version:1B][bigEndian64(commitVersion)][bigEndian32(part)] +bool concatenateBackupMutationForLogFile(Reference self, Standalone val_input, Standalone key_input) { + std::string prefix = "||\t"; + std::stringstream ss; + const int version_size = 12; + const int header_size = 12; + StringRef val = val_input.contents(); + StringRefReaderMX reader(val, restore_corrupted_data()); + StringRefReaderMX readerKey(key_input, restore_corrupted_data()); //read key_input! + int logRangeMutationFirstLength = key_input.size() - 1 - 8 - 4; + bool concatenated = false; + + if ( logRangeMutationFirstLength < 0 ) { + printf("[ERROR]!!! logRangeMutationFirstLength:%ld < 0, key_input.size:%ld\n", logRangeMutationFirstLength, key_input.size()); + } + + if ( debug_verbose ) { + printf("[DEBUG] Process key_input:%s\n", getHexKey(key_input, logRangeMutationFirstLength).c_str()); + } + + //PARSE key + Standalone id_old = key_input.substr(0, key_input.size() - 4); //Used to sanity check the decoding of key is correct + Standalone partStr = key_input.substr(key_input.size() - 4, 4); //part + StringRefReaderMX readerPart(partStr, restore_corrupted_data()); + uint32_t part_direct = readerPart.consumeNetworkUInt32(); //Consume a bigEndian value + if ( debug_verbose ) { + printf("[DEBUG] Process prefix:%s and partStr:%s part_direct:%08x fromm key_input:%s, size:%ld\n", + getHexKey(id_old, logRangeMutationFirstLength).c_str(), + getHexString(partStr).c_str(), + part_direct, + getHexKey(key_input, logRangeMutationFirstLength).c_str(), + key_input.size()); + } + + StringRef longRangeMutationFirst; + + if ( logRangeMutationFirstLength > 0 ) { + printf("readerKey consumes %dB\n", logRangeMutationFirstLength); + longRangeMutationFirst = StringRef(readerKey.consume(logRangeMutationFirstLength), logRangeMutationFirstLength); + } + + uint8_t hashValue = readerKey.consume(); + uint64_t commitVersion = readerKey.consumeNetworkUInt64(); // Consume big Endian value encoded in log file, commitVersion is in littleEndian + uint64_t commitVersionBE = bigEndian64(commitVersion); + uint32_t part = readerKey.consumeNetworkUInt32(); //Consume big Endian value encoded in log file + uint32_t partBE = bigEndian32(part); + Standalone id2 = longRangeMutationFirst.withSuffix(StringRef(&hashValue,1)).withSuffix(StringRef((uint8_t*) &commitVersion, 8)); + + //Use commitVersion as id + Standalone id = StringRef((uint8_t*) &commitVersion, 8); + + if ( debug_verbose ) { + printf("[DEBUG] key_input_size:%d longRangeMutationFirst:%s hashValue:%02x commitVersion:%016lx (BigEndian:%016lx) part:%08x (BigEndian:%08x), part_direct:%08x mutationMap.size:%ld\n", + key_input.size(), longRangeMutationFirst.printable().c_str(), hashValue, + commitVersion, commitVersionBE, + part, partBE, + part_direct, self->mutationMap.size()); + } + + if ( self->mutationMap.find(id) == self->mutationMap.end() ) { + self->mutationMap.insert(std::make_pair(id, val_input)); + if ( part_direct != 0 ) { + printf("[ERROR]!!! part:%d != 0 for key_input:%s\n", part_direct, getHexString(key_input).c_str()); + } + self->mutationPartMap.insert(std::make_pair(id, part_direct)); + } else { // concatenate the val string +// printf("[INFO] Concatenate the log's val string at version:%ld\n", id.toString().c_str()); + self->mutationMap[id] = self->mutationMap[id].contents().withSuffix(val_input.contents()); //Assign the new Areana to the map's value + if ( part_direct != (self->mutationPartMap[id] + 1) ) { + printf("[ERROR]!!! current part id:%d new part_direct:%d is not the next integer of key_input:%s\n", self->mutationPartMap[id], part_direct, getHexString(key_input).c_str()); + printf("[HINT] Check if the same range or log file has been processed more than once!\n"); + } + if ( part_direct != part ) { + printf("part_direct:%08x != part:%08x\n", part_direct, part); + } + self->mutationPartMap[id] = part_direct; + concatenated = true; + } + + return concatenated; +} + +bool isRangeMutation(MutationRef m) { + if (m.type == MutationRef::Type::ClearRange) { + if (m.type == MutationRef::Type::DebugKeyRange) { + printf("[ERROR] DebugKeyRange mutation is in backup data unexpectedly. We still handle it as a range mutation; the suspicious mutation:%s\n", m.toString().c_str()); + } + return true; + } else { + if ( !(m.type == MutationRef::Type::SetValue || + isAtomicOp((MutationRef::Type) m.type)) ) { + printf("[ERROR] %s mutation is in backup data unexpectedly. We still handle it as a key mutation; the suspicious mutation:%s\n", typeString[m.type], m.toString().c_str()); + + } + return false; + } +} + + + // Parse the kv pair (version, serialized_mutation), which are the results parsed from log file. + void parseSerializedMutation(Reference self, bool isSampling) { + // Step: Parse the concatenated KV pairs into (version, ) pair + printf("[INFO] Parse the concatenated log data\n"); + std::string prefix = "||\t"; + std::stringstream ss; + const int version_size = 12; + const int header_size = 12; + int kvCount = 0; + + for ( auto& m : self->mutationMap ) { + StringRef k = m.first.contents(); + StringRefReaderMX readerVersion(k, restore_corrupted_data()); + uint64_t commitVersion = readerVersion.consume(); // Consume little Endian data + + + StringRef val = m.second.contents(); + StringRefReaderMX reader(val, restore_corrupted_data()); + + int count_size = 0; + // Get the include version in the batch commit, which is not the commitVersion. + // commitVersion is in the key + uint64_t includeVersion = reader.consume(); + count_size += 8; + uint32_t val_length_decode = reader.consume(); //Parse little endian value, confirmed it is correct! + count_size += 4; + + if ( self->kvOps.find(commitVersion) == self->kvOps.end() ) { + self->kvOps.insert(std::make_pair(commitVersion, VectorRef())); + } + + if ( debug_verbose ) { + printf("----------------------------------------------------------Register Backup Mutation into KVOPs version:%08lx\n", commitVersion); + printf("To decode value:%s\n", getHexString(val).c_str()); + } + // In sampling, the last mutation vector may be not complete, we do not concatenate for performance benefit + if ( val_length_decode != (val.size() - 12) ) { + //IF we see val.size() == 10000, It means val should be concatenated! The concatenation may fail to copy the data + if (isSampling) { + printf("[PARSE WARNING]!!! val_length_decode:%d != val.size:%d version:%ld(0x%lx)\n", val_length_decode, val.size(), + commitVersion, commitVersion); + printf("[PARSE WARNING] Skipped the mutation! OK for sampling workload but WRONG for restoring the workload\n"); + continue; + } else { + printf("[PARSE ERROR]!!! val_length_decode:%d != val.size:%d version:%ld(0x%lx)\n", val_length_decode, val.size(), + commitVersion, commitVersion); + } + } else { + if ( debug_verbose ) { + printf("[PARSE SUCCESS] val_length_decode:%d == (val.size:%d - 12)\n", val_length_decode, val.size()); + } + } + + // Get the mutation header + while (1) { + // stop when reach the end of the string + if(reader.eof() ) { //|| *reader.rptr == 0xFF + //printf("Finish decode the value\n"); + break; + } + + + uint32_t type = reader.consume();//reader.consumeNetworkUInt32(); + uint32_t kLen = reader.consume();//reader.consumeNetworkUInkvOps[t32(); + uint32_t vLen = reader.consume();//reader.consumeNetworkUInt32(); + const uint8_t *k = reader.consume(kLen); + const uint8_t *v = reader.consume(vLen); + count_size += 4 * 3 + kLen + vLen; + + MutationRef mutation((MutationRef::Type) type, KeyRef(k, kLen), KeyRef(v, vLen)); + self->kvOps[commitVersion].push_back_deep(self->kvOps[commitVersion].arena(), mutation); + kvCount++; + + if ( kLen < 0 || kLen > val.size() || vLen < 0 || vLen > val.size() ) { + printf("%s[PARSE ERROR]!!!! kLen:%d(0x%04x) vLen:%d(0x%04x)\n", prefix.c_str(), kLen, kLen, vLen, vLen); + } + + if ( debug_verbose ) { + printf("%s---LogFile parsed mutations. Prefix:[%d]: Version:%016lx Type:%d K:%s V:%s k_size:%d v_size:%d\n", prefix.c_str(), + kvCount, + commitVersion, type, getHexString(KeyRef(k, kLen)).c_str(), getHexString(KeyRef(v, vLen)).c_str(), kLen, vLen); + } + + } + // printf("----------------------------------------------------------\n"); + } + + printf("[INFO] Produces %d mutation operations from concatenated kv pairs that are parsed from log\n", kvCount); + +} + + +ACTOR static Future _parseRangeFileToMutationsOnLoader(Reference self, + Reference bc, Version version, + std::string fileName, int64_t readOffset_input, int64_t readLen_input, + KeyRange restoreRange, Key addPrefix, Key removePrefix) { + + state int64_t readOffset = readOffset_input; + state int64_t readLen = readLen_input; + + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] Parse range file and get mutations 1, bc:%lx\n", bc.getPtr()); + } + // The set of key value version is rangeFile.version. the key-value set in the same range file has the same version + Reference inFile = wait(bc->readFile(fileName)); + + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] Parse range file and get mutations 2\n"); + } + state Standalone> blockData = wait(parallelFileRestore::decodeRangeFileBlock(inFile, readOffset, readLen)); + + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] Parse range file and get mutations 3\n"); + int tmpi = 0; + for (tmpi = 0; tmpi < blockData.size(); tmpi++) { + printf("\t[VERBOSE_DEBUG] mutation: key:%s value:%s\n", blockData[tmpi].key.toString().c_str(), blockData[tmpi].value.toString().c_str()); + } + } + + // First and last key are the range for this file + state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key); + printf("[INFO] RangeFile:%s KeyRange:%s, restoreRange:%s\n", + fileName.c_str(), fileRange.toString().c_str(), restoreRange.toString().c_str()); + + // If fileRange doesn't intersect restore range then we're done. + if(!fileRange.intersects(restoreRange)) { + TraceEvent("ExtractApplyRangeFileToDB_MX").detail("NoIntersectRestoreRange", "FinishAndReturn"); + return Void(); + } + + // We know the file range intersects the restore range but there could still be keys outside the restore range. + // Find the subvector of kv pairs that intersect the restore range. Note that the first and last keys are just the range endpoints for this file + // The blockData's first and last entries are metadata, not the real data + int rangeStart = 1; //1 + int rangeEnd = blockData.size() -1; //blockData.size() - 1 // Q: the rangeStart and rangeEnd is [,)? + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] Range file decoded blockData\n"); + for (auto& data : blockData ) { + printf("\t[VERBOSE_DEBUG] data key:%s val:%s\n", data.key.toString().c_str(), data.value.toString().c_str()); + } + } + + // Slide start forwaself, stop if something in range is found + // Move rangeStart and rangeEnd until they is within restoreRange + while(rangeStart < rangeEnd && !restoreRange.contains(blockData[rangeStart].key)) { + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] rangeStart:%d key:%s is not in the range:%s\n", rangeStart, blockData[rangeStart].key.toString().c_str(), restoreRange.toString().c_str()); + } + ++rangeStart; + } + // Side end backwaself, stop if something in range is found + while(rangeEnd > rangeStart && !restoreRange.contains(blockData[rangeEnd - 1].key)) { + if ( debug_verbose ) { + printf("[VERBOSE_DEBUG] (rangeEnd:%d - 1) key:%s is not in the range:%s\n", rangeEnd, blockData[rangeStart].key.toString().c_str(), restoreRange.toString().c_str()); + } + --rangeEnd; + } + + // MX: now data only contains the kv mutation within restoreRange + state VectorRef data = blockData.slice(rangeStart, rangeEnd); + printf("[INFO] RangeFile:%s blockData entry size:%d recovered data size:%d\n", fileName.c_str(), blockData.size(), data.size()); + + // Shrink file range to be entirely within restoreRange and translate it to the new prefix + // First, use the untranslated file range to create the shrunk original file range which must be used in the kv range version map for applying mutations + state KeyRange originalFileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end)); + + // Now shrink and translate fileRange + Key fileEnd = std::min(fileRange.end, restoreRange.end); + if(fileEnd == (removePrefix == StringRef() ? normalKeys.end : strinc(removePrefix)) ) { + fileEnd = addPrefix == StringRef() ? normalKeys.end : strinc(addPrefix); + } else { + fileEnd = fileEnd.removePrefix(removePrefix).withPrefix(addPrefix); + } + fileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin).removePrefix(removePrefix).withPrefix(addPrefix),fileEnd); + + state int start = 0; + state int end = data.size(); + //state int dataSizeLimit = BUGGIFY ? g_random->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; + state int dataSizeLimit = CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; + state int kvCount = 0; + + //MX: This is where the key-value pair in range file is applied into DB + loop { + + state int i = start; + state int txBytes = 0; + state int iend = start; + + // find iend that results in the desired transaction size + for(; iend < end && txBytes < dataSizeLimit; ++iend) { + txBytes += data[iend].key.expectedSize(); + txBytes += data[iend].value.expectedSize(); + } + + + for(; i < iend; ++i) { + //MXX: print out the key value version, and operations. + if ( debug_verbose ) { + printf("RangeFile [key:%s, value:%s, version:%ld, op:set]\n", data[i].key.printable().c_str(), data[i].value.printable().c_str(), version); + } +// TraceEvent("PrintRangeFile_MX").detail("Key", data[i].key.printable()).detail("Value", data[i].value.printable()) +// .detail("Version", rangeFile.version).detail("Op", "set"); +//// printf("PrintRangeFile_MX: mType:set param1:%s param2:%s param1_size:%d, param2_size:%d\n", +//// getHexString(data[i].key.c_str(), getHexString(data[i].value).c_str(), data[i].key.size(), data[i].value.size()); + + //NOTE: Should NOT removePrefix and addPrefix for the backup data! + // In other woselfs, the following operation is wrong: data[i].key.removePrefix(removePrefix).withPrefix(addPrefix) + MutationRef m(MutationRef::Type::SetValue, data[i].key, data[i].value); //ASSUME: all operation in range file is set. + ++kvCount; + + // TODO: we can commit the kv operation into DB. + // Right now, we cache all kv operations into kvOps, and apply all kv operations later in one place + if ( self->kvOps.find(version) == self->kvOps.end() ) { // Create the map's key if mutation m is the first on to be inserted + //kvOps.insert(std::make_pair(rangeFile.version, Standalone>(VectorRef()))); + self->kvOps.insert(std::make_pair(version, VectorRef())); + } + + ASSERT(self->kvOps.find(version) != self->kvOps.end()); + self->kvOps[version].push_back_deep(self->kvOps[version].arena(), m); + + } + + // Commit succeeded, so advance starting point + start = i; + + if(start == end) { + //TraceEvent("ExtraApplyRangeFileToDB_MX").detail("Progress", "DoneApplyKVToDB"); + printf("[INFO][Loader] NodeID:%s Parse RangeFile:%s: the number of kv operations = %d\n", + self->describeNode().c_str(), fileName.c_str(), kvCount); + return Void(); + } + } + + } + + ACTOR static Future _parseLogFileToMutationsOnLoader(Reference self, + Reference bc, Version version, + std::string fileName, int64_t readOffset, int64_t readLen, + KeyRange restoreRange, Key addPrefix, Key removePrefix, + Key mutationLogPrefix) { + + // Step: concatenate the backuped param1 and param2 (KV) at the same version. + //state Key mutationLogPrefix = mutationLogPrefix; + //TraceEvent("ReadLogFileStart").detail("LogFileName", fileName); + state Reference inFile = wait(bc->readFile(fileName)); + //TraceEvent("ReadLogFileFinish").detail("LogFileName", fileName); + + printf("Parse log file:%s readOffset:%d readLen:%ld\n", fileName.c_str(), readOffset, readLen); + //TODO: NOTE: decodeLogFileBlock() should read block by block! based on my serial version. This applies to decode range file as well + state Standalone> data = wait(parallelFileRestore::decodeLogFileBlock(inFile, readOffset, readLen)); + //state Standalone> data = wait(fileBackup::decodeLogFileBlock_MX(inFile, readOffset, readLen)); //Decode log file + TraceEvent("ReadLogFileFinish").detail("LogFileName", fileName).detail("DecodedDataSize", data.contents().size()); + printf("ReadLogFile, raw data size:%d\n", data.size()); + + state int start = 0; + state int end = data.size(); + //state int dataSizeLimit = BUGGIFY ? g_random->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; + state int dataSizeLimit = CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; + state int kvCount = 0; + state int numConcatenated = 0; + loop { + try { +// printf("Process start:%d where end=%d\n", start, end); + if(start == end) { + printf("ReadLogFile: finish reading the raw data and concatenating the mutation at the same version\n"); + break; + } + + state int i = start; + state int txBytes = 0; + for(; i < end && txBytes < dataSizeLimit; ++i) { + Key k = data[i].key.withPrefix(mutationLogPrefix); + ValueRef v = data[i].value; + txBytes += k.expectedSize(); + txBytes += v.expectedSize(); + //MXX: print out the key value version, and operations. + //printf("LogFile [key:%s, value:%s, version:%ld, op:NoOp]\n", k.printable().c_str(), v.printable().c_str(), logFile.version); + // printf("LogFile [KEY:%s, VALUE:%s, VERSION:%ld, op:NoOp]\n", getHexString(k).c_str(), getHexString(v).c_str(), logFile.version); + // printBackupMutationRefValueHex(v, " |\t"); + // printf("[DEBUG]||Concatenate backup mutation:fileInfo:%s, data:%d\n", logFile.toString().c_str(), i); + bool concatenated = concatenateBackupMutationForLogFile(self, data[i].value, data[i].key); + numConcatenated += ( concatenated ? 1 : 0); + // //TODO: Decode the value to get the mutation type. Use NoOp to distinguish from range kv for now. + // MutationRef m(MutationRef::Type::NoOp, data[i].key, data[i].value); //ASSUME: all operation in log file is NoOp. + // if ( self->kvOps.find(logFile.version) == self->kvOps.end() ) { + // self->kvOps.insert(std::make_pair(logFile.version, std::vector())); + // } else { + // self->kvOps[logFile.version].push_back(m); + // } + } + + start = i; + + } catch(Error &e) { + if(e.code() == error_code_transaction_too_large) + dataSizeLimit /= 2; + } + } + + printf("[INFO] raw kv number:%d parsed from log file, concatenated:%d kv, num_log_versions:%d\n", data.size(), numConcatenated, self->mutationMap.size()); + + return Void(); + } diff --git a/fdbserver/RestoreLoader.actor.h b/fdbserver/RestoreLoader.actor.h index c86e6442e2..36150b4fc2 100644 --- a/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/RestoreLoader.actor.h @@ -21,7 +21,7 @@ // Declear RestoreLoader interface and actors #pragma once -#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RestoreLoaderInterface_H) +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RestoreLoaderInterface_G_H) #define FDBSERVER_RestoreLoaderInterface_G_H #include "fdbserver/RestoreLoader.actor.g.h" #elif !defined(FDBSERVER_RestoreLoaderInterface_H) @@ -35,5 +35,111 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbrpc/Locality.h" +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreCommon.actor.h" +#include "fdbserver/RestoreRoleCommon.actor.h" +#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbclient/BackupContainer.h" +#include "flow/actorcompiler.h" // has to be last include + +struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted { +public: + // range2Applier is in master and loader node. Loader node uses this to determine which applier a mutation should be sent + std::map, UID> range2Applier; // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for + std::map, int> keyOpsCount; // The number of operations per key which is used to determine the key-range boundary for appliers + int numSampledMutations; // The total number of mutations received from sampled data. + + // Loader's state to handle the duplicate delivery of loading commands + std::map processedFiles; //first is filename of processed file, second is not used + + // Temporary data structure for parsing range and log files into (version, ) + std::map>> kvOps; + // Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted + std::map, Standalone> mutationMap; // Key is the unique identifier for a batch of mutation logs at the same version + std::map, uint32_t> mutationPartMap; // Recoself the most recent + + + Reference bc; // Backup container is used to read backup files + Key bcUrl; // The url used to get the bc + + CMDUID cmdID; + + // Performance statistics + double curWorkloadSize; + + void addref() { return ReferenceCounted::addref(); } + void delref() { return ReferenceCounted::delref(); } + + RestoreLoaderData() { + nodeID = g_random->randomUniqueID(); + nodeIndex = 0; + } + + ~RestoreLoaderData() {} + + std::string describeNode() { + std::stringstream ss; + ss << "[Role: Loader] [NodeID:" << nodeID.toString().c_str() + << "] [NodeIndex:" << std::to_string(nodeIndex) << "]"; + return ss.str(); + } + + void resetPerVersionBatch() { + printf("[INFO]Node:%s resetPerVersionBatch\n", nodeID.toString().c_str()); + RestoreRoleData::resetPerVersionBatch(); + + range2Applier.clear(); + keyOpsCount.clear(); + numSampledMutations = 0; + + processedFiles.clear(); + + kvOps.clear(); + mutationMap.clear(); + mutationPartMap.clear(); + + curWorkloadSize = 0; + } + + vector getBusyAppliers() { + vector busyAppliers; + for (auto &app : range2Applier) { + busyAppliers.push_back(app.second); + } + return busyAppliers; + } + + std::vector getWorkingApplierIDs() { + std::vector applierIDs; + for ( auto &applier : range2Applier ) { + applierIDs.push_back(applier.second); + } + + ASSERT( !applierIDs.empty() ); + return applierIDs; + } + + void initBackupContainer(Key url) { + if ( bcUrl == url && bc.isValid() ) { + return; + } + printf("initBackupContainer, url:%s\n", url.toString().c_str()); + bcUrl = url; + bc = IBackupContainer::openContainer(url.toString()); + } + + void printAppliersKeyRange() { + printf("[INFO] The mapping of KeyRange_start --> Applier ID\n"); + // applier type: std::map, UID> + for (auto &applier : range2Applier) { + printf("\t[INFO]%s -> %s\n", getHexString(applier.first).c_str(), applier.second.toString().c_str()); + } + } +}; + + +ACTOR Future restoreLoaderCore(Reference self, RestoreLoaderInterface loaderInterf, Database cx); + +#include "flow/unactorcompiler.h" #endif \ No newline at end of file diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp new file mode 100644 index 0000000000..c414a24f1c --- /dev/null +++ b/fdbserver/RestoreMaster.actor.cpp @@ -0,0 +1,1326 @@ +/* + * RestoreMaster.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/SystemData.h" + +// Backup agent header +#include "fdbclient/BackupAgent.actor.h" +//#include "FileBackupAgent.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/MutationList.h" +#include "fdbclient/BackupContainer.h" + +#include "fdbserver/RestoreCommon.actor.h" +#include "fdbserver/RestoreRoleCommon.actor.h" +#include "fdbserver/RestoreMaster.actor.h" +#include "fdbserver/RestoreApplier.actor.h" +#include "fdbserver/RestoreLoader.actor.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +ACTOR Future askLoadersToCollectRestoreAppliersInterfaces(Reference self); +ACTOR Future>> collectRestoreRequests(Database cx); +ACTOR static Future processRestoreRequest(RestoreRequest request, Reference self, Database cx); +ACTOR static Future finishRestore(Reference self, Database cx, Standalone> restoreRequests); + +ACTOR static Future _collectBackupFiles(Reference self, Database cx, RestoreRequest request); +ACTOR Future initializeVersionBatch(Reference self); +ACTOR static Future distributeWorkloadPerVersionBatch(Reference self, Database cx, RestoreRequest request, Reference restoreConfig); +ACTOR static Future unlockDB(Database cx, UID uid); +ACTOR static Future _clearDB(Reference tr); +ACTOR static Future _lockDB(Database cx, UID uid, bool lockDB); +ACTOR static Future registerStatus(Database cx, struct FastRestoreStatus status); +ACTOR static Future sampleWorkload(Reference self, RestoreRequest request, Reference restoreConfig, int64_t sampleMB_input); +ACTOR Future notifyAppliersKeyRangeToLoader(Reference self, Database cx); +ACTOR Future assignKeyRangeToAppliers(Reference self, Database cx); +ACTOR Future notifyApplierToApplyMutations(Reference self); + + +ACTOR Future startRestoreMaster(Reference self, Database cx) { + try { + wait( delay(1.0) ); + wait( _collectRestoreRoleInterfaces(self, cx) ); + + wait( delay(1.0) ); + wait( askLoadersToCollectRestoreAppliersInterfaces(self) ); + + state int restoreId = 0; + state int checkNum = 0; + loop { + printf("Node:%s---Wait on restore requests...---\n", self->describeNode().c_str()); + state Standalone> restoreRequests = wait( collectRestoreRequests(cx) ); + + printf("Node:%s ---Received restore requests as follows---\n", self->describeNode().c_str()); + // Print out the requests info + for ( auto &it : restoreRequests ) { + printf("\t[INFO][Master]Node:%s RestoreRequest info:%s\n", self->describeNode().c_str(), it.toString().c_str()); + } + + // Step: Perform the restore requests + for ( auto &it : restoreRequests ) { + TraceEvent("LeaderGotRestoreRequest").detail("RestoreRequestInfo", it.toString()); + printf("Node:%s Got RestoreRequestInfo:%s\n", self->describeNode().c_str(), it.toString().c_str()); + Version ver = wait( processRestoreRequest(it, self, cx) ); + } + + // Step: Notify all restore requests have been handled by cleaning up the restore keys + wait( delay(5.0) ); + printf("Finish my restore now!\n"); + //wait( finishRestore(self) ); + wait( finishRestore(self, cx, restoreRequests) ); + + printf("[INFO] MXRestoreEndHere RestoreID:%d\n", restoreId); + TraceEvent("MXRestoreEndHere").detail("RestoreID", restoreId++); + wait( delay(5.0) ); + //NOTE: we have to break the loop so that the tester.actor can receive the return of this test workload. + //Otherwise, this special workload never returns and tester will think the test workload is stuck and the tester will timesout + break; //TODO: this break will be removed later since we need the restore agent to run all the time! + } + + return Void(); + + } catch (Error &e) { + fprintf(stdout, "[ERROR] Restoer Master encounters error. error code:%d, error message:%s\n", + e.code(), e.what()); + } + + return Void(); +} + + + +ACTOR static Future processRestoreRequest(RestoreRequest request, Reference self, Database cx) { + state Key tagName = request.tagName; + state Key url = request.url; + state bool waitForComplete = request.waitForComplete; + state Version targetVersion = request.targetVersion; + state bool verbose = request.verbose; + state KeyRange range = request.range; + state Key addPrefix = request.addPrefix; + state Key removePrefix = request.removePrefix; + state bool lockDB = request.lockDB; + state UID randomUid = request.randomUid; + + //MX: Lock DB if it is not locked + printf("RestoreRequest lockDB:%d\n", lockDB); + if ( lockDB == false ) { + printf("[WARNING] RestoreRequest lockDB:%d; we will overwrite request.lockDB to true and forcely lock db\n", lockDB); + lockDB = true; + request.lockDB = true; + } + + state long curBackupFilesBeginIndex = 0; + state long curBackupFilesEndIndex = 0; + + state double totalWorkloadSize = 0; + state double totalRunningTime = 0; // seconds + state double curRunningTime = 0; // seconds + state double curStartTime = 0; + state double curEndTime = 0; + state double curWorkloadSize = 0; //Bytes + + + state Reference tr(new ReadYourWritesTransaction(cx)); + state Reference restoreConfig(new RestoreConfig(randomUid)); + + // lock DB for restore + wait( _lockDB(cx, randomUid, lockDB) ); + wait( _clearDB(tr) ); + + // Step: Collect all backup files + printf("===========Restore request start!===========\n"); + state double startTime = now(); + wait( _collectBackupFiles(self, cx, request) ); + printf("[Perf] Node:%s collectBackupFiles takes %.2f seconds\n", self->describeNode().c_str(), now() - startTime); + self->constructFilesWithVersionRange(); + self->files.clear(); // Ensure no mistakely use self->files + + // Sort the backup files based on end version. + sort(self->allFiles.begin(), self->allFiles.end()); + self->printAllBackupFilesInfo(); + + self->buildForbiddenVersionRange(); + self->printForbiddenVersionRange(); + if ( self->isForbiddenVersionRangeOverlapped() ) { + fprintf(stderr, "[ERROR] forbidden version ranges are overlapped! Check out the forbidden version range above\n"); + } + + self->batchIndex = 0; + state int prevBatchIndex = 0; + state long prevCurBackupFilesBeginIndex = 0; + state long prevCurBackupFilesEndIndex = 0; + state double prevCurWorkloadSize = 0; + state double prevtotalWorkloadSize = 0; + + loop { + try { + curStartTime = now(); + self->files.clear(); + self->resetPerVersionBatch(); + self->cmdID.setBatch(self->batchIndex); + // Checkpoint the progress of the previous version batch + prevBatchIndex = self->batchIndex; + prevCurBackupFilesBeginIndex = self->curBackupFilesBeginIndex; + prevCurBackupFilesEndIndex = self->curBackupFilesEndIndex; + prevCurWorkloadSize = self->curWorkloadSize; + prevtotalWorkloadSize = self->totalWorkloadSize; + + bool hasBackupFilesToProcess = self->collectFilesForOneVersionBatch(); + if ( !hasBackupFilesToProcess ) { // No more backup files to restore + printf("No backup files to process any more\n"); + break; + } + + printf("[Progress][Start version batch] Node:%s, restoreBatchIndex:%d, curWorkloadSize:%.2f------\n", self->describeNode().c_str(), self->batchIndex, self->curWorkloadSize); + + wait( initializeVersionBatch(self) ); + + wait( delay(1.0) ); + + wait( distributeWorkloadPerVersionBatch(self, cx, request, restoreConfig) ); + + curEndTime = now(); + curRunningTime = curEndTime - curStartTime; + ASSERT(curRunningTime >= 0); + totalRunningTime += curRunningTime; + + struct FastRestoreStatus status; + status.curRunningTime = curRunningTime; + status.curWorkloadSize = self->curWorkloadSize; + status.curSpeed = self->curWorkloadSize / curRunningTime; + status.totalRunningTime = totalRunningTime; + status.totalWorkloadSize = self->totalWorkloadSize; + status.totalSpeed = self->totalWorkloadSize / totalRunningTime; + + printf("[Progress][Finish version batch] restoreBatchIndex:%d, curWorkloadSize:%.2f B, curWorkload:%.2f B curRunningtime:%.2f s curSpeed:%.2f B/s totalWorkload:%.2f B totalRunningTime:%.2f s totalSpeed:%.2f B/s\n", + self->batchIndex, self->curWorkloadSize, + status.curWorkloadSize, status.curRunningTime, status.curSpeed, status.totalWorkloadSize, status.totalRunningTime, status.totalSpeed); + + wait( registerStatus(cx, status) ); + printf("[Progress] Finish 1 version batch. curBackupFilesBeginIndex:%ld curBackupFilesEndIndex:%ld allFiles.size():%ld", + self->curBackupFilesBeginIndex, self->curBackupFilesEndIndex, self->allFiles.size()); + + self->curBackupFilesBeginIndex = self->curBackupFilesEndIndex + 1; + self->curBackupFilesEndIndex++; + self->curWorkloadSize = 0; + self->batchIndex++; + + } catch(Error &e) { + fprintf(stdout, "!!![MAY HAVE BUG] Reset the version batch state to the start of the current version batch, due to error:%s\n", e.what()); + if(e.code() != error_code_restore_duplicate_tag) { + wait(tr->onError(e)); + } + self->batchIndex = prevBatchIndex; + self->curBackupFilesBeginIndex = prevCurBackupFilesBeginIndex; + self->curBackupFilesEndIndex = prevCurBackupFilesEndIndex; + self->curWorkloadSize = prevCurWorkloadSize; + self->totalWorkloadSize = prevtotalWorkloadSize; + } + } + + // Unlock DB at the end of handling the restore request + wait( unlockDB(cx, randomUid) ); + printf("Finish restore uid:%s \n", randomUid.toString().c_str()); + + return targetVersion; +} + +// Distribution workload per version batch +ACTOR static Future distributeWorkloadPerVersionBatch(Reference self, Database cx, RestoreRequest request, Reference restoreConfig) { + state Key tagName = request.tagName; + state Key url = request.url; + state bool waitForComplete = request.waitForComplete; + state Version targetVersion = request.targetVersion; + state bool verbose = request.verbose; + state KeyRange restoreRange = request.range; + state Key addPrefix = request.addPrefix; + state Key removePrefix = request.removePrefix; + state bool lockDB = request.lockDB; + state UID randomUid = request.randomUid; + state Key mutationLogPrefix = restoreConfig->mutationLogPrefix(); + + if ( self->isBackupEmpty() ) { + printf("[WARNING] Node:%s distributeWorkloadPerVersionBatch() load an empty batch of backup. Print out the empty backup files info.\n", self->describeNode().c_str()); + self->printBackupFilesInfo(); + return Void(); + } + + printf("[INFO] Node:%s mutationLogPrefix:%s (hex value:%s)\n", self->describeNode().c_str(), mutationLogPrefix.toString().c_str(), getHexString(mutationLogPrefix).c_str()); + + // Determine the key range each applier is responsible for + int numLoaders = self->loadersInterf.size(); + int numAppliers = self->appliersInterf.size(); + ASSERT( numLoaders > 0 ); + ASSERT( numAppliers > 0 ); + + state int loadingSizeMB = 0; //numLoaders * 1000; //NOTE: We want to load the entire file in the first version, so we want to make this as large as possible + int64_t sampleSizeMB = 0; //loadingSizeMB / 100; // Will be overwritten. The sampleSizeMB will be calculated based on the batch size + + state double startTime = now(); + state double startTimeBeforeSampling = now(); + + wait( sampleWorkload(self, request, restoreConfig, sampleSizeMB) ); + wait( delay(1.0) ); + + printf("[Progress] distributeWorkloadPerVersionBatch sampling time:%.2f seconds\n", now() - startTime); + state double startTimeAfterSampling = now(); + + // Notify each applier about the key range it is responsible for, and notify appliers to be ready to receive data + startTime = now(); + wait( assignKeyRangeToAppliers(self, cx) ); + wait( delay(1.0) ); + printf("[Progress] distributeWorkloadPerVersionBatch assignKeyRangeToAppliers time:%.2f seconds\n", now() - startTime); + + startTime = now(); + wait( notifyAppliersKeyRangeToLoader(self, cx) ); + wait( delay(1.0) ); + printf("[Progress] distributeWorkloadPerVersionBatch notifyAppliersKeyRangeToLoader time:%.2f seconds\n", now() - startTime); + + // Determine which backup data block (filename, offset, and length) each loader is responsible for and + // Notify the loader about the data block and send the cmd to the loader to start loading the data + // Wait for the ack from loader and repeats + + // Prepare the file's loading status + for (int i = 0; i < self->files.size(); ++i) { + self->files[i].cursor = 0; + } + + // Send loading cmd to available loaders whenever loaders become available + // NOTE: We must split the workload in the correct boundary: + // For range file, it's the block boundary; + // For log file, it is the version boundary. + // This is because + // (1) The set of mutations at a version may be encoded in multiple KV pairs in log files. + // We need to concatenate the related KVs to a big KV before we can parse the value into a vector of mutations at that version + // (2) The backuped KV are arranged in blocks in range file. + // For simplicity, we distribute at the granularity of files for now. + + state int loadSizeB = loadingSizeMB * 1024 * 1024; + state int loadingCmdIndex = 0; + + state int checkpointCurFileIndex = 0; + state long checkpointCurOffset = 0; + + startTime = now(); + // We should load log file before we do range file + state RestoreCommandEnum phaseType = RestoreCommandEnum::Assign_Loader_Log_File; + state std::vector> cmdReplies; + loop { + state int curFileIndex = 0; // The smallest index of the files that has not been FULLY loaded + state long curOffset = 0; + state bool allLoadReqsSent = false; + loop { + try { + if ( allLoadReqsSent ) { + break; // All load requests have been handled + } + wait(delay(1.0)); + + cmdReplies.clear(); + printf("[INFO] Number of backup files:%ld\n", self->files.size()); + self->cmdID.initPhase(phaseType); + for (auto &loader : self->loadersInterf) { + UID loaderID = loader.first; + RestoreLoaderInterface loaderInterf = loader.second; + + while ( curFileIndex < self->files.size() && self->files[curFileIndex].fileSize == 0 ) { + // NOTE: && self->files[curFileIndex].cursor >= self->files[curFileIndex].fileSize + printf("[INFO] File %ld:%s filesize:%ld skip the file\n", curFileIndex, + self->files[curFileIndex].fileName.c_str(), self->files[curFileIndex].fileSize); + curFileIndex++; + curOffset = 0; + } + if ( curFileIndex >= self->files.size() ) { + allLoadReqsSent = true; + break; + } + LoadingParam param; + //self->files[curFileIndex].cursor = 0; // This is a hacky way to make sure cursor is correct in current version when we load 1 file at a time + param.url = request.url; + param.version = self->files[curFileIndex].version; + param.filename = self->files[curFileIndex].fileName; + param.offset = curOffset; //self->files[curFileIndex].cursor; + param.length = std::min(self->files[curFileIndex].fileSize - curOffset, self->files[curFileIndex].blockSize); + //param.length = self->files[curFileIndex].fileSize; + loadSizeB = param.length; + param.blockSize = self->files[curFileIndex].blockSize; + param.restoreRange = restoreRange; + param.addPrefix = addPrefix; + param.removePrefix = removePrefix; + param.mutationLogPrefix = mutationLogPrefix; + if ( !(param.length > 0 && param.offset >= 0 && param.offset < self->files[curFileIndex].fileSize) ) { + printf("[ERROR] param: length:%ld offset:%ld fileSize:%ld for %ldth filename:%s\n", + param.length, param.offset, self->files[curFileIndex].fileSize, curFileIndex, + self->files[curFileIndex].fileName.c_str()); + } + ASSERT( param.length > 0 ); + ASSERT( param.offset >= 0 ); + ASSERT( param.offset < self->files[curFileIndex].fileSize ); + self->files[curFileIndex].cursor = self->files[curFileIndex].cursor + param.length; + + RestoreCommandEnum cmdType = RestoreCommandEnum::Assign_Loader_Range_File; + if (self->files[curFileIndex].isRange) { + cmdType = RestoreCommandEnum::Assign_Loader_Range_File; + self->cmdID.setPhase(RestoreCommandEnum::Assign_Loader_Range_File); + } else { + cmdType = RestoreCommandEnum::Assign_Loader_Log_File; + self->cmdID.setPhase(RestoreCommandEnum::Assign_Loader_Log_File); + } + + if ( (phaseType == RestoreCommandEnum::Assign_Loader_Log_File && self->files[curFileIndex].isRange) + || (phaseType == RestoreCommandEnum::Assign_Loader_Range_File && !self->files[curFileIndex].isRange) ) { + self->files[curFileIndex].cursor = 0; + curFileIndex++; + curOffset = 0; + } else { // load the type of file in the phaseType + self->cmdID.nextCmd(); + printf("[CMD] Loading fileIndex:%ld fileInfo:%s loadingParam:%s on node %s\n", + curFileIndex, self->files[curFileIndex].toString().c_str(), + param.toString().c_str(), loaderID.toString().c_str()); // VERY USEFUL INFO + printf("[INFO] Node:%s CMDUID:%s cmdType:%d isRange:%d loaderNode:%s\n", self->describeNode().c_str(), self->cmdID.toString().c_str(), + (int) cmdType, (int) self->files[curFileIndex].isRange, loaderID.toString().c_str()); + if (self->files[curFileIndex].isRange) { + cmdReplies.push_back( loaderInterf.loadRangeFile.getReply(RestoreLoadFileRequest(self->cmdID, param)) ); + } else { + cmdReplies.push_back( loaderInterf.loadLogFile.getReply(RestoreLoadFileRequest(self->cmdID, param)) ); + } + curOffset += param.length; + + // Reach the end of the file + if ( param.length + param.offset >= self->files[curFileIndex].fileSize ) { + curFileIndex++; + curOffset = 0; + } + + // if (param.length <= loadSizeB) { // Reach the end of the file + // ASSERT( self->files[curFileIndex].cursor == self->files[curFileIndex].fileSize ); + // curFileIndex++; + // } + } + + if ( curFileIndex >= self->files.size() ) { + allLoadReqsSent = true; + break; + } + //++loadingCmdIndex; // Replaced by cmdUID + } + + printf("[INFO] Wait for %ld loaders to accept the cmd Assign_Loader_File\n", cmdReplies.size()); + + // Question: How to set reps to different value based on cmdReplies.empty()? + if ( !cmdReplies.empty() ) { + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); //TODO: change to getAny. NOTE: need to keep the still-waiting replies + //std::vector reps = wait( getAll(cmdReplies) ); + + cmdReplies.clear(); + for (int i = 0; i < reps.size(); ++i) { + printf("[INFO] Get Ack reply:%s for Assign_Loader_File\n", + reps[i].toString().c_str()); + } + checkpointCurFileIndex = curFileIndex; // Save the previous success point + checkpointCurOffset = curOffset; + } + + // TODO: Let master print all nodes status. Note: We need a function to print out all nodes status + + if (allLoadReqsSent) { + printf("[INFO] allLoadReqsSent has finished.\n"); + break; // NOTE: need to change when change to wait on any cmdReplies + } + + } catch (Error &e) { + // TODO: Handle the command reply timeout error + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + curFileIndex = checkpointCurFileIndex; + curOffset = checkpointCurOffset; + } + } + + if (phaseType == RestoreCommandEnum::Assign_Loader_Log_File) { + phaseType = RestoreCommandEnum::Assign_Loader_Range_File; + } else if (phaseType == RestoreCommandEnum::Assign_Loader_Range_File) { + break; + } + } + + wait( delay(1.0) ); + printf("[Progress] distributeWorkloadPerVersionBatch loadFiles time:%.2f seconds\n", now() - startTime); + + ASSERT( cmdReplies.empty() ); + + wait( delay(5.0) ); + // Notify the applier to applly mutation to DB + + startTime = now(); + wait( notifyApplierToApplyMutations(self) ); + printf("[Progress] distributeWorkloadPerVersionBatch applyToDB time:%.2f seconds\n", now() - startTime); + + state double endTime = now(); + + double runningTime = endTime - startTimeBeforeSampling; + printf("[Progress] Node:%s distributeWorkloadPerVersionBatch runningTime without sampling time:%.2f seconds, with sampling time:%.2f seconds\n", + self->describeNode().c_str(), + runningTime, endTime - startTimeAfterSampling); + + return Void(); + +} + + +// RestoreMaster: Ask loaders to sample data and send mutations to master applier. Ask master applier to calculate the range for each applier +ACTOR static Future sampleWorkload(Reference self, RestoreRequest request, Reference restoreConfig, int64_t sampleMB_input) { + state Key tagName = request.tagName; + state Key url = request.url; + state bool waitForComplete = request.waitForComplete; + state Version targetVersion = request.targetVersion; + state bool verbose = request.verbose; + state KeyRange restoreRange = request.range; + state Key addPrefix = request.addPrefix; + state Key removePrefix = request.removePrefix; + state bool lockDB = request.lockDB; + state UID randomUid = request.randomUid; + state Key mutationLogPrefix = restoreConfig->mutationLogPrefix(); + + state bool allLoadReqsSent = false; + state int64_t sampleMB = sampleMB_input; //100; + state int64_t sampleB = sampleMB * 1024 * 1024; // Sample a block for every sampleB bytes. // Should adjust this value differently for simulation mode and real mode + state int64_t curFileIndex = 0; + state int64_t curFileOffset = 0; + state int64_t loadSizeB = 0; + state int64_t loadingCmdIndex = 0; + state int64_t sampleIndex = 0; + state double totalBackupSizeB = 0; + state double samplePercent = 0.05; // sample 1 data block per samplePercent (0.01) of data. num_sample = 1 / samplePercent + + // We should sample 1% data + for (int i = 0; i < self->files.size(); i++) { + totalBackupSizeB += self->files[i].fileSize; + } + sampleB = std::max((int) (samplePercent * totalBackupSizeB), 10 * 1024 * 1024); // The minimal sample size is 10MB + printf("Node:%s totalBackupSizeB:%.1fB (%.1fMB) samplePercent:%.2f, sampleB:%ld\n", self->describeNode().c_str(), + totalBackupSizeB, totalBackupSizeB / 1024 / 1024, samplePercent, sampleB); + + // Step: Distribute sampled file blocks to loaders to sample the mutations + self->cmdID.initPhase(RestoreCommandEnum::Sample_Range_File); + curFileIndex = 0; + state CMDUID checkpointCMDUID = self->cmdID; + state int checkpointCurFileIndex = curFileIndex; + state int64_t checkpointCurFileOffset = 0; + state std::vector> cmdReplies; + state RestoreCommandEnum cmdType; + loop { // For retry on timeout + try { + if ( allLoadReqsSent ) { + break; // All load requests have been handled + } + wait(delay(1.0)); + + cmdReplies.clear(); + + printf("[Sampling] Node:%s We will sample the workload among %ld backup files.\n", self->describeNode().c_str(), self->files.size()); + printf("[Sampling] Node:%s totalBackupSizeB:%.1fB (%.1fMB) samplePercent:%.2f, sampleB:%ld, loadSize:%dB sampleIndex:%ld\n", self->describeNode().c_str(), + totalBackupSizeB, totalBackupSizeB / 1024 / 1024, samplePercent, sampleB, loadSizeB, sampleIndex); + for (auto &loader : self->loadersInterf) { + const UID &loaderID = loader.first; + RestoreLoaderInterface &loaderInterf= loader.second; + + // Find the sample file + while ( curFileIndex < self->files.size() && self->files[curFileIndex].fileSize == 0 ) { + // NOTE: && self->files[curFileIndex].cursor >= self->files[curFileIndex].fileSize + printf("[Sampling] File %ld:%s filesize:%ld skip the file\n", curFileIndex, + self->files[curFileIndex].fileName.c_str(), self->files[curFileIndex].fileSize); + curFileOffset = 0; + curFileIndex++; + } + // Find the next sample point + while ( loadSizeB / sampleB < sampleIndex && curFileIndex < self->files.size() ) { + if (self->files[curFileIndex].fileSize == 0) { + // NOTE: && self->files[curFileIndex].cursor >= self->files[curFileIndex].fileSize + printf("[Sampling] File %ld:%s filesize:%ld skip the file\n", curFileIndex, + self->files[curFileIndex].fileName.c_str(), self->files[curFileIndex].fileSize); + curFileIndex++; + curFileOffset = 0; + continue; + } + if ( loadSizeB / sampleB >= sampleIndex ) { + break; + } + if (curFileIndex >= self->files.size()) { + break; + } + loadSizeB += std::min( self->files[curFileIndex].blockSize, std::max(self->files[curFileIndex].fileSize - curFileOffset * self->files[curFileIndex].blockSize, (int64_t) 0) ); + curFileOffset++; + if ( self->files[curFileIndex].blockSize == 0 || curFileOffset >= self->files[curFileIndex].fileSize / self->files[curFileIndex].blockSize ) { + curFileOffset = 0; + curFileIndex++; + } + } + if ( curFileIndex >= self->files.size() ) { + allLoadReqsSent = true; + break; + } + + //sampleIndex++; + + // Notify loader to sample the file + LoadingParam param; + param.url = request.url; + param.version = self->files[curFileIndex].version; + param.filename = self->files[curFileIndex].fileName; + param.offset = curFileOffset * self->files[curFileIndex].blockSize; // The file offset in bytes + //param.length = std::min(self->files[curFileIndex].fileSize - self->files[curFileIndex].cursor, loadSizeB); + param.length = std::min(self->files[curFileIndex].blockSize, std::max((int64_t)0, self->files[curFileIndex].fileSize - param.offset)); + loadSizeB += param.length; + sampleIndex = std::ceil(loadSizeB / sampleB); + curFileOffset++; + + //loadSizeB = param.length; + param.blockSize = self->files[curFileIndex].blockSize; + param.restoreRange = restoreRange; + param.addPrefix = addPrefix; + param.removePrefix = removePrefix; + param.mutationLogPrefix = mutationLogPrefix; + if ( !(param.length > 0 && param.offset >= 0 && param.offset < self->files[curFileIndex].fileSize) ) { + printf("[ERROR] param: length:%ld offset:%ld fileSize:%ld for %ldth file:%s\n", + param.length, param.offset, self->files[curFileIndex].fileSize, curFileIndex, + self->files[curFileIndex].toString().c_str()); + } + + + printf("[Sampling][File:%ld] filename:%s offset:%ld blockSize:%ld filesize:%ld loadSize:%ldB sampleIndex:%ld\n", + curFileIndex, self->files[curFileIndex].fileName.c_str(), curFileOffset, + self->files[curFileIndex].blockSize, self->files[curFileIndex].fileSize, + loadSizeB, sampleIndex); + + + ASSERT( param.length > 0 ); + ASSERT( param.offset >= 0 ); + ASSERT( param.offset <= self->files[curFileIndex].fileSize ); + + printf("[Sampling][CMD] Node:%s Loading %s on node %s\n", + self->describeNode().c_str(), param.toString().c_str(), loaderID.toString().c_str()); + + self->cmdID.nextCmd(); // The cmd index is the i^th file (range or log file) to be processed + if (!self->files[curFileIndex].isRange) { + cmdType = RestoreCommandEnum::Sample_Log_File; + self->cmdID.setPhase(RestoreCommandEnum::Sample_Log_File); + cmdReplies.push_back( loaderInterf.sampleLogFile.getReply(RestoreLoadFileRequest(self->cmdID, param)) ); + } else { + cmdType = RestoreCommandEnum::Sample_Range_File; + self->cmdID.setPhase(RestoreCommandEnum::Sample_Range_File); + cmdReplies.push_back( loaderInterf.sampleRangeFile.getReply(RestoreLoadFileRequest(self->cmdID, param)) ); + } + + printf("[Sampling] Master cmdType:%d cmdUID:%s isRange:%d destinationNode:%s\n", + (int) cmdType, self->cmdID.toString().c_str(), (int) self->files[curFileIndex].isRange, + loaderID.toString().c_str()); + + if (param.offset + param.length >= self->files[curFileIndex].fileSize) { // Reach the end of the file + curFileIndex++; + curFileOffset = 0; + } + if ( curFileIndex >= self->files.size() ) { + allLoadReqsSent = true; + break; + } + ++loadingCmdIndex; + } + + printf("[Sampling] Wait for %ld loaders to accept the cmd Sample_Range_File or Sample_Log_File\n", cmdReplies.size()); + + if ( !cmdReplies.empty() ) { + //TODO: change to getAny. NOTE: need to keep the still-waiting replies + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + //std::vector reps = wait( getAll(cmdReplies) ); + + for (int i = 0; i < reps.size(); ++i) { + printf("[Sampling][%d out of %d] Get reply:%s for Sample_Range_File or Sample_Log_File\n", + i, reps.size(), reps[i].toString().c_str()); + } + checkpointCMDUID = self->cmdID; + checkpointCurFileIndex = curFileIndex; + checkpointCurFileOffset = curFileOffset; + } + + if (allLoadReqsSent) { + printf("[Sampling] allLoadReqsSent, sampling finished\n"); + break; // NOTE: need to change when change to wait on any cmdReplies + } + + } catch (Error &e) { + // Handle the command reply timeout error + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + self->cmdID = checkpointCMDUID; + curFileIndex = checkpointCurFileIndex; + curFileOffset = checkpointCurFileOffset; + allLoadReqsSent = false; + printf("[Sampling][Waring] Retry at CMDID:%s curFileIndex:%ld\n", self->cmdID.toString().c_str(), curFileIndex); + } + } + + wait(delay(1.0)); + + // Ask master applier to calculate the key ranges for appliers + state int numKeyRanges = 0; + loop { + try { + printf("[Sampling][CMD] Ask master applier %s for the key ranges for appliers\n", self->masterApplierInterf.toString().c_str()); + + ASSERT(self->appliersInterf.size() > 0); + self->cmdID.initPhase(RestoreCommandEnum::Calculate_Applier_KeyRange); + self->cmdID.nextCmd(); + GetKeyRangeNumberReply rep = wait( timeoutError( + self->masterApplierInterf.calculateApplierKeyRange.getReply(RestoreCalculateApplierKeyRangeRequest(self->cmdID, self->appliersInterf.size())), FastRestore_Failure_Timeout) ); + printf("[Sampling][CMDRep] number of key ranges calculated by master applier:%d\n", rep.keyRangeNum); + numKeyRanges = rep.keyRangeNum; + + if (numKeyRanges <= 0 || numKeyRanges >= self->appliersInterf.size() ) { + printf("[WARNING] Calculate_Applier_KeyRange receives wrong reply (numKeyRanges:%ld) from other phases. appliersInterf.size:%d Retry Calculate_Applier_KeyRange\n", numKeyRanges, self->appliersInterf.size()); + continue; + } + + if ( numKeyRanges < self->appliersInterf.size() ) { + printf("[WARNING][Sampling] numKeyRanges:%d < appliers number:%ld. %ld appliers will not be used!\n", + numKeyRanges, self->appliersInterf.size(), self->appliersInterf.size() - numKeyRanges); + } + + break; + } catch (Error &e) { + // Handle the command reply timeout error + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + printf("[Sampling] [Warning] Retry on Calculate_Applier_KeyRange\n"); + } + } + + wait(delay(1.0)); + + // Ask master applier to return the key range for appliers + state std::vector> keyRangeReplies; + state std::map::iterator applier; + loop { + try { + self->range2Applier.clear(); + keyRangeReplies.clear(); // In case error happens in try loop + self->cmdID.initPhase(RestoreCommandEnum::Get_Applier_KeyRange); + //self->cmdID.nextCmd(); + state int applierindex = 0; + for ( applier = self->appliersInterf.begin(); applier != self->appliersInterf.end(); applier++, applierindex++) { + self->cmdID.nextCmd(); + printf("[Sampling][Master] Node:%s, CMDID:%s Ask masterApplierInterf:%s for the lower boundary of the key range for applier:%s\n", + self->describeNode().c_str(), self->cmdID.toString().c_str(), + self->masterApplierInterf.toString().c_str(), applier->first.toString().c_str()); + keyRangeReplies.push_back( self->masterApplierInterf.getApplierKeyRangeRequest.getReply( + RestoreGetApplierKeyRangeRequest(self->cmdID, applierindex)) ); + } + std::vector reps = wait( timeoutError( getAll(keyRangeReplies), FastRestore_Failure_Timeout) ); + + ASSERT( reps.size() <= self->appliersInterf.size() ); + + // TODO: Directly use the replied lowerBound and upperBound + applier = self->appliersInterf.begin(); + for (int i = 0; i < reps.size() && i < numKeyRanges; ++i) { + UID applierID = applier->first; + Standalone lowerBound = reps[i].lowerBound; + // if (i < numKeyRanges) { + // lowerBound = reps[i].lowerBound; + // } else { + // lowerBound = normalKeys.end; + // } + + if (i == 0) { + lowerBound = LiteralStringRef("\x00"); // The first interval must starts with the smallest possible key + } + printf("[INFO] Node:%s Assign key-to-applier map: Key:%s -> applierID:%s\n", self->describeNode().c_str(), + getHexString(lowerBound).c_str(), applierID.toString().c_str()); + self->range2Applier.insert(std::make_pair(lowerBound, applierID)); + applier++; + } + + break; + } catch (Error &e) { + // TODO: Handle the command reply timeout error + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + printf("[Sampling] [Warning] Retry on Get_Applier_KeyRange\n"); + } + } + printf("[Sampling] self->range2Applier has been set. Its size is:%d\n", self->range2Applier.size()); + self->printAppliersKeyRange(); + + wait(delay(1.0)); + + return Void(); + +} + +// Restore Master: Ask each restore loader to collect all appliers' interfaces +ACTOR Future askLoadersToCollectRestoreAppliersInterfaces(Reference self) { + state int index = 0; + loop { + try { + wait(delay(1.0)); + index = 0; + std::vector> cmdReplies; + for(auto& loaderInterf : self->loadersInterf) { + self->cmdID.nextCmd(); + printf("[CMD:%s] Node:%s askLoadersToCollectRestoreAppliersInterfaces for node (index=%d uid=%s)\n", + self->cmdID.toString().c_str(), self->describeNode().c_str(), + index, loaderInterf.first.toString().c_str()); + cmdReplies.push_back( loaderInterf.second.collectRestoreRoleInterfaces.getReply(RestoreSimpleRequest(self->cmdID)) ); + index++; + } + std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); + printf("[setWorkerInterface] Finished\n"); + break; + } catch (Error &e) { + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + printf("Node:%s waits on replies time out. Current phase: setWorkerInterface, Retry all commands.\n", self->describeNode().c_str()); + } + } + + return Void(); +} + + + +// TODO: Revise the way to collect the restore request. We may make it into 1 transaction +ACTOR Future>> collectRestoreRequests(Database cx) { + state int restoreId = 0; + state int checkNum = 0; + state Standalone> restoreRequests; + state Future watch4RestoreRequest; + + //wait for the restoreRequestTriggerKey to be set by the client/test workload + state ReadYourWritesTransaction tr(cx); + + loop { + try { + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + // Assumption: restoreRequestTriggerKey has not been set + // Question: What if restoreRequestTriggerKey has been set? we will stuck here? + // Question: Can the following code handle the situation? + // Note: restoreRequestTriggerKey may be set before the watch is set or may have a conflict when the client sets the same key + // when it happens, will we stuck at wait on the watch? + + watch4RestoreRequest = tr.watch(restoreRequestTriggerKey); + wait(tr.commit()); + printf("[INFO][Master] Finish setting up watch for restoreRequestTriggerKey\n"); + break; + } catch(Error &e) { + printf("[WARNING] Transaction for restore request in watch restoreRequestTriggerKey. Error:%s\n", e.name()); + wait(tr.onError(e)); + } + }; + + + loop { + try { + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + // Assumption: restoreRequestTriggerKey has not been set + // Before we wait on the watch, we must make sure the key is not there yet! + //printf("[INFO][Master] Make sure restoreRequestTriggerKey does not exist before we wait on the key\n"); + Optional triggerKey = wait( tr.get(restoreRequestTriggerKey) ); + if ( triggerKey.present() ) { + printf("!!! restoreRequestTriggerKey (and restore requests) is set before restore agent waits on the request. Restore agent can immediately proceed\n"); + break; + } + wait(watch4RestoreRequest); + printf("[INFO][Master] restoreRequestTriggerKey watch is triggered\n"); + break; + } catch(Error &e) { + printf("[WARNING] Transaction for restore request at wait on watch restoreRequestTriggerKey. Error:%s\n", e.name()); + wait(tr.onError(e)); + } + }; + + loop { + try { + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + + state Optional numRequests = wait(tr.get(restoreRequestTriggerKey)); + int num = decodeRestoreRequestTriggerValue(numRequests.get()); + //TraceEvent("RestoreRequestKey").detail("NumRequests", num); + printf("[INFO] RestoreRequestNum:%d\n", num); + + state Standalone restoreRequestValues = wait(tr.getRange(restoreRequestKeys, CLIENT_KNOBS->TOO_MANY)); + printf("Restore worker get restoreRequest: %s\n", restoreRequestValues.toString().c_str()); + + ASSERT(!restoreRequestValues.more); + + if(restoreRequestValues.size()) { + for ( auto &it : restoreRequestValues ) { + printf("Now decode restore request value...\n"); + restoreRequests.push_back(restoreRequests.arena(), decodeRestoreRequestValue(it.value)); + } + } + break; + } catch(Error &e) { + printf("[WARNING] Transaction error: collect restore requests. Error:%s\n", e.name()); + wait(tr.onError(e)); + } + }; + + return restoreRequests; +} + +// NOTE: This function can now get the backup file descriptors +ACTOR static Future _collectBackupFiles(Reference self, Database cx, RestoreRequest request) { + state Key tagName = request.tagName; + state Key url = request.url; + state bool waitForComplete = request.waitForComplete; + state Version targetVersion = request.targetVersion; + state bool verbose = request.verbose; + state KeyRange range = request.range; + state Key addPrefix = request.addPrefix; + state Key removePrefix = request.removePrefix; + state bool lockDB = request.lockDB; + state UID randomUid = request.randomUid; + + ASSERT( lockDB == true ); + + self->initBackupContainer(url); + + state Reference bc = self->bc; + state BackupDescription desc = wait(bc->describeBackup()); + + wait(desc.resolveVersionTimes(cx)); + + printf("[INFO] Backup Description\n%s", desc.toString().c_str()); + printf("[INFO] Restore for url:%s, lockDB:%d\n", url.toString().c_str(), lockDB); + if(targetVersion == invalidVersion && desc.maxRestorableVersion.present()) + targetVersion = desc.maxRestorableVersion.get(); + + printf("[INFO] collectBackupFiles: now getting backup files for restore request: %s\n", request.toString().c_str()); + Optional restorable = wait(bc->getRestoreSet(targetVersion)); + + if(!restorable.present()) { + printf("[WARNING] restoreVersion:%ld (%lx) is not restorable!\n", targetVersion, targetVersion); + throw restore_missing_data(); + } + + if (!self->files.empty()) { + printf("[WARNING] global files are not empty! files.size() is %ld. We forcely clear files\n", self->files.size()); + self->files.clear(); + } + + printf("[INFO] Found backup files: num of files:%ld\n", self->files.size()); + for(const RangeFile &f : restorable.get().ranges) { + TraceEvent("FoundRangeFileMX").detail("FileInfo", f.toString()); + printf("[INFO] FoundRangeFile, fileInfo:%s\n", f.toString().c_str()); + RestoreFileFR file(f.version, f.fileName, true, f.blockSize, f.fileSize, f.version, f.version); + self->files.push_back(file); + } + for(const LogFile &f : restorable.get().logs) { + TraceEvent("FoundLogFileMX").detail("FileInfo", f.toString()); + printf("[INFO] FoundLogFile, fileInfo:%s\n", f.toString().c_str()); + RestoreFileFR file(f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.beginVersion); + self->files.push_back(file); + } + + printf("[INFO] Restoring backup to version: %lld\n", (long long) targetVersion); + + return Void(); +} + + +ACTOR static Future _lockDB(Database cx, UID uid, bool lockDB) { + printf("[Lock] DB will be locked, uid:%s, lockDB:%d\n", uid.toString().c_str(), lockDB); + + ASSERT( lockDB ); + + loop { + try { + wait(lockDatabase(cx, uid)); + break; + } catch( Error &e ) { + printf("Transaction Error when we lockDB. Error:%s\n", e.what()); + wait(tr->onError(e)); + } + } + + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(checkDatabaseLock(tr, uid)); + + tr->commit(); + break; + } catch( Error &e ) { + printf("Transaction Error when we lockDB. Error:%s\n", e.what()); + wait(tr->onError(e)); + } + } + + + return Void(); +} + +ACTOR static Future _clearDB(Reference tr) { + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->clear(normalKeys); + tr->commit(); + break; + } catch(Error &e) { + printf("Retry at clean up DB before restore. error code:%d message:%s. Retry...\n", e.code(), e.what()); + if(e.code() != error_code_restore_duplicate_tag) { + wait(tr->onError(e)); + } + } + } + + return Void(); +} + + + +ACTOR Future initializeVersionBatch(Reference self) { + loop { + try { + wait(delay(1.0)); + std::vector> cmdReplies; + self->cmdID.initPhase(RestoreCommandEnum::Reset_VersionBatch); + for (auto &loader : self->loadersInterf) { + cmdReplies.push_back( loader.second.initVersionBatch.getReply(RestoreVersionBatchRequest(self->cmdID, self->batchIndex)) ); + } + for (auto &applier : self->appliersInterf) { + cmdReplies.push_back( applier.second.initVersionBatch.getReply(RestoreVersionBatchRequest(self->cmdID, self->batchIndex)) ); + } + + std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); + printf("Initilaize Version Batch done\n"); + break; + } catch (Error &e) { + fprintf(stdout, "[ERROR] Node:%s, Current phase: initializeVersionBatch, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + } + } + + return Void(); +} + + +ACTOR Future notifyApplierToApplyMutations(Reference self) { + state std::vector> cmdReplies; + loop { + try { + self->cmdID.initPhase( RestoreCommandEnum::Apply_Mutation_To_DB ); + for (auto& applier : self->appliersInterf) { + RestoreApplierInterface &applierInterf = applier.second; + + printf("[CMD] Node:%s Notify node:%s to apply mutations to DB\n", self->describeNode().c_str(), applier.first.toString().c_str()); + cmdReplies.push_back( applier.second.applyToDB.getReply(RestoreSimpleRequest(self->cmdID)) ); + } + printf("[INFO] Wait for %ld appliers to apply mutations to DB\n", self->appliersInterf.size()); + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + //std::vector reps = wait( getAll(cmdReplies) ); + printf("[INFO] %ld appliers finished applying mutations to DB\n", self->appliersInterf.size()); + + cmdReplies.clear(); + + wait(delay(5.0)); //TODO: Delete this wait and see if it can pass correctness + + break; + } catch (Error &e) { + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + } + } + + return Void(); +} + + + +ACTOR Future assignKeyRangeToAppliers(Reference self, Database cx) { //, VectorRef ret_agents + //construct the key range for each applier + std::vector lowerBounds; + std::vector> keyRanges; + std::vector applierIDs; + + // printf("[INFO] Node:%s, Assign key range to appliers. num_appliers:%ld\n", self->describeNode().c_str(), self->range2Applier.size()); + for (auto& applier : self->range2Applier) { + lowerBounds.push_back(applier.first); + applierIDs.push_back(applier.second); + // printf("\t[INFO] ApplierID:%s lowerBound:%s\n", + // applierIDs.back().toString().c_str(), + // lowerBounds.back().toString().c_str()); + } + for (int i = 0; i < lowerBounds.size(); ++i) { + KeyRef startKey = lowerBounds[i]; + KeyRef endKey; + if ( i < lowerBounds.size() - 1) { + endKey = lowerBounds[i+1]; + } else { + endKey = normalKeys.end; + } + + if (startKey > endKey) { + fprintf(stderr, "ERROR at assignKeyRangeToAppliers, startKey:%s > endKey:%s\n", startKey.toString().c_str(), endKey.toString().c_str()); + } + + keyRanges.push_back(KeyRangeRef(startKey, endKey)); + } + + ASSERT( applierIDs.size() == keyRanges.size() ); + state std::map> appliers; + appliers.clear(); // If this function is called more than once in multiple version batches, appliers may carry over the data from earlier version batch + for (int i = 0; i < applierIDs.size(); ++i) { + if (appliers.find(applierIDs[i]) != appliers.end()) { + printf("[ERROR] ApplierID appear more than once. appliers size:%ld applierID: %s\n", + appliers.size(), applierIDs[i].toString().c_str()); + printApplierKeyRangeInfo(appliers); + } + ASSERT( appliers.find(applierIDs[i]) == appliers.end() ); // we should not have a duplicate applierID respoinsbile for multiple key ranges + appliers.insert(std::make_pair(applierIDs[i], keyRanges[i])); + } + + state std::vector> cmdReplies; + loop { + try { + cmdReplies.clear(); + self->cmdID.initPhase(RestoreCommandEnum::Assign_Applier_KeyRange); + for (auto& applier : appliers) { + KeyRangeRef keyRange = applier.second; + UID applierID = applier.first; + printf("[CMD] Node:%s, Assign KeyRange:%s [begin:%s end:%s] to applier ID:%s\n", self->describeNode().c_str(), + keyRange.toString().c_str(), + getHexString(keyRange.begin).c_str(), getHexString(keyRange.end).c_str(), + applierID.toString().c_str()); + + ASSERT( self->appliersInterf.find(applierID) != self->appliersInterf.end() ); + RestoreApplierInterface applierInterf = self->appliersInterf[applierID]; + self->cmdID.nextCmd(); + cmdReplies.push_back( applierInterf.setApplierKeyRangeRequest.getReply(RestoreSetApplierKeyRangeRequest(self->cmdID, applier.first, keyRange)) ); + + } + printf("[INFO] Wait for %ld applier to accept the cmd Assign_Applier_KeyRange\n", appliers.size()); + std::vector reps = wait( timeoutError(getAll(cmdReplies), FastRestore_Failure_Timeout) ); + printf("All appliers have been assigned for ranges"); + + break; + } catch (Error &e) { + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s error. error code:%d, error message:%s\n", self->describeNode().c_str(), + self->cmdID.toString().c_str(), e.code(), e.what()); + } + } + + return Void(); +} + +// Restore Master: Notify loader about appliers' responsible key range +ACTOR Future notifyAppliersKeyRangeToLoader(Reference self, Database cx) { + state std::vector loaders = self->getLoaderIDs(); + state std::vector> cmdReplies; + state Standalone> appliers; + state Standalone> ranges; + + state std::map, UID>::iterator applierRange; + for (applierRange = self->range2Applier.begin(); applierRange != self->range2Applier.end(); applierRange++) { + KeyRef beginRange = applierRange->first; + KeyRange range(KeyRangeRef(beginRange, beginRange)); // TODO: Use the end of key range + appliers.push_back(appliers.arena(), applierRange->second); + ranges.push_back(ranges.arena(), range); + } + + printf("Notify_Loader_ApplierKeyRange: number of appliers:%d\n", appliers.size()); + ASSERT( appliers.size() == ranges.size() && appliers.size() != 0 ); + + self->cmdID.initPhase( RestoreCommandEnum::Notify_Loader_ApplierKeyRange ); + state std::map::iterator loader; + for (loader = self->loadersInterf.begin(); loader != self->loadersInterf.begin(); loader++) { + self->cmdID.nextCmd(); + loop { + try { + cmdReplies.clear(); + printf("[CMD] Node:%s Notify node:%s about appliers key range\n", self->describeNode().c_str(), loader->first.toString().c_str()); + cmdReplies.push_back( loader->second.setApplierKeyRangeVectorRequest.getReply(RestoreSetApplierKeyRangeVectorRequest(self->cmdID, appliers, ranges)) ); + printf("[INFO] Wait for node:%s to accept the cmd Notify_Loader_ApplierKeyRange\n", loader->first.toString().c_str()); + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout ) ); + printf("Finished Notify_Loader_ApplierKeyRange: number of appliers:%d\n", appliers.size()); + cmdReplies.clear(); + break; + } catch (Error &e) { + fprintf(stdout, "[ERROR] Node:%s, Commands before cmdID:%s timeout\n", self->describeNode().c_str(), self->cmdID.toString().c_str()); + } + } + } + + return Void(); +} + + +ACTOR static Future finishRestore(Reference self, Database cx, Standalone> restoreRequests) { + // Make restore workers quit + state std::vector> cmdReplies; + state std::map::iterator loader; + state std::map::iterator applier; + loop { + try { + cmdReplies.clear(); + self->cmdID.initPhase(RestoreCommandEnum::Finish_Restore); + + for ( loader = self->loadersInterf.begin(); loader != self->loadersInterf.end(); loader++ ) { + self->cmdID.nextCmd(); + cmdReplies.push_back(loader->second.finishRestore.getReply(RestoreSimpleRequest(self->cmdID))); + } + for ( applier = self->appliersInterf.begin(); applier != self->appliersInterf.end(); applier++ ) { + self->cmdID.nextCmd(); + cmdReplies.push_back(applier->second.finishRestore.getReply(RestoreSimpleRequest(self->cmdID))); + } + + if (!cmdReplies.empty()) { + std::vector reps = wait( timeoutError( getAll(cmdReplies), FastRestore_Failure_Timeout / 100 ) ); + //std::vector reps = wait( getAll(cmdReplies) ); + cmdReplies.clear(); + } + printf("All restore workers have quited\n"); + + break; + } catch(Error &e) { + printf("[ERROR] At sending finishRestore request. error code:%d message:%s. Retry...\n", e.code(), e.what()); + self->loadersInterf.clear(); + self->appliersInterf.clear(); + cmdReplies.clear(); + wait( _collectRestoreRoleInterfaces(self, cx) ); + } + } + + // Notify tester that the restore has finished + state ReadYourWritesTransaction tr3(cx); + loop { + try { + tr3.reset(); + tr3.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr3.setOption(FDBTransactionOptions::LOCK_AWARE); + tr3.clear(restoreRequestTriggerKey); + tr3.clear(restoreRequestKeys); + tr3.set(restoreRequestDoneKey, restoreRequestDoneValue(restoreRequests.size())); + wait(tr3.commit()); + TraceEvent("LeaderFinishRestoreRequest"); + printf("[INFO] RestoreLeader write restoreRequestDoneKey\n"); + + break; + } catch( Error &e ) { + TraceEvent("RestoreAgentLeaderErrorTr3").detail("ErrorCode", e.code()).detail("ErrorName", e.name()); + printf("[Error] RestoreLead operation on restoreRequestDoneKey, error:%s\n", e.what()); + wait( tr3.onError(e) ); + } + }; + + + // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for any restore operation + // the ranges to restore must be within the backed up ranges, otherwise from the restore perspective it will appear that some + // key ranges were missing and so the backup set is incomplete and the restore has failed. + // This validation cannot be done currently because Restore only supports a single restore range but backups can have many ranges. + + // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored version. + // restore.clearApplyMutationsKeys(tr); + + printf("[INFO] Notify the end of the restore\n"); + TraceEvent("NotifyRestoreFinished"); + + return Void(); +} + + + +ACTOR static Future unlockDB(Database cx, UID uid) { + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + printf("CheckDBlock:%s START\n", uid.toString().c_str()); + wait(checkDatabaseLock(tr, uid)); + printf("CheckDBlock:%s DONE\n", uid.toString().c_str()); + + printf("UnlockDB now. Start.\n"); + wait(unlockDatabase(tr, uid)); //NOTE: unlockDatabase didn't commit inside the function! + + printf("CheckDBlock:%s START\n", uid.toString().c_str()); + wait(checkDatabaseLock(tr, uid)); + printf("CheckDBlock:%s DONE\n", uid.toString().c_str()); + + printf("UnlockDB now. Commit.\n"); + wait( tr->commit() ); + + printf("UnlockDB now. Done.\n"); + break; + } catch( Error &e ) { + printf("Error when we unlockDB. Error:%s\n", e.what()); + wait(tr->onError(e)); + } + }; + + return Void(); + } + +ACTOR static Future registerStatus(Database cx, struct FastRestoreStatus status) { + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + try { + printf("[Restore_Status][%d] curWorkload:%.2f curRunningtime:%.2f curSpeed:%.2f totalWorkload:%.2f totalRunningTime:%.2f totalSpeed:%.2f\n", + restoreStatusIndex, status.curWorkloadSize, status.curRunningTime, status.curSpeed, status.totalWorkloadSize, status.totalRunningTime, status.totalSpeed); + + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + tr->set(restoreStatusKeyFor(StringRef(std::string("curWorkload") + std::to_string(restoreStatusIndex))), restoreStatusValue(status.curWorkloadSize)); + tr->set(restoreStatusKeyFor(StringRef(std::string("curRunningTime") + std::to_string(restoreStatusIndex))), restoreStatusValue(status.curRunningTime)); + tr->set(restoreStatusKeyFor(StringRef(std::string("curSpeed") + std::to_string(restoreStatusIndex))), restoreStatusValue(status.curSpeed)); + + tr->set(restoreStatusKeyFor(StringRef(std::string("totalWorkload"))), restoreStatusValue(status.totalWorkloadSize)); + tr->set(restoreStatusKeyFor(StringRef(std::string("totalRunningTime"))), restoreStatusValue(status.totalRunningTime)); + tr->set(restoreStatusKeyFor(StringRef(std::string("totalSpeed"))), restoreStatusValue(status.totalSpeed)); + + wait( tr->commit() ); + restoreStatusIndex++; + + break; + } catch( Error &e ) { + printf("Transaction Error when we registerStatus. Error:%s\n", e.what()); + wait(tr->onError(e)); + } + }; + + return Void(); +} \ No newline at end of file diff --git a/fdbserver/RestoreMaster.actor.h b/fdbserver/RestoreMaster.actor.h new file mode 100644 index 0000000000..b6d29dfb7a --- /dev/null +++ b/fdbserver/RestoreMaster.actor.h @@ -0,0 +1,264 @@ +/* + * RestoreMasterInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Declear RestoreMaster interface and actors + +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RestoreMasterInterface_G_H) + #define FDBSERVER_RestoreMasterInterface_G_H + #include "fdbserver/RestoreMaster.actor.g.h" +#elif !defined(FDBSERVER_RestoreMasterInterface_H) + #define FDBSERVER_RestoreMasterInterface_H + +#include +#include "flow/Stats.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/CommitTransaction.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbserver/CoordinationInterface.h" +#include "fdbrpc/Locality.h" + +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreRoleCommon.actor.h" + +#include "flow/actorcompiler.h" // has to be last include + +extern double loadBatchSizeThresholdB; +extern int restoreStatusIndex; + +struct RestoreMasterData : RestoreRoleData, public ReferenceCounted { + // range2Applier is in master and loader node. Loader node uses this to determine which applier a mutation should be sent + std::map, UID> range2Applier; // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for + + CMDUID cmdID; // Command id to recoself the progress + + // Temporary variables to hold files and data to restore + std::vector allFiles; // All backup files to be processed in all version batches + std::vector files; // Backup files to be parsed and applied: range and log files in 1 version batch + std::map forbiddenVersions; // forbidden version range [first, second) + + // In each version batch, we process the files in [curBackupFilesBeginIndex, curBackupFilesEndIndex] in RestoreMasterData.allFiles. + long curBackupFilesBeginIndex; + long curBackupFilesEndIndex; + double totalWorkloadSize; + double curWorkloadSize; + int batchIndex; + + Reference bc; // Backup container is used to read backup files + Key bcUrl; // The url used to get the bc + + void addref() { return ReferenceCounted::addref(); } + void delref() { return ReferenceCounted::delref(); } + + void printAllBackupFilesInfo() { + printf("[INFO] All backup files: num:%ld\n", allFiles.size()); + for (int i = 0; i < allFiles.size(); ++i) { + printf("\t[INFO][File %d] %s\n", i, allFiles[i].toString().c_str()); + } + } + + std::string describeNode() { + std::stringstream ss; + ss << "Master versionBatch:" << batchIndex; + return ss.str(); + } + + void constructFilesWithVersionRange() { + printf("[INFO] constructFilesWithVersionRange for num_files:%ld\n", files.size()); + allFiles.clear(); + for (int i = 0; i < files.size(); i++) { + printf("\t[File:%d] Start %s\n", i, files[i].toString().c_str()); + Version beginVersion = 0; + Version endVersion = 0; + if ( files[i].isRange) { + // No need to parse range filename to get endVersion + beginVersion = files[i].version; + endVersion = beginVersion; + } else { // Log file + //Refer to pathToLogFile() in BackupContainer.actor.cpp + long blockSize, len; + int pos = files[i].fileName.find_last_of("/"); + std::string fileName = files[i].fileName.substr(pos); + printf("\t[File:%d] Log filename:%s, pos:%d\n", i, fileName.c_str(), pos); + sscanf(fileName.c_str(), "/log,%ld,%ld,%*[^,],%lu%ln", &beginVersion, &endVersion, &blockSize, &len); + printf("\t[File:%d] Log filename:%s produces beginVersion:%ld endVersion:%ld\n",i, fileName.c_str(), beginVersion, endVersion); + } + files[i].beginVersion = beginVersion; + files[i].endVersion = endVersion; + printf("\t[File:%d] End %s\n", i, files[i].toString().c_str()); + ASSERT(beginVersion <= endVersion); + allFiles.push_back( files[i]); + } + } + + void printBackupFilesInfo() { + printf("[INFO] The backup files for current batch to load and apply: num:%ld\n", files.size()); + for (int i = 0; i < files.size(); ++i) { + printf("\t[INFO][File %d] %s\n", i, files[i].toString().c_str()); + } + } + + void buildForbiddenVersionRange() { + printf("[INFO] Build forbidden version ranges for all backup files: num:%ld\n", allFiles.size()); + for (int i = 0; i < allFiles.size(); ++i) { + if (!allFiles[i].isRange) { + forbiddenVersions.insert(std::make_pair(allFiles[i].beginVersion, allFiles[i].endVersion)); + } + } + } + + bool isForbiddenVersionRangeOverlapped() { + printf("[INFO] Check if forbidden version ranges is overlapped: num of ranges:%ld\n", forbiddenVersions.size()); + if (forbiddenVersions.empty()) { + return false; + } + + std::map::iterator prevRange = forbiddenVersions.begin(); + std::map::iterator curRange = forbiddenVersions.begin(); + curRange++; // Assume forbiddenVersions has at least one element! + + while ( curRange != forbiddenVersions.end() ) { + if ( curRange->first < prevRange->second ) { + return true; // overlapped + } + curRange++; + } + + return false; //not overlapped + } + + + void printForbiddenVersionRange() { + printf("[INFO] Number of forbidden version ranges:%ld\n", forbiddenVersions.size()); + int i = 0; + for (auto &range : forbiddenVersions) { + printf("\t[INFO][Range%d] [%ld, %ld)\n", i, range.first, range.second); + ++i; + } + } + + // endVersion is begin version for range file, because range file takes snapshot at the same version + // endVersion is the end version (excluded) for mutations recoselfed in log file + bool isVersionInForbiddenRange(Version endVersion, bool isRange) { + bool isForbidden = false; + for (auto &range : forbiddenVersions) { + if ( isRange ) { //the range file includes mutations at the endVersion + if (endVersion >= range.first && endVersion < range.second) { + isForbidden = true; + break; + } + } else { // the log file does NOT include mutations at the endVersion + continue; // Log file's endVersion is always a valid version batch boundary as long as the forbidden version ranges do not overlap + } + } + + return isForbidden; + } + + + void printAppliersKeyRange() { + printf("[INFO] The mapping of KeyRange_start --> Applier ID\n"); + // applier type: std::map, UID> + for (auto &applier : range2Applier) { + printf("\t[INFO]%s -> %s\n", getHexString(applier.first).c_str(), applier.second.toString().c_str()); + } + } + + bool isBackupEmpty() { + for (int i = 0; i < files.size(); ++i) { + if (files[i].fileSize > 0) { + return false; + } + } + return true; + } + + + void initBackupContainer(Key url) { + if ( bcUrl == url && bc.isValid() ) { + return; + } + printf("initBackupContainer, url:%s\n", url.toString().c_str()); + bcUrl = url; + bc = IBackupContainer::openContainer(url.toString()); + //state BackupDescription desc = wait(self->bc->describeBackup()); + //return Void(); + } + + // Collect the set of backup files to be used for a version batch + // Return true if there is still files to be restored; false otherwise. + // This function will change the process' RestoreMasterData + bool collectFilesForOneVersionBatch() { + files.clear(); + curWorkloadSize = 0; + Version endVersion = -1; + bool isRange = false; + bool validVersion = false; + // Step: Find backup files in each version batch and restore them. + while ( curBackupFilesBeginIndex < allFiles.size() ) { + // Find the curBackupFilesEndIndex, such that the to-be-loaded files size (curWorkloadSize) is as close to loadBatchSizeThresholdB as possible, + // and curBackupFilesEndIndex must not belong to the forbidden version range! + if ( curBackupFilesEndIndex < allFiles.size() ) { + endVersion = allFiles[curBackupFilesEndIndex].endVersion; + isRange = allFiles[curBackupFilesEndIndex].isRange; + validVersion = !isVersionInForbiddenRange(endVersion, isRange); + curWorkloadSize += allFiles[curBackupFilesEndIndex].fileSize; + printf("[DEBUG][Batch:%d] Calculate backup files for a version batch: endVersion:%lld isRange:%d validVersion:%d curWorkloadSize:%.2fB curBackupFilesBeginIndex:%ld curBackupFilesEndIndex:%ld, files.size:%ld\n", + batchIndex, (long long) endVersion, isRange, validVersion, curWorkloadSize , curBackupFilesBeginIndex, curBackupFilesEndIndex, allFiles.size()); + } + if ( (validVersion && curWorkloadSize >= loadBatchSizeThresholdB) || curBackupFilesEndIndex >= allFiles.size() ) { + if ( curBackupFilesEndIndex >= allFiles.size() && curWorkloadSize <= 0 ) { + printf("Restore finishes: curBackupFilesEndIndex:%ld, allFiles.size:%ld, curWorkloadSize:%.2f\n", + curBackupFilesEndIndex, allFiles.size(), curWorkloadSize ); + //break; // return result + } + // Construct the files [curBackupFilesBeginIndex, curBackupFilesEndIndex] + //resetPerVersionBatch(); + //cmdID.setBatch(batchIndex); + if ( curBackupFilesBeginIndex < allFiles.size()) { + for (int fileIndex = curBackupFilesBeginIndex; fileIndex <= curBackupFilesEndIndex && fileIndex < allFiles.size(); fileIndex++) { + files.push_back(allFiles[fileIndex]); + } + } + printBackupFilesInfo(); + totalWorkloadSize += curWorkloadSize; + break; + } else if (validVersion && curWorkloadSize < loadBatchSizeThresholdB) { + curBackupFilesEndIndex++; + } else if (!validVersion && curWorkloadSize < loadBatchSizeThresholdB) { + curBackupFilesEndIndex++; + } else if (!validVersion && curWorkloadSize >= loadBatchSizeThresholdB) { + // Now: just move to the next file. We will eventually find a valid version but load more than loadBatchSizeThresholdB + printf("[WARNING] The loading batch size will be larger than expected! curBatchSize:%.2fB, expectedBatchSize:%2.fB, endVersion:%ld\n", + curWorkloadSize, loadBatchSizeThresholdB, endVersion); + curBackupFilesEndIndex++; + // TODO: Roll back to find a valid version + } + } + + return (files.size() > 0); + } +}; + + +ACTOR Future startRestoreMaster(Reference self, Database cx); + +#include "flow/unactorcompiler.h" +#endif \ No newline at end of file diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp new file mode 100644 index 0000000000..80a8d941db --- /dev/null +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -0,0 +1,324 @@ +/* + * RestoreRoleCommon.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/MutationList.h" + +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreRoleCommon.actor.h" +#include "fdbserver/RestoreLoader.actor.h" +#include "fdbserver/RestoreApplier.actor.h" +#include "fdbserver/RestoreMaster.actor.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +class Database; +struct RestoreWorkerData; + +// id is the id of the worker to be monitored +// This actor is used for both restore loader and restore applier +ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id) { + wait( delay(0.1) ); // To avoid warning + req.reply.send(RestoreCommonReply(id, req.cmdID)); + + return Void(); +} + +// Restore Worker: collect restore role interfaces locally by reading the specific system keys +ACTOR Future _collectRestoreRoleInterfaces(Reference self, Database cx) { + state Transaction tr(cx); + //state Standalone loaderAgentValues; + //state Standalone applierAgentValues; + printf("[INFO][Worker] Node:%s Get the handleCollectRestoreRoleInterfaceRequest for all workers\n", self->describeNode().c_str()); + loop { + try { + self->clearInterfaces(); + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + state Standalone loaderAgentValues = wait( tr.getRange(restoreLoaderKeys, CLIENT_KNOBS->TOO_MANY) ); + state Standalone applierAgentValues = wait( tr.getRange(restoreApplierKeys, CLIENT_KNOBS->TOO_MANY) ); + ASSERT(!loaderAgentValues.more); + ASSERT(!applierAgentValues.more); + // Save the loader and applier interfaces for the later operations + if (loaderAgentValues.size()) { + for(auto& it : loaderAgentValues) { + RestoreLoaderInterface loaderInterf = BinaryReader::fromStringRef(it.value, IncludeVersion()); + self->loadersInterf[loaderInterf.id()] = loaderInterf; + } + } + if (applierAgentValues.size()) { + for(auto& it : applierAgentValues) { + RestoreApplierInterface applierInterf = BinaryReader::fromStringRef(it.value, IncludeVersion()); + self->appliersInterf[applierInterf.id()] = applierInterf; + self->masterApplierInterf = applierInterf; // TODO: Set masterApplier in a more deterministic way + } + } + //wait(tr.commit()); + break; + } catch( Error &e ) { + printf("[WARNING] Node:%s handleCollectRestoreRoleInterfaceRequest() transaction error:%s\n", self->describeNode().c_str(), e.what()); + wait( tr.onError(e) ); + } + printf("[WARNING] Node:%s handleCollectRestoreRoleInterfaceRequest should always succeed in the first loop! Something goes wrong!\n", self->describeNode().c_str()); + }; + + return Void(); +} + +// Restore worker +// RestoreRoleData will be casted to RestoreLoaderData or RestoreApplierData based on its type +ACTOR Future handleCollectRestoreRoleInterfaceRequest(RestoreSimpleRequest req, Reference self, Database cx) { + + while (self->isInProgress(RestoreCommandEnum::Collect_RestoreRoleInterface)) { + printf("[DEBUG] NODE:%s handleCollectRestoreRoleInterfaceRequest wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + // Handle duplicate, assuming cmdUID is always unique for the same workload + if ( self->isCmdProcessed(req.cmdID) ) { + printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + + self->setInProgressFlag(RestoreCommandEnum::Collect_RestoreRoleInterface); + + wait( _collectRestoreRoleInterfaces(self, cx) ); + + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + self->processedCmd[req.cmdID] = 1; + self->clearInProgressFlag(RestoreCommandEnum::Collect_RestoreRoleInterface); + + return Void(); + } + + + +ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { + // wait( delay(1.0) ); + printf("[Batch:%d] Node:%s Start...\n", req.batchID, self->describeNode().c_str()); + while (self->isInProgress(RestoreCommandEnum::Reset_VersionBatch)) { + printf("[DEBUG] NODE:%s handleVersionBatchRequest wait for 5s\n", self->describeNode().c_str()); + wait(delay(5.0)); + } + + // Handle duplicate, assuming cmdUID is always unique for the same workload + if ( self->isCmdProcessed(req.cmdID) ) { + printf("[DEBUG] NODE:%s skip duplicate cmd:%s\n", self->describeNode().c_str(), req.cmdID.toString().c_str()); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + return Void(); + } + + self->setInProgressFlag(RestoreCommandEnum::Reset_VersionBatch); + + self->resetPerVersionBatch(); + req.reply.send(RestoreCommonReply(self->id(), req.cmdID)); + + self->processedCmd[req.cmdID] = 1; + self->clearInProgressFlag(RestoreCommandEnum::Reset_VersionBatch); + + // This actor never returns. You may cancel it in master + return Void(); +} + + +//-------Helper functions +std::string getHexString(StringRef input) { + std::stringstream ss; + for (int i = 0; itype, + getHexString(iter->param1).c_str(), getHexString(iter->param2).c_str(), iter->param1.size(), iter->param2.size()); + } + return; +} + +//TODO: Print out the backup mutation log value. The backup log value (i.e., the value in the kv pair) has the following format +//version(12B)|mutationRef|MutationRef|.... +//A mutationRef has the format: |type_4B|param1_size_4B|param2_size_4B|param1|param2. +//Note: The data is stored in little endian! You need to convert it to BigEndian so that you know how long the param1 and param2 is and how to format them! +void printBackupMutationRefValueHex(Standalone val_input, std::string prefix) { + std::stringstream ss; + const int version_size = 12; + const int header_size = 12; + StringRef val = val_input.contents(); + StringRefReaderMX reader(val, restore_corrupted_data()); + + int count_size = 0; + // Get the version + uint64_t version = reader.consume(); + count_size += 8; + uint32_t val_length_decode = reader.consume(); + count_size += 4; + + printf("----------------------------------------------------------\n"); + printf("To decode value:%s\n", getHexString(val).c_str()); + if ( val_length_decode != (val.size() - 12) ) { + fprintf(stderr, "%s[PARSE ERROR]!!! val_length_decode:%d != val.size:%d\n", prefix.c_str(), val_length_decode, val.size()); + } else { + if ( debug_verbose ) { + printf("%s[PARSE SUCCESS] val_length_decode:%d == (val.size:%d - 12)\n", prefix.c_str(), val_length_decode, val.size()); + } + } + + // Get the mutation header + while (1) { + // stop when reach the end of the string + if(reader.eof() ) { //|| *reader.rptr == 0xFFCheckRestoreRequestDoneErrorMX + //printf("Finish decode the value\n"); + break; + } + + + uint32_t type = reader.consume();//reader.consumeNetworkUInt32(); + uint32_t kLen = reader.consume();//reader.consumeNetworkUInt32(); + uint32_t vLen = reader.consume();//reader.consumeNetworkUInt32(); + const uint8_t *k = reader.consume(kLen); + const uint8_t *v = reader.consume(vLen); + count_size += 4 * 3 + kLen + vLen; + + if ( kLen < 0 || kLen > val.size() || vLen < 0 || vLen > val.size() ) { + fprintf(stderr, "%s[PARSE ERROR]!!!! kLen:%d(0x%04x) vLen:%d(0x%04x)\n", prefix.c_str(), kLen, kLen, vLen, vLen); + } + + if ( debug_verbose ) { + printf("%s---DedodeBackupMutation: Type:%d K:%s V:%s k_size:%d v_size:%d\n", prefix.c_str(), + type, getHexString(KeyRef(k, kLen)).c_str(), getHexString(KeyRef(v, vLen)).c_str(), kLen, vLen); + } + + } + if ( debug_verbose ) { + printf("----------------------------------------------------------\n"); + } +} + +void printBackupLogKeyHex(Standalone key_input, std::string prefix) { + std::stringstream ss; + const int version_size = 12; + const int header_size = 12; + StringRef val = key_input.contents(); + StringRefReaderMX reader(val, restore_corrupted_data()); + + int count_size = 0; + // Get the version + uint64_t version = reader.consume(); + count_size += 8; + uint32_t val_length_decode = reader.consume(); + count_size += 4; + + printf("----------------------------------------------------------\n"); + printf("To decode value:%s\n", getHexString(val).c_str()); + if ( val_length_decode != (val.size() - 12) ) { + fprintf(stderr, "%s[PARSE ERROR]!!! val_length_decode:%d != val.size:%d\n", prefix.c_str(), val_length_decode, val.size()); + } else { + printf("%s[PARSE SUCCESS] val_length_decode:%d == (val.size:%d - 12)\n", prefix.c_str(), val_length_decode, val.size()); + } + + // Get the mutation header + while (1) { + // stop when reach the end of the string + if(reader.eof() ) { //|| *reader.rptr == 0xFF + //printf("Finish decode the value\n"); + break; + } + + + uint32_t type = reader.consume();//reader.consumeNetworkUInt32(); + uint32_t kLen = reader.consume();//reader.consumeNetworkUInt32(); + uint32_t vLen = reader.consume();//reader.consumeNetworkUInt32(); + const uint8_t *k = reader.consume(kLen); + const uint8_t *v = reader.consume(vLen); + count_size += 4 * 3 + kLen + vLen; + + if ( kLen < 0 || kLen > val.size() || vLen < 0 || vLen > val.size() ) { + printf("%s[PARSE ERROR]!!!! kLen:%d(0x%04x) vLen:%d(0x%04x)\n", prefix.c_str(), kLen, kLen, vLen, vLen); + } + + printf("%s---DedoceBackupMutation: Type:%d K:%s V:%s k_size:%d v_size:%d\n", prefix.c_str(), + type, getHexString(KeyRef(k, kLen)).c_str(), getHexString(KeyRef(v, vLen)).c_str(), kLen, vLen); + + } + printf("----------------------------------------------------------\n"); +} + +void printLowerBounds(std::vector> lowerBounds) { + if ( debug_verbose == false ) + return; + + printf("[INFO] Print out %ld keys in the lowerbounds\n", lowerBounds.size()); + for (int i = 0; i < lowerBounds.size(); i++) { + printf("\t[INFO][%d] %s\n", i, getHexString(lowerBounds[i]).c_str()); + } +} + + +void printApplierKeyRangeInfo(std::map> appliers) { + printf("[INFO] appliers num:%ld\n", appliers.size()); + int index = 0; + for(auto &applier : appliers) { + printf("\t[INFO][Applier:%d] ID:%s --> KeyRange:%s\n", index, applier.first.toString().c_str(), applier.second.toString().c_str()); + } +} diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h new file mode 100644 index 0000000000..073f02fad7 --- /dev/null +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -0,0 +1,200 @@ +/* + * RestoreRoleCommon.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Delcare commone struct and functions used in fast restore + +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RestoreRoleCommon_G_H) + #define FDBSERVER_RestoreRoleCommon_G_H + #include "fdbserver/RestoreRoleCommon.actor.g.h" +#elif !defined(FDBSERVER_RestoreRoleCommon_H) + #define FDBSERVER_RestoreRoleCommon_H + +#include +#include "flow/Stats.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/CommitTransaction.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbserver/CoordinationInterface.h" +#include "fdbrpc/Locality.h" + +#include "fdbserver/RestoreUtil.h" +#include "fdbserver/RestoreWorkerInterface.h" + +extern bool debug_verbose; +extern double mutationVectorThreshold; + +struct RestoreRoleInterface; +struct RestoreLoaderInterface; +struct RestoreApplierInterface; + +struct RestoreRoleData; +struct RestoreMasterData; + +struct RestoreSimpleRequest; + +ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); +ACTOR Future handleCollectRestoreRoleInterfaceRequest(RestoreSimpleRequest req, Reference self, Database cx); +ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); + +ACTOR Future _collectRestoreRoleInterfaces(Reference self, Database cx); + +// Helper class for reading restore data from a buffer and throwing the right errors. +// This struct is mostly copied from StringRefReader. We add a sanity check in this struct. +// TODO: Merge this struct with StringRefReader. +struct StringRefReaderMX { + StringRefReaderMX(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e), str_size(s.size()) {} + + // Return remainder of data as a StringRef + StringRef remainder() { + return StringRef(rptr, end - rptr); + } + + // Return a pointer to len bytes at the current read position and advance read pos + //Consume a little-Endian data. Since we only run on little-Endian machine, the data on storage is little Endian + const uint8_t * consume(unsigned int len) { + if(rptr == end && len != 0) + throw end_of_stream(); + const uint8_t *p = rptr; + rptr += len; + if(rptr > end) { + printf("[ERROR] StringRefReaderMX throw error! string length:%d\n", str_size); + printf("!!!!!!!!!!!![ERROR]!!!!!!!!!!!!!! Worker may die due to the error. Master will stuck when a worker die\n"); + throw failure_error; + } + return p; + } + + // Return a T from the current read position and advance read pos + template const T consume() { + return *(const T *)consume(sizeof(T)); + } + + // Functions for consuming big endian (network byte oselfer) integers. + // Consumes a big endian number, swaps it to little endian, and returns it. + const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} + const uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} + + const int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume< int64_t>());} + const uint64_t consumeNetworkUInt64() { return bigEndian64( consume());} + + bool eof() { return rptr == end; } + + const uint8_t *rptr, *end; + const int str_size; + Error failure_error; +}; + +struct RestoreRoleData : NonCopyable, public ReferenceCounted { +public: + RestoreRole role; + UID nodeID; // RestoreLoader role ID + int nodeIndex; // RestoreLoader role index, which is continuous and easy for debuggging + + std::map loadersInterf; + std::map appliersInterf; + RestoreApplierInterface masterApplierInterf; + + std::map processedCmd; + uint32_t inProgressFlag = 0; + + RestoreRoleData() : role(RestoreRole::Invalid) {}; + + ~RestoreRoleData() {}; + + UID id() const { return nodeID; } + + bool isCmdProcessed(CMDUID const &cmdID) { + return processedCmd.find(cmdID) != processedCmd.end(); + } + + // Helper functions to set/clear the flag when a worker is in the middle of processing an actor. + void setInProgressFlag(RestoreCommandEnum phaseEnum) { + int phase = (int) phaseEnum; + ASSERT(phase < 32); + inProgressFlag |= (1UL << phase); + } + + void clearInProgressFlag(RestoreCommandEnum phaseEnum) { + int phase = (int) phaseEnum; + ASSERT(phase < 32); + inProgressFlag &= ~(1UL << phase); + } + + bool isInProgress(RestoreCommandEnum phaseEnum) { + int phase = (int) phaseEnum; + ASSERT(phase < 32); + return (inProgressFlag & (1UL << phase)); + } + + void resetPerVersionBatch() { + processedCmd.clear(); + inProgressFlag = 0; + } + + void clearInterfaces() { + loadersInterf.clear(); + appliersInterf.clear(); + } + + std::string describeNode() { + std::stringstream ss; + ss << "RestoreRoleData role:" << getRoleStr(role); + return ss.str(); + } + + // TODO: To remove this function + std::vector getApplierIDs() { + std::vector applierIDs; + for (auto &applier : appliersInterf) { + applierIDs.push_back(applier.first); + } + return applierIDs; + } + + // TODO: To remove this function + std::vector getLoaderIDs() { + std::vector loaderIDs; + for (auto &loader : loadersInterf) { + loaderIDs.push_back(loader.first); + } + + return loaderIDs; + } + + // TODO: To remove this function + std::vector getWorkerIDs() { + std::vector workerIDs; + for (auto &loader : loadersInterf) { + workerIDs.push_back(loader.first); + } + for (auto &applier : appliersInterf) { + workerIDs.push_back(applier.first); + } + + return workerIDs; + } + +}; + +void printLowerBounds(std::vector> lowerBounds); +void printApplierKeyRangeInfo(std::map> appliers); + +#endif \ No newline at end of file diff --git a/fdbserver/RestoreUtil.actor.cpp b/fdbserver/RestoreUtil.actor.cpp new file mode 100644 index 0000000000..ed54d2ef6b --- /dev/null +++ b/fdbserver/RestoreUtil.actor.cpp @@ -0,0 +1,70 @@ +/* + * RestoreUtil.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/RestoreUtil.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +std::vector RestoreRoleStr = {"Invalid", "Master", "Loader", "Applier"}; +int numRoles = RestoreRoleStr.size(); + +std::string getRoleStr(RestoreRole role) { + if ( (int) role >= numRoles || (int) role < 0) { + printf("[ERROR] role:%d is out of scope\n", (int) role); + return "[Unset]"; + } + return RestoreRoleStr[(int)role]; +} + +// CMDUID implementation +void CMDUID::initPhase(RestoreCommandEnum newPhase) { + printf("CMDID, current phase:%d, new phase:%d\n", phase, newPhase); + phase = (uint16_t) newPhase; + cmdID = 0; +} + +void CMDUID::nextPhase() { + phase++; + cmdID = 0; +} + +void CMDUID::nextCmd() { + cmdID++; +} + +RestoreCommandEnum CMDUID::getPhase() { + return (RestoreCommandEnum) phase; +} + +void CMDUID::setPhase(RestoreCommandEnum newPhase) { + phase = (uint16_t) newPhase; +} + +void CMDUID::setBatch(int newBatchIndex) { + batch = newBatchIndex; +} + +uint64_t CMDUID::getIndex() { + return cmdID; +} + +std::string CMDUID::toString() const { + return format("%04ld|%04ld|%016lld", batch, phase, cmdID); +} diff --git a/fdbserver/RestoreUtil.h b/fdbserver/RestoreUtil.h new file mode 100644 index 0000000000..4e9ceed149 --- /dev/null +++ b/fdbserver/RestoreUtil.h @@ -0,0 +1,146 @@ +/* + * RestoreUtil.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file defines the commonly used data structure and functions +// that are used by both RestoreWorker and RestoreRoles(Master, Loader, and Applier) + +#ifndef FDBSERVER_RESTOREUTIL_H +#define FDBSERVER_RESTOREUTIL_H +#pragma once + +#include "fdbclient/Tuple.h" +#include "flow/flow.h" +#include "flow/Stats.h" +#include "fdbrpc/fdbrpc.h" +#include "fdbrpc/IAsyncFile.h" + + +// RestoreCommandEnum is also used as the phase ID for CMDUID +enum class RestoreCommandEnum {Init = 0, + Set_Role, Set_Role_Done, + Sample_Range_File, Sample_Log_File, Sample_File_Done, + Loader_Send_Sample_Mutation_To_Applier, Loader_Send_Sample_Mutation_To_Applier_Done, //7 + Calculate_Applier_KeyRange, Get_Applier_KeyRange, Get_Applier_KeyRange_Done, //10 + Assign_Applier_KeyRange, Assign_Applier_KeyRange_Done, //12 + Assign_Loader_Range_File, Assign_Loader_Log_File, Assign_Loader_File_Done,//15 + Loader_Send_Mutations_To_Applier, Loader_Send_Mutations_To_Applier_Done,//17 + Apply_Mutation_To_DB, Apply_Mutation_To_DB_Skip, //19 + Loader_Notify_Appler_To_Apply_Mutation, + Notify_Loader_ApplierKeyRange, Notify_Loader_ApplierKeyRange_Done, //22 + Finish_Restore, Reset_VersionBatch, Set_WorkerInterface, Collect_RestoreRoleInterface, + Heart_Beat}; //23 +BINARY_SERIALIZABLE(RestoreCommandEnum); + +enum class RestoreRole {Invalid = 0, Master = 1, Loader, Applier}; +BINARY_SERIALIZABLE( RestoreRole ); + +extern std::vector RestoreRoleStr; +extern int numRoles; + +std::string getRoleStr(RestoreRole role); + +// Restore command's UID. uint64_t part[2]; +// part[0] is the phase id, part[1] is the command index in the phase. +// TODO: Add another field to indicate version-batch round +class CMDUID { +public: + uint16_t batch; + uint16_t phase; + uint64_t cmdID; + CMDUID() : batch(0), phase(0), cmdID(0) { } + CMDUID( uint16_t a, uint64_t b ) { batch = 0; phase=a; cmdID=b; } + CMDUID(const CMDUID &cmd) { batch = cmd.batch; phase = cmd.phase; cmdID = cmd.cmdID; } + + void initPhase(RestoreCommandEnum phase); + + void nextPhase(); // Set to the next phase. + + void nextCmd(); // Increase the command index at the same phase + + RestoreCommandEnum getPhase(); + void setPhase(RestoreCommandEnum newPhase); + void setBatch(int newBatchIndex); + + uint64_t getIndex(); + + std::string toString() const; + + bool operator == ( const CMDUID& r ) const { return batch == r.batch && phase == r.phase && cmdID == r.cmdID; } + bool operator != ( const CMDUID& r ) const { return batch != r.batch || phase != r.phase || cmdID != r.cmdID; } + bool operator < ( const CMDUID& r ) const { return batch < r.batch || (batch == r.batch && phase < r.phase) || (batch == r.batch && phase == r.phase && cmdID < r.cmdID); } + + //uint64_t hash() const { return first(); } + //uint64_t first() const { return part[0]; } + //uint64_t second() const { return part[1]; } + + template + void serialize_unversioned(Ar& ar) { // Changing this serialization format will affect key definitions, so can't simply be versioned! + serializer(ar, batch, phase, cmdID); + } +}; +template void load( Ar& ar, CMDUID& uid ) { uid.serialize_unversioned(ar); } +template void save( Ar& ar, CMDUID const& uid ) { const_cast(uid).serialize_unversioned(ar); } + + struct FastRestoreStatus { + double curWorkloadSize; + double curRunningTime; + double curSpeed; + + double totalWorkloadSize; + double totalRunningTime; + double totalSpeed; +}; + +// Common restore request/response interface +// Reply type +struct RestoreCommonReply { + UID id; // unique ID of the server who sends the reply + CMDUID cmdID; // The restore command for the reply + + RestoreCommonReply() : id(UID()), cmdID(CMDUID()) {} + explicit RestoreCommonReply(UID id, CMDUID cmdID) : id(id), cmdID(cmdID) {} + + std::string toString() const { + std::stringstream ss; + ss << "ServerNodeID:" << id.toString() << " CMDID:" << cmdID.toString(); + return ss.str(); + } + + template + void serialize(Ar& ar) { + serializer(ar, id, cmdID); + } +}; + +struct RestoreSimpleRequest : TimedRequest { + CMDUID cmdID; + + ReplyPromise reply; + + RestoreSimpleRequest() : cmdID(CMDUID()) {} + explicit RestoreSimpleRequest(CMDUID cmdID) : cmdID(cmdID) {} + + template + void serialize( Ar& ar ) { + serializer(ar, cmdID, reply); + } +}; + +#endif //FDBSERVER_RESTOREUTIL_ACTOR_H \ No newline at end of file diff --git a/fdbserver/RestoreWorkerInterface.h b/fdbserver/RestoreWorkerInterface.h index 35d4cdd255..cd1abd44f7 100644 --- a/fdbserver/RestoreWorkerInterface.h +++ b/fdbserver/RestoreWorkerInterface.h @@ -18,8 +18,10 @@ * limitations under the License. */ -#ifndef FDBSERVER_RestoreWorkerInterface_H -#define FDBSERVER_RestoreWorkerInterface_H +// Declare and define the interface for restore worker/loader/applier + +#ifndef FDBSERVER_RESTORE_WORKER_INTERFACE_H +#define FDBSERVER_RESTORE_WORKER_INTERFACE_H #pragma once #include @@ -30,11 +32,12 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbrpc/Locality.h" +#include "fdbserver/RestoreUtil.h" +//#include "fdbserver/RestoreRoleCommon.actor.h" + +#include "flow/actorcompiler.h" // has to be last include class RestoreConfig; -enum class RestoreRole {Invalid = 0, Master = 1, Loader, Applier}; -extern std::vector RestoreRoleStr; -BINARY_SERIALIZABLE( RestoreRole ); // Timeout threshold in seconds for restore commands @@ -43,8 +46,7 @@ extern int FastRestore_Failure_Timeout; struct RestoreCommonReply; struct GetKeyRangeReply; struct GetKeyRangeReply; -struct RestoreSetRoleRequest; -struct RestoreSimpleRequest; +struct RestoreRecruitRoleRequest; struct RestoreLoadFileRequest; struct RestoreGetApplierKeyRangeRequest; struct RestoreSetApplierKeyRangeRequest; @@ -54,124 +56,87 @@ struct RestoreCalculateApplierKeyRangeRequest; struct RestoreSendMutationVectorRequest; struct RestoreSetApplierKeyRangeVectorRequest; -// RestoreCommandEnum is also used as the phase ID for CMDUID -enum class RestoreCommandEnum {Init = 0, - Set_Role, Set_Role_Done, - Sample_Range_File, Sample_Log_File, Sample_File_Done, - Loader_Send_Sample_Mutation_To_Applier, Loader_Send_Sample_Mutation_To_Applier_Done, //7 - Calculate_Applier_KeyRange, Get_Applier_KeyRange, Get_Applier_KeyRange_Done, //10 - Assign_Applier_KeyRange, Assign_Applier_KeyRange_Done, //12 - Assign_Loader_Range_File, Assign_Loader_Log_File, Assign_Loader_File_Done,//15 - Loader_Send_Mutations_To_Applier, Loader_Send_Mutations_To_Applier_Done,//17 - Apply_Mutation_To_DB, Apply_Mutation_To_DB_Skip, //19 - Loader_Notify_Appler_To_Apply_Mutation, - Notify_Loader_ApplierKeyRange, Notify_Loader_ApplierKeyRange_Done, //22 - Finish_Restore, RESET_VersionBatch, Set_WorkerInterface}; //23 -BINARY_SERIALIZABLE(RestoreCommandEnum); -// Restore command's UID. uint64_t part[2]; -// part[0] is the phase id, part[1] is the command index in the phase. -// TODO: Add another field to indicate version-batch round -class CMDUID { -public: - uint16_t batch; - uint16_t phase; - uint64_t cmdID; - CMDUID() : batch(0), phase(0), cmdID(0) { } - CMDUID( uint16_t a, uint64_t b ) { batch = 0; phase=a; cmdID=b; } - CMDUID(const CMDUID &cmd) { batch = cmd.batch; phase = cmd.phase; cmdID = cmd.cmdID; } +struct RestoreWorkerInterface { + UID interfID; - void initPhase(RestoreCommandEnum phase); + RequestStream heartbeat; + RequestStream recruitRole; + RequestStream terminateWorker; - void nextPhase(); // Set to the next phase. + bool operator == (RestoreWorkerInterface const& r) const { return id() == r.id(); } + bool operator != (RestoreWorkerInterface const& r) const { return id() != r.id(); } - void nextCmd(); // Increase the command index at the same phase + UID id() const { return interfID; } //cmd.getEndpoint().token; - RestoreCommandEnum getPhase(); - void setPhase(RestoreCommandEnum newPhase); - void setBatch(int newBatchIndex); + NetworkAddress address() const { return recruitRole.getEndpoint().addresses.address; } - uint64_t getIndex(); + void initEndpoints() { + heartbeat.getEndpoint( TaskClusterController ); + recruitRole.getEndpoint( TaskClusterController );// Q: Why do we need this? + terminateWorker.getEndpoint( TaskClusterController ); - std::string toString() const; - - bool operator == ( const CMDUID& r ) const { return batch == r.batch && phase == r.phase && cmdID == r.cmdID; } - bool operator != ( const CMDUID& r ) const { return batch != r.batch || phase != r.phase || cmdID != r.cmdID; } - bool operator < ( const CMDUID& r ) const { return batch < r.batch || (batch == r.batch && phase < r.phase) || (batch == r.batch && phase == r.phase && cmdID < r.cmdID); } - - //uint64_t hash() const { return first(); } - //uint64_t first() const { return part[0]; } - //uint64_t second() const { return part[1]; } + interfID = g_random->randomUniqueID(); + } template - void serialize_unversioned(Ar& ar) { // Changing this serialization format will affect key definitions, so can't simply be versioned! - serializer(ar, batch, phase, cmdID); + void serialize( Ar& ar ) { + serializer(ar, interfID, heartbeat, recruitRole, terminateWorker); } }; -template void load( Ar& ar, CMDUID& uid ) { uid.serialize_unversioned(ar); } -template void save( Ar& ar, CMDUID const& uid ) { const_cast(uid).serialize_unversioned(ar); } +struct RestoreRoleInterface { +public: + RestoreRole role; -// NOTE: is cmd's Endpoint token the same with the request's token for the same node? -struct RestoreInterface { + RestoreRoleInterface() { + role = RestoreRole::Invalid; + } +}; + +struct RestoreLoaderInterface : RestoreRoleInterface { +public: UID nodeID; RequestStream heartbeat; - RequestStream setRole; RequestStream sampleRangeFile; RequestStream sampleLogFile; - RequestStream sendSampleMutationVector; - RequestStream calculateApplierKeyRange; - RequestStream getApplierKeyRangeRequest; - RequestStream setApplierKeyRangeRequest; // To delete RequestStream setApplierKeyRangeVectorRequest; RequestStream loadRangeFile; RequestStream loadLogFile; - RequestStream sendMutationVector; - RequestStream applyToDB; RequestStream initVersionBatch; - RequestStream setWorkerInterface; + RequestStream collectRestoreRoleInterfaces; // TODO: Change to collectRestoreRoleInterfaces RequestStream finishRestore; - // ToDelete -// RequestStream< struct RestoreCommand > cmd; // Restore commands from master to loader and applier -// RequestStream< struct RestoreRequest > request; // Restore requests used by loader and applier + bool operator == (RestoreWorkerInterface const& r) const { return id() == r.id(); } + bool operator != (RestoreWorkerInterface const& r) const { return id() != r.id(); } - bool operator == (RestoreInterface const& r) const { return id() == r.id(); } - bool operator != (RestoreInterface const& r) const { return id() != r.id(); } + UID id() const { return nodeID; } - UID id() const { return nodeID; } //cmd.getEndpoint().token; - - NetworkAddress address() const { return setRole.getEndpoint().addresses.address; } + NetworkAddress address() const { return heartbeat.getEndpoint().addresses.address; } void initEndpoints() { heartbeat.getEndpoint( TaskClusterController ); - setRole.getEndpoint( TaskClusterController );// Q: Why do we need this? sampleRangeFile.getEndpoint( TaskClusterController ); sampleLogFile.getEndpoint( TaskClusterController ); - sendSampleMutationVector.getEndpoint( TaskClusterController ); - calculateApplierKeyRange.getEndpoint( TaskClusterController ); - getApplierKeyRangeRequest.getEndpoint( TaskClusterController ); - setApplierKeyRangeRequest.getEndpoint( TaskClusterController ); setApplierKeyRangeVectorRequest.getEndpoint( TaskClusterController ); loadRangeFile.getEndpoint( TaskClusterController ); loadLogFile.getEndpoint( TaskClusterController ); - sendMutationVector.getEndpoint( TaskClusterController ); - applyToDB.getEndpoint( TaskClusterController ); initVersionBatch.getEndpoint( TaskClusterController ); - setWorkerInterface.getEndpoint( TaskClusterController ); + collectRestoreRoleInterfaces.getEndpoint( TaskClusterController ); + finishRestore.getEndpoint( TaskClusterController ); nodeID = g_random->randomUniqueID(); @@ -179,10 +144,73 @@ struct RestoreInterface { template void serialize( Ar& ar ) { - serializer(ar, nodeID, heartbeat, setRole, sampleRangeFile, sampleLogFile, sendSampleMutationVector, - calculateApplierKeyRange, getApplierKeyRangeRequest, setApplierKeyRangeRequest, setApplierKeyRangeVectorRequest, - loadRangeFile, loadLogFile, sendMutationVector, applyToDB, initVersionBatch, setWorkerInterface, - finishRestore); + serializer(ar, nodeID, heartbeat, sampleRangeFile, sampleLogFile, + setApplierKeyRangeVectorRequest, loadRangeFile, loadLogFile, + initVersionBatch, collectRestoreRoleInterfaces, finishRestore); + } +}; + + +struct RestoreApplierInterface : RestoreRoleInterface { +public: + UID nodeID; + + RequestStream heartbeat; + + RequestStream calculateApplierKeyRange; + RequestStream getApplierKeyRangeRequest; + RequestStream setApplierKeyRangeRequest; + + RequestStream sendSampleMutationVector; + RequestStream sendMutationVector; + + RequestStream applyToDB; + + RequestStream initVersionBatch; + + RequestStream collectRestoreRoleInterfaces; + + RequestStream finishRestore; + + + bool operator == (RestoreWorkerInterface const& r) const { return id() == r.id(); } + bool operator != (RestoreWorkerInterface const& r) const { return id() != r.id(); } + + UID id() const { return nodeID; } + + NetworkAddress address() const { return heartbeat.getEndpoint().addresses.address; } + + void initEndpoints() { + heartbeat.getEndpoint( TaskClusterController ); + + calculateApplierKeyRange.getEndpoint( TaskClusterController ); + getApplierKeyRangeRequest.getEndpoint( TaskClusterController ); + setApplierKeyRangeRequest.getEndpoint( TaskClusterController ); + + sendSampleMutationVector.getEndpoint( TaskClusterController ); + sendMutationVector.getEndpoint( TaskClusterController ); + + applyToDB.getEndpoint( TaskClusterController ); + + initVersionBatch.getEndpoint( TaskClusterController ); + + collectRestoreRoleInterfaces.getEndpoint( TaskClusterController ); + + finishRestore.getEndpoint( TaskClusterController ); + + nodeID = g_random->randomUniqueID(); + } + + template + void serialize( Ar& ar ) { + serializer(ar, nodeID, heartbeat, calculateApplierKeyRange, + getApplierKeyRangeRequest, setApplierKeyRangeRequest, + sendSampleMutationVector, sendMutationVector, + applyToDB, initVersionBatch, collectRestoreRoleInterfaces, finishRestore); + } + + std::string toString() { + return nodeID.toString(); } }; @@ -215,21 +243,26 @@ struct LoadingParam { }; -struct RestoreSetRoleRequest : TimedRequest { +struct RestoreRecruitRoleRequest : TimedRequest { CMDUID cmdID; RestoreRole role; - int nodeIndex; - UID masterApplierID; + int nodeIndex; // Each role is a node ReplyPromise reply; - RestoreSetRoleRequest() : cmdID(CMDUID()), role(RestoreRole::Invalid) {} - explicit RestoreSetRoleRequest(CMDUID cmdID, RestoreRole role, int nodeIndex, UID masterApplierID) : - cmdID(cmdID), role(role), nodeIndex(nodeIndex), masterApplierID(masterApplierID) {} + RestoreRecruitRoleRequest() : cmdID(CMDUID()), role(RestoreRole::Invalid) {} + explicit RestoreRecruitRoleRequest(CMDUID cmdID, RestoreRole role, int nodeIndex) : + cmdID(cmdID), role(role), nodeIndex(nodeIndex){} template void serialize( Ar& ar ) { - serializer(ar, cmdID, role, nodeIndex, masterApplierID, reply); + serializer(ar, cmdID, role, nodeIndex, reply); + } + + std::string printable() { + std::stringstream ss; + ss << "CMDID:" << cmdID.toString() << " Role:" << getRoleStr(role) << " NodeIndex:" << nodeIndex; + return ss.str(); } }; @@ -265,20 +298,6 @@ struct RestoreSendMutationVectorRequest : TimedRequest { } }; -// CalculateApplierKeyRange, applyToDB -struct RestoreSimpleRequest : TimedRequest { - CMDUID cmdID; - - ReplyPromise reply; - - RestoreSimpleRequest() : cmdID(CMDUID()) {} - explicit RestoreSimpleRequest(CMDUID cmdID) : cmdID(cmdID) {} - - template - void serialize( Ar& ar ) { - serializer(ar, cmdID, reply); - } -}; struct RestoreCalculateApplierKeyRangeRequest : TimedRequest { CMDUID cmdID; @@ -358,28 +377,6 @@ struct RestoreSetApplierKeyRangeVectorRequest : TimedRequest { } }; - - -// Reply type -struct RestoreCommonReply { - UID id; // unique ID of the server who sends the reply - CMDUID cmdID; // The restore command for the reply - - RestoreCommonReply() : id(UID()), cmdID(CMDUID()) {} - explicit RestoreCommonReply(UID id, CMDUID cmdID) : id(id), cmdID(cmdID) {} - - std::string toString() const { - std::stringstream ss; - ss << "ServerNodeID:" << id.toString() << " CMDID:" << cmdID.toString(); - return ss.str(); - } - - template - void serialize(Ar& ar) { - serializer(ar, id, cmdID); - } -}; - struct GetKeyRangeReply : RestoreCommonReply { int index; Standalone lowerBound; // inclusive diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index d58d7fa156..0441e11575 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -53,7 +53,10 @@ + + + @@ -199,7 +202,13 @@ - + + + false + + + false + false @@ -209,6 +218,7 @@ false +