From 1025a0da37f3517da99e540fea43817a8fd0ad06 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 3 Mar 2021 21:45:33 -0800 Subject: [PATCH 1/7] File-based backup now uses async rename to prevent blocking the network thread in the event of slow filesystem metadata operations. Backup now opens its write-only files without a block cache so that writes to disk will be larger (1MB by default instead of 4k). --- fdbclient/BackupContainer.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index a886fd2444..6695dad269 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -1314,7 +1314,7 @@ public: wait(f->m_file->sync()); std::string name = f->m_file->getFilename(); f->m_file.clear(); - renameFile(name, f->m_finalFullPath); + wait(IAsyncFileSystem::filesystem()->renameFile(name, f->m_finalFullPath)); return Void(); } @@ -1337,7 +1337,7 @@ public: }; Future> writeFile(std::string path) { - int flags = IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE; + int flags = IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE; std::string fullPath = joinPath(m_path, path); platform::createDirectory(parentDirectory(fullPath)); std::string temp = fullPath + "." + deterministicRandom()->randomUniqueID().toString() + ".temp"; From b2f313774f5b5780d8c15060d24ad14c15b81e30 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 3 Mar 2021 22:24:00 -0800 Subject: [PATCH 2/7] Added release notes. --- .../sphinx/source/release-notes/release-notes-620.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 35094d3497..50098b9ec5 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -4,6 +4,11 @@ Release Notes ############# +6.2.33 +====== +* Fix backup agent stall when writing to local filesystem with slow metadata operations. `(PR #4428) `_ +* Backup agent writes to local filesystems without 4k caching layer, resulting in larger write operations. `(PR #4428) `_ + 6.2.32 ====== * Fix an issue where symbolic links in cmake-built RPMs are broken if you unpack the RPM to a custom directory. `(PR #4380) `_ From cd5341ceff4a86af410cc47abc8145f8ecc149c3 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 3 Mar 2021 22:26:43 -0800 Subject: [PATCH 3/7] Make release note more clear. --- documentation/sphinx/source/release-notes/release-notes-620.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 50098b9ec5..f8df76b7f8 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -7,7 +7,7 @@ Release Notes 6.2.33 ====== * Fix backup agent stall when writing to local filesystem with slow metadata operations. `(PR #4428) `_ -* Backup agent writes to local filesystems without 4k caching layer, resulting in larger write operations. `(PR #4428) `_ +* Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) `_ 6.2.32 ====== From 1d00c413857b3c282deb1f349f869cc7f06f9e2c Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 5 Mar 2021 03:35:10 -0800 Subject: [PATCH 4/7] Bug fixes: Avoid empty writes, avoid 0 length truncation, and make AsyncFileNonDurable support writes at arbitrary offsets and lengths. Increase randomness of IBackupFile appends in backup container unit test. --- fdbclient/BackupContainer.actor.cpp | 32 +++++++++++++++++++++-------- fdbclient/Knobs.cpp | 2 +- fdbrpc/AsyncFileNonDurable.actor.h | 23 ++++++++++++++------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 6695dad269..4d1331a674 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -1294,6 +1294,11 @@ public: } Future flush(int size) { + // Avoid empty write + if(size == 0) { + return Void(); + } + ASSERT(size <= m_buffer.size()); // Keep a reference to the old buffer @@ -1310,7 +1315,10 @@ public: ACTOR static Future finish_impl(Reference f) { wait(f->flush(f->m_buffer.size())); - wait(f->m_file->truncate(f->size())); // Some IAsyncFile implementations extend in whole block sizes. + // Avoid truncation from 0 to 0 because simulation won't allow it since KAIO would not. + if(f->size() != 0) { + wait(f->m_file->truncate(f->size())); // Some IAsyncFile implementations extend in whole block sizes. + } wait(f->m_file->sync()); std::string name = f->m_file->getFilename(); f->m_file.clear(); @@ -1776,21 +1784,27 @@ int chooseFileSize(std::vector &sizes) { } ACTOR Future writeAndVerifyFile(Reference c, Reference f, int size) { - state Standalone content; - if(size > 0) { - content = makeString(size); - for(int i = 0; i < content.size(); ++i) - mutateString(content)[i] = (uint8_t)deterministicRandom()->randomInt(0, 256); + state Standalone> content; + content.resize(content.arena(), size); + for(int i = 0; i < content.size(); ++i) { + content[i] = (uint8_t)deterministicRandom()->randomInt(0, 256); + } - wait(f->append(content.begin(), content.size())); + state VectorRef sendBuf = content; + while(sendBuf.size() > 0) { + state int n = std::min(sendBuf.size(), deterministicRandom()->randomInt(1, 16384)); + wait(f->append(sendBuf.begin(), n)); + sendBuf.pop_front(n); } wait(f->finish()); + state Reference inputFile = wait(c->readFile(f->getFileName())); int64_t fileSize = wait(inputFile->size()); ASSERT(size == fileSize); if(size > 0) { - state Standalone buf = makeString(size); - int b = wait(inputFile->read(mutateString(buf), buf.size(), 0)); + state Standalone> buf; + buf.resize(buf.arena(), fileSize); + int b = wait(inputFile->read(buf.begin(), buf.size(), 0)); ASSERT(b == buf.size()); ASSERT(buf == content); } diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index 285c966b54..b19ec67623 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -115,7 +115,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( TASKBUCKET_MAX_TASK_KEYS, 1000 ); if( randomize && BUGGIFY ) TASKBUCKET_MAX_TASK_KEYS = 20; //Backup - init( BACKUP_LOCAL_FILE_WRITE_BLOCK, 1024*1024 ); if( randomize && BUGGIFY ) BACKUP_LOCAL_FILE_WRITE_BLOCK = 100; + init( BACKUP_LOCAL_FILE_WRITE_BLOCK, 1024*1024 ); if( randomize && BUGGIFY ) BACKUP_LOCAL_FILE_WRITE_BLOCK = deterministicRandom()->randomInt(100, 10000); init( BACKUP_CONCURRENT_DELETES, 100 ); init( BACKUP_SIMULATED_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) BACKUP_SIMULATED_LIMIT_BYTES = 1000; init( BACKUP_GET_RANGE_LIMIT_BYTES, 1e6 ); diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 7e8e551b3e..652e9774c2 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -455,20 +455,25 @@ private: debugFileCheck("AsyncFileNonDurableWriteAfterWait", self->filename, dataCopy.begin(), offset, length); - //Only page-aligned writes are supported - ASSERT(offset % 4096 == 0 && length % 4096 == 0); - //Non-durable writes should introduce errors at the page level and corrupt at the sector level //Otherwise, we can perform the entire write at once - int pageLength = saveDurable ? length : 4096; - int sectorLength = saveDurable ? length : 512; + int diskPageLength = saveDurable ? length : 4096; + int diskSectorLength = saveDurable ? length : 512; vector> writeFutures; - for(int writeOffset = 0; writeOffset < length; writeOffset += pageLength) { + for(int writeOffset = 0; writeOffset < length; ) { + // Number of bytes to the next diskPageLength offset within the write or the end of the write. + // First and last pages can be short. + int pageLength = std::min((int64_t)length - writeOffset, diskPageLength - ((offset + writeOffset) % diskPageLength)); + //choose a random action to perform on this page write (write correctly, corrupt, or don't write) KillMode pageKillMode = (KillMode)deterministicRandom()->randomInt(0, self->killMode + 1); - for(int pageOffset = 0; pageOffset < pageLength; pageOffset += sectorLength) { + for(int pageOffset = 0; pageOffset < pageLength; ) { + // Number of bytes to the next diskSectorLength offset within the write or the end of the write. + // First and last pages can be short. + int sectorLength = std::min((int64_t)length - (writeOffset + pageOffset), diskSectorLength - ((offset + writeOffset + pageOffset) % diskSectorLength)); + //If saving durable, then perform the write correctly. Otherwise, perform the write correcly with a probability of 1/3. //If corrupting the write, then this sector will be written correctly with a 1/4 chance if(saveDurable || pageKillMode == NO_CORRUPTION || (pageKillMode == FULL_CORRUPTION && deterministicRandom()->random01() < 0.25)) { @@ -523,7 +528,11 @@ private: TraceEvent("AsyncFileNonDurable_DroppedWrite", self->id).detail("Offset", offset + writeOffset + pageOffset).detail("Length", sectorLength).detail("Filename", self->filename); TEST(true); //AsyncFileNonDurable dropped write } + + pageOffset += sectorLength; } + + writeOffset += pageLength; } wait(waitForAll(writeFutures)); From 8a4aca3f4719710612df8eda13b13535a1702fc8 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 7 Mar 2021 22:19:04 -0800 Subject: [PATCH 5/7] Bug fix: In simulation only, RangeFileWriter was not waiting on a write future in one location. --- fdbclient/FileBackupAgent.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 960988b48a..0e75449267 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -1147,7 +1147,7 @@ namespace fileBackup { wait(rangeFile.writeKey(nextKey)); if(BUGGIFY) { - rangeFile.padEnd(); + wait(rangeFile.padEnd()); } bool usedFile = wait(finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion)); From a397d8625fe7e23d6c65d77adf1a9cf31f34f813 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 7 Mar 2021 22:21:10 -0800 Subject: [PATCH 6/7] AsyncFileNonDurable now has a flag for whether or not to emulate AIO mode, so the previous behavior limits are preserved when this flag is set. AsyncFileDurable writes which are selected to be durable are again done in a single write, and the logic to handle non-aligned writes for non-AIO mode is more clear and commented. In simulation, backup files on local filesystems are written with random buffer sizes. Backup container unit test now uses more random file sizes and limits memory consumption for all files being written in parallel. --- fdbclient/BackupContainer.actor.cpp | 43 +++++++++++++++++++---------- fdbclient/Knobs.cpp | 2 +- fdbrpc/AsyncFileNonDurable.actor.h | 33 +++++++++++++++------- fdbrpc/sim2.actor.cpp | 6 ++-- 4 files changed, 55 insertions(+), 29 deletions(-) diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 4d1331a674..2eb2c0d6b9 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -1278,16 +1278,19 @@ public: class BackupFile : public IBackupFile, ReferenceCounted { public: BackupFile(std::string fileName, Reference file, std::string finalFullPath) - : IBackupFile(fileName), m_file(file), m_finalFullPath(finalFullPath), m_writeOffset(0) + : IBackupFile(fileName), m_file(file), m_finalFullPath(finalFullPath), m_writeOffset(0), m_blockSize(CLIENT_KNOBS->BACKUP_LOCAL_FILE_WRITE_BLOCK) { - m_buffer.reserve(m_buffer.arena(), CLIENT_KNOBS->BACKUP_LOCAL_FILE_WRITE_BLOCK); + if(BUGGIFY) { + m_blockSize = deterministicRandom()->randomInt(100, 20000); + } + m_buffer.reserve(m_buffer.arena(), m_blockSize); } Future append(const void *data, int len) { m_buffer.append(m_buffer.arena(), (const uint8_t *)data, len); - if(m_buffer.size() >= CLIENT_KNOBS->BACKUP_LOCAL_FILE_WRITE_BLOCK) { - return flush(CLIENT_KNOBS->BACKUP_LOCAL_FILE_WRITE_BLOCK); + if(m_buffer.size() >= m_blockSize) { + return flush(m_blockSize); } return Void(); @@ -1315,10 +1318,7 @@ public: ACTOR static Future finish_impl(Reference f) { wait(f->flush(f->m_buffer.size())); - // Avoid truncation from 0 to 0 because simulation won't allow it since KAIO would not. - if(f->size() != 0) { - wait(f->m_file->truncate(f->size())); // Some IAsyncFile implementations extend in whole block sizes. - } + wait(f->m_file->truncate(f->size())); // Some IAsyncFile implementations extend in whole block sizes. wait(f->m_file->sync()); std::string name = f->m_file->getFilename(); f->m_file.clear(); @@ -1342,6 +1342,7 @@ public: Standalone> m_buffer; int64_t m_writeOffset; std::string m_finalFullPath; + int m_blockSize; }; Future> writeFile(std::string path) { @@ -1775,16 +1776,22 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference } int chooseFileSize(std::vector &sizes) { - int size = 1000; if(!sizes.empty()) { - size = sizes.back(); + int size = sizes.back(); sizes.pop_back(); + return size; } - return size; + return deterministicRandom()->randomInt(0, 2e6); } -ACTOR Future writeAndVerifyFile(Reference c, Reference f, int size) { +ACTOR Future writeAndVerifyFile(Reference c, Reference f, int size, FlowLock *lock) { state Standalone> content; + + wait(lock->take(TaskPriority::DefaultYield, size)); + state FlowLock::Releaser releaser(*lock, size); + + printf("writeAndVerify size=%d file=%s\n", size, f->getFileName().c_str()); + content.resize(content.arena(), size); for(int i = 0; i < content.size(); ++i) { content[i] = (uint8_t)deterministicRandom()->randomInt(0, 256); @@ -1818,6 +1825,8 @@ Version nextVersion(Version v) { } ACTOR Future testBackupContainer(std::string url) { + state FlowLock lock(100e6); + printf("BackupContainerTest URL %s\n", url.c_str()); state Reference c = IBackupContainer::openContainer(url); @@ -1840,7 +1849,11 @@ ACTOR Future testBackupContainer(std::string url) { state Version v = deterministicRandom()->randomInt64(0, std::numeric_limits::max() / 2); // List of sizes to use to test edge cases on underlying file implementations - state std::vector fileSizes = {0, 10000000, 5000005}; + state std::vector fileSizes = {0}; + if(StringRef(url).startsWith(LiteralStringRef("blob"))) { + fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE); + fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE + 10); + } loop { state Version logStart = v; @@ -1861,7 +1874,7 @@ ACTOR Future testBackupContainer(std::string url) { int size = chooseFileSize(fileSizes); snapshotSizes.rbegin()->second += size; - writes.push_back(writeAndVerifyFile(c, range, size)); + writes.push_back(writeAndVerifyFile(c, range, size, &lock)); if(deterministicRandom()->random01() < .2) { writes.push_back(c->writeKeyspaceSnapshotFile(snapshots.rbegin()->second, snapshotSizes.rbegin()->second)); @@ -1879,7 +1892,7 @@ ACTOR Future testBackupContainer(std::string url) { state Reference log = wait(c->writeLogFile(logStart, v, 10)); logs[logStart] = log->getFileName(); int size = chooseFileSize(fileSizes); - writes.push_back(writeAndVerifyFile(c, log, size)); + writes.push_back(writeAndVerifyFile(c, log, size, &lock)); // Randomly stop after a snapshot has finished and all manually seeded file sizes have been used. if(fileSizes.empty() && !snapshots.empty() && snapshots.rbegin()->second.empty() && deterministicRandom()->random01() < .2) { diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index b19ec67623..9b4156cd72 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -115,7 +115,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( TASKBUCKET_MAX_TASK_KEYS, 1000 ); if( randomize && BUGGIFY ) TASKBUCKET_MAX_TASK_KEYS = 20; //Backup - init( BACKUP_LOCAL_FILE_WRITE_BLOCK, 1024*1024 ); if( randomize && BUGGIFY ) BACKUP_LOCAL_FILE_WRITE_BLOCK = deterministicRandom()->randomInt(100, 10000); + init( BACKUP_LOCAL_FILE_WRITE_BLOCK, 1024*1024 ); init( BACKUP_CONCURRENT_DELETES, 100 ); init( BACKUP_SIMULATED_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) BACKUP_SIMULATED_LIMIT_BYTES = 1000; init( BACKUP_GET_RANGE_LIMIT_BYTES, 1e6 ); diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 652e9774c2..0c62c09fe9 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -142,6 +142,8 @@ public: //The address of the machine that opened the file NetworkAddress openedAddress; + bool aio; + private: //The wrapped IAsyncFile Reference file; @@ -174,8 +176,8 @@ private: ActorCollection reponses; //cannot call getResult on this actor collection, since the actors will be on different processes - AsyncFileNonDurable(const std::string& filename, Reference file, Reference diskParameters, NetworkAddress openedAddress) - : openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false) { + AsyncFileNonDurable(const std::string& filename, Reference file, Reference diskParameters, NetworkAddress openedAddress, bool aio) + : openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false), aio(aio) { //This is only designed to work in simulation ASSERT(g_network->isSimulated()); @@ -196,7 +198,7 @@ public: static std::map> filesBeingDeleted; //Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile - ACTOR static Future> open(std::string filename, std::string actualFilename, Future> wrappedFile, Reference diskParameters) { + ACTOR static Future> open(std::string filename, std::string actualFilename, Future> wrappedFile, Reference diskParameters, bool aio) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); state Future shutdown = success(currentProcess->shutdownSignal.getFuture()); @@ -221,7 +223,7 @@ public: throw io_error().asInjectedFault(); } - state Reference nonDurableFile( new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address) ); + state Reference nonDurableFile( new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio) ); //Causes the approximateSize member to be set state Future sizeFuture = nonDurableFile->size(); @@ -455,6 +457,9 @@ private: debugFileCheck("AsyncFileNonDurableWriteAfterWait", self->filename, dataCopy.begin(), offset, length); + // In AIO mode, only page-aligned writes are supported + ASSERT(!self->aio || (offset % 4096 == 0 && length % 4096 == 0)); + //Non-durable writes should introduce errors at the page level and corrupt at the sector level //Otherwise, we can perform the entire write at once int diskPageLength = saveDurable ? length : 4096; @@ -462,17 +467,25 @@ private: vector> writeFutures; for(int writeOffset = 0; writeOffset < length; ) { - // Number of bytes to the next diskPageLength offset within the write or the end of the write. - // First and last pages can be short. - int pageLength = std::min((int64_t)length - writeOffset, diskPageLength - ((offset + writeOffset) % diskPageLength)); + // Number of bytes until the next diskPageLength file offset within the write or the end of the write. + int pageLength = diskPageLength; + if(!self->aio && !saveDurable) { + // If not in AIO mode, and the save is not durable, then we can't perform the entire write all at once + // and the first and last pages touched by the write could be partial. + pageLength = std::min((int64_t)length - writeOffset, diskPageLength - ((offset + writeOffset) % diskPageLength)); + } //choose a random action to perform on this page write (write correctly, corrupt, or don't write) KillMode pageKillMode = (KillMode)deterministicRandom()->randomInt(0, self->killMode + 1); for(int pageOffset = 0; pageOffset < pageLength; ) { - // Number of bytes to the next diskSectorLength offset within the write or the end of the write. - // First and last pages can be short. - int sectorLength = std::min((int64_t)length - (writeOffset + pageOffset), diskSectorLength - ((offset + writeOffset + pageOffset) % diskSectorLength)); + // Number of bytes until the next diskSectorLength file offset within the write or the end of the write. + int sectorLength = diskSectorLength; + if(!self->aio && !saveDurable) { + // If not in AIO mode, and the save is not durable, then we can't perform the entire write all at once + // and the first and last sectors touched by the write could be partial. + sectorLength = std::min((int64_t)length - (writeOffset + pageOffset), diskSectorLength - ((offset + writeOffset + pageOffset) % diskSectorLength)); + } //If saving durable, then perform the write correctly. Otherwise, perform the write correcly with a probability of 1/3. //If corrupting the write, then this sector will be written correctly with a 1/4 chance diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 35521affb9..7365e0dcb0 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -622,8 +622,8 @@ private: if (randLog) fprintf( randLog, "SFT1 %s %s %s %" PRId64 "\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str(), size ); - if (size == 0) { - // KAIO will return EINVAL, as len==0 is an error. + // KAIO will return EINVAL, as len==0 is an error. + if( (self->flags & IAsyncFile::OPEN_NO_AIO) == 0 && size == 0) { throw io_error(); } @@ -1832,7 +1832,7 @@ Future< Reference > Sim2FileSystem::open( std::string filename } //Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile. This way, they can both keep up with the time to start the next operation Reference diskParameters(new DiskParameters(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH)); - machineCache[actualFilename] = AsyncFileNonDurable::open(filename, actualFilename, SimpleFile::open(filename, flags, mode, diskParameters, false), diskParameters); + machineCache[actualFilename] = AsyncFileNonDurable::open(filename, actualFilename, SimpleFile::open(filename, flags, mode, diskParameters, false), diskParameters, (flags & IAsyncFile::OPEN_NO_AIO) == 0); } Future> f = AsyncFileDetachable::open( machineCache[actualFilename] ); if(FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) From 318b862aa3408840f407453839a8bf129fc1b763 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 8 Mar 2021 01:49:29 -0800 Subject: [PATCH 7/7] Apply clang-format to backup changes. --- .clang-format | 8 +- fdbclient/BackupContainer.actor.cpp | 1047 ++-- fdbclient/FileBackupAgent.actor.cpp | 7007 +++++++++++++++------------ fdbclient/Knobs.cpp | 349 +- fdbrpc/AsyncFileNonDurable.actor.h | 607 +-- fdbrpc/sim2.actor.cpp | 1765 ++++--- 6 files changed, 5999 insertions(+), 4784 deletions(-) diff --git a/.clang-format b/.clang-format index c7faa9e0e6..60b24f6172 100644 --- a/.clang-format +++ b/.clang-format @@ -11,14 +11,14 @@ AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Inline -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true +BinPackArguments: false +BinPackParameters: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach ColumnLimit: 120 diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 2eb2c0d6b9..d8aa6b7e3f 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -39,13 +39,13 @@ namespace IBackupFile_impl { - ACTOR Future appendStringRefWithLen(Reference file, Standalone s) { - state uint32_t lenBuf = bigEndian32((uint32_t)s.size()); - wait(file->append(&lenBuf, sizeof(lenBuf))); - wait(file->append(s.begin(), s.size())); - return Void(); - } +ACTOR Future appendStringRefWithLen(Reference file, Standalone s) { + state uint32_t lenBuf = bigEndian32((uint32_t)s.size()); + wait(file->append(&lenBuf, sizeof(lenBuf))); + wait(file->append(s.begin(), s.size())); + return Void(); } +} // namespace IBackupFile_impl Future IBackupFile::appendStringRefWithLen(Standalone s) { return IBackupFile_impl::appendStringRefWithLen(Reference::addRef(this), s); @@ -53,31 +53,31 @@ Future IBackupFile::appendStringRefWithLen(Standalone s) { std::string IBackupContainer::ExpireProgress::toString() const { std::string s = step + "..."; - if(total > 0) { + if (total > 0) { s += format("%d/%d (%.2f%%)", done, total, double(done) / total * 100); } return s; } -void BackupFileList::toStream(FILE *fout) const { - for(const RangeFile &f : ranges) { +void BackupFileList::toStream(FILE* fout) const { + for (const RangeFile& f : ranges) { fprintf(fout, "range %" PRId64 " %s\n", f.fileSize, f.fileName.c_str()); } - for(const LogFile &f : logs) { + for (const LogFile& f : logs) { fprintf(fout, "log %" PRId64 " %s\n", f.fileSize, f.fileName.c_str()); } - for(const KeyspaceSnapshotFile &f : snapshots) { + for (const KeyspaceSnapshotFile& f : snapshots) { fprintf(fout, "snapshotManifest %" PRId64 " %s\n", f.totalSize, f.fileName.c_str()); } } -Future fetchTimes(Reference tr, std::map *pVersionTimeMap) { +Future fetchTimes(Reference tr, std::map* pVersionTimeMap) { std::vector> futures; // Resolve each version in the map, - for(auto &p : *pVersionTimeMap) { + for (auto& p : *pVersionTimeMap) { futures.push_back(map(timeKeeperEpochsFromVersion(p.first, tr), [=](Optional t) { - if(t.present()) + if (t.present()) pVersionTimeMap->at(p.first) = t.get(); else pVersionTimeMap->erase(p.first); @@ -92,22 +92,23 @@ Future BackupDescription::resolveVersionTimes(Database cx) { // Populate map with versions needed versionTimeMap.clear(); - for(const KeyspaceSnapshotFile &m : snapshots) { + for (const KeyspaceSnapshotFile& m : snapshots) { versionTimeMap[m.beginVersion]; versionTimeMap[m.endVersion]; } - if(minLogBegin.present()) + if (minLogBegin.present()) versionTimeMap[minLogBegin.get()]; - if(maxLogEnd.present()) + if (maxLogEnd.present()) versionTimeMap[maxLogEnd.get()]; - if(contiguousLogEnd.present()) + if (contiguousLogEnd.present()) versionTimeMap[contiguousLogEnd.get()]; - if(minRestorableVersion.present()) + if (minRestorableVersion.present()) versionTimeMap[minRestorableVersion.get()]; - if(maxRestorableVersion.present()) + if (maxRestorableVersion.present()) versionTimeMap[maxRestorableVersion.get()]; - return runRYWTransaction(cx, [=](Reference tr) { return fetchTimes(tr, &versionTimeMap); }); + return runRYWTransaction(cx, + [=](Reference tr) { return fetchTimes(tr, &versionTimeMap); }); }; std::string BackupDescription::toString() const { @@ -118,46 +119,49 @@ std::string BackupDescription::toString() const { auto formatVersion = [&](Version v) { std::string s; - if(!versionTimeMap.empty()) { + if (!versionTimeMap.empty()) { auto i = versionTimeMap.find(v); - if(i != versionTimeMap.end()) + if (i != versionTimeMap.end()) s = format("%lld (%s)", v, BackupAgentBase::formatTime(i->second).c_str()); else s = format("%lld (unknown)", v); - } - else if(maxLogEnd.present()) { + } else if (maxLogEnd.present()) { double days = double(maxLogEnd.get() - v) / (CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 24 * 60 * 60); s = format("%lld (maxLogEnd %s%.2f days)", v, days < 0 ? "+" : "-", days); - } - else { + } else { s = format("%lld", v); } return s; }; - for(const KeyspaceSnapshotFile &m : snapshots) { - info.append(format("Snapshot: startVersion=%s endVersion=%s totalBytes=%lld restorable=%s expiredPct=%.2f\n", - formatVersion(m.beginVersion).c_str(), formatVersion(m.endVersion).c_str(), m.totalSize, m.restorable.orDefault(false) ? "true" : "false", m.expiredPct(expiredEndVersion))); + for (const KeyspaceSnapshotFile& m : snapshots) { + info.append( + format("Snapshot: startVersion=%s endVersion=%s totalBytes=%lld restorable=%s expiredPct=%.2f\n", + formatVersion(m.beginVersion).c_str(), + formatVersion(m.endVersion).c_str(), + m.totalSize, + m.restorable.orDefault(false) ? "true" : "false", + m.expiredPct(expiredEndVersion))); } info.append(format("SnapshotBytes: %lld\n", snapshotBytes)); - if(expiredEndVersion.present()) + if (expiredEndVersion.present()) info.append(format("ExpiredEndVersion: %s\n", formatVersion(expiredEndVersion.get()).c_str())); - if(unreliableEndVersion.present()) + if (unreliableEndVersion.present()) info.append(format("UnreliableEndVersion: %s\n", formatVersion(unreliableEndVersion.get()).c_str())); - if(minLogBegin.present()) + if (minLogBegin.present()) info.append(format("MinLogBeginVersion: %s\n", formatVersion(minLogBegin.get()).c_str())); - if(contiguousLogEnd.present()) + if (contiguousLogEnd.present()) info.append(format("ContiguousLogEndVersion: %s\n", formatVersion(contiguousLogEnd.get()).c_str())); - if(maxLogEnd.present()) + if (maxLogEnd.present()) info.append(format("MaxLogEndVersion: %s\n", formatVersion(maxLogEnd.get()).c_str())); - if(minRestorableVersion.present()) + if (minRestorableVersion.present()) info.append(format("MinRestorableVersion: %s\n", formatVersion(minRestorableVersion.get()).c_str())); - if(maxRestorableVersion.present()) + if (maxRestorableVersion.present()) info.append(format("MaxRestorableVersion: %s\n", formatVersion(maxRestorableVersion.get()).c_str())); - if(!extendedDetail.empty()) + if (!extendedDetail.empty()) info.append("ExtendedDetail: ").append(extendedDetail); return info; @@ -173,14 +177,13 @@ std::string BackupDescription::toJSON() const { auto formatVersion = [&](Version v) { JsonBuilderObject doc; doc.setKey("Version", v); - if(!versionTimeMap.empty()) { + if (!versionTimeMap.empty()) { auto i = versionTimeMap.find(v); - if(i != versionTimeMap.end()) { + if (i != versionTimeMap.end()) { doc.setKey("Timestamp", BackupAgentBase::formatTime(i->second)); doc.setKey("EpochSeconds", i->second); } - } - else if(maxLogEnd.present()) { + } else if (maxLogEnd.present()) { double days = double(v - maxLogEnd.get()) / (CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 24 * 60 * 60); doc.setKey("RelativeDays", days); } @@ -188,7 +191,7 @@ std::string BackupDescription::toJSON() const { }; JsonBuilderArray snapshotsArray; - for(const KeyspaceSnapshotFile &m : snapshots) { + for (const KeyspaceSnapshotFile& m : snapshots) { JsonBuilderObject snapshotDoc; snapshotDoc.setKey("Start", formatVersion(m.beginVersion)); snapshotDoc.setKey("End", formatVersion(m.endVersion)); @@ -201,22 +204,22 @@ std::string BackupDescription::toJSON() const { doc.setKey("TotalSnapshotBytes", snapshotBytes); - if(expiredEndVersion.present()) + if (expiredEndVersion.present()) doc.setKey("ExpiredEnd", formatVersion(expiredEndVersion.get())); - if(unreliableEndVersion.present()) + if (unreliableEndVersion.present()) doc.setKey("UnreliableEnd", formatVersion(unreliableEndVersion.get())); - if(minLogBegin.present()) + if (minLogBegin.present()) doc.setKey("MinLogBegin", formatVersion(minLogBegin.get())); - if(contiguousLogEnd.present()) + if (contiguousLogEnd.present()) doc.setKey("ContiguousLogEnd", formatVersion(contiguousLogEnd.get())); - if(maxLogEnd.present()) + if (maxLogEnd.present()) doc.setKey("MaxLogEnd", formatVersion(maxLogEnd.get())); - if(minRestorableVersion.present()) + if (minRestorableVersion.present()) doc.setKey("MinRestorablePoint", formatVersion(minRestorableVersion.get())); - if(maxRestorableVersion.present()) + if (maxRestorableVersion.present()) doc.setKey("MaxRestorablePoint", formatVersion(maxRestorableVersion.get())); - if(!extendedDetail.empty()) + if (!extendedDetail.empty()) doc.setKey("ExtendedDetail", extendedDetail); return doc.getJson(); @@ -228,17 +231,17 @@ std::string BackupDescription::toJSON() const { * Snapshot manifests (a complete set of files constituting a database snapshot for the backup's target ranges) * are stored as JSON files at paths like * /snapshots/snapshot,minVersion,maxVersion,totalBytes - * + * * Key range files for snapshots are stored at paths like * /kvranges/snapshot,startVersion/N/range,version,uid,blockSize * where startVersion is the version at which the backup snapshot execution began and N is a number - * that is increased as key range files are generated over time (at varying rates) such that there + * that is increased as key range files are generated over time (at varying rates) such that there * are around 5,000 key range files in each folder. * - * Note that startVersion will NOT correspond to the minVersion of a snapshot manifest because + * Note that startVersion will NOT correspond to the minVersion of a snapshot manifest because * snapshot manifest min/max versions are based on the actual contained data and the first data * file written will be after the start version of the snapshot's execution. - * + * * Log files are at file paths like * /logs/.../log,startVersion,endVersion,blockSize * where ... is a multi level path which sorts lexically into version order and results in approximately 1 @@ -248,8 +251,8 @@ std::string BackupDescription::toJSON() const { * * Prior to FDB version 6.0.16, key range files were stored using a different folder scheme. Newer versions * still support this scheme for all restore and backup management operations but key range files generated - * by backup using version 6.0.16 or later use the scheme describe above. - * + * by backup using version 6.0.16 or later use the scheme describe above. + * * The old format stored key range files at paths like * /ranges/.../range,version,uid,blockSize * where ... is a multi level path with sorts lexically into version order and results in up to approximately @@ -272,7 +275,8 @@ public: // Although not required, an implementation can avoid traversing unwanted subfolders // by calling folderPathFilter(absoluteFolderPath) and checking for a false return value. typedef std::vector> FilesAndSizesT; - virtual Future listFiles(std::string path = "", std::function folderPathFilter = nullptr) = 0; + virtual Future listFiles(std::string path = "", + std::function folderPathFilter = nullptr) = 0; // Open a file for read by fileName virtual Future> readFile(std::string fileName) = 0; @@ -285,7 +289,7 @@ public: // Delete entire container. During the process, if pNumDeleted is not null it will be // updated with the count of deleted files so that progress can be seen. - virtual Future deleteContainer(int *pNumDeleted) = 0; + virtual Future deleteContainer(int* pNumDeleted) = 0; // Creates a 2-level path (x/y) where v should go such that x/y/* contains (10^smallestBucket) possible versions static std::string versionFolderString(Version v, int smallestBucket) { @@ -309,108 +313,140 @@ public: return f; } - // The innermost folder covers 100 seconds (1e8 versions) During a full speed backup it is possible though very unlikely write about 10,000 snapshot range files during that time. + // The innermost folder covers 100 seconds (1e8 versions) During a full speed backup it is possible though very + // unlikely write about 10,000 snapshot range files during that time. static std::string old_rangeVersionFolderString(Version v) { return format("ranges/%s/", versionFolderString(v, 8).c_str()); } // Get the root folder for a snapshot's data based on its begin version static std::string snapshotFolderString(Version snapshotBeginVersion) { - return format("kvranges/snapshot.%018" PRId64 , snapshotBeginVersion); + return format("kvranges/snapshot.%018" PRId64, snapshotBeginVersion); } // Extract the snapshot begin version from a path static Version extractSnapshotBeginVersion(std::string path) { Version snapshotBeginVersion; - if(sscanf(path.c_str(), "kvranges/snapshot.%018" SCNd64, &snapshotBeginVersion) == 1) { + if (sscanf(path.c_str(), "kvranges/snapshot.%018" SCNd64, &snapshotBeginVersion) == 1) { return snapshotBeginVersion; } return invalidVersion; } - // The innermost folder covers 100,000 seconds (1e11 versions) which is 5,000 mutation log files at current settings. + // The innermost folder covers 100,000 seconds (1e11 versions) which is 5,000 mutation log files at current + // settings. static std::string logVersionFolderString(Version v) { return format("logs/%s/", versionFolderString(v, 11).c_str()); } Future> writeLogFile(Version beginVersion, Version endVersion, int blockSize) { - return writeFile(logVersionFolderString(beginVersion) + format("log,%lld,%lld,%s,%d", beginVersion, endVersion, deterministicRandom()->randomUniqueID().toString().c_str(), blockSize)); + return writeFile(logVersionFolderString(beginVersion) + + format("log,%lld,%lld,%s,%d", + beginVersion, + endVersion, + deterministicRandom()->randomUniqueID().toString().c_str(), + blockSize)); } - Future> writeRangeFile(Version snapshotBeginVersion, int snapshotFileCount, Version fileVersion, int blockSize) { - std::string fileName = format("range,%" PRId64 ",%s,%d", fileVersion, deterministicRandom()->randomUniqueID().toString().c_str(), blockSize); + Future> writeRangeFile(Version snapshotBeginVersion, + int snapshotFileCount, + Version fileVersion, + int blockSize) { + std::string fileName = format("range,%" PRId64 ",%s,%d", + fileVersion, + deterministicRandom()->randomUniqueID().toString().c_str(), + blockSize); // In order to test backward compatibility in simulation, sometimes write to the old path format - if(g_network->isSimulated() && deterministicRandom()->coinflip()) { + if (g_network->isSimulated() && deterministicRandom()->coinflip()) { return writeFile(old_rangeVersionFolderString(fileVersion) + fileName); } - return writeFile(snapshotFolderString(snapshotBeginVersion) + format("/%d/", snapshotFileCount / (BUGGIFY ? 1 : 5000)) + fileName); + return writeFile(snapshotFolderString(snapshotBeginVersion) + + format("/%d/", snapshotFileCount / (BUGGIFY ? 1 : 5000)) + fileName); } - // Find what should be the filename of a path by finding whatever is after the last forward or backward slash, or failing to find those, the whole string. + // Find what should be the filename of a path by finding whatever is after the last forward or backward slash, or + // failing to find those, the whole string. static std::string fileNameOnly(std::string path) { // Find the last forward slash position, defaulting to 0 if not found int pos = path.find_last_of('/'); - if(pos == std::string::npos) { + if (pos == std::string::npos) { pos = 0; } // Find the last backward slash position after pos, and update pos if found int b = path.find_last_of('\\', pos); - if(b != std::string::npos) { + if (b != std::string::npos) { pos = b; } return path.substr(pos + 1); } - static bool pathToRangeFile(RangeFile &out, std::string path, int64_t size) { + static bool pathToRangeFile(RangeFile& out, std::string path, int64_t size) { std::string name = fileNameOnly(path); RangeFile f; f.fileName = path; f.fileSize = size; int len; - if(sscanf(name.c_str(), "range,%" SCNd64 ",%*[^,],%u%n", &f.version, &f.blockSize, &len) == 2 && len == name.size()) { + if (sscanf(name.c_str(), "range,%" SCNd64 ",%*[^,],%u%n", &f.version, &f.blockSize, &len) == 2 && + len == name.size()) { out = f; return true; } return false; } - static bool pathToLogFile(LogFile &out, std::string path, int64_t size) { + static bool pathToLogFile(LogFile& out, std::string path, int64_t size) { std::string name = fileNameOnly(path); LogFile f; f.fileName = path; f.fileSize = size; int len; - if(sscanf(name.c_str(), "log,%" SCNd64 ",%" SCNd64 ",%*[^,],%u%n", &f.beginVersion, &f.endVersion, &f.blockSize, &len) == 3 && len == name.size()) { + if (sscanf(name.c_str(), + "log,%" SCNd64 ",%" SCNd64 ",%*[^,],%u%n", + &f.beginVersion, + &f.endVersion, + &f.blockSize, + &len) == 3 && + len == name.size()) { out = f; return true; } return false; } - static bool pathToKeyspaceSnapshotFile(KeyspaceSnapshotFile &out, std::string path) { + static bool pathToKeyspaceSnapshotFile(KeyspaceSnapshotFile& out, std::string path) { std::string name = fileNameOnly(path); KeyspaceSnapshotFile f; f.fileName = path; int len; - if(sscanf(name.c_str(), "snapshot,%" SCNd64 ",%" SCNd64 ",%" SCNd64 "%n", &f.beginVersion, &f.endVersion, &f.totalSize, &len) == 3 && len == name.size()) { + if (sscanf(name.c_str(), + "snapshot,%" SCNd64 ",%" SCNd64 ",%" SCNd64 "%n", + &f.beginVersion, + &f.endVersion, + &f.totalSize, + &len) == 3 && + len == name.size()) { out = f; return true; } return false; } - // TODO: Do this more efficiently, as the range file list for a snapshot could potentially be hundreds of megabytes. - ACTOR static Future> readKeyspaceSnapshot_impl(Reference bc, KeyspaceSnapshotFile snapshot) { + // TODO: Do this more efficiently, as the range file list for a snapshot could potentially be hundreds of + // megabytes. + ACTOR static Future> readKeyspaceSnapshot_impl(Reference bc, + KeyspaceSnapshotFile snapshot) { // Read the range file list for the specified version range, and then index them by fileName. - // This is so we can verify that each of the files listed in the manifest file are also in the container at this time. + // This is so we can verify that each of the files listed in the manifest file are also in the container at this + // time. std::vector files = wait(bc->listRangeFiles(snapshot.beginVersion, snapshot.endVersion)); state std::map rangeIndex; - for(auto &f : files) + for (auto& f : files) rangeIndex[f.fileName] = std::move(f); - // Read the snapshot file, verify the version range, then find each of the range files by name in the index and return them. + // Read the snapshot file, verify the version range, then find each of the range files by name in the index and + // return them. state Reference f = wait(bc->readFile(snapshot.fileName)); int64_t size = wait(f->size()); state Standalone buf = makeString(size); @@ -420,42 +456,42 @@ public: JSONDoc doc(json); Version v; - if(!doc.tryGet("beginVersion", v) || v != snapshot.beginVersion) + if (!doc.tryGet("beginVersion", v) || v != snapshot.beginVersion) throw restore_corrupted_data(); - if(!doc.tryGet("endVersion", v) || v != snapshot.endVersion) + if (!doc.tryGet("endVersion", v) || v != snapshot.endVersion) throw restore_corrupted_data(); - json_spirit::mValue &filesArray = doc.create("files"); - if(filesArray.type() != json_spirit::array_type) + json_spirit::mValue& filesArray = doc.create("files"); + if (filesArray.type() != json_spirit::array_type) throw restore_corrupted_data(); std::vector results; int missing = 0; - for(auto const &fileValue : filesArray.get_array()) { - if(fileValue.type() != json_spirit::str_type) + for (auto const& fileValue : filesArray.get_array()) { + if (fileValue.type() != json_spirit::str_type) throw restore_corrupted_data(); // If the file is not in the index then log the error but don't throw yet, keep checking the whole list. auto i = rangeIndex.find(fileValue.get_str()); - if(i == rangeIndex.end()) { + if (i == rangeIndex.end()) { TraceEvent(SevError, "FileRestoreMissingRangeFile") - .detail("URL", bc->getURL()) - .detail("File", fileValue.get_str()); + .detail("URL", bc->getURL()) + .detail("File", fileValue.get_str()); ++missing; } // No point in using more memory once data is missing since an error will be thrown instead. - if(missing == 0) { + if (missing == 0) { results.push_back(i->second); } } - if(missing > 0) { + if (missing > 0) { TraceEvent(SevError, "FileRestoreMissingRangeFileSummary") - .detail("URL", bc->getURL()) - .detail("Count", missing); + .detail("URL", bc->getURL()) + .detail("Count", missing); throw restore_missing_data(); } @@ -467,10 +503,11 @@ public: return readKeyspaceSnapshot_impl(Reference::addRef(this), snapshot); } - ACTOR static Future writeKeyspaceSnapshotFile_impl(Reference bc, std::vector fileNames, int64_t totalBytes) { + ACTOR static Future writeKeyspaceSnapshotFile_impl(Reference bc, + std::vector fileNames, + int64_t totalBytes) { ASSERT(!fileNames.empty()); - state Version minVer = std::numeric_limits::max(); state Version maxVer = 0; state RangeFile rf; @@ -478,16 +515,15 @@ public: state int i; // Validate each filename, update version range - for(i = 0; i < fileNames.size(); ++i) { - auto const &f = fileNames[i]; - if(pathToRangeFile(rf, f, 0)) { + for (i = 0; i < fileNames.size(); ++i) { + auto const& f = fileNames[i]; + if (pathToRangeFile(rf, f, 0)) { fileArray.push_back(f); - if(rf.version < minVer) + if (rf.version < minVer) minVer = rf.version; - if(rf.version > maxVer) + if (rf.version > maxVer) maxVer = rf.version; - } - else + } else throw restore_unknown_file_type(); wait(yield()); } @@ -503,7 +539,8 @@ public: wait(yield()); state std::string docString = json_spirit::write_string(json); - state Reference f = wait(bc->writeFile(format("snapshots/snapshot,%lld,%lld,%lld", minVer, maxVer, totalBytes))); + state Reference f = + wait(bc->writeFile(format("snapshots/snapshot,%lld,%lld,%lld", minVer, maxVer, totalBytes))); wait(f->append(docString.data(), docString.size())); wait(f->finish()); @@ -511,33 +548,37 @@ public: } Future writeKeyspaceSnapshotFile(std::vector fileNames, int64_t totalBytes) { - return writeKeyspaceSnapshotFile_impl(Reference::addRef(this), fileNames, totalBytes); + return writeKeyspaceSnapshotFile_impl( + Reference::addRef(this), fileNames, totalBytes); }; // List log files, unsorted, which contain data at any version >= beginVersion and <= targetVersion - Future> listLogFiles(Version beginVersion = 0, Version targetVersion = std::numeric_limits::max()) { - // The first relevant log file could have a begin version less than beginVersion based on the knobs which determine log file range size, - // so start at an earlier version adjusted by how many versions a file could contain. + Future> listLogFiles(Version beginVersion = 0, + Version targetVersion = std::numeric_limits::max()) { + // The first relevant log file could have a begin version less than beginVersion based on the knobs which + // determine log file range size, so start at an earlier version adjusted by how many versions a file could + // contain. // // Get the cleaned (without slashes) first and last folders that could contain relevant results. - std::string firstPath = cleanFolderString(logVersionFolderString( - std::max(0, beginVersion - CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE) - )); - std::string lastPath = cleanFolderString(logVersionFolderString(targetVersion)); + std::string firstPath = cleanFolderString(logVersionFolderString(std::max( + 0, beginVersion - CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE))); + std::string lastPath = cleanFolderString(logVersionFolderString(targetVersion)); - std::function pathFilter = [=](const std::string &folderPath) { - // Remove slashes in the given folder path so that the '/' positions in the version folder string do not matter + std::function pathFilter = [=](const std::string& folderPath) { + // Remove slashes in the given folder path so that the '/' positions in the version folder string do not + // matter std::string cleaned = cleanFolderString(folderPath); - return StringRef(firstPath).startsWith(cleaned) || StringRef(lastPath).startsWith(cleaned) - || (cleaned > firstPath && cleaned < lastPath); + return StringRef(firstPath).startsWith(cleaned) || StringRef(lastPath).startsWith(cleaned) || + (cleaned > firstPath && cleaned < lastPath); }; - return map(listFiles("logs/", pathFilter), [=](const FilesAndSizesT &files) { + return map(listFiles("logs/", pathFilter), [=](const FilesAndSizesT& files) { std::vector results; LogFile lf; - for(auto &f : files) { - if(pathToLogFile(lf, f.first, f.second) && lf.endVersion > beginVersion && lf.beginVersion <= targetVersion) + for (auto& f : files) { + if (pathToLogFile(lf, f.first, f.second) && lf.endVersion > beginVersion && + lf.beginVersion <= targetVersion) results.push_back(lf); } return results; @@ -545,25 +586,27 @@ public: } // List range files, unsorted, which contain data at or between beginVersion and endVersion - // NOTE: This reads the range file folder schema from FDB 6.0.15 and earlier and is provided for backward compatibility + // NOTE: This reads the range file folder schema from FDB 6.0.15 and earlier and is provided for backward + // compatibility Future> old_listRangeFiles(Version beginVersion, Version endVersion) { // Get the cleaned (without slashes) first and last folders that could contain relevant results. std::string firstPath = cleanFolderString(old_rangeVersionFolderString(beginVersion)); - std::string lastPath = cleanFolderString(old_rangeVersionFolderString(endVersion)); + std::string lastPath = cleanFolderString(old_rangeVersionFolderString(endVersion)); - std::function pathFilter = [=](const std::string &folderPath) { - // Remove slashes in the given folder path so that the '/' positions in the version folder string do not matter + std::function pathFilter = [=](const std::string& folderPath) { + // Remove slashes in the given folder path so that the '/' positions in the version folder string do not + // matter std::string cleaned = cleanFolderString(folderPath); - return StringRef(firstPath).startsWith(cleaned) || StringRef(lastPath).startsWith(cleaned) - || (cleaned > firstPath && cleaned < lastPath); + return StringRef(firstPath).startsWith(cleaned) || StringRef(lastPath).startsWith(cleaned) || + (cleaned > firstPath && cleaned < lastPath); }; - return map(listFiles("ranges/", pathFilter), [=](const FilesAndSizesT &files) { + return map(listFiles("ranges/", pathFilter), [=](const FilesAndSizesT& files) { std::vector results; RangeFile rf; - for(auto &f : files) { - if(pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) + for (auto& f : files) { + if (pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) results.push_back(rf); } return results; @@ -581,35 +624,40 @@ public: // Define filter function (for listFiles() implementations that use it) to reject any folder // starting after endVersion - std::function pathFilter = [=](std::string const &path) { + std::function pathFilter = [=](std::string const& path) { return extractSnapshotBeginVersion(path) <= endVersion; }; - Future> newFiles = map(listFiles("kvranges/", pathFilter), [=](const FilesAndSizesT &files) { - std::vector results; - RangeFile rf; - for(auto &f : files) { - if(pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && rf.version <= endVersion) - results.push_back(rf); - } - return results; - }); + Future> newFiles = + map(listFiles("kvranges/", pathFilter), [=](const FilesAndSizesT& files) { + std::vector results; + RangeFile rf; + for (auto& f : files) { + if (pathToRangeFile(rf, f.first, f.second) && rf.version >= beginVersion && + rf.version <= endVersion) + results.push_back(rf); + } + return results; + }); return map(success(oldFiles) && success(newFiles), [=](Void _) { std::vector results = std::move(newFiles.get()); std::vector oldResults = std::move(oldFiles.get()); - results.insert(results.end(), std::make_move_iterator(oldResults.begin()), std::make_move_iterator(oldResults.end())); + results.insert( + results.end(), std::make_move_iterator(oldResults.begin()), std::make_move_iterator(oldResults.end())); return results; }); } - // List snapshots which have been fully written, in sorted beginVersion order, which start before end and finish on or after begin - Future> listKeyspaceSnapshots(Version begin = 0, Version end = std::numeric_limits::max()) { - return map(listFiles("snapshots/"), [=](const FilesAndSizesT &files) { + // List snapshots which have been fully written, in sorted beginVersion order, which start before end and finish on + // or after begin + Future> listKeyspaceSnapshots(Version begin = 0, + Version end = std::numeric_limits::max()) { + return map(listFiles("snapshots/"), [=](const FilesAndSizesT& files) { std::vector results; KeyspaceSnapshotFile sf; - for(auto &f : files) { - if(pathToKeyspaceSnapshotFile(sf, f.first) && sf.beginVersion < end && sf.endVersion >= begin) + for (auto& f : files) { + if (pathToKeyspaceSnapshotFile(sf, f.first) && sf.beginVersion < end && sf.endVersion >= begin) results.push_back(sf); } std::sort(results.begin(), results.end()); @@ -617,27 +665,29 @@ public: }); } - ACTOR static Future dumpFileList_impl(Reference bc, Version begin, Version end) { + ACTOR static Future dumpFileList_impl(Reference bc, + Version begin, + Version end) { state Future> fRanges = bc->listRangeFiles(begin, end); state Future> fSnapshots = bc->listKeyspaceSnapshots(begin, end); state Future> fLogs = bc->listLogFiles(begin, end); wait(success(fRanges) && success(fSnapshots) && success(fLogs)); - return BackupFileList({fRanges.get(), fLogs.get(), fSnapshots.get()}); + return BackupFileList({ fRanges.get(), fLogs.get(), fSnapshots.get() }); } Future dumpFileList(Version begin, Version end) { return dumpFileList_impl(Reference::addRef(this), begin, end); } - static Version resolveRelativeVersion(Optional max, Version v, const char *name, Error e) { - if(v == invalidVersion) { + static Version resolveRelativeVersion(Optional max, Version v, const char* name, Error e) { + if (v == invalidVersion) { TraceEvent(SevError, "BackupExpireInvalidVersion").detail(name, v); throw e; } - if(v < 0) { - if(!max.present()) { + if (v < 0) { + if (!max.present()) { TraceEvent(SevError, "BackupExpireCannotResolveRelativeVersion").detail(name, v); throw e; } @@ -646,16 +696,18 @@ public: return v; } - ACTOR static Future describeBackup_impl(Reference bc, bool deepScan, Version logStartVersionOverride) { + ACTOR static Future describeBackup_impl(Reference bc, + bool deepScan, + Version logStartVersionOverride) { state BackupDescription desc; desc.url = bc->getURL(); TraceEvent("BackupContainerDescribe1") - .detail("URL", bc->getURL()) - .detail("LogStartVersionOverride", logStartVersionOverride); + .detail("URL", bc->getURL()) + .detail("LogStartVersionOverride", logStartVersionOverride); bool e = wait(bc->exists()); - if(!e) { + if (!e) { TraceEvent(SevWarnAlways, "BackupContainerDoesNotExist").detail("URL", bc->getURL()); throw backup_does_not_exist(); } @@ -663,9 +715,10 @@ public: // If logStartVersion is relative, then first do a recursive call without it to find the max log version // from which to resolve the relative version. // This could be handled more efficiently without recursion but it's tricky, this will do for now. - if(logStartVersionOverride != invalidVersion && logStartVersionOverride < 0) { + if (logStartVersionOverride != invalidVersion && logStartVersionOverride < 0) { BackupDescription tmp = wait(bc->describeBackup(false, invalidVersion)); - logStartVersionOverride = resolveRelativeVersion(tmp.maxLogEnd, logStartVersionOverride, "LogStartVersionOverride", invalid_option_value()); + logStartVersionOverride = resolveRelativeVersion( + tmp.maxLogEnd, logStartVersionOverride, "LogStartVersionOverride", invalid_option_value()); } // Get metadata versions @@ -679,7 +732,7 @@ public: metaReads.push_back(store(metaUnreliableEnd, bc->unreliableEndVersion().get())); // Only read log begin/end versions if not doing a deep scan, otherwise scan files and recalculate them. - if(!deepScan) { + if (!deepScan) { metaReads.push_back(store(metaLogBegin, bc->logBeginVersion().get())); metaReads.push_back(store(metaLogEnd, bc->logEndVersion().get())); } @@ -687,15 +740,17 @@ public: wait(waitForAll(metaReads)); TraceEvent("BackupContainerDescribe2") - .detail("URL", bc->getURL()) - .detail("LogStartVersionOverride", logStartVersionOverride) - .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion)) - .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion)) - .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion)) - .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion)); + .detail("URL", bc->getURL()) + .detail("LogStartVersionOverride", logStartVersionOverride) + .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion)) + .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion)) + .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion)) + .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion)); - // If the logStartVersionOverride is positive (not relative) then ensure that unreliableEndVersion is equal or greater - if(logStartVersionOverride != invalidVersion && metaUnreliableEnd.orDefault(invalidVersion) < logStartVersionOverride) { + // If the logStartVersionOverride is positive (not relative) then ensure that unreliableEndVersion is equal or + // greater + if (logStartVersionOverride != invalidVersion && + metaUnreliableEnd.orDefault(invalidVersion) < logStartVersionOverride) { metaUnreliableEnd = logStartVersionOverride; } @@ -703,28 +758,26 @@ public: // thing to do is rescan to verify log continuity and get exact begin/end versions // - either are missing // - metaLogEnd <= metaLogBegin (invalid range) - // - metaLogEnd < metaExpiredEnd (log continuity exists in missing data range) + // - metaLogEnd < metaExpiredEnd (log continuity exists in missing data range) // - metaLogEnd < metaUnreliableEnd (log continuity exists in incomplete data range) - if(!metaLogBegin.present() || !metaLogEnd.present() - || metaLogEnd.get() <= metaLogBegin.get() - || metaLogEnd.get() < metaExpiredEnd.orDefault(invalidVersion) - || metaLogEnd.get() < metaUnreliableEnd.orDefault(invalidVersion) - ) { + if (!metaLogBegin.present() || !metaLogEnd.present() || metaLogEnd.get() <= metaLogBegin.get() || + metaLogEnd.get() < metaExpiredEnd.orDefault(invalidVersion) || + metaLogEnd.get() < metaUnreliableEnd.orDefault(invalidVersion)) { TraceEvent(SevWarnAlways, "BackupContainerMetadataInvalid") - .detail("URL", bc->getURL()) - .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion)) - .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion)) - .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion)) - .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion)); + .detail("URL", bc->getURL()) + .detail("ExpiredEndVersion", metaExpiredEnd.orDefault(invalidVersion)) + .detail("UnreliableEndVersion", metaUnreliableEnd.orDefault(invalidVersion)) + .detail("LogBeginVersion", metaLogBegin.orDefault(invalidVersion)) + .detail("LogEndVersion", metaLogEnd.orDefault(invalidVersion)); metaLogBegin = Optional(); metaLogEnd = Optional(); } // If the unreliable end version is not set or is < expiredEndVersion then increase it to expiredEndVersion. - // Describe does not update unreliableEnd in the backup metadata for safety reasons as there is no + // Describe does not update unreliableEnd in the backup metadata for safety reasons as there is no // compare-and-set operation to atomically change it and an expire process could be advancing it simultaneously. - if(!metaUnreliableEnd.present() || metaUnreliableEnd.get() < metaExpiredEnd.orDefault(0)) + if (!metaUnreliableEnd.present() || metaUnreliableEnd.get() < metaExpiredEnd.orDefault(0)) metaUnreliableEnd = metaExpiredEnd; desc.unreliableEndVersion = metaUnreliableEnd; @@ -737,12 +790,13 @@ public: // Use the known log range if present // Logs are assumed to be contiguious between metaLogBegin and metaLogEnd, so initalize desc accordingly - if(metaLogBegin.present() && metaLogEnd.present()) { + if (metaLogBegin.present() && metaLogEnd.present()) { // minLogBegin is the greater of the log begin metadata OR the unreliable end version since we can't count // on log file presence before that version. desc.minLogBegin = std::max(metaLogBegin.get(), desc.unreliableEndVersion.orDefault(0)); - // Set the maximum known end version of a log file, so far, which is also the assumed contiguous log file end version + // Set the maximum known end version of a log file, so far, which is also the assumed contiguous log file + // end version desc.maxLogEnd = metaLogEnd.get(); desc.contiguousLogEnd = desc.maxLogEnd; @@ -756,24 +810,24 @@ public: // List logs in version order so log continuity can be analyzed std::sort(logs.begin(), logs.end()); - if(!logs.empty()) { + if (!logs.empty()) { desc.maxLogEnd = logs.rbegin()->endVersion; auto i = logs.begin(); // If we didn't get log versions above then seed them using the first log file - if(!desc.contiguousLogEnd.present()) { + if (!desc.contiguousLogEnd.present()) { desc.minLogBegin = i->beginVersion; desc.contiguousLogEnd = i->endVersion; ++i; } - auto &end = desc.contiguousLogEnd.get(); // For convenience to make loop cleaner + auto& end = desc.contiguousLogEnd.get(); // For convenience to make loop cleaner // Advance until continuity is broken - while(i != logs.end()) { - if(i->beginVersion > end) + while (i != logs.end()) { + if (i->beginVersion > end) break; // If the next link in the log chain is found, update the end - if(i->beginVersion == end) + if (i->beginVersion == end) end = i->endVersion; ++i; } @@ -781,40 +835,38 @@ public: // Only update stored contiguous log begin and end versions if we did NOT use a log start override. // Otherwise, a series of describe operations can result in a version range which is actually missing data. - if(logStartVersionOverride == invalidVersion) { + if (logStartVersionOverride == invalidVersion) { // If the log metadata begin/end versions are missing (or treated as missing due to invalidity) or // differ from the newly calculated values for minLogBegin and contiguousLogEnd, respectively, - // then attempt to update the metadata in the backup container but ignore errors in case the + // then attempt to update the metadata in the backup container but ignore errors in case the // container is not writeable. try { state Future updates = Void(); - if(desc.minLogBegin.present() && metaLogBegin != desc.minLogBegin) { + if (desc.minLogBegin.present() && metaLogBegin != desc.minLogBegin) { updates = updates && bc->logBeginVersion().set(desc.minLogBegin.get()); } - if(desc.contiguousLogEnd.present() && metaLogEnd != desc.contiguousLogEnd) { + if (desc.contiguousLogEnd.present() && metaLogEnd != desc.contiguousLogEnd) { updates = updates && bc->logEndVersion().set(desc.contiguousLogEnd.get()); } wait(updates); - } catch(Error &e) { - if(e.code() == error_code_actor_cancelled) + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) throw; - TraceEvent(SevWarn, "BackupContainerMetadataUpdateFailure") - .error(e) - .detail("URL", bc->getURL()); + TraceEvent(SevWarn, "BackupContainerMetadataUpdateFailure").error(e).detail("URL", bc->getURL()); } } - for(auto &s : desc.snapshots) { + for (auto& s : desc.snapshots) { // Calculate restorability of each snapshot. Assume true, then try to prove false s.restorable = true; // If this is not a single-version snapshot then see if the available contiguous logs cover its range - if(s.beginVersion != s.endVersion) { - if(!desc.minLogBegin.present() || desc.minLogBegin.get() > s.beginVersion) + if (s.beginVersion != s.endVersion) { + if (!desc.minLogBegin.present() || desc.minLogBegin.get() > s.beginVersion) s.restorable = false; - if(!desc.contiguousLogEnd.present() || desc.contiguousLogEnd.get() <= s.endVersion) + if (!desc.contiguousLogEnd.present() || desc.contiguousLogEnd.get() <= s.endVersion) s.restorable = false; } @@ -822,20 +874,22 @@ public: // If the snapshot is at a single version then it requires no logs. Update min and max restorable. // TODO: Somehow check / report if the restorable range is not or may not be contiguous. - if(s.beginVersion == s.endVersion) { - if(!desc.minRestorableVersion.present() || s.endVersion < desc.minRestorableVersion.get()) + if (s.beginVersion == s.endVersion) { + if (!desc.minRestorableVersion.present() || s.endVersion < desc.minRestorableVersion.get()) desc.minRestorableVersion = s.endVersion; - if(!desc.maxRestorableVersion.present() || s.endVersion > desc.maxRestorableVersion.get()) + if (!desc.maxRestorableVersion.present() || s.endVersion > desc.maxRestorableVersion.get()) desc.maxRestorableVersion = s.endVersion; } // If the snapshot is covered by the contiguous log chain then update min/max restorable. - if(desc.minLogBegin.present() && s.beginVersion >= desc.minLogBegin.get() && s.endVersion < desc.contiguousLogEnd.get()) { - if(!desc.minRestorableVersion.present() || s.endVersion < desc.minRestorableVersion.get()) + if (desc.minLogBegin.present() && s.beginVersion >= desc.minLogBegin.get() && + s.endVersion < desc.contiguousLogEnd.get()) { + if (!desc.minRestorableVersion.present() || s.endVersion < desc.minRestorableVersion.get()) desc.minRestorableVersion = s.endVersion; - if(!desc.maxRestorableVersion.present() || (desc.contiguousLogEnd.get() - 1) > desc.maxRestorableVersion.get()) + if (!desc.maxRestorableVersion.present() || + (desc.contiguousLogEnd.get() - 1) > desc.maxRestorableVersion.get()) desc.maxRestorableVersion = desc.contiguousLogEnd.get() - 1; } } @@ -845,34 +899,41 @@ public: // Uses the virtual methods to describe the backup contents Future describeBackup(bool deepScan, Version logStartVersionOverride) { - return describeBackup_impl(Reference::addRef(this), deepScan, logStartVersionOverride); + return describeBackup_impl( + Reference::addRef(this), deepScan, logStartVersionOverride); } - ACTOR static Future expireData_impl(Reference bc, Version expireEndVersion, bool force, ExpireProgress *progress, Version restorableBeginVersion) { - if(progress != nullptr) { + ACTOR static Future expireData_impl(Reference bc, + Version expireEndVersion, + bool force, + ExpireProgress* progress, + Version restorableBeginVersion) { + if (progress != nullptr) { progress->step = "Describing backup"; progress->total = 0; } TraceEvent("BackupContainerFileSystemExpire1") - .detail("URL", bc->getURL()) - .detail("ExpireEndVersion", expireEndVersion) - .detail("RestorableBeginVersion", restorableBeginVersion); + .detail("URL", bc->getURL()) + .detail("ExpireEndVersion", expireEndVersion) + .detail("RestorableBeginVersion", restorableBeginVersion); // Get the backup description. state BackupDescription desc = wait(bc->describeBackup(false, expireEndVersion)); // Resolve relative versions using max log version - expireEndVersion = resolveRelativeVersion(desc.maxLogEnd, expireEndVersion, "ExpireEndVersion", invalid_option_value()); - restorableBeginVersion = resolveRelativeVersion(desc.maxLogEnd, restorableBeginVersion, "RestorableBeginVersion", invalid_option_value()); + expireEndVersion = + resolveRelativeVersion(desc.maxLogEnd, expireEndVersion, "ExpireEndVersion", invalid_option_value()); + restorableBeginVersion = resolveRelativeVersion( + desc.maxLogEnd, restorableBeginVersion, "RestorableBeginVersion", invalid_option_value()); // It would be impossible to have restorability to any version < expireEndVersion after expiring to that version - if(restorableBeginVersion < expireEndVersion) + if (restorableBeginVersion < expireEndVersion) throw backup_cannot_expire(); // If the expire request is to a version at or before the previous version to which data was already deleted // then do nothing and just return - if(expireEndVersion <= desc.expiredEndVersion.orDefault(invalidVersion)) { + if (expireEndVersion <= desc.expiredEndVersion.orDefault(invalidVersion)) { return Void(); } @@ -881,8 +942,9 @@ public: // - begins at or after expireEndVersion // - ends at or before restorableBeginVersion state bool forceNeeded = true; - for(KeyspaceSnapshotFile &s : desc.snapshots) { - if(s.restorable.orDefault(false) && s.beginVersion >= expireEndVersion && s.endVersion <= restorableBeginVersion) { + for (KeyspaceSnapshotFile& s : desc.snapshots) { + if (s.restorable.orDefault(false) && s.beginVersion >= expireEndVersion && + s.endVersion <= restorableBeginVersion) { forceNeeded = false; break; } @@ -892,41 +954,41 @@ public: // Note that it is possible for there to be no actual files in the backup prior to expireEndVersion, // if they were externally deleted or an expire operation deleted them but was terminated before // updating expireEndVersion - if(forceNeeded && !force) + if (forceNeeded && !force) throw backup_cannot_expire(); // Start scan for files to delete at the last completed expire operation's end or 0. state Version scanBegin = desc.expiredEndVersion.orDefault(0); TraceEvent("BackupContainerFileSystemExpire2") - .detail("URL", bc->getURL()) - .detail("ExpireEndVersion", expireEndVersion) - .detail("RestorableBeginVersion", restorableBeginVersion) - .detail("ScanBeginVersion", scanBegin); + .detail("URL", bc->getURL()) + .detail("ExpireEndVersion", expireEndVersion) + .detail("RestorableBeginVersion", restorableBeginVersion) + .detail("ScanBeginVersion", scanBegin); state std::vector logs; state std::vector ranges; - if(progress != nullptr) { + if (progress != nullptr) { progress->step = "Listing files"; } // Get log files or range files that contain any data at or before expireEndVersion - wait(store(logs, bc->listLogFiles(scanBegin, expireEndVersion - 1)) && store(ranges, bc->listRangeFiles(scanBegin, expireEndVersion - 1))); + wait(store(logs, bc->listLogFiles(scanBegin, expireEndVersion - 1)) && + store(ranges, bc->listRangeFiles(scanBegin, expireEndVersion - 1))); // The new logBeginVersion will be taken from the last log file, if there is one state Optional newLogBeginVersion; - if(!logs.empty()) { + if (!logs.empty()) { // Linear scan the unsorted logs to find the latest one in sorted order - LogFile &last = *std::max_element(logs.begin(), logs.end()); + LogFile& last = *std::max_element(logs.begin(), logs.end()); // If the last log ends at expireEndVersion then that will be the next log begin - if(last.endVersion == expireEndVersion) { + if (last.endVersion == expireEndVersion) { newLogBeginVersion = expireEndVersion; - } - else { + } else { // If the last log overlaps the expiredEnd then use the log's begin version and move the expiredEnd // back to match it and keep the last log file - if(last.endVersion > expireEndVersion) { + if (last.endVersion > expireEndVersion) { newLogBeginVersion = last.beginVersion; // Instead of modifying this potentially very large vector, just clear LogFile @@ -941,27 +1003,28 @@ public: state std::vector toDelete; // Move filenames out of vector then destroy it to save memory - for(auto const &f : logs) { + for (auto const& f : logs) { // We may have cleared the last log file earlier so skip any empty filenames - if(!f.fileName.empty()) { + if (!f.fileName.empty()) { toDelete.push_back(std::move(f.fileName)); } } logs.clear(); // Move filenames out of vector then destroy it to save memory - for(auto const &f : ranges) { - // The file version must be checked here again because it is likely that expireEndVersion is in the middle of a log file, in which case - // after the log and range file listings are done (using the original expireEndVersion) the expireEndVersion will be moved back slightly - // to the begin version of the last log file found (which is also the first log to not be deleted) - if(f.version < expireEndVersion) { + for (auto const& f : ranges) { + // The file version must be checked here again because it is likely that expireEndVersion is in the middle + // of a log file, in which case after the log and range file listings are done (using the original + // expireEndVersion) the expireEndVersion will be moved back slightly to the begin version of the last log + // file found (which is also the first log to not be deleted) + if (f.version < expireEndVersion) { toDelete.push_back(std::move(f.fileName)); } } ranges.clear(); - for(auto const &f : desc.snapshots) { - if(f.endVersion < expireEndVersion) + for (auto const& f : desc.snapshots) { + if (f.endVersion < expireEndVersion) toDelete.push_back(std::move(f.fileName)); } desc = BackupDescription(); @@ -969,15 +1032,15 @@ public: // We are about to start deleting files, at which point all data prior to expireEndVersion is considered // 'unreliable' as some or all of it will be missing. So before deleting anything, read unreliableEndVersion // (don't use cached value in desc) and update its value if it is missing or < expireEndVersion - if(progress != nullptr) { + if (progress != nullptr) { progress->step = "Initial metadata update"; } Optional metaUnreliableEnd = wait(bc->unreliableEndVersion().get()); - if(metaUnreliableEnd.orDefault(0) < expireEndVersion) { + if (metaUnreliableEnd.orDefault(0) < expireEndVersion) { wait(bc->unreliableEndVersion().set(expireEndVersion)); } - if(progress != nullptr) { + if (progress != nullptr) { progress->step = "Deleting files"; progress->total = toDelete.size(); progress->done = 0; @@ -987,10 +1050,10 @@ public: // delete actor states would use even more if they all existed at the same time. state std::list> deleteFutures; - while(!toDelete.empty() || !deleteFutures.empty()) { + while (!toDelete.empty() || !deleteFutures.empty()) { // While there are files to delete and budget in the deleteFutures list, start a delete - while(!toDelete.empty() && deleteFutures.size() < CLIENT_KNOBS->BACKUP_CONCURRENT_DELETES) { + while (!toDelete.empty() && deleteFutures.size() < CLIENT_KNOBS->BACKUP_CONCURRENT_DELETES) { deleteFutures.push_back(bc->deleteFile(toDelete.back())); toDelete.pop_back(); } @@ -1000,23 +1063,23 @@ public: // than the delete concurrency limit. state int targetFuturesSize = toDelete.empty() ? 0 : (CLIENT_KNOBS->BACKUP_CONCURRENT_DELETES - 1); - while(deleteFutures.size() > targetFuturesSize) { + while (deleteFutures.size() > targetFuturesSize) { wait(deleteFutures.front()); - if(progress != nullptr) { + if (progress != nullptr) { ++progress->done; } deleteFutures.pop_front(); } } - if(progress != nullptr) { + if (progress != nullptr) { progress->step = "Final metadata update"; progress->total = 0; } // Update the expiredEndVersion metadata to indicate that everything prior to that version has been // successfully deleted if the current version is lower or missing Optional metaExpiredEnd = wait(bc->expiredEndVersion().get()); - if(metaExpiredEnd.orDefault(0) < expireEndVersion) { + if (metaExpiredEnd.orDefault(0) < expireEndVersion) { wait(bc->expiredEndVersion().set(expireEndVersion)); } @@ -1024,20 +1087,28 @@ public: } // Delete all data up to (but not including endVersion) - Future expireData(Version expireEndVersion, bool force, ExpireProgress *progress, Version restorableBeginVersion) { - return expireData_impl(Reference::addRef(this), expireEndVersion, force, progress, restorableBeginVersion); + Future expireData(Version expireEndVersion, + bool force, + ExpireProgress* progress, + Version restorableBeginVersion) { + return expireData_impl(Reference::addRef(this), + expireEndVersion, + force, + progress, + restorableBeginVersion); } - ACTOR static Future> getRestoreSet_impl(Reference bc, Version targetVersion) { + ACTOR static Future> getRestoreSet_impl(Reference bc, + Version targetVersion) { // Find the most recent keyrange snapshot to end at or before targetVersion state Optional snapshot; std::vector snapshots = wait(bc->listKeyspaceSnapshots()); - for(auto const &s : snapshots) { - if(s.endVersion <= targetVersion) + for (auto const& s : snapshots) { + if (s.endVersion <= targetVersion) snapshot = s; } - if(snapshot.present()) { + if (snapshot.present()) { state RestorableFileSet restorable; restorable.snapshot = snapshot.get(); restorable.targetVersion = targetVersion; @@ -1046,7 +1117,7 @@ public: restorable.ranges = ranges; // No logs needed if there is a complete key space snapshot at the target version. - if(snapshot.get().beginVersion == snapshot.get().endVersion && snapshot.get().endVersion == targetVersion) + if (snapshot.get().beginVersion == snapshot.get().endVersion && snapshot.get().endVersion == targetVersion) return Optional(restorable); state std::vector logs = wait(bc->listLogFiles(snapshot.get().beginVersion, targetVersion)); @@ -1055,23 +1126,23 @@ public: std::sort(logs.begin(), logs.end()); // If there are logs and the first one starts at or before the snapshot begin version then proceed - if(!logs.empty() && logs.front().beginVersion <= snapshot.get().beginVersion) { + if (!logs.empty() && logs.front().beginVersion <= snapshot.get().beginVersion) { auto i = logs.begin(); Version end = i->endVersion; restorable.logs.push_back(*i); // Add logs to restorable logs set until continuity is broken OR we reach targetVersion - while(++i != logs.end()) { - if(i->beginVersion > end || i->beginVersion > targetVersion) + while (++i != logs.end()) { + if (i->beginVersion > end || i->beginVersion > targetVersion) break; // If the next link in the log chain is found, update the end - if(i->beginVersion == end) { + if (i->beginVersion == end) { restorable.logs.push_back(*i); end = i->endVersion; } } - if(end >= targetVersion) { + if (end >= targetVersion) { return Optional(restorable); } } @@ -1080,90 +1151,95 @@ public: return Optional(); } - Future> getRestoreSet(Version targetVersion){ + Future> getRestoreSet(Version targetVersion) { return getRestoreSet_impl(Reference::addRef(this), targetVersion); } private: struct VersionProperty { - VersionProperty(Reference bc, std::string name) : bc(bc), path("properties/" + name) {} + VersionProperty(Reference bc, std::string name) + : bc(bc), path("properties/" + name) {} Reference bc; std::string path; - Future> get() { - return readVersionProperty(bc, path); - } - Future set(Version v) { - return writeVersionProperty(bc, path, v); - } - Future clear() { - return bc->deleteFile(path); - } + Future> get() { return readVersionProperty(bc, path); } + Future set(Version v) { return writeVersionProperty(bc, path, v); } + Future clear() { return bc->deleteFile(path); } }; public: - // To avoid the need to scan the underyling filesystem in many cases, some important version boundaries are stored in named files. - // These versions also indicate what version ranges are known to be deleted or partially deleted. + // To avoid the need to scan the underyling filesystem in many cases, some important version boundaries are stored + // in named files. These versions also indicate what version ranges are known to be deleted or partially deleted. // // The values below describe version ranges as follows: // 0 - expiredEndVersion All files in this range have been deleted // expiredEndVersion - unreliableEndVersion Some files in this range may have been deleted. // - // logBeginVersion - logEnd Log files are contiguous in this range and have NOT been deleted by fdbbackup - // logEnd - infinity Files in this range may or may not exist yet + // logBeginVersion - logEnd Log files are contiguous in this range and have NOT been deleted by + // fdbbackup logEnd - infinity Files in this range may or may not exist yet // - VersionProperty logBeginVersion() { return {Reference::addRef(this), "log_begin_version"}; } - VersionProperty logEndVersion() { return {Reference::addRef(this), "log_end_version"}; } - VersionProperty expiredEndVersion() { return {Reference::addRef(this), "expired_end_version"}; } - VersionProperty unreliableEndVersion() { return {Reference::addRef(this), "unreliable_end_version"}; } + VersionProperty logBeginVersion() { + return { Reference::addRef(this), "log_begin_version" }; + } + VersionProperty logEndVersion() { + return { Reference::addRef(this), "log_end_version" }; + } + VersionProperty expiredEndVersion() { + return { Reference::addRef(this), "expired_end_version" }; + } + VersionProperty unreliableEndVersion() { + return { Reference::addRef(this), "unreliable_end_version" }; + } - ACTOR static Future writeVersionProperty(Reference bc, std::string path, Version v) { + ACTOR static Future writeVersionProperty(Reference bc, + std::string path, + Version v) { try { state Reference f = wait(bc->writeFile(path)); std::string s = format("%lld", v); wait(f->append(s.data(), s.size())); wait(f->finish()); return Void(); - } catch(Error &e) { + } catch (Error& e) { TraceEvent(SevWarn, "BackupContainerWritePropertyFailed") - .error(e) - .detail("URL", bc->getURL()) - .detail("Path", path); + .error(e) + .detail("URL", bc->getURL()) + .detail("Path", path); throw; } } - ACTOR static Future> readVersionProperty(Reference bc, std::string path) { + ACTOR static Future> readVersionProperty(Reference bc, + std::string path) { try { state Reference f = wait(bc->readFile(path)); state int64_t size = wait(f->size()); state std::string s; s.resize(size); - int rs = wait(f->read((uint8_t *)s.data(), size, 0)); + int rs = wait(f->read((uint8_t*)s.data(), size, 0)); Version v; int len; - if(rs == size && sscanf(s.c_str(), "%" SCNd64 "%n", &v, &len) == 1 && len == size) + if (rs == size && sscanf(s.c_str(), "%" SCNd64 "%n", &v, &len) == 1 && len == size) return v; - TraceEvent(SevWarn, "BackupContainerInvalidProperty") - .detail("URL", bc->getURL()) - .detail("Path", path); + TraceEvent(SevWarn, "BackupContainerInvalidProperty").detail("URL", bc->getURL()).detail("Path", path); throw backup_invalid_info(); - } catch(Error &e) { - if(e.code() == error_code_file_not_found) + } catch (Error& e) { + if (e.code() == error_code_file_not_found) return Optional(); TraceEvent(SevWarn, "BackupContainerReadPropertyFailed") - .error(e) - .detail("URL", bc->getURL()) - .detail("Path", path); + .error(e) + .detail("URL", bc->getURL()) + .detail("Path", path); throw; } } }; -class BackupContainerLocalDirectory : public BackupContainerFileSystem, ReferenceCounted { +class BackupContainerLocalDirectory : public BackupContainerFileSystem, + ReferenceCounted { public: void addref() { return ReferenceCounted::addref(); } void delref() { return ReferenceCounted::delref(); } @@ -1172,16 +1248,21 @@ public: BackupContainerLocalDirectory(std::string url) { std::string path; - if(url.find("file://") != 0) { - TraceEvent(SevWarn, "BackupContainerLocalDirectory").detail("Description", "Invalid URL for BackupContainerLocalDirectory").detail("URL", url); + if (url.find("file://") != 0) { + TraceEvent(SevWarn, "BackupContainerLocalDirectory") + .detail("Description", "Invalid URL for BackupContainerLocalDirectory") + .detail("URL", url); } path = url.substr(7); // Remove trailing slashes on path path.erase(path.find_last_not_of("\\/") + 1); - if(!g_network->isSimulated() && path != abspath(path)) { - TraceEvent(SevWarn, "BackupContainerLocalDirectory").detail("Description", "Backup path must be absolute (e.g. file:///some/path)").detail("URL", url).detail("Path", path); + if (!g_network->isSimulated() && path != abspath(path)) { + TraceEvent(SevWarn, "BackupContainerLocalDirectory") + .detail("Description", "Backup path must be absolute (e.g. file:///some/path)") + .detail("URL", url) + .detail("Path", path); throw io_error(); } @@ -1191,23 +1272,28 @@ public: static Future> listURLs(std::string url) { std::string path; - if(url.find("file://") != 0) { - TraceEvent(SevWarn, "BackupContainerLocalDirectory").detail("Description", "Invalid URL for BackupContainerLocalDirectory").detail("URL", url); + if (url.find("file://") != 0) { + TraceEvent(SevWarn, "BackupContainerLocalDirectory") + .detail("Description", "Invalid URL for BackupContainerLocalDirectory") + .detail("URL", url); } path = url.substr(7); // Remove trailing slashes on path path.erase(path.find_last_not_of("\\/") + 1); - if(!g_network->isSimulated() && path != abspath(path)) { - TraceEvent(SevWarn, "BackupContainerLocalDirectory").detail("Description", "Backup path must be absolute (e.g. file:///some/path)").detail("URL", url).detail("Path", path); + if (!g_network->isSimulated() && path != abspath(path)) { + TraceEvent(SevWarn, "BackupContainerLocalDirectory") + .detail("Description", "Backup path must be absolute (e.g. file:///some/path)") + .detail("URL", url) + .detail("Path", path); throw io_error(); } std::vector dirs = platform::listDirectories(path); std::vector results; - for(auto &r : dirs) { - if(r == "." || r == "..") + for (auto& r : dirs) { + if (r == "." || r == "..") continue; results.push_back(std::string("file://") + joinPath(path, r)); } @@ -1216,18 +1302,16 @@ public: } Future create() { - // Nothing should be done here because create() can be called by any process working with the container URL, such as fdbbackup. - // Since "local directory" containers are by definition local to the machine they are accessed from, - // the container's creation (in this case the creation of a directory) must be ensured prior to every file creation, - // which is done in openFile(). - // Creating the directory here will result in unnecessary directories being created on machines that run fdbbackup but not agents. + // Nothing should be done here because create() can be called by any process working with the container URL, + // such as fdbbackup. Since "local directory" containers are by definition local to the machine they are + // accessed from, the container's creation (in this case the creation of a directory) must be ensured prior to + // every file creation, which is done in openFile(). Creating the directory here will result in unnecessary + // directories being created on machines that run fdbbackup but not agents. return Void(); } // The container exists if the folder it resides in exists - Future exists() { - return directoryExists(m_path); - } + Future exists() { return directoryExists(m_path); } Future> readFile(std::string path) { int flags = IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED; @@ -1235,32 +1319,32 @@ public: // so create a symbolic link to make each file opening appear to be unique. This could also work in production // but only if the source directory is writeable which shouldn't be required for a restore. std::string fullPath = joinPath(m_path, path); - #ifndef _WIN32 - if(g_network->isSimulated()) { - if(!fileExists(fullPath)) +#ifndef _WIN32 + if (g_network->isSimulated()) { + if (!fileExists(fullPath)) throw file_not_found(); std::string uniquePath = fullPath + "." + deterministicRandom()->randomUniqueID().toString() + ".lnk"; unlink(uniquePath.c_str()); ASSERT(symlink(basename(path).c_str(), uniquePath.c_str()) == 0); fullPath = uniquePath; } - // Opening cached mode forces read/write mode at a lower level, overriding the readonly request. So cached mode - // can't be used because backup files are read-only. Cached mode can only help during restore task retries handled - // by the same process that failed the first task execution anyway, which is a very rare case. - #endif +// Opening cached mode forces read/write mode at a lower level, overriding the readonly request. So cached mode +// can't be used because backup files are read-only. Cached mode can only help during restore task retries handled +// by the same process that failed the first task execution anyway, which is a very rare case. +#endif Future> f = IAsyncFileSystem::filesystem()->open(fullPath, flags, 0644); - if(g_network->isSimulated()) { + if (g_network->isSimulated()) { int blockSize = 0; // Extract block size from the filename, if present size_t lastComma = path.find_last_of(','); - if(lastComma != path.npos) { + if (lastComma != path.npos) { blockSize = atoi(path.substr(lastComma + 1).c_str()); } - if(blockSize <= 0) { + if (blockSize <= 0) { blockSize = deterministicRandom()->randomInt(1e4, 1e6); } - if(deterministicRandom()->random01() < .01) { + if (deterministicRandom()->random01() < .01) { blockSize /= deterministicRandom()->randomInt(1, 3); } @@ -1278,18 +1362,18 @@ public: class BackupFile : public IBackupFile, ReferenceCounted { public: BackupFile(std::string fileName, Reference file, std::string finalFullPath) - : IBackupFile(fileName), m_file(file), m_finalFullPath(finalFullPath), m_writeOffset(0), m_blockSize(CLIENT_KNOBS->BACKUP_LOCAL_FILE_WRITE_BLOCK) - { - if(BUGGIFY) { + : IBackupFile(fileName), m_file(file), m_finalFullPath(finalFullPath), m_writeOffset(0), + m_blockSize(CLIENT_KNOBS->BACKUP_LOCAL_FILE_WRITE_BLOCK) { + if (BUGGIFY) { m_blockSize = deterministicRandom()->randomInt(100, 20000); } m_buffer.reserve(m_buffer.arena(), m_blockSize); } - Future append(const void *data, int len) { - m_buffer.append(m_buffer.arena(), (const uint8_t *)data, len); + Future append(const void* data, int len) { + m_buffer.append(m_buffer.arena(), (const uint8_t*)data, len); - if(m_buffer.size() >= m_blockSize) { + if (m_buffer.size() >= m_blockSize) { return flush(m_blockSize); } @@ -1298,7 +1382,7 @@ public: Future flush(int size) { // Avoid empty write - if(size == 0) { + if (size == 0) { return Void(); } @@ -1318,7 +1402,7 @@ public: ACTOR static Future finish_impl(Reference f) { wait(f->flush(f->m_buffer.size())); - wait(f->m_file->truncate(f->size())); // Some IAsyncFile implementations extend in whole block sizes. + wait(f->m_file->truncate(f->size())); // Some IAsyncFile implementations extend in whole block sizes. wait(f->m_file->sync()); std::string name = f->m_file->getFilename(); f->m_file.clear(); @@ -1326,13 +1410,9 @@ public: return Void(); } - int64_t size() const { - return m_buffer.size() + m_writeOffset; - } + int64_t size() const { return m_buffer.size() + m_writeOffset; } - Future finish() { - return finish_impl(Reference::addRef(this)); - } + Future finish() { return finish_impl(Reference::addRef(this)); } void addref() { return ReferenceCounted::addref(); } void delref() { return ReferenceCounted::delref(); } @@ -1346,14 +1426,14 @@ public: }; Future> writeFile(std::string path) { - int flags = IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE; + int flags = IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_CREATE | + IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE; std::string fullPath = joinPath(m_path, path); platform::createDirectory(parentDirectory(fullPath)); - std::string temp = fullPath + "." + deterministicRandom()->randomUniqueID().toString() + ".temp"; + std::string temp = fullPath + "." + deterministicRandom()->randomUniqueID().toString() + ".temp"; Future> f = IAsyncFileSystem::filesystem()->open(temp, flags, 0644); - return map(f, [=](Reference f) { - return Reference(new BackupFile(path, f, fullPath)); - }); + return map(f, + [=](Reference f) { return Reference(new BackupFile(path, f, fullPath)); }); } Future deleteFile(std::string path) { @@ -1361,36 +1441,41 @@ public: return Void(); } - Future listFiles(std::string path, std::function) { + Future listFiles(std::string path, std::function) { FilesAndSizesT results; std::vector files; platform::findFilesRecursively(joinPath(m_path, path), files); - // Remove .lnk files from results, they are a side effect of a backup that was *read* during simulation. See openFile() above for more info on why they are created. - if(g_network->isSimulated()) - files.erase(std::remove_if(files.begin(), files.end(), [](std::string const &f) { return StringRef(f).endsWith(LiteralStringRef(".lnk")); }), files.end()); + // Remove .lnk files from results, they are a side effect of a backup that was *read* during simulation. See + // openFile() above for more info on why they are created. + if (g_network->isSimulated()) + files.erase( + std::remove_if(files.begin(), + files.end(), + [](std::string const& f) { return StringRef(f).endsWith(LiteralStringRef(".lnk")); }), + files.end()); - for(auto &f : files) { + for (auto& f : files) { // Hide .part or .temp files. StringRef s(f); - if(!s.endsWith(LiteralStringRef(".part")) && !s.endsWith(LiteralStringRef(".temp"))) - results.push_back({f.substr(m_path.size() + 1), ::fileSize(f)}); + if (!s.endsWith(LiteralStringRef(".part")) && !s.endsWith(LiteralStringRef(".temp"))) + results.push_back({ f.substr(m_path.size() + 1), ::fileSize(f) }); } return results; } - Future deleteContainer(int *pNumDeleted) { + Future deleteContainer(int* pNumDeleted) { // In order to avoid deleting some random directory due to user error, first describe the backup // and make sure it has something in it. - return map(describeBackup(false, invalidVersion), [=](BackupDescription const &desc) { + return map(describeBackup(false, invalidVersion), [=](BackupDescription const& desc) { // If the backup has no snapshots and no logs then it's probably not a valid backup - if(desc.snapshots.size() == 0 && !desc.minLogBegin.present()) + if (desc.snapshots.size() == 0 && !desc.minLogBegin.present()) throw backup_invalid_url(); int count = platform::eraseDirectoryRecursive(m_path); - if(pNumDeleted != nullptr) + if (pNumDeleted != nullptr) *pNumDeleted = count; return Void(); @@ -1416,26 +1501,26 @@ private: // All backup data goes into a single bucket std::string m_bucket; - std::string dataPath(const std::string path) { - return DATAFOLDER + "/" + m_name + "/" + path; - } + std::string dataPath(const std::string path) { return DATAFOLDER + "/" + m_name + "/" + path; } // Get the path of the backups's index entry - std::string indexEntry() { - return INDEXFOLDER + "/" + m_name; - } + std::string indexEntry() { return INDEXFOLDER + "/" + m_name; } public: - BackupContainerBlobStore(Reference bstore, std::string name, const BlobStoreEndpoint::ParametersT ¶ms) + BackupContainerBlobStore(Reference bstore, + std::string name, + const BlobStoreEndpoint::ParametersT& params) : m_bstore(bstore), m_name(name), m_bucket("FDB_BACKUPS_V2") { // Currently only one parameter is supported, "bucket" - for(auto &kv : params) { - if(kv.first == "bucket") { + for (auto& kv : params) { + if (kv.first == "bucket") { m_bucket = kv.second; continue; } - TraceEvent(SevWarn, "BackupContainerBlobStoreInvalidParameter").detail("Name", kv.first).detail("Value", kv.second); + TraceEvent(SevWarn, "BackupContainerBlobStoreInvalidParameter") + .detail("Name", kv.first) + .detail("Value", kv.second); IBackupContainer::lastOpenError = format("Unknown URL parameter: '%s'", kv.first.c_str()); throw backup_invalid_url(); } @@ -1451,32 +1536,31 @@ public: virtual ~BackupContainerBlobStore() {} Future> readFile(std::string path) { - return Reference( - new AsyncFileReadAheadCache( - Reference(new AsyncFileBlobStoreRead(m_bstore, m_bucket, dataPath(path))), - m_bstore->knobs.read_block_size, - m_bstore->knobs.read_ahead_blocks, - m_bstore->knobs.concurrent_reads_per_file, - m_bstore->knobs.read_cache_blocks_per_file - ) - ); + return Reference(new AsyncFileReadAheadCache( + Reference(new AsyncFileBlobStoreRead(m_bstore, m_bucket, dataPath(path))), + m_bstore->knobs.read_block_size, + m_bstore->knobs.read_ahead_blocks, + m_bstore->knobs.concurrent_reads_per_file, + m_bstore->knobs.read_cache_blocks_per_file)); } ACTOR static Future> listURLs(Reference bstore, std::string bucket) { state std::string basePath = INDEXFOLDER + '/'; BlobStoreEndpoint::ListResult contents = wait(bstore->listBucket(bucket, basePath)); std::vector results; - for(auto &f : contents.objects) { - results.push_back(bstore->getResourceURL(f.name.substr(basePath.size()), format("bucket=%s", bucket.c_str()))); + for (auto& f : contents.objects) { + results.push_back( + bstore->getResourceURL(f.name.substr(basePath.size()), format("bucket=%s", bucket.c_str()))); } return results; } class BackupFile : public IBackupFile, ReferenceCounted { public: - BackupFile(std::string fileName, Reference file) : IBackupFile(fileName), m_file(file), m_offset(0) {} + BackupFile(std::string fileName, Reference file) + : IBackupFile(fileName), m_file(file), m_offset(0) {} - Future append(const void *data, int len) { + Future append(const void* data, int len) { Future r = m_file->write(data, len, m_offset); m_offset += len; return r; @@ -1484,47 +1568,51 @@ public: Future finish() { Reference self = Reference::addRef(this); - return map(m_file->sync(), [=](Void _) { self->m_file.clear(); return Void(); }); + return map(m_file->sync(), [=](Void _) { + self->m_file.clear(); + return Void(); + }); } - int64_t size() const { - return m_offset; - } + int64_t size() const { return m_offset; } void addref() { return ReferenceCounted::addref(); } void delref() { return ReferenceCounted::delref(); } + private: Reference m_file; int64_t m_offset; }; Future> writeFile(std::string path) { - return Reference(new BackupFile(path, Reference(new AsyncFileBlobStoreWrite(m_bstore, m_bucket, dataPath(path))))); + return Reference(new BackupFile( + path, Reference(new AsyncFileBlobStoreWrite(m_bstore, m_bucket, dataPath(path))))); } - Future deleteFile(std::string path) { - return m_bstore->deleteObject(m_bucket, dataPath(path)); - } + Future deleteFile(std::string path) { return m_bstore->deleteObject(m_bucket, dataPath(path)); } - ACTOR static Future listFiles_impl(Reference bc, std::string path, std::function pathFilter) { + ACTOR static Future listFiles_impl(Reference bc, + std::string path, + std::function pathFilter) { // pathFilter expects container based paths, so create a wrapper which converts a raw path // to a container path by removing the known backup name prefix. state int prefixTrim = bc->dataPath("").size(); - std::function rawPathFilter = [=](const std::string &folderPath) { + std::function rawPathFilter = [=](const std::string& folderPath) { ASSERT(folderPath.size() >= prefixTrim); return pathFilter(folderPath.substr(prefixTrim)); }; - state BlobStoreEndpoint::ListResult result = wait(bc->m_bstore->listBucket(bc->m_bucket, bc->dataPath(path), '/', std::numeric_limits::max(), rawPathFilter)); + state BlobStoreEndpoint::ListResult result = wait(bc->m_bstore->listBucket( + bc->m_bucket, bc->dataPath(path), '/', std::numeric_limits::max(), rawPathFilter)); FilesAndSizesT files; - for(auto &o : result.objects) { + for (auto& o : result.objects) { ASSERT(o.name.size() >= prefixTrim); - files.push_back({o.name.substr(prefixTrim), o.size}); + files.push_back({ o.name.substr(prefixTrim), o.size }); } return files; } - Future listFiles(std::string path, std::function pathFilter) { + Future listFiles(std::string path, std::function pathFilter) { return listFiles_impl(Reference::addRef(this), path, pathFilter); } @@ -1533,25 +1621,21 @@ public: // Check/create the index entry bool exists = wait(bc->m_bstore->objectExists(bc->m_bucket, bc->indexEntry())); - if(!exists) { + if (!exists) { wait(bc->m_bstore->writeEntireFile(bc->m_bucket, bc->indexEntry(), "")); } return Void(); } - Future create() { - return create_impl(Reference::addRef(this)); - } + Future create() { return create_impl(Reference::addRef(this)); } // The container exists if the index entry in the blob bucket exists - Future exists() { - return m_bstore->objectExists(m_bucket, indexEntry()); - } + Future exists() { return m_bstore->objectExists(m_bucket, indexEntry()); } - ACTOR static Future deleteContainer_impl(Reference bc, int *pNumDeleted) { + ACTOR static Future deleteContainer_impl(Reference bc, int* pNumDeleted) { bool e = wait(bc->exists()); - if(!e) { + if (!e) { TraceEvent(SevWarnAlways, "BackupContainerDoesNotExist").detail("URL", bc->getURL()); throw backup_does_not_exist(); } @@ -1565,13 +1649,11 @@ public: return Void(); } - Future deleteContainer(int *pNumDeleted) { + Future deleteContainer(int* pNumDeleted) { return deleteContainer_impl(Reference::addRef(this), pNumDeleted); } - std::string getBucket() const { - return m_bucket; - } + std::string getBucket() const { return m_bucket; } }; const std::string BackupContainerBlobStore::DATAFOLDER = "data"; @@ -1587,48 +1669,47 @@ std::vector IBackupContainer::getURLFormats() { } // Get an IBackupContainer based on a container URL string -Reference IBackupContainer::openContainer(std::string url) -{ +Reference IBackupContainer::openContainer(std::string url) { static std::map> m_cache; - Reference &r = m_cache[url]; - if(r) + Reference& r = m_cache[url]; + if (r) return r; try { StringRef u(url); - if(u.startsWith(LiteralStringRef("file://"))) + if (u.startsWith(LiteralStringRef("file://"))) r = Reference(new BackupContainerLocalDirectory(url)); - else if(u.startsWith(LiteralStringRef("blobstore://"))) { + else if (u.startsWith(LiteralStringRef("blobstore://"))) { std::string resource; // The URL parameters contain blobstore endpoint tunables as well as possible backup-specific options. BlobStoreEndpoint::ParametersT backupParams; - Reference bstore = BlobStoreEndpoint::fromString(url, &resource, &lastOpenError, &backupParams); + Reference bstore = + BlobStoreEndpoint::fromString(url, &resource, &lastOpenError, &backupParams); - if(resource.empty()) + if (resource.empty()) throw backup_invalid_url(); - for(auto c : resource) - if(!isalnum(c) && c != '_' && c != '-' && c != '.' && c != '/') + for (auto c : resource) + if (!isalnum(c) && c != '_' && c != '-' && c != '.' && c != '/') throw backup_invalid_url(); r = Reference(new BackupContainerBlobStore(bstore, resource, backupParams)); - } - else { + } else { lastOpenError = "invalid URL prefix"; throw backup_invalid_url(); } r->URL = url; return r; - } catch(Error &e) { - if(e.code() == error_code_actor_cancelled) + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) throw; TraceEvent m(SevWarn, "BackupContainer"); m.detail("Description", "Invalid container specification. See help."); m.detail("URL", url); m.error(e); - if(e.code() == error_code_backup_invalid_url) + if (e.code() == error_code_backup_invalid_url) m.detail("LastOpenError", lastOpenError); throw; @@ -1640,18 +1721,20 @@ Reference IBackupContainer::openContainer(std::string url) ACTOR Future> listContainers_impl(std::string baseURL) { try { StringRef u(baseURL); - if(u.startsWith(LiteralStringRef("file://"))) { + if (u.startsWith(LiteralStringRef("file://"))) { std::vector results = wait(BackupContainerLocalDirectory::listURLs(baseURL)); return results; - } - else if(u.startsWith(LiteralStringRef("blobstore://"))) { + } else if (u.startsWith(LiteralStringRef("blobstore://"))) { std::string resource; BlobStoreEndpoint::ParametersT backupParams; - Reference bstore = BlobStoreEndpoint::fromString(baseURL, &resource, &IBackupContainer::lastOpenError, &backupParams); + Reference bstore = + BlobStoreEndpoint::fromString(baseURL, &resource, &IBackupContainer::lastOpenError, &backupParams); - if(!resource.empty()) { - TraceEvent(SevWarn, "BackupContainer").detail("Description", "Invalid backup container base URL, resource aka path should be blank.").detail("URL", baseURL); + if (!resource.empty()) { + TraceEvent(SevWarn, "BackupContainer") + .detail("Description", "Invalid backup container base URL, resource aka path should be blank.") + .detail("URL", baseURL); throw backup_invalid_url(); } @@ -1660,22 +1743,21 @@ ACTOR Future> listContainers_impl(std::string baseURL) std::vector results = wait(BackupContainerBlobStore::listURLs(bstore, dummy.getBucket())); return results; - } - else { + } else { IBackupContainer::lastOpenError = "invalid URL prefix"; throw backup_invalid_url(); } - } catch(Error &e) { - if(e.code() == error_code_actor_cancelled) + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) throw; TraceEvent m(SevWarn, "BackupContainer"); - + m.detail("Description", "Invalid backup container URL prefix. See help."); m.detail("URL", baseURL); m.error(e); - if(e.code() == error_code_backup_invalid_url) + if (e.code() == error_code_backup_invalid_url) m.detail("LastOpenError", IBackupContainer::lastOpenError); throw; @@ -1688,11 +1770,13 @@ Future> IBackupContainer::listContainers(std::string ba ACTOR Future timeKeeperVersionFromDatetime(std::string datetime, Database db) { state KeyBackedMap versionMap(timeKeeperPrefixRange.begin); - state Reference tr = Reference(new ReadYourWritesTransaction(db)); + state Reference tr = + Reference(new ReadYourWritesTransaction(db)); state int64_t time = BackupAgentBase::parseTime(datetime); - if(time < 0) { - fprintf(stderr, "ERROR: Incorrect date/time or format. Format is %s.\n", BackupAgentBase::timeFormat().c_str()); + if (time < 0) { + fprintf( + stderr, "ERROR: Incorrect date/time or format. Format is %s.\n", BackupAgentBase::timeFormat().c_str()); throw backup_error(); } @@ -1700,20 +1784,21 @@ ACTOR Future timeKeeperVersionFromDatetime(std::string datetime, Databa try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state std::vector> results = wait( versionMap.getRange(tr, 0, time, 1, false, true) ); + state std::vector> results = + wait(versionMap.getRange(tr, 0, time, 1, false, true)); if (results.size() != 1) { // No key less than time was found in the database // Look for a key >= time. - wait( store( results, versionMap.getRange(tr, time, std::numeric_limits::max(), 1) ) ); + wait(store(results, versionMap.getRange(tr, time, std::numeric_limits::max(), 1))); - if(results.size() != 1) { + if (results.size() != 1) { fprintf(stderr, "ERROR: Unable to calculate a version for given date/time.\n"); throw backup_error(); } } // Adjust version found by the delta between time and the time found and min with 0. - auto &result = results[0]; + auto& result = results[0]; return std::max(0, result.second + (time - result.first) * CLIENT_KNOBS->CORE_VERSIONSPERSECOND); } catch (Error& e) { @@ -1735,13 +1820,14 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference tr->setOption(FDBTransactionOptions::LOCK_AWARE); loop { - mid = (min + max + 1) / 2; // ceiling + mid = (min + max + 1) / 2; // ceiling // Find the highest time < mid - state std::vector> results = wait( versionMap.getRange(tr, min, mid, 1, false, true) ); + state std::vector> results = + wait(versionMap.getRange(tr, min, mid, 1, false, true)); if (results.size() != 1) { - if(mid == min) { + if (mid == min) { // There aren't any records having a version < v, so just look for any record having a time < now // and base a result on it wait(store(results, versionMap.getRange(tr, 0, (int64_t)now(), 1))); @@ -1761,11 +1847,10 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference found = results[0]; - if(v < found.second) { + if (v < found.second) { max = found.first; - } - else { - if(found.first == min) { + } else { + if (found.first == min) { break; } min = found.first; @@ -1775,8 +1860,8 @@ ACTOR Future> timeKeeperEpochsFromVersion(Version v, Reference return found.first + (v - found.second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND; } -int chooseFileSize(std::vector &sizes) { - if(!sizes.empty()) { +int chooseFileSize(std::vector& sizes) { + if (!sizes.empty()) { int size = sizes.back(); sizes.pop_back(); return size; @@ -1784,7 +1869,10 @@ int chooseFileSize(std::vector &sizes) { return deterministicRandom()->randomInt(0, 2e6); } -ACTOR Future writeAndVerifyFile(Reference c, Reference f, int size, FlowLock *lock) { +ACTOR Future writeAndVerifyFile(Reference c, + Reference f, + int size, + FlowLock* lock) { state Standalone> content; wait(lock->take(TaskPriority::DefaultYield, size)); @@ -1793,12 +1881,12 @@ ACTOR Future writeAndVerifyFile(Reference c, ReferencegetFileName().c_str()); content.resize(content.arena(), size); - for(int i = 0; i < content.size(); ++i) { + for (int i = 0; i < content.size(); ++i) { content[i] = (uint8_t)deterministicRandom()->randomInt(0, 256); } state VectorRef sendBuf = content; - while(sendBuf.size() > 0) { + while (sendBuf.size() > 0) { state int n = std::min(sendBuf.size(), deterministicRandom()->randomInt(1, 16384)); wait(f->append(sendBuf.begin(), n)); sendBuf.pop_front(n); @@ -1808,7 +1896,7 @@ ACTOR Future writeAndVerifyFile(Reference c, Reference inputFile = wait(c->readFile(f->getFileName())); int64_t fileSize = wait(inputFile->size()); ASSERT(size == fileSize); - if(size > 0) { + if (size > 0) { state Standalone> buf; buf.resize(buf.arena(), fileSize); int b = wait(inputFile->read(buf.begin(), buf.size(), 0)); @@ -1834,8 +1922,8 @@ ACTOR Future testBackupContainer(std::string url) { // Make sure container doesn't exist, then create it. try { wait(c->deleteContainer()); - } catch(Error &e) { - if(e.code() != error_code_backup_invalid_url && e.code() != error_code_backup_does_not_exist) + } catch (Error& e) { + if (e.code() != error_code_backup_invalid_url && e.code() != error_code_backup_does_not_exist) throw; } @@ -1849,8 +1937,8 @@ ACTOR Future testBackupContainer(std::string url) { state Version v = deterministicRandom()->randomInt64(0, std::numeric_limits::max() / 2); // List of sizes to use to test edge cases on underlying file implementations - state std::vector fileSizes = {0}; - if(StringRef(url).startsWith(LiteralStringRef("blob"))) { + state std::vector fileSizes = { 0 }; + if (StringRef(url).startsWith(LiteralStringRef("blob"))) { fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE); fileSizes.push_back(CLIENT_KNOBS->BLOBSTORE_MULTIPART_MIN_PART_SIZE + 10); } @@ -1859,11 +1947,11 @@ ACTOR Future testBackupContainer(std::string url) { state Version logStart = v; state int kvfiles = deterministicRandom()->randomInt(0, 3); - while(kvfiles > 0) { - if(snapshots.empty()) { + while (kvfiles > 0) { + if (snapshots.empty()) { snapshots[v] = {}; snapshotSizes[v] = 0; - if(deterministicRandom()->coinflip()) { + if (deterministicRandom()->coinflip()) { v = nextVersion(v); } } @@ -1876,8 +1964,9 @@ ACTOR Future testBackupContainer(std::string url) { snapshotSizes.rbegin()->second += size; writes.push_back(writeAndVerifyFile(c, range, size, &lock)); - if(deterministicRandom()->random01() < .2) { - writes.push_back(c->writeKeyspaceSnapshotFile(snapshots.rbegin()->second, snapshotSizes.rbegin()->second)); + if (deterministicRandom()->random01() < .2) { + writes.push_back( + c->writeKeyspaceSnapshotFile(snapshots.rbegin()->second, snapshotSizes.rbegin()->second)); snapshots[v] = {}; snapshotSizes[v] = 0; break; @@ -1886,7 +1975,7 @@ ACTOR Future testBackupContainer(std::string url) { --kvfiles; } - if(logStart == v || deterministicRandom()->coinflip()) { + if (logStart == v || deterministicRandom()->coinflip()) { v = nextVersion(v); } state Reference log = wait(c->writeLogFile(logStart, v, 10)); @@ -1895,7 +1984,8 @@ ACTOR Future testBackupContainer(std::string url) { writes.push_back(writeAndVerifyFile(c, log, size, &lock)); // Randomly stop after a snapshot has finished and all manually seeded file sizes have been used. - if(fileSizes.empty() && !snapshots.empty() && snapshots.rbegin()->second.empty() && deterministicRandom()->random01() < .2) { + if (fileSizes.empty() && !snapshots.empty() && snapshots.rbegin()->second.empty() && + deterministicRandom()->random01() < .2) { snapshots.erase(snapshots.rbegin()->first); break; } @@ -1913,7 +2003,7 @@ ACTOR Future testBackupContainer(std::string url) { // Do a series of expirations and verify resulting state state int i = 0; - for(; i < listing.snapshots.size(); ++i) { + for (; i < listing.snapshots.size(); ++i) { { // Ensure we can still restore to the latest version Optional rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get())); @@ -1935,7 +2025,7 @@ ACTOR Future testBackupContainer(std::string url) { wait(ready(f)); // If there is an error, it must be backup_cannot_expire and we have to be on the last snapshot - if(f.isError()) { + if (f.isError()) { ASSERT(f.getError().code() == error_code_backup_cannot_expire); ASSERT(i == listing.snapshots.size() - 1); wait(c->expireData(expireVersion, true)); @@ -1963,7 +2053,7 @@ ACTOR Future testBackupContainer(std::string url) { } TEST_CASE("/backup/containers/localdir") { - if(g_network->isSimulated()) + if (g_network->isSimulated()) wait(testBackupContainer(format("file://simfdb/backups/%llx", timer_int()))); else wait(testBackupContainer(format("file:///private/tmp/fdb_backups/%llx", timer_int()))); @@ -1972,7 +2062,7 @@ TEST_CASE("/backup/containers/localdir") { TEST_CASE("/backup/containers/url") { if (!g_network->isSimulated()) { - const char *url = getenv("FDB_TEST_BACKUP_URL"); + const char* url = getenv("FDB_TEST_BACKUP_URL"); ASSERT(url != nullptr); wait(testBackupContainer(url)); } @@ -1981,11 +2071,11 @@ TEST_CASE("/backup/containers/url") { TEST_CASE("/backup/containers_list") { if (!g_network->isSimulated()) { - state const char *url = getenv("FDB_TEST_BACKUP_URL"); + state const char* url = getenv("FDB_TEST_BACKUP_URL"); ASSERT(url != nullptr); printf("Listing %s\n", url); std::vector urls = wait(IBackupContainer::listContainers(url)); - for(auto &u : urls) { + for (auto& u : urls) { printf("%s\n", u.c_str()); } } @@ -1994,16 +2084,21 @@ TEST_CASE("/backup/containers_list") { TEST_CASE("/backup/time") { // test formatTime() - for(int i = 0; i < 1000; ++i) { + for (int i = 0; i < 1000; ++i) { int64_t ts = deterministicRandom()->randomInt64(0, std::numeric_limits::max()); ASSERT(BackupAgentBase::parseTime(BackupAgentBase::formatTime(ts)) == ts); } - ASSERT(BackupAgentBase::parseTime("2019/03/18.17:51:11-0600") == BackupAgentBase::parseTime("2019/03/18.16:51:11-0700")); - ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07-0700") == BackupAgentBase::parseTime("2019/04/01.03:45:07-0200")); - ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07+0000") == BackupAgentBase::parseTime("2019/04/01.03:45:07+0500")); - ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07+0030") == BackupAgentBase::parseTime("2019/04/01.03:45:07+0530")); - ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07+0030") == BackupAgentBase::parseTime("2019/04/01.04:00:07+0545")); + ASSERT(BackupAgentBase::parseTime("2019/03/18.17:51:11-0600") == + BackupAgentBase::parseTime("2019/03/18.16:51:11-0700")); + ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07-0700") == + BackupAgentBase::parseTime("2019/04/01.03:45:07-0200")); + ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07+0000") == + BackupAgentBase::parseTime("2019/04/01.03:45:07+0500")); + ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07+0030") == + BackupAgentBase::parseTime("2019/04/01.03:45:07+0530")); + ASSERT(BackupAgentBase::parseTime("2019/03/31.22:45:07+0030") == + BackupAgentBase::parseTime("2019/04/01.04:00:07+0545")); return Void(); } diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 0e75449267..5d13002acd 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -37,9 +37,11 @@ #include #include -#include "flow/actorcompiler.h" // This must be the last #include. +#include "flow/actorcompiler.h" // This must be the last #include. -static std::string boolToYesOrNo(bool val) { return val ? std::string("Yes") : std::string("No"); } +static std::string boolToYesOrNo(bool val) { + return val ? std::string("Yes") : std::string("No"); +} static std::string versionToString(Optional version) { if (version.present()) @@ -54,7 +56,8 @@ static std::string timeStampToString(Optional epochs) { return BackupAgentBase::formatTime(epochs.get()); } -static Future> getTimestampFromVersion(Optional ver, Reference tr) { +static Future> getTimestampFromVersion(Optional ver, + Reference tr) { if (!ver.present()) return Optional(); @@ -83,97 +86,91 @@ const Key FileBackupAgent::keyLastRestorable = LiteralStringRef("last_restorable typedef FileBackupAgent::ERestoreState ERestoreState; StringRef FileBackupAgent::restoreStateText(ERestoreState id) { - switch(id) { - case ERestoreState::UNITIALIZED: return LiteralStringRef("unitialized"); - case ERestoreState::QUEUED: return LiteralStringRef("queued"); - case ERestoreState::STARTING: return LiteralStringRef("starting"); - case ERestoreState::RUNNING: return LiteralStringRef("running"); - case ERestoreState::COMPLETED: return LiteralStringRef("completed"); - case ERestoreState::ABORTED: return LiteralStringRef("aborted"); - default: return LiteralStringRef("Unknown"); + switch (id) { + case ERestoreState::UNITIALIZED: + return LiteralStringRef("unitialized"); + case ERestoreState::QUEUED: + return LiteralStringRef("queued"); + case ERestoreState::STARTING: + return LiteralStringRef("starting"); + case ERestoreState::RUNNING: + return LiteralStringRef("running"); + case ERestoreState::COMPLETED: + return LiteralStringRef("completed"); + case ERestoreState::ABORTED: + return LiteralStringRef("aborted"); + default: + return LiteralStringRef("Unknown"); } } -template<> Tuple Codec::pack(ERestoreState const &val) { return Tuple().append(val); } -template<> ERestoreState Codec::unpack(Tuple const &val) { return (ERestoreState)val.getInt(0); } +template <> +Tuple Codec::pack(ERestoreState const& val) { + return Tuple().append(val); +} +template <> +ERestoreState Codec::unpack(Tuple const& val) { + return (ERestoreState)val.getInt(0); +} -ACTOR Future> TagUidMap::getAll_impl(TagUidMap *tagsMap, Reference tr, bool snapshot) { +ACTOR Future> TagUidMap::getAll_impl(TagUidMap* tagsMap, + Reference tr, + bool snapshot) { state Key prefix = tagsMap->prefix; // Copying it here as tagsMap lifetime is not tied to this actor TagMap::PairsType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6, snapshot)); std::vector results; - for(auto &p : tagPairs) + for (auto& p : tagPairs) results.push_back(KeyBackedTag(p.first, prefix)); return results; } KeyBackedTag::KeyBackedTag(std::string tagName, StringRef tagMapPrefix) - : KeyBackedProperty(TagUidMap(tagMapPrefix).getProperty(tagName)), tagName(tagName), tagMapPrefix(tagMapPrefix) {} + : KeyBackedProperty(TagUidMap(tagMapPrefix).getProperty(tagName)), tagName(tagName), + tagMapPrefix(tagMapPrefix) {} class RestoreConfig : public KeyBackedConfig { public: RestoreConfig(UID uid = UID()) : KeyBackedConfig(fileRestorePrefixRange.begin, uid) {} RestoreConfig(Reference task) : KeyBackedConfig(fileRestorePrefixRange.begin, task) {} - KeyBackedProperty stateEnum() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty stateEnum() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } Future stateText(Reference tr) { - return map(stateEnum().getD(tr), [](ERestoreState s) -> StringRef { return FileBackupAgent::restoreStateText(s); }); - } - KeyBackedProperty addPrefix() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } - KeyBackedProperty removePrefix() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); + return map(stateEnum().getD(tr), + [](ERestoreState s) -> StringRef { return FileBackupAgent::restoreStateText(s); }); } + KeyBackedProperty addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } // XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges - KeyBackedProperty restoreRange() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } KeyBackedProperty> restoreRanges() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } - KeyBackedProperty batchFuture() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } - KeyBackedProperty restoreVersion() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedProperty batchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } + KeyBackedProperty restoreVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } KeyBackedProperty> sourceContainer() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } // Get the source container as a bare URL, without creating a container instance - KeyBackedProperty sourceContainerURL() { - return configSpace.pack(LiteralStringRef("sourceContainer")); - } + KeyBackedProperty sourceContainerURL() { return configSpace.pack(LiteralStringRef("sourceContainer")); } // Total bytes written by all log and range restore tasks. - KeyBackedBinaryValue bytesWritten() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedBinaryValue bytesWritten() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } // File blocks that have had tasks created for them by the Dispatch task - KeyBackedBinaryValue filesBlocksDispatched() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedBinaryValue filesBlocksDispatched() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } // File blocks whose tasks have finished - KeyBackedBinaryValue fileBlocksFinished() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedBinaryValue fileBlocksFinished() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } // Total number of files in the fileMap - KeyBackedBinaryValue fileCount() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedBinaryValue fileCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } // Total number of file blocks in the fileMap - KeyBackedBinaryValue fileBlockCount() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + KeyBackedBinaryValue fileBlockCount() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } Future> getRestoreRangesOrDefault(Reference tr) { return getRestoreRangesOrDefault_impl(this, tr); } - ACTOR static Future> getRestoreRangesOrDefault_impl(RestoreConfig *self, Reference tr) { + ACTOR static Future> getRestoreRangesOrDefault_impl(RestoreConfig* self, + Reference tr) { state std::vector ranges = wait(self->restoreRanges().getD(tr)); if (ranges.empty()) { state KeyRange range = wait(self->restoreRange().getD(tr)); @@ -187,21 +184,21 @@ public: struct RestoreFile { Version version; std::string fileName; - bool isRange; // false for log file + bool isRange; // false for log file int64_t blockSize; int64_t fileSize; - Version endVersion; // not meaningful for range files + Version endVersion; // not meaningful for range files Tuple pack() const { return Tuple() - .append(version) - .append(StringRef(fileName)) - .append(isRange) - .append(fileSize) - .append(blockSize) - .append(endVersion); + .append(version) + .append(StringRef(fileName)) + .append(isRange) + .append(fileSize) + .append(blockSize) + .append(endVersion); } - static RestoreFile unpack(Tuple const &t) { + static RestoreFile unpack(Tuple const& t) { RestoreFile r; int i = 0; r.version = t.getInt(i++); @@ -215,38 +212,34 @@ public: }; typedef KeyBackedSet FileSetT; - FileSetT fileSet() { - return configSpace.pack(LiteralStringRef(__FUNCTION__)); - } + FileSetT fileSet() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); } Future isRunnable(Reference tr) { - return map(stateEnum().getD(tr), [](ERestoreState s) -> bool { return s != ERestoreState::ABORTED - && s != ERestoreState::COMPLETED - && s != ERestoreState::UNITIALIZED; + return map(stateEnum().getD(tr), [](ERestoreState s) -> bool { + return s != ERestoreState::ABORTED && s != ERestoreState::COMPLETED && s != ERestoreState::UNITIALIZED; }); } - Future logError(Database cx, Error e, std::string const &details, void *taskInstance = nullptr) { - if(!uid.isValid()) { + Future logError(Database cx, Error e, std::string const& details, void* taskInstance = nullptr) { + if (!uid.isValid()) { TraceEvent(SevError, "FileRestoreErrorNoUID").error(e).detail("Description", details); return Void(); } TraceEvent t(SevWarn, "FileRestoreError"); - t.error(e).detail("RestoreUID", uid).detail("Description", details).detail("TaskInstance", (uint64_t)taskInstance); + t.error(e) + .detail("RestoreUID", uid) + .detail("Description", details) + .detail("TaskInstance", (uint64_t)taskInstance); // These should not happen - if(e.code() == error_code_key_not_found) + if (e.code() == error_code_key_not_found) t.backtrace(); return updateErrorInfo(cx, e, details); } - Key mutationLogPrefix() { - return uidPrefixKey(applyLogKeys.begin, uid); - } + Key mutationLogPrefix() { return uidPrefixKey(applyLogKeys.begin, uid); } - Key applyMutationsMapPrefix() { - return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); - } + Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); } ACTOR static Future getApplyVersionLag_impl(Reference tr, UID uid) { // Both of these are snapshot reads @@ -254,7 +247,7 @@ public: state Future> endVal = tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid), true); wait(success(beginVal) && success(endVal)); - if(!beginVal.get().present() || !endVal.get().present()) + if (!beginVal.get().present() || !endVal.get().present()) return 0; Version beginVersion = BinaryReader::fromStringRef(beginVal.get().get(), Unversioned()); @@ -284,7 +277,7 @@ public: void clearApplyMutationsKeys(Reference tr) { tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); - + // Clear add/remove prefix keys tr->clear(uidPrefixKey(applyMutationsAddPrefixRange.begin, uid)); tr->clear(uidPrefixKey(applyMutationsRemovePrefixRange.begin, uid)); @@ -312,9 +305,10 @@ public: } Future getApplyEndVersion(Reference tr) { - return map(tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid)), [=](Optional const &value) -> Version { - return value.present() ? BinaryReader::fromStringRef(value.get(), Unversioned()) : 0; - }); + return map(tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid)), + [=](Optional const& value) -> Version { + return value.present() ? BinaryReader::fromStringRef(value.get(), Unversioned()) : 0; + }); } ACTOR static Future getProgress_impl(RestoreConfig restore, Reference tr); @@ -326,7 +320,8 @@ public: typedef RestoreConfig::RestoreFile RestoreFile; -ACTOR Future RestoreConfig::getProgress_impl(RestoreConfig restore, Reference tr) { +ACTOR Future RestoreConfig::getProgress_impl(RestoreConfig restore, + Reference tr) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); @@ -342,40 +337,44 @@ ACTOR Future RestoreConfig::getProgress_impl(RestoreConfig restore, // restore might no longer be valid after the first wait so make sure it is not needed anymore. state UID uid = restore.getUid(); - wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) && success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(lag) && success(tag) && success(lastError)); + wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) && + success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(lag) && success(tag) && + success(lastError)); std::string errstr = "None"; - if(lastError.get().second != 0) - errstr = format("'%s' %" PRId64 "s ago.\n", lastError.get().first.c_str(), (tr->getReadVersion().get() - lastError.get().second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND ); + if (lastError.get().second != 0) + errstr = format("'%s' %" PRId64 "s ago.\n", + lastError.get().first.c_str(), + (tr->getReadVersion().get() - lastError.get().second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND); TraceEvent("FileRestoreProgress") - .detail("RestoreUID", uid) - .detail("Tag", tag.get()) - .detail("State", status.get().toString()) - .detail("FileCount", fileCount.get()) - .detail("FileBlocksFinished", fileBlocksFinished.get()) - .detail("FileBlocksTotal", fileBlockCount.get()) - .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get()) - .detail("BytesWritten", bytesWritten.get()) - .detail("ApplyLag", lag.get()) - .detail("TaskInstance", THIS_ADDR); + .detail("RestoreUID", uid) + .detail("Tag", tag.get()) + .detail("State", status.get().toString()) + .detail("FileCount", fileCount.get()) + .detail("FileBlocksFinished", fileBlocksFinished.get()) + .detail("FileBlocksTotal", fileBlockCount.get()) + .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get()) + .detail("BytesWritten", bytesWritten.get()) + .detail("ApplyLag", lag.get()) + .detail("TaskInstance", THIS_ADDR); - - return format("Tag: %s UID: %s State: %s Blocks: %lld/%lld BlocksInProgress: %lld Files: %lld BytesWritten: %lld ApplyVersionLag: %lld LastError: %s", - tag.get().c_str(), - uid.toString().c_str(), - status.get().toString().c_str(), - fileBlocksFinished.get(), - fileBlockCount.get(), - fileBlocksDispatched.get() - fileBlocksFinished.get(), - fileCount.get(), - bytesWritten.get(), - lag.get(), - errstr.c_str() - ); + return format("Tag: %s UID: %s State: %s Blocks: %lld/%lld BlocksInProgress: %lld Files: %lld BytesWritten: " + "%lld ApplyVersionLag: %lld LastError: %s", + tag.get().c_str(), + uid.toString().c_str(), + status.get().toString().c_str(), + fileBlocksFinished.get(), + fileBlockCount.get(), + fileBlocksDispatched.get() - fileBlocksFinished.get(), + fileCount.get(), + bytesWritten.get(), + lag.get(), + errstr.c_str()); } -ACTOR Future RestoreConfig::getFullStatus_impl(RestoreConfig restore, Reference tr) { +ACTOR Future RestoreConfig::getFullStatus_impl(RestoreConfig restore, + Reference tr) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); @@ -387,3107 +386,3540 @@ ACTOR Future RestoreConfig::getFullStatus_impl(RestoreConfig restor state Future progress = restore.getProgress(tr); // restore might no longer be valid after the first wait so make sure it is not needed anymore. - wait(success(ranges) && success(addPrefix) && success(removePrefix) && success(url) && success(restoreVersion) && success(progress)); + wait(success(ranges) && success(addPrefix) && success(removePrefix) && success(url) && success(restoreVersion) && + success(progress)); std::string returnStr; returnStr = format("%s URL: %s", progress.get().c_str(), url.get().toString().c_str()); - for (auto &range : ranges.get()) { + for (auto& range : ranges.get()) { returnStr += format(" Range: '%s'-'%s'", printable(range.begin).c_str(), printable(range.end).c_str()); } returnStr += format(" AddPrefix: '%s' RemovePrefix: '%s' Version: %lld", - printable(addPrefix.get()).c_str(), - printable(removePrefix.get()).c_str(), - restoreVersion.get() - ); + printable(addPrefix.get()).c_str(), + printable(removePrefix.get()).c_str(), + restoreVersion.get()); return returnStr; } - FileBackupAgent::FileBackupAgent() - : subspace(Subspace(fileBackupPrefixRange.begin)) - // The other subspaces have logUID -> value - , config(subspace.get(BackupAgentBase::keyConfig)) - , lastRestorable(subspace.get(FileBackupAgent::keyLastRestorable)) - , taskBucket(new TaskBucket(subspace.get(BackupAgentBase::keyTasks), true, false, true)) - , futureBucket(new FutureBucket(subspace.get(BackupAgentBase::keyFutures), true, true)) -{ -} + : subspace(Subspace(fileBackupPrefixRange.begin)) + // The other subspaces have logUID -> value + , + config(subspace.get(BackupAgentBase::keyConfig)), lastRestorable(subspace.get(FileBackupAgent::keyLastRestorable)), + taskBucket(new TaskBucket(subspace.get(BackupAgentBase::keyTasks), true, false, true)), + futureBucket(new FutureBucket(subspace.get(BackupAgentBase::keyFutures), true, true)) {} namespace fileBackup { - // Return a block of contiguous padding bytes, growing if needed. - Value makePadding(int size) { - static Value pad; - if(pad.size() < size) { - pad = makeString(size); - memset(mutateString(pad), '\xff', pad.size()); - } - - return pad.substr(0, size); +// Return a block of contiguous padding bytes, growing if needed. +Value makePadding(int size) { + static Value pad; + if (pad.size() < size) { + pad = makeString(size); + memset(mutateString(pad), '\xff', pad.size()); } - // File Format handlers. - // Both Range and Log formats are designed to be readable starting at any 1MB boundary - // so they can be read in parallel. - // - // Writer instances must be kept alive while any member actors are in progress. - // - // RangeFileWriter must be used as follows: - // 1 - writeKey(key) the queried key range begin - // 2 - writeKV(k, v) each kv pair to restore - // 3 - writeKey(key) the queried key range end - // - // RangeFileWriter will insert the required padding, header, and extra - // end/begin keys around the 1MB boundaries as needed. - // - // Example: - // The range a-z is queries and returns c-j which covers 3 blocks. - // The client code writes keys in this sequence: - // a c d e f g h i j z - // - // H = header P = padding a...z = keys v = value | = block boundary - // - // Encoded file: H a cv dv ev P | H e ev fv gv hv P | H h hv iv jv z - // Decoded in blocks yields: - // Block 1: range [a, e) with kv pairs cv, dv - // Block 2: range [e, h) with kv pairs ev, fv, gv - // Block 3: range [h, z) with kv pairs hv, iv, jv - // - // NOTE: All blocks except for the final block will have one last - // value which will not be used. This isn't actually a waste since - // if the next KV pair wouldn't fit within the block after the value - // then the space after the final key to the next 1MB boundary would - // just be padding anyway. - struct RangeFileWriter { - RangeFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(1001) {} + return pad.substr(0, size); +} - // Handles the first block and internal blocks. Ends current block if needed. - // The final flag is used in simulation to pad the file's final block to a whole block size - ACTOR static Future newBlock(RangeFileWriter *self, int bytesNeeded, bool final = false) { - // Write padding to finish current block if needed - int bytesLeft = self->blockEnd - self->file->size(); - if(bytesLeft > 0) { - state Value paddingFFs = makePadding(bytesLeft); - wait(self->file->append(paddingFFs.begin(), bytesLeft)); +// File Format handlers. +// Both Range and Log formats are designed to be readable starting at any 1MB boundary +// so they can be read in parallel. +// +// Writer instances must be kept alive while any member actors are in progress. +// +// RangeFileWriter must be used as follows: +// 1 - writeKey(key) the queried key range begin +// 2 - writeKV(k, v) each kv pair to restore +// 3 - writeKey(key) the queried key range end +// +// RangeFileWriter will insert the required padding, header, and extra +// end/begin keys around the 1MB boundaries as needed. +// +// Example: +// The range a-z is queries and returns c-j which covers 3 blocks. +// The client code writes keys in this sequence: +// a c d e f g h i j z +// +// H = header P = padding a...z = keys v = value | = block boundary +// +// Encoded file: H a cv dv ev P | H e ev fv gv hv P | H h hv iv jv z +// Decoded in blocks yields: +// Block 1: range [a, e) with kv pairs cv, dv +// Block 2: range [e, h) with kv pairs ev, fv, gv +// Block 3: range [h, z) with kv pairs hv, iv, jv +// +// NOTE: All blocks except for the final block will have one last +// value which will not be used. This isn't actually a waste since +// if the next KV pair wouldn't fit within the block after the value +// then the space after the final key to the next 1MB boundary would +// just be padding anyway. +struct RangeFileWriter { + RangeFileWriter(Reference file = Reference(), int blockSize = 0) + : file(file), blockSize(blockSize), blockEnd(0), fileVersion(1001) {} + + // Handles the first block and internal blocks. Ends current block if needed. + // The final flag is used in simulation to pad the file's final block to a whole block size + ACTOR static Future newBlock(RangeFileWriter* self, int bytesNeeded, bool final = false) { + // Write padding to finish current block if needed + int bytesLeft = self->blockEnd - self->file->size(); + if (bytesLeft > 0) { + state Value paddingFFs = makePadding(bytesLeft); + wait(self->file->append(paddingFFs.begin(), bytesLeft)); + } + + if (final) { + ASSERT(g_network->isSimulated()); + return Void(); + } + + // Set new blockEnd + self->blockEnd += self->blockSize; + + // write Header + wait(self->file->append((uint8_t*)&self->fileVersion, sizeof(self->fileVersion))); + + // If this is NOT the first block then write duplicate stuff needed from last block + if (self->blockEnd > self->blockSize) { + wait(self->file->appendStringRefWithLen(self->lastKey)); + wait(self->file->appendStringRefWithLen(self->lastKey)); + wait(self->file->appendStringRefWithLen(self->lastValue)); + } + + // There must now be room in the current block for bytesNeeded or the block size is too small + if (self->file->size() + bytesNeeded > self->blockEnd) + throw backup_bad_block_size(); + + return Void(); + } + + // Used in simulation only to create backup file sizes which are an integer multiple of the block size + Future padEnd() { + ASSERT(g_network->isSimulated()); + if (file->size() > 0) { + return newBlock(this, 0, true); + } + return Void(); + } + + // Ends the current block if necessary based on bytesNeeded. + Future newBlockIfNeeded(int bytesNeeded) { + if (file->size() + bytesNeeded > blockEnd) + return newBlock(this, bytesNeeded); + return Void(); + } + + // Start a new block if needed, then write the key and value + ACTOR static Future writeKV_impl(RangeFileWriter* self, Key k, Value v) { + int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); + wait(self->newBlockIfNeeded(toWrite)); + wait(self->file->appendStringRefWithLen(k)); + wait(self->file->appendStringRefWithLen(v)); + self->lastKey = k; + self->lastValue = v; + return Void(); + } + + Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } + + // Write begin key or end key. + ACTOR static Future writeKey_impl(RangeFileWriter* self, Key k) { + int toWrite = sizeof(uint32_t) + k.size(); + wait(self->newBlockIfNeeded(toWrite)); + wait(self->file->appendStringRefWithLen(k)); + return Void(); + } + + Future writeKey(Key k) { return writeKey_impl(this, k); } + + Reference file; + int blockSize; + +private: + int64_t blockEnd; + uint32_t fileVersion; + Key lastKey; + Key lastValue; +}; + +// Helper class for reading restore data from a buffer and throwing the right errors. +struct StringRefReader { + StringRefReader(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e) {} + + // Return remainder of data as a StringRef + StringRef remainder() { return StringRef(rptr, end - rptr); } + + // Return a pointer to len bytes at the current read position and advance read pos + const uint8_t* consume(unsigned int len) { + if (rptr == end && len != 0) + throw end_of_stream(); + const uint8_t* p = rptr; + rptr += len; + if (rptr > end) + throw failure_error; + return p; + } + + // Return a T from the current read position and advance read pos + template + const T consume() { + return *(const T*)consume(sizeof(T)); + } + + // Functions for consuming big endian (network byte order) integers. + // Consumes a big endian number, swaps it to little endian, and returns it. + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } + uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } + + bool eof() { return rptr == end; } + + const uint8_t *rptr, *end; + Error failure_error; +}; + +ACTOR Future>> decodeRangeFileBlock(Reference file, + int64_t offset, + int len) { + state Standalone buf = makeString(len); + int rLen = wait(file->read(mutateString(buf), len, offset)); + if (rLen != len) + throw restore_bad_read(); + + Standalone> results({}, buf.arena()); + state StringRefReader reader(buf, restore_corrupted_data()); + + try { + // Read header, currently only decoding version 1001 + if (reader.consume() != 1001) + throw restore_unsupported_file_version(); + + // Read begin key, if this fails then block was invalid. + uint32_t kLen = reader.consumeNetworkUInt32(); + const uint8_t* k = reader.consume(kLen); + results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + + // Read kv pairs and end key + while (1) { + // Read a key. + kLen = reader.consumeNetworkUInt32(); + k = reader.consume(kLen); + + // If eof reached or first value len byte is 0xFF then a valid block end was reached. + if (reader.eof() || *reader.rptr == 0xFF) { + results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + break; } - if(final) { - ASSERT(g_network->isSimulated()); - return Void(); + // Read a value, which must exist or the block is invalid + uint32_t vLen = reader.consumeNetworkUInt32(); + const uint8_t* v = reader.consume(vLen); + results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); + + // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. + if (reader.eof() || *reader.rptr == 0xFF) + break; + } + + // Make sure any remaining bytes in the block are 0xFF + for (auto b : reader.remainder()) + if (b != 0xFF) + throw restore_corrupted_data_padding(); + + return results; + + } catch (Error& e) { + TraceEvent(SevWarn, "FileRestoreCorruptRangeFileBlock") + .error(e) + .detail("Filename", file->getFilename()) + .detail("BlockOffset", offset) + .detail("BlockLen", len) + .detail("ErrorRelativeOffset", reader.rptr - buf.begin()) + .detail("ErrorAbsoluteOffset", reader.rptr - buf.begin() + offset); + throw; + } +} + +// Very simple format compared to KeyRange files. +// Header, [Key, Value]... Key len +struct LogFileWriter { + static const std::string& FFs; + + LogFileWriter(Reference file = Reference(), int blockSize = 0) + : file(file), blockSize(blockSize), blockEnd(0), fileVersion(2001) {} + + // Start a new block if needed, then write the key and value + ACTOR static Future writeKV_impl(LogFileWriter* self, Key k, Value v) { + // If key and value do not fit in this block, end it and start a new one + int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); + if (self->file->size() + toWrite > self->blockEnd) { + // Write padding if needed + int bytesLeft = self->blockEnd - self->file->size(); + if (bytesLeft > 0) { + state Value paddingFFs = makePadding(bytesLeft); + wait(self->file->append(paddingFFs.begin(), bytesLeft)); } // Set new blockEnd self->blockEnd += self->blockSize; // write Header - wait(self->file->append((uint8_t *)&self->fileVersion, sizeof(self->fileVersion))); - - // If this is NOT the first block then write duplicate stuff needed from last block - if(self->blockEnd > self->blockSize) { - wait(self->file->appendStringRefWithLen(self->lastKey)); - wait(self->file->appendStringRefWithLen(self->lastKey)); - wait(self->file->appendStringRefWithLen(self->lastValue)); - } - - // There must now be room in the current block for bytesNeeded or the block size is too small - if(self->file->size() + bytesNeeded > self->blockEnd) - throw backup_bad_block_size(); - - return Void(); + wait(self->file->append((uint8_t*)&self->fileVersion, sizeof(self->fileVersion))); } - // Used in simulation only to create backup file sizes which are an integer multiple of the block size - Future padEnd() { - ASSERT(g_network->isSimulated()); - if(file->size() > 0) { - return newBlock(this, 0, true); - } - return Void(); - } + wait(self->file->appendStringRefWithLen(k)); + wait(self->file->appendStringRefWithLen(v)); - // Ends the current block if necessary based on bytesNeeded. - Future newBlockIfNeeded(int bytesNeeded) { - if(file->size() + bytesNeeded > blockEnd) - return newBlock(this, bytesNeeded); - return Void(); - } + // At this point we should be in whatever the current block is or the block size is too small + if (self->file->size() > self->blockEnd) + throw backup_bad_block_size(); - // Start a new block if needed, then write the key and value - ACTOR static Future writeKV_impl(RangeFileWriter *self, Key k, Value v) { - int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); - wait(self->newBlockIfNeeded(toWrite)); - wait(self->file->appendStringRefWithLen(k)); - wait(self->file->appendStringRefWithLen(v)); - self->lastKey = k; - self->lastValue = v; - return Void(); - } + return Void(); + } - Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } + Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } - // Write begin key or end key. - ACTOR static Future writeKey_impl(RangeFileWriter *self, Key k) { - int toWrite = sizeof(uint32_t) + k.size(); - wait(self->newBlockIfNeeded(toWrite)); - wait(self->file->appendStringRefWithLen(k)); - return Void(); - } + Reference file; + int blockSize; - Future writeKey(Key k) { return writeKey_impl(this, k); } +private: + int64_t blockEnd; + uint32_t fileVersion; +}; - Reference file; - int blockSize; +ACTOR Future>> decodeLogFileBlock(Reference file, + int64_t offset, + int len) { + state Standalone buf = makeString(len); + int rLen = wait(file->read(mutateString(buf), len, offset)); + if (rLen != len) + throw restore_bad_read(); - private: - int64_t blockEnd; - uint32_t fileVersion; - Key lastKey; - Key lastValue; - }; + Standalone> results({}, buf.arena()); + state StringRefReader reader(buf, restore_corrupted_data()); - // Helper class for reading restore data from a buffer and throwing the right errors. - struct StringRefReader { - StringRefReader(StringRef s = StringRef(), Error e = Error()) : rptr(s.begin()), end(s.end()), failure_error(e) {} + try { + // Read header, currently only decoding version 2001 + if (reader.consume() != 2001) + throw restore_unsupported_file_version(); - // Return remainder of data as a StringRef - StringRef remainder() { - return StringRef(rptr, end - rptr); - } + // Read k/v pairs. Block ends either at end of last value exactly or with 0xFF as first key len byte. + while (1) { + // If eof reached or first key len bytes is 0xFF then end of block was reached. + if (reader.eof() || *reader.rptr == 0xFF) + break; - // Return a pointer to len bytes at the current read position and advance read pos - const uint8_t * consume(unsigned int len) { - if(rptr == end && len != 0) - throw end_of_stream(); - const uint8_t *p = rptr; - rptr += len; - if(rptr > end) - throw failure_error; - return p; - } - - // Return a T from the current read position and advance read pos - template const T consume() { - return *(const T *)consume(sizeof(T)); - } - - // Functions for consuming big endian (network byte order) integers. - // Consumes a big endian number, swaps it to little endian, and returns it. - int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} - uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} - - bool eof() { return rptr == end; } - - const uint8_t *rptr, *end; - Error failure_error; - }; - - ACTOR Future>> decodeRangeFileBlock(Reference file, int64_t offset, int len) { - state Standalone buf = makeString(len); - int rLen = wait(file->read(mutateString(buf), len, offset)); - if(rLen != len) - throw restore_bad_read(); - - Standalone> results({}, buf.arena()); - state StringRefReader reader(buf, restore_corrupted_data()); - - try { - // Read header, currently only decoding version 1001 - if(reader.consume() != 1001) - throw restore_unsupported_file_version(); - - // Read begin key, if this fails then block was invalid. + // Read key and value. If anything throws then there is a problem. uint32_t kLen = reader.consumeNetworkUInt32(); - const uint8_t *k = reader.consume(kLen); - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + const uint8_t* k = reader.consume(kLen); + uint32_t vLen = reader.consumeNetworkUInt32(); + const uint8_t* v = reader.consume(vLen); - // Read kv pairs and end key - while(1) { - // Read a key. - kLen = reader.consumeNetworkUInt32(); - k = reader.consume(kLen); - - // If eof reached or first value len byte is 0xFF then a valid block end was reached. - if(reader.eof() || *reader.rptr == 0xFF) { - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); - break; - } - - // Read a value, which must exist or the block is invalid - uint32_t vLen = reader.consumeNetworkUInt32(); - const uint8_t *v = reader.consume(vLen); - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); - - // If eof reached or first byte of next key len is 0xFF then a valid block end was reached. - if(reader.eof() || *reader.rptr == 0xFF) - break; - } - - // Make sure any remaining bytes in the block are 0xFF - for(auto b : reader.remainder()) - if(b != 0xFF) - throw restore_corrupted_data_padding(); - - return results; - - } catch(Error &e) { - TraceEvent(SevWarn, "FileRestoreCorruptRangeFileBlock") - .error(e) - .detail("Filename", file->getFilename()) - .detail("BlockOffset", offset) - .detail("BlockLen", len) - .detail("ErrorRelativeOffset", reader.rptr - buf.begin()) - .detail("ErrorAbsoluteOffset", reader.rptr - buf.begin() + offset); - throw; + results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); } + + // Make sure any remaining bytes in the block are 0xFF + for (auto b : reader.remainder()) + if (b != 0xFF) + throw restore_corrupted_data_padding(); + + return results; + + } catch (Error& e) { + TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock") + .error(e) + .detail("Filename", file->getFilename()) + .detail("BlockOffset", offset) + .detail("BlockLen", len) + .detail("ErrorRelativeOffset", reader.rptr - buf.begin()) + .detail("ErrorAbsoluteOffset", reader.rptr - buf.begin() + offset); + throw; + } +} + +ACTOR Future checkTaskVersion(Database cx, Reference task, StringRef name, uint32_t version) { + uint32_t taskVersion = task->getVersion(); + if (taskVersion > version) { + state Error err = task_invalid_version(); + + TraceEvent(SevWarn, "BA_BackupRangeTaskFuncExecute") + .detail("TaskVersion", taskVersion) + .detail("Name", name) + .detail("Version", version); + if (KeyBackedConfig::TaskParams.uid().exists(task)) { + std::string msg = format("%s task version `%lu' is greater than supported version `%lu'", + task->params[Task::reservedTaskParamKeyType].toString().c_str(), + (unsigned long)taskVersion, + (unsigned long)version); + wait(BackupConfig(task).logError(cx, err, msg)); + } + + throw err; } + return Void(); +} - // Very simple format compared to KeyRange files. - // Header, [Key, Value]... Key len - struct LogFileWriter { - static const std::string &FFs; +ACTOR static Future abortFiveZeroBackup(FileBackupAgent* backupAgent, + Reference tr, + std::string tagName) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); - LogFileWriter(Reference file = Reference(), int blockSize = 0) : file(file), blockSize(blockSize), blockEnd(0), fileVersion(2001) {} + state Subspace tagNames = backupAgent->subspace.get(BackupAgentBase::keyTagName); + Optional uidStr = wait(tr->get(tagNames.pack(Key(tagName)))); + if (!uidStr.present()) { + TraceEvent(SevWarn, "FileBackupAbortIncompatibleBackup_TagNotFound").detail("TagName", tagName.c_str()); + return Void(); + } + state UID uid = BinaryReader::fromStringRef(uidStr.get(), Unversioned()); - // Start a new block if needed, then write the key and value - ACTOR static Future writeKV_impl(LogFileWriter *self, Key k, Value v) { - // If key and value do not fit in this block, end it and start a new one - int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); - if(self->file->size() + toWrite > self->blockEnd) { - // Write padding if needed - int bytesLeft = self->blockEnd - self->file->size(); - if(bytesLeft > 0) { - state Value paddingFFs = makePadding(bytesLeft); - wait(self->file->append(paddingFFs.begin(), bytesLeft)); + state Subspace statusSpace = backupAgent->subspace.get(BackupAgentBase::keyStates).get(uid.toString()); + state Subspace globalConfig = backupAgent->subspace.get(BackupAgentBase::keyConfig).get(uid.toString()); + state Subspace newConfigSpace = + uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(fileBackupPrefixRange.begin), uid); + + Optional statusStr = wait(tr->get(statusSpace.pack(FileBackupAgent::keyStateStatus))); + state EBackupState status = + !statusStr.present() ? FileBackupAgent::STATE_NEVERRAN : BackupAgentBase::getState(statusStr.get().toString()); + + TraceEvent(SevInfo, "FileBackupAbortIncompatibleBackup") + .detail("TagName", tagName.c_str()) + .detail("Status", BackupAgentBase::getStateText(status)); + + // Clear the folder id to prevent future tasks from executing at all + tr->clear(singleKeyRange(StringRef(globalConfig.pack(FileBackupAgent::keyFolderId)))); + + // Clear the mutations logging config and data + Key configPath = uidPrefixKey(logRangesRange.begin, uid); + Key logsPath = uidPrefixKey(backupLogKeys.begin, uid); + tr->clear(KeyRangeRef(configPath, strinc(configPath))); + tr->clear(KeyRangeRef(logsPath, strinc(logsPath))); + + // Clear the new-style config space + tr->clear(newConfigSpace.range()); + + Key statusKey = StringRef(statusSpace.pack(FileBackupAgent::keyStateStatus)); + + // Set old style state key to Aborted if it was Runnable + if (backupAgent->isRunnable(status)) + tr->set(statusKey, StringRef(FileBackupAgent::getStateText(BackupAgentBase::STATE_ABORTED))); + + return Void(); +} + +struct AbortFiveZeroBackupTask : TaskFuncBase { + static StringRef name; + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state FileBackupAgent backupAgent; + state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString(); + + TEST(true); // Canceling old backup task + + TraceEvent(SevInfo, "FileBackupCancelOldTask") + .detail("Task", task->params[Task::reservedTaskParamKeyType]) + .detail("TagName", tagName); + wait(abortFiveZeroBackup(&backupAgent, tr, tagName)); + + wait(taskBucket->finish(tr, task)); + return Void(); + } + + virtual StringRef getName() const { + TraceEvent(SevError, "FileBackupError") + .detail("Cause", "AbortFiveZeroBackupTaskFunc::name() should never be called"); + ASSERT(false); + return StringRef(); + } + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Future(Void()); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef AbortFiveZeroBackupTask::name = LiteralStringRef("abort_legacy_backup"); +REGISTER_TASKFUNC(AbortFiveZeroBackupTask); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_diff_logs); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_log_range); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_logs); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_range); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_restorable); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finish_full_backup); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finished_full_backup); +REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_start_full_backup); + +ACTOR static Future abortFiveOneBackup(FileBackupAgent* backupAgent, + Reference tr, + std::string tagName) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state KeyBackedTag tag = makeBackupTag(tagName); + state UidAndAbortedFlagT current = wait(tag.getOrThrow(tr, false, backup_unneeded())); + + state BackupConfig config(current.first); + EBackupState status = wait(config.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); + + if (!backupAgent->isRunnable((BackupAgentBase::enumState)status)) { + throw backup_unneeded(); + } + + TraceEvent(SevInfo, "FBA_AbortFileOneBackup") + .detail("TagName", tagName.c_str()) + .detail("Status", BackupAgentBase::getStateText(status)); + + // Cancel backup task through tag + wait(tag.cancel(tr)); + + Key configPath = uidPrefixKey(logRangesRange.begin, config.getUid()); + Key logsPath = uidPrefixKey(backupLogKeys.begin, config.getUid()); + + tr->clear(KeyRangeRef(configPath, strinc(configPath))); + tr->clear(KeyRangeRef(logsPath, strinc(logsPath))); + + config.stateEnum().set(tr, EBackupState::STATE_ABORTED); + + return Void(); +} + +struct AbortFiveOneBackupTask : TaskFuncBase { + static StringRef name; + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state FileBackupAgent backupAgent; + state BackupConfig config(task); + state std::string tagName = wait(config.tag().getOrThrow(tr)); + + TEST(true); // Canceling 5.1 backup task + + TraceEvent(SevInfo, "FileBackupCancelFiveOneTask") + .detail("Task", task->params[Task::reservedTaskParamKeyType]) + .detail("TagName", tagName); + wait(abortFiveOneBackup(&backupAgent, tr, tagName)); + + wait(taskBucket->finish(tr, task)); + return Void(); + } + + virtual StringRef getName() const { + TraceEvent(SevError, "FileBackupError") + .detail("Cause", "AbortFiveOneBackupTaskFunc::name() should never be called"); + ASSERT(false); + return StringRef(); + } + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Future(Void()); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef AbortFiveOneBackupTask::name = LiteralStringRef("abort_legacy_backup_5.2"); +REGISTER_TASKFUNC(AbortFiveOneBackupTask); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_range); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_ranges); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_logs); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_erase_logs); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_logs); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_finished); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_snapshot_manifest); +REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_start); + +std::function)> NOP_SETUP_TASK_FN = [](Reference task) { /* NOP */ }; +ACTOR static Future addBackupTask(StringRef name, + uint32_t version, + Reference tr, + Reference taskBucket, + TaskCompletionKey completionKey, + BackupConfig config, + Reference waitFor = Reference(), + std::function)> setupTaskFn = NOP_SETUP_TASK_FN, + int priority = 0, + bool setValidation = true) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + Key doneKey = wait(completionKey.get(tr, taskBucket)); + state Reference task(new Task(name, version, doneKey, priority)); + + // Bind backup config to new task + wait(config.toTask(tr, task, setValidation)); + + // Set task specific params + setupTaskFn(task); + + if (!waitFor) { + return taskBucket->addTask(tr, task); + } + wait(waitFor->onSetAddTask(tr, taskBucket, task)); + + return LiteralStringRef("OnSetAddTask"); +} + +// Backup and Restore taskFunc definitions will inherit from one of the following classes which +// servers to catch and log to the appropriate config any error that execute/finish didn't catch and log. +struct RestoreTaskFuncBase : TaskFuncBase { + virtual Future handleError(Database cx, Reference task, Error const& error) { + return RestoreConfig(task).logError( + cx, + error, + format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str())); + } + virtual std::string toString(Reference task) { return ""; } +}; + +struct BackupTaskFuncBase : TaskFuncBase { + virtual Future handleError(Database cx, Reference task, Error const& error) { + return BackupConfig(task).logError( + cx, + error, + format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str())); + } + virtual std::string toString(Reference task) { return ""; } +}; + +ACTOR static Future>> getBlockOfShards(Reference tr, + Key beginKey, + Key endKey, + int limit) { + + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + state Standalone> results; + Standalone values = wait(tr->getRange( + KeyRangeRef(keyAfter(beginKey.withPrefix(keyServersPrefix)), endKey.withPrefix(keyServersPrefix)), limit)); + + for (auto& s : values) { + KeyRef k = s.key.removePrefix(keyServersPrefix); + results.push_back_deep(results.arena(), k); + } + + return results; +} + +struct BackupRangeTaskFunc : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + + static struct { + static TaskParam beginKey() { return LiteralStringRef(__FUNCTION__); } + static TaskParam endKey() { return LiteralStringRef(__FUNCTION__); } + static TaskParam addBackupRangeTasks() { return LiteralStringRef(__FUNCTION__); } + } Params; + + std::string toString(Reference task) { + return format("beginKey '%s' endKey '%s' addTasks %d", + Params.beginKey().get(task).printable().c_str(), + Params.endKey().get(task).printable().c_str(), + Params.addBackupRangeTasks().get(task)); + } + + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; + + // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress durable. + // This means: + // - increment the backup config's range bytes written + // - update the range file map + // - update the task begin key + // - save/extend the task with the new params + // Returns whether or not the caller should continue executing the task. + ACTOR static Future finishRangeFile(Reference file, + Database cx, + Reference task, + Reference taskBucket, + KeyRange range, + Version version) { + wait(file->finish()); + + // Ignore empty ranges. + if (range.empty()) + return false; + + state Reference tr(new ReadYourWritesTransaction(cx)); + state BackupConfig backup(task); + state bool usedFile = false; + + // Avoid unnecessary conflict by prevent taskbucket's automatic timeout extension + // because the following transaction loop extends and updates the task. + wait(task->extendMutex.take()); + state FlowLock::Releaser releaser(task->extendMutex, 1); + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + // Update the start key of the task so if this transaction completes but the task then fails + // when it is restarted it will continue where this execution left off. + Params.beginKey().set(task, range.end); + + // Save and extend the task with the new begin parameter + state Version newTimeout = wait(taskBucket->extendTimeout(tr, task, true)); + + // Update the range bytes written in the backup config + backup.rangeBytesWritten().atomicOp(tr, file->size(), MutationRef::AddValue); + backup.snapshotRangeFileCount().atomicOp(tr, 1, MutationRef::AddValue); + + // See if there is already a file for this key which has an earlier begin, update the map if not. + Optional s = wait(backup.snapshotRangeFileMap().get(tr, range.end)); + if (!s.present() || s.get().begin >= range.begin) { + backup.snapshotRangeFileMap().set( + tr, range.end, { range.begin, version, file->getFileName(), file->size() }); + usedFile = true; } - // Set new blockEnd - self->blockEnd += self->blockSize; - - // write Header - wait(self->file->append((uint8_t *)&self->fileVersion, sizeof(self->fileVersion))); + wait(tr->commit()); + task->timeoutVersion = newTimeout; + break; + } catch (Error& e) { + wait(tr->onError(e)); } + } - wait(self->file->appendStringRefWithLen(k)); - wait(self->file->appendStringRefWithLen(v)); + return usedFile; + } - // At this point we should be in whatever the current block is or the block size is too small - if(self->file->size() > self->blockEnd) - throw backup_bad_block_size(); + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + int priority, + Key begin, + Key end, + TaskCompletionKey completionKey, + Reference waitFor = Reference(), + Version scheduledVersion = invalidVersion) { + Key key = wait(addBackupTask( + BackupRangeTaskFunc::name, + BackupRangeTaskFunc::version, + tr, + taskBucket, + completionKey, + BackupConfig(parentTask), + waitFor, + [=](Reference task) { + Params.beginKey().set(task, begin); + Params.endKey().set(task, end); + Params.addBackupRangeTasks().set(task, false); + if (scheduledVersion != invalidVersion) + ReservedTaskParams::scheduledVersion().set(task, scheduledVersion); + }, + priority)); + return key; + } + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); + + wait(checkTaskVersion(cx, task, BackupRangeTaskFunc::name, BackupRangeTaskFunc::version)); + + state Key beginKey = Params.beginKey().get(task); + state Key endKey = Params.endKey().get(task); + + TraceEvent("FileBackupRangeStart") + .suppressFor(60) + .detail("BackupUID", BackupConfig(task).getUid()) + .detail("BeginKey", Params.beginKey().get(task).printable()) + .detail("EndKey", Params.endKey().get(task).printable()) + .detail("TaskKey", task->key.printable()); + + // When a key range task saves the last chunk of progress and then the executor dies, when the task continues + // its beginKey and endKey will be equal but there is no work to be done. + if (beginKey == endKey) + return Void(); + + // Find out if there is a shard boundary in(beginKey, endKey) + Standalone> keys = wait(runRYWTransaction( + cx, [=](Reference tr) { return getBlockOfShards(tr, beginKey, endKey, 1); })); + if (keys.size() > 0) { + Params.addBackupRangeTasks().set(task, true); return Void(); } - Future writeKV(Key k, Value v) { return writeKV_impl(this, k, v); } + // Read everything from beginKey to endKey, write it to an output file, run the output file processor, and + // then set on_done. If we are still writing after X seconds, end the output file and insert a new backup_range + // task for the remainder. + state Reference outFile; + state Version outVersion = invalidVersion; + state Key lastKey; - Reference file; - int blockSize; + // retrieve kvData + state PromiseStream results; - private: - int64_t blockEnd; - uint32_t fileVersion; + state Future rc = readCommitted(cx, results, lock, KeyRangeRef(beginKey, endKey), true, true, true); + state RangeFileWriter rangeFile; + state BackupConfig backup(task); + + // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but if + // bc is false then clearly the backup is no longer in progress + state Reference bc = wait(backup.backupContainer().getD(cx)); + if (!bc) { + return Void(); + } + + state bool done = false; + state int64_t nrKeys = 0; + + loop { + state RangeResultWithVersion values; + try { + RangeResultWithVersion _values = waitNext(results.getFuture()); + values = _values; + lock->release(values.first.expectedSize()); + } catch (Error& e) { + if (e.code() == error_code_end_of_stream) + done = true; + else + throw; + } + + // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish it. + if (values.second != outVersion || done) { + if (outFile) { + TEST(outVersion != invalidVersion); // Backup range task wrote multiple versions + state Key nextKey = done ? endKey : keyAfter(lastKey); + wait(rangeFile.writeKey(nextKey)); + + if (BUGGIFY) { + wait(rangeFile.padEnd()); + } + + bool usedFile = wait( + finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion)); + TraceEvent("FileBackupWroteRangeFile") + .suppressFor(60) + .detail("BackupUID", backup.getUid()) + .detail("Size", outFile->size()) + .detail("Keys", nrKeys) + .detail("ReadVersion", outVersion) + .detail("BeginKey", beginKey.printable()) + .detail("EndKey", nextKey.printable()) + .detail("AddedFileToMap", usedFile); + + nrKeys = 0; + beginKey = nextKey; + } + + if (done) + return Void(); + + // Start writing a new file after verifying this task should keep running as of a new read version + // (which must be >= outVersion) + outVersion = values.second; + // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so 250k + // conservatively. + state int blockSize = + BUGGIFY ? deterministicRandom()->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; + state Version snapshotBeginVersion; + state int64_t snapshotRangeFileCount; + + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(taskBucket->keepRunning(tr, task) && + storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) && + store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr))); + + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + Reference f = + wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); + outFile = f; + + // Initialize range file writer and write begin key + rangeFile = RangeFileWriter(outFile, blockSize); + wait(rangeFile.writeKey(beginKey)); + } + + // write kvData to file, update lastKey and key count + if (values.first.size() != 0) { + state size_t i = 0; + for (; i < values.first.size(); ++i) { + wait(rangeFile.writeKV(values.first[i].key, values.first[i].value)); + } + lastKey = values.first.back().key; + nrKeys += values.first.size(); + } + } + } + + ACTOR static Future startBackupRangeInternal(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task, + Reference onDone) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + state Key nextKey = Params.beginKey().get(task); + state Key endKey = Params.endKey().get(task); + + state Standalone> keys = + wait(getBlockOfShards(tr, nextKey, endKey, CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT)); + + std::vector> addTaskVector; + for (int idx = 0; idx < keys.size(); ++idx) { + if (nextKey != keys[idx]) { + addTaskVector.push_back(addTask(tr, + taskBucket, + task, + task->getPriority(), + nextKey, + keys[idx], + TaskCompletionKey::joinWith(onDone))); + TraceEvent("FileBackupRangeSplit") + .suppressFor(60) + .detail("BackupUID", BackupConfig(task).getUid()) + .detail("BeginKey", Params.beginKey().get(task).printable()) + .detail("EndKey", Params.endKey().get(task).printable()) + .detail("SliceBeginKey", nextKey.printable()) + .detail("SliceEndKey", keys[idx].printable()); + } + nextKey = keys[idx]; + } + + wait(waitForAll(addTaskVector)); + + if (nextKey != endKey) { + // Add task to cover nextKey to the end, using the priority of the current task + wait(success(addTask(tr, + taskBucket, + task, + task->getPriority(), + nextKey, + endKey, + TaskCompletionKey::joinWith(onDone), + Reference(), + task->getPriority()))); + } + + return Void(); + } + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); + + if (Params.addBackupRangeTasks().get(task)) { + wait(startBackupRangeInternal(tr, taskBucket, futureBucket, task, taskFuture)); + } else { + wait(taskFuture->set(tr, taskBucket)); + } + + wait(taskBucket->finish(tr, task)); + + TraceEvent("FileBackupRangeFinish") + .suppressFor(60) + .detail("BackupUID", BackupConfig(task).getUid()) + .detail("BeginKey", Params.beginKey().get(task).printable()) + .detail("EndKey", Params.endKey().get(task).printable()) + .detail("TaskKey", task->key.printable()); + + return Void(); + } +}; +StringRef BackupRangeTaskFunc::name = LiteralStringRef("file_backup_write_range_5.2"); +const uint32_t BackupRangeTaskFunc::version = 1; +REGISTER_TASKFUNC(BackupRangeTaskFunc); + +struct BackupSnapshotDispatchTask : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + + static struct { + // Set by Execute, used by Finish + static TaskParam shardsBehind() { return LiteralStringRef(__FUNCTION__); } + // Set by Execute, used by Finish + static TaskParam snapshotFinished() { return LiteralStringRef(__FUNCTION__); } + // Set by Execute, used by Finish + static TaskParam nextDispatchVersion() { return LiteralStringRef(__FUNCTION__); } + } Params; + + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); }; - ACTOR Future>> decodeLogFileBlock(Reference file, int64_t offset, int len) { - state Standalone buf = makeString(len); - int rLen = wait(file->read(mutateString(buf), len, offset)); - if(rLen != len) - throw restore_bad_read(); + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + int priority, + TaskCompletionKey completionKey, + Reference waitFor = Reference(), + Version scheduledVersion = invalidVersion) { + Key key = wait(addBackupTask( + name, + version, + tr, + taskBucket, + completionKey, + BackupConfig(parentTask), + waitFor, + [=](Reference task) { + if (scheduledVersion != invalidVersion) + ReservedTaskParams::scheduledVersion().set(task, scheduledVersion); + }, + priority)); + return key; + } - Standalone> results({}, buf.arena()); - state StringRefReader reader(buf, restore_corrupted_data()); + enum DispatchState { SKIP = 0, DONE = 1, NOT_DONE_MIN = 2 }; + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); + wait(checkTaskVersion(cx, task, name, version)); + + state double startTime = timer(); + state Reference tr(new ReadYourWritesTransaction(cx)); + + // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN which + // will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which are not + // yet finished to enable efficiently finding random database shards which are not done. + state int notDoneSequence = NOT_DONE_MIN; + state KeyRangeMap shardMap(notDoneSequence++, normalKeys.end); + state Key beginKey = normalKeys.begin; + + // Read all shard boundaries and add them to the map + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state Future>> shardBoundaries = + getBlockOfShards(tr, beginKey, normalKeys.end, CLIENT_KNOBS->TOO_MANY); + wait(success(shardBoundaries) && taskBucket->keepRunning(tr, task)); + + if (shardBoundaries.get().size() == 0) + break; + + for (auto& boundary : shardBoundaries.get()) { + shardMap.rawInsert(boundary, notDoneSequence++); + } + + beginKey = keyAfter(shardBoundaries.get().back()); + tr->reset(); + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + // Read required stuff from backup config + state BackupConfig config(task); + state Version recentReadVersion; + state Version snapshotBeginVersion; + state Version snapshotTargetEndVersion; + state int64_t snapshotIntervalSeconds; + state Optional latestSnapshotEndVersion; + state std::vector backupRanges; + state Optional snapshotBatchFutureKey; + state Reference snapshotBatchFuture; + state Optional snapshotBatchSize; + + tr->reset(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) && + store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) && + store(backupRanges, config.backupRanges().getOrThrow(tr)) && + store(snapshotIntervalSeconds, config.snapshotIntervalSeconds().getOrThrow(tr)) + // The next two parameters are optional + && store(snapshotBatchFutureKey, config.snapshotBatchFuture().get(tr)) && + store(snapshotBatchSize, config.snapshotBatchSize().get(tr)) && + store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) && + store(recentReadVersion, tr->getReadVersion()) && taskBucket->keepRunning(tr, task)); + + // If the snapshot batch future key does not exist, this is the first execution of this dispatch task so + // - create and set the snapshot batch future key + // - initialize the batch size to 0 + // - initialize the target snapshot end version if it is not yet set + // - commit + if (!snapshotBatchFutureKey.present()) { + snapshotBatchFuture = futureBucket->future(tr); + config.snapshotBatchFuture().set(tr, snapshotBatchFuture->pack()); + snapshotBatchSize = 0; + config.snapshotBatchSize().set(tr, snapshotBatchSize.get()); + + // The dispatch of this batch can take multiple separate executions if the executor fails + // so store a completion key for the dispatch finish() to set when dispatching the batch is done. + state TaskCompletionKey dispatchCompletionKey = TaskCompletionKey::joinWith(snapshotBatchFuture); + // this is a bad hack - but flow doesn't work well with lambda functions and caputring + // state variables... + auto cfg = &config; + auto tx = &tr; + wait(map(dispatchCompletionKey.get(tr, taskBucket), [cfg, tx](Key const& k) { + cfg->snapshotBatchDispatchDoneKey().set(*tx, k); + return Void(); + })); + wait(tr->commit()); + } else { + ASSERT(snapshotBatchSize.present()); + // Batch future key exists in the config so create future from it + snapshotBatchFuture = + Reference(new TaskFuture(futureBucket, snapshotBatchFutureKey.get())); + } + + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + // Read all dispatched ranges + state std::vector> dispatchBoundaries; + tr->reset(); + beginKey = normalKeys.begin; + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state Future>> bounds = config.snapshotRangeDispatchMap().getRange( + tr, beginKey, keyAfter(normalKeys.end), CLIENT_KNOBS->TOO_MANY); + wait(success(bounds) && taskBucket->keepRunning(tr, task) && + store(recentReadVersion, tr->getReadVersion())); + + if (bounds.get().empty()) + break; + + dispatchBoundaries.reserve(dispatchBoundaries.size() + bounds.get().size()); + dispatchBoundaries.insert(dispatchBoundaries.end(), bounds.get().begin(), bounds.get().end()); + + beginKey = keyAfter(bounds.get().back().first); + tr->reset(); + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + // The next few sections involve combining the results above. Yields are used after operations + // that could have operated on many thousands of things and in loops which could have many + // thousands of iterations. + // Declare some common iterators which must be state vars and will be used multiple times. + state int i; + state RangeMap::Iterator iShard; + state RangeMap::Iterator iShardEnd; + + // Set anything inside a dispatched range to DONE. + // Also ensure that the boundary value are true, false, [true, false]... + if (dispatchBoundaries.size() > 0) { + state bool lastValue = false; + state Key lastKey; + for (i = 0; i < dispatchBoundaries.size(); ++i) { + const std::pair& boundary = dispatchBoundaries[i]; + + // Values must alternate + ASSERT(boundary.second == !lastValue); + + // If this was the end of a dispatched range + if (!boundary.second) { + // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range to + // DONE. + RangeMap::Ranges shardRanges = + shardMap.modify(KeyRangeRef(lastKey, boundary.first)); + iShard = shardRanges.begin(); + iShardEnd = shardRanges.end(); + for (; iShard != iShardEnd; ++iShard) { + iShard->value() = DONE; + wait(yield()); + } + } + lastValue = dispatchBoundaries[i].second; + lastKey = dispatchBoundaries[i].first; + + wait(yield()); + } + ASSERT(lastValue == false); + } + + // Set anything outside the backup ranges to SKIP. We can use insert() here instead of modify() + // because it's OK to delete shard boundaries in the skipped ranges. + if (backupRanges.size() > 0) { + shardMap.insert(KeyRangeRef(normalKeys.begin, backupRanges.front().begin), SKIP); + wait(yield()); + + for (i = 0; i < backupRanges.size() - 1; ++i) { + shardMap.insert(KeyRangeRef(backupRanges[i].end, backupRanges[i + 1].begin), SKIP); + wait(yield()); + } + + shardMap.insert(KeyRangeRef(backupRanges.back().end, normalKeys.end), SKIP); + wait(yield()); + } + + state int countShardsDone = 0; + state int countShardsNotDone = 0; + + // Scan through the shard map, counting the DONE and NOT_DONE shards. + RangeMap::Ranges shardRanges = shardMap.ranges(); + iShard = shardRanges.begin(); + iShardEnd = shardRanges.end(); + for (; iShard != iShardEnd; ++iShard) { + if (iShard->value() == DONE) { + ++countShardsDone; + } else if (iShard->value() >= NOT_DONE_MIN) + ++countShardsNotDone; + + wait(yield()); + } + + // Coalesce the shard map to make random selection below more efficient. + shardMap.coalesce(normalKeys); + wait(yield()); + + // In this context "all" refers to all of the shards relevant for this particular backup + state int countAllShards = countShardsDone + countShardsNotDone; + + if (countShardsNotDone == 0) { + TraceEvent("FileBackupSnapshotDispatchFinished") + .detail("BackupUID", config.getUid()) + .detail("AllShards", countAllShards) + .detail("ShardsDone", countShardsDone) + .detail("ShardsNotDone", countShardsNotDone) + .detail("SnapshotBeginVersion", snapshotBeginVersion) + .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) + .detail("CurrentVersion", recentReadVersion) + .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); + Params.snapshotFinished().set(task, true); + return Void(); + } + + // Decide when the next snapshot dispatch should run. + state Version nextDispatchVersion; + + // In simulation, use snapshot interval / 5 to ensure multiple dispatches run + // Otherwise, use the knob for the number of seconds between snapshot dispatch tasks. + if (g_network->isSimulated()) + nextDispatchVersion = + recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * (snapshotIntervalSeconds / 5.0); + else + nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * + CLIENT_KNOBS->BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC; + + // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just use + // the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this dispatch will + // be scheduled at a random time between recentReadVersion and nextDispatchVersion, + // so nextDispatchVersion shouldn't be less than recentReadVersion. + if (nextDispatchVersion > snapshotTargetEndVersion) + nextDispatchVersion = std::max(recentReadVersion, snapshotTargetEndVersion); + + Params.nextDispatchVersion().set(task, nextDispatchVersion); + + // Calculate number of shards that should be done before the next interval end + // timeElapsed is between 0 and 1 and represents what portion of the shards we should have completed by now + double timeElapsed; + Version snapshotScheduledVersionInterval = snapshotTargetEndVersion - snapshotBeginVersion; + if (snapshotTargetEndVersion > snapshotBeginVersion) + timeElapsed = std::min( + 1.0, (double)(nextDispatchVersion - snapshotBeginVersion) / (snapshotScheduledVersionInterval)); + else + timeElapsed = 1.0; + + state int countExpectedShardsDone = countAllShards * timeElapsed; + state int countShardsToDispatch = std::max(0, countExpectedShardsDone - countShardsDone); + + // Calculate the number of shards that would have been dispatched by a normal (on-schedule) + // BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the current + // snapshot. + int64_t dispatchWindow = nextDispatchVersion - recentReadVersion; + + // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all shards + // are considered late + int countShardsExpectedPerNormalWindow; + if (snapshotScheduledVersionInterval == 0) { + countShardsExpectedPerNormalWindow = 0; + } else { + // A dispatchWindow of 0 means the target end version is <= now which also results in all shards being + // considered late + countShardsExpectedPerNormalWindow = + (double(dispatchWindow) / snapshotScheduledVersionInterval) * countAllShards; + } + + // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are being + // dispatched, if any. + int countShardsBehind = + std::max(0, countShardsToDispatch + snapshotBatchSize.get() - countShardsExpectedPerNormalWindow); + Params.shardsBehind().set(task, countShardsBehind); + + TraceEvent("FileBackupSnapshotDispatchStats") + .detail("BackupUID", config.getUid()) + .detail("AllShards", countAllShards) + .detail("ShardsDone", countShardsDone) + .detail("ShardsNotDone", countShardsNotDone) + .detail("ExpectedShardsDone", countExpectedShardsDone) + .detail("ShardsToDispatch", countShardsToDispatch) + .detail("ShardsBehind", countShardsBehind) + .detail("SnapshotBeginVersion", snapshotBeginVersion) + .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) + .detail("NextDispatchVersion", nextDispatchVersion) + .detail("CurrentVersion", recentReadVersion) + .detail("TimeElapsed", timeElapsed) + .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); + + // Dispatch random shards to catch up to the expected progress + while (countShardsToDispatch > 0) { + // First select ranges to add + state std::vector rangesToAdd; + + // Limit number of tasks added per transaction + int taskBatchSize = BUGGIFY ? deterministicRandom()->randomInt(1, countShardsToDispatch + 1) + : CLIENT_KNOBS->BACKUP_DISPATCH_ADDTASK_SIZE; + int added = 0; + + while (countShardsToDispatch > 0 && added < taskBatchSize && shardMap.size() > 0) { + // Get a random range. + auto it = shardMap.randomRange(); + // Find a NOT_DONE range and add it to rangesToAdd + while (1) { + if (it->value() >= NOT_DONE_MIN) { + rangesToAdd.push_back(it->range()); + it->value() = DONE; + shardMap.coalesce(Key(it->begin())); + ++added; + ++countShardsDone; + --countShardsToDispatch; + --countShardsNotDone; + break; + } + if (it->end() == shardMap.mapEnd) + break; + ++it; + } + } + + state int64_t oldBatchSize = snapshotBatchSize.get(); + state int64_t newBatchSize = oldBatchSize + rangesToAdd.size(); + + // Now add the selected ranges in a single transaction. + tr->reset(); + loop { + try { + TraceEvent("FileBackupSnapshotDispatchAddingTasks") + .suppressFor(2) + .detail("TasksToAdd", rangesToAdd.size()) + .detail("NewBatchSize", newBatchSize); + + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + // For each range, make sure it isn't set in the dispatched range map. + state std::vector>> beginReads; + state std::vector>> endReads; + + for (auto& range : rangesToAdd) { + beginReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.begin)); + endReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.end)); + } + + wait(store(snapshotBatchSize.get(), config.snapshotBatchSize().getOrThrow(tr)) && + waitForAll(beginReads) && waitForAll(endReads) && taskBucket->keepRunning(tr, task)); + + // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction is + // already done. + if (snapshotBatchSize.get() == newBatchSize) { + break; + } else { + ASSERT(snapshotBatchSize.get() == oldBatchSize); + config.snapshotBatchSize().set(tr, newBatchSize); + snapshotBatchSize = newBatchSize; + config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().get(task)); + config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get()); + } + + state std::vector> addTaskFutures; + + for (i = 0; i < beginReads.size(); ++i) { + KeyRange& range = rangesToAdd[i]; + + // This loop might have made changes to begin or end boundaries in a prior + // iteration. If so, the updated values exist in the RYW cache so re-read both entries. + Optional beginValue = config.snapshotRangeDispatchMap().get(tr, range.begin).get(); + Optional endValue = config.snapshotRangeDispatchMap().get(tr, range.end).get(); + + ASSERT(!beginValue.present() || !endValue.present() || beginValue != endValue); + + // If begin is present, it must be a range end so value must be false + // If end is present, it must be a range begin so value must be true + if ((!beginValue.present() || !beginValue.get()) && (!endValue.present() || endValue.get())) { + if (beginValue.present()) { + config.snapshotRangeDispatchMap().erase(tr, range.begin); + } else { + config.snapshotRangeDispatchMap().set(tr, range.begin, true); + } + if (endValue.present()) { + config.snapshotRangeDispatchMap().erase(tr, range.end); + } else { + config.snapshotRangeDispatchMap().set(tr, range.end, false); + } + + Version scheduledVersion = invalidVersion; + // If the next dispatch version is in the future, choose a random version at which to start + // the new task. + if (nextDispatchVersion > recentReadVersion) + scheduledVersion = recentReadVersion + deterministicRandom()->random01() * + (nextDispatchVersion - recentReadVersion); + + // Range tasks during the initial snapshot should run at a higher priority + int priority = latestSnapshotEndVersion.present() ? 0 : 1; + addTaskFutures.push_back( + success(BackupRangeTaskFunc::addTask(tr, + taskBucket, + task, + priority, + range.begin, + range.end, + TaskCompletionKey::joinWith(snapshotBatchFuture), + Reference(), + scheduledVersion))); + + TraceEvent("FileBackupSnapshotRangeDispatched") + .suppressFor(2) + .detail("BackupUID", config.getUid()) + .detail("CurrentVersion", recentReadVersion) + .detail("ScheduledVersion", scheduledVersion) + .detail("BeginKey", range.begin.printable()) + .detail("EndKey", range.end.printable()); + } else { + // This shouldn't happen because if the transaction was already done or if another execution + // of this task is making progress it should have been detected above. + ASSERT(false); + } + } + + wait(waitForAll(addTaskFutures)); + wait(tr->commit()); + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + } + + if (countShardsNotDone == 0) { + TraceEvent("FileBackupSnapshotDispatchFinished") + .detail("BackupUID", config.getUid()) + .detail("AllShards", countAllShards) + .detail("ShardsDone", countShardsDone) + .detail("ShardsNotDone", countShardsNotDone) + .detail("SnapshotBeginVersion", snapshotBeginVersion) + .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) + .detail("CurrentVersion", recentReadVersion) + .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds) + .detail("DispatchTimeSeconds", timer() - startTime); + Params.snapshotFinished().set(task, true); + } + + return Void(); + } + + // This function is just a wrapper for BackupSnapshotManifest::addTask() which is defined below. + // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep their + // execute and finish phases defined together inside their class definitions this wrapper is declared here but + // defined after BackupSnapshotManifest is defined. + static Future addSnapshotManifestTask(Reference tr, + Reference taskBucket, + Reference parentTask, + TaskCompletionKey completionKey, + Reference waitFor = Reference()); + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state BackupConfig config(task); + + // Get the batch future and dispatch done keys, then clear them. + state Key snapshotBatchFutureKey; + state Key snapshotBatchDispatchDoneKey; + + wait(store(snapshotBatchFutureKey, config.snapshotBatchFuture().getOrThrow(tr)) && + store(snapshotBatchDispatchDoneKey, config.snapshotBatchDispatchDoneKey().getOrThrow(tr))); + + state Reference snapshotBatchFuture = futureBucket->unpack(snapshotBatchFutureKey); + state Reference snapshotBatchDispatchDoneFuture = + futureBucket->unpack(snapshotBatchDispatchDoneKey); + config.snapshotBatchFuture().clear(tr); + config.snapshotBatchDispatchDoneKey().clear(tr); + config.snapshotBatchSize().clear(tr); + + // Update shardsBehind here again in case the execute phase did not actually have to create any shard tasks + config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().getOrDefault(task, 0)); + config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get()); + + state Reference snapshotFinishedFuture = task->getDoneFuture(futureBucket); + + // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another snapshot + // dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done key, passed to + // the current task, is also passed on. + if (Params.snapshotFinished().getOrDefault(task, false)) { + wait(success(addSnapshotManifestTask( + tr, taskBucket, task, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture))); + } else { + wait(success(addTask(tr, + taskBucket, + task, + 1, + TaskCompletionKey::signal(snapshotFinishedFuture), + snapshotBatchFuture, + Params.nextDispatchVersion().get(task)))); + } + + // This snapshot batch is finished, so set the batch done future. + wait(snapshotBatchDispatchDoneFuture->set(tr, taskBucket)); + + wait(taskBucket->finish(tr, task)); + + return Void(); + } +}; +StringRef BackupSnapshotDispatchTask::name = LiteralStringRef("file_backup_dispatch_ranges_5.2"); +const uint32_t BackupSnapshotDispatchTask::version = 1; +REGISTER_TASKFUNC(BackupSnapshotDispatchTask); + +struct BackupLogRangeTaskFunc : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + + static struct { + static TaskParam addBackupLogRangeTasks() { return LiteralStringRef(__FUNCTION__); } + static TaskParam fileSize() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + } Params; + + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; + + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); + + wait(checkTaskVersion(cx, task, BackupLogRangeTaskFunc::name, BackupLogRangeTaskFunc::version)); + + state Version beginVersion = Params.beginVersion().get(task); + state Version endVersion = Params.endVersion().get(task); + + state BackupConfig config(task); + state Reference bc; + + state Reference tr(new ReadYourWritesTransaction(cx)); + loop { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + // Wait for the read version to pass endVersion + try { + wait(taskBucket->keepRunning(tr, task)); + + if (!bc) { + // Backup container must be present if we're still here + Reference _bc = wait(config.backupContainer().getOrThrow(tr)); + bc = _bc; + } + + Version currentVersion = tr->getReadVersion().get(); + if (endVersion < currentVersion) + break; + + wait(delay(std::max(CLIENT_KNOBS->BACKUP_RANGE_MINWAIT, + (double)(endVersion - currentVersion) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND))); + tr->reset(); + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); + state Standalone> ranges = getLogRanges(beginVersion, endVersion, destUidValue); + if (ranges.size() > CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) { + Params.addBackupLogRangeTasks().set(task, true); + return Void(); + } + + // Block size must be at least large enough for 1 max size key, 1 max size value, and overhead, so + // conservatively 125k. + state int blockSize = + BUGGIFY ? deterministicRandom()->randomInt(125e3, 4e6) : CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE; + state Reference outFile = wait(bc->writeLogFile(beginVersion, endVersion, blockSize)); + state LogFileWriter logFile(outFile, blockSize); + + state PromiseStream results; + state std::vector> rc; + + for (auto& range : ranges) { + rc.push_back(readCommitted(cx, results, lock, range, false, true, true)); + } + + state Future sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr const& result) { + if (result.isError()) + results.sendError(result.getError()); + else + results.sendError(end_of_stream()); + return Void(); + }); + + state Version lastVersion; try { - // Read header, currently only decoding version 2001 - if(reader.consume() != 2001) - throw restore_unsupported_file_version(); + loop { + state RangeResultWithVersion r = waitNext(results.getFuture()); + lock->release(r.first.expectedSize()); - // Read k/v pairs. Block ends either at end of last value exactly or with 0xFF as first key len byte. - while(1) { - // If eof reached or first key len bytes is 0xFF then end of block was reached. - if(reader.eof() || *reader.rptr == 0xFF) + state int i = 0; + for (; i < r.first.size(); ++i) { + // Remove the backupLogPrefix + UID bytes from the key + wait(logFile.writeKV(r.first[i].key.substr(backupLogPrefixBytes + 16), r.first[i].value)); + lastVersion = r.second; + } + } + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + + if (e.code() != error_code_end_of_stream) { + state Error err = e; + wait(config.logError(cx, err, format("Failed to write to file `%s'", outFile->getFileName().c_str()))); + throw err; + } + } + + // Make sure this task is still alive, if it's not then the data read above could be incomplete. + wait(taskBucket->keepRunning(cx, task)); + + wait(outFile->finish()); + + TraceEvent("FileBackupWroteLogFile") + .suppressFor(60) + .detail("BackupUID", config.getUid()) + .detail("Size", outFile->size()) + .detail("BeginVersion", beginVersion) + .detail("EndVersion", endVersion) + .detail("LastReadVersion", latestVersion); + + Params.fileSize().set(task, outFile->size()); + + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + int priority, + Version beginVersion, + Version endVersion, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key key = wait(addBackupTask( + BackupLogRangeTaskFunc::name, + BackupLogRangeTaskFunc::version, + tr, + taskBucket, + completionKey, + BackupConfig(parentTask), + waitFor, + [=](Reference task) { + Params.beginVersion().set(task, beginVersion); + Params.endVersion().set(task, endVersion); + Params.addBackupLogRangeTasks().set(task, false); + }, + priority)); + return key; + } + + ACTOR static Future startBackupLogRangeInternal(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task, + Reference taskFuture, + Version beginVersion, + Version endVersion) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + std::vector> addTaskVector; + int tasks = 0; + for (int64_t vblock = beginVersion / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE; + vblock < (endVersion + CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE - 1) / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE; + vblock += CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) { + Version bv = std::max(beginVersion, vblock * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE); + + if (tasks >= CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT) { + addTaskVector.push_back(addTask(tr, + taskBucket, + task, + task->getPriority(), + bv, + endVersion, + TaskCompletionKey::joinWith(taskFuture))); + break; + } + + Version ev = std::min(endVersion, + (vblock + CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE); + addTaskVector.push_back( + addTask(tr, taskBucket, task, task->getPriority(), bv, ev, TaskCompletionKey::joinWith(taskFuture))); + tasks++; + } + + wait(waitForAll(addTaskVector)); + + return Void(); + } + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Version beginVersion = Params.beginVersion().get(task); + state Version endVersion = Params.endVersion().get(task); + state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); + state BackupConfig config(task); + + if (Params.fileSize().exists(task)) { + config.logBytesWritten().atomicOp(tr, Params.fileSize().get(task), MutationRef::AddValue); + } + + if (Params.addBackupLogRangeTasks().get(task)) { + wait(startBackupLogRangeInternal(tr, taskBucket, futureBucket, task, taskFuture, beginVersion, endVersion)); + endVersion = beginVersion; + } else { + wait(taskFuture->set(tr, taskBucket)); + } + + wait(taskBucket->finish(tr, task)); + return Void(); + } +}; + +StringRef BackupLogRangeTaskFunc::name = LiteralStringRef("file_backup_write_logs_5.2"); +const uint32_t BackupLogRangeTaskFunc::version = 1; +REGISTER_TASKFUNC(BackupLogRangeTaskFunc); + +// This task stopped being used in 6.2, however the code remains here to handle upgrades. +struct EraseLogRangeTaskFunc : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + StringRef getName() const { return name; }; + + static struct { + static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam destUidValue() { return LiteralStringRef(__FUNCTION__); } + } Params; + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + UID logUid, + TaskCompletionKey completionKey, + Key destUidValue, + Version endVersion = 0, + Reference waitFor = Reference()) { + Key key = wait(addBackupTask( + EraseLogRangeTaskFunc::name, + EraseLogRangeTaskFunc::version, + tr, + taskBucket, + completionKey, + BackupConfig(logUid), + waitFor, + [=](Reference task) { + Params.beginVersion().set(task, 1); // FIXME: remove in 6.X, only needed for 5.2 backward compatibility + Params.endVersion().set(task, endVersion); + Params.destUidValue().set(task, destUidValue); + }, + 0, + false)); + + return key; + } + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); + + wait(checkTaskVersion(tr->getDatabase(), task, EraseLogRangeTaskFunc::name, EraseLogRangeTaskFunc::version)); + + state Version endVersion = Params.endVersion().get(task); + state Key destUidValue = Params.destUidValue().get(task); + + state BackupConfig config(task); + state Key logUidValue = config.getUidAsKey(); + + wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task) && + eraseLogData( + tr, logUidValue, destUidValue, endVersion != 0 ? Optional(endVersion) : Optional())); + + return Void(); + } + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Void(); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("file_backup_erase_logs_5.2"); +const uint32_t EraseLogRangeTaskFunc::version = 1; +REGISTER_TASKFUNC(EraseLogRangeTaskFunc); + +struct BackupLogsDispatchTask : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + + static struct { + static TaskParam prevBeginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + } Params; + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + wait(checkTaskVersion(tr->getDatabase(), task, BackupLogsDispatchTask::name, BackupLogsDispatchTask::version)); + + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state Reference onDone = task->getDoneFuture(futureBucket); + state Version prevBeginVersion = Params.prevBeginVersion().get(task); + state Version beginVersion = Params.beginVersion().get(task); + state BackupConfig config(task); + config.latestLogEndVersion().set(tr, beginVersion); + + state bool stopWhenDone; + state Optional restorableVersion; + state EBackupState backupState; + state Optional tag; + state Optional latestSnapshotEndVersion; + + wait(store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) && + store(restorableVersion, config.getLatestRestorableVersion(tr)) && + store(backupState, config.stateEnum().getOrThrow(tr)) && store(tag, config.tag().get(tr)) && + store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr))); + + // If restorable, update the last restorable version for this tag + if (restorableVersion.present() && tag.present()) { + FileBackupAgent().setLastRestorable(tr, StringRef(tag.get()), restorableVersion.get()); + } + + // If the backup is restorable but the state is not differential then set state to differential + if (restorableVersion.present() && backupState != BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) + config.stateEnum().set(tr, BackupAgentBase::STATE_RUNNING_DIFFERENTIAL); + + // If stopWhenDone is set and there is a restorable version, set the done future and do not create further + // tasks. + if (stopWhenDone && restorableVersion.present()) { + wait(onDone->set(tr, taskBucket) && taskBucket->finish(tr, task)); + + TraceEvent("FileBackupLogsDispatchDone") + .detail("BackupUID", config.getUid()) + .detail("BeginVersion", beginVersion) + .detail("RestorableVersion", restorableVersion.orDefault(-1)); + + return Void(); + } + + state Version endVersion = std::max(tr->getReadVersion().get() + 1, + beginVersion + (CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES - 1) * + CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE); + + TraceEvent("FileBackupLogDispatch") + .suppressFor(60) + .detail("BeginVersion", beginVersion) + .detail("EndVersion", endVersion) + .detail("RestorableVersion", restorableVersion.orDefault(-1)); + + state Reference logDispatchBatchFuture = futureBucket->future(tr); + + // If a snapshot has ended for this backup then mutations are higher priority to reduce backup lag + state int priority = latestSnapshotEndVersion.present() ? 1 : 0; + + // Add the initial log range task to read/copy the mutations and the next logs dispatch task which will run + // after this batch is done + wait(success(BackupLogRangeTaskFunc::addTask(tr, + taskBucket, + task, + priority, + beginVersion, + endVersion, + TaskCompletionKey::joinWith(logDispatchBatchFuture)))); + wait(success(BackupLogsDispatchTask::addTask(tr, + taskBucket, + task, + priority, + beginVersion, + endVersion, + TaskCompletionKey::signal(onDone), + logDispatchBatchFuture))); + + // Do not erase at the first time + if (prevBeginVersion > 0) { + state Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); + wait(eraseLogData(tr, config.getUidAsKey(), destUidValue, Optional(beginVersion))); + } + + wait(taskBucket->finish(tr, task)); + + TraceEvent("FileBackupLogsDispatchContinuing") + .suppressFor(60) + .detail("BackupUID", config.getUid()) + .detail("BeginVersion", beginVersion) + .detail("EndVersion", endVersion); + + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + int priority, + Version prevBeginVersion, + Version beginVersion, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key key = wait(addBackupTask( + BackupLogsDispatchTask::name, + BackupLogsDispatchTask::version, + tr, + taskBucket, + completionKey, + BackupConfig(parentTask), + waitFor, + [=](Reference task) { + Params.prevBeginVersion().set(task, prevBeginVersion); + Params.beginVersion().set(task, beginVersion); + }, + priority)); + return key; + } + + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Void(); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef BackupLogsDispatchTask::name = LiteralStringRef("file_backup_dispatch_logs_5.2"); +const uint32_t BackupLogsDispatchTask::version = 1; +REGISTER_TASKFUNC(BackupLogsDispatchTask); + +struct FileBackupFinishedTask : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + + StringRef getName() const { return name; }; + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + wait(checkTaskVersion(tr->getDatabase(), task, FileBackupFinishedTask::name, FileBackupFinishedTask::version)); + + state BackupConfig backup(task); + state UID uid = backup.getUid(); + + tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); + state Key destUidValue = wait(backup.destUidValue().getOrThrow(tr)); + wait(eraseLogData(tr, backup.getUidAsKey(), destUidValue)); + + backup.stateEnum().set(tr, EBackupState::STATE_COMPLETED); + + wait(taskBucket->finish(tr, task)); + + TraceEvent("FileBackupFinished").detail("BackupUID", uid); + + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key key = wait(addBackupTask(FileBackupFinishedTask::name, + FileBackupFinishedTask::version, + tr, + taskBucket, + completionKey, + BackupConfig(parentTask), + waitFor)); + return key; + } + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Void(); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef FileBackupFinishedTask::name = LiteralStringRef("file_backup_finished_5.2"); +const uint32_t FileBackupFinishedTask::version = 1; +REGISTER_TASKFUNC(FileBackupFinishedTask); + +struct BackupSnapshotManifest : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; + static struct { + static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } + } Params; + + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state BackupConfig config(task); + state Reference bc; + + state Reference tr(new ReadYourWritesTransaction(cx)); + + // Read the entire range file map into memory, then walk it backwards from its last entry to produce a list of + // non overlapping key range files + state std::map localmap; + state Key startKey; + state int batchSize = BUGGIFY ? 1 : 1000000; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(taskBucket->keepRunning(tr, task)); + + if (!bc) { + // Backup container must be present if we're still here + wait(store(bc, config.backupContainer().getOrThrow(tr))); + } + + BackupConfig::RangeFileMapT::PairsType rangeresults = + wait(config.snapshotRangeFileMap().getRange(tr, startKey, {}, batchSize)); + + for (auto& p : rangeresults) { + localmap.insert(p); + } + + if (rangeresults.size() < batchSize) break; - // Read key and value. If anything throws then there is a problem. - uint32_t kLen = reader.consumeNetworkUInt32(); - const uint8_t *k = reader.consume(kLen); - uint32_t vLen = reader.consumeNetworkUInt32(); - const uint8_t *v = reader.consume(vLen); - - results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen))); + startKey = keyAfter(rangeresults.back().first); + tr->reset(); + } catch (Error& e) { + wait(tr->onError(e)); } - - // Make sure any remaining bytes in the block are 0xFF - for(auto b : reader.remainder()) - if(b != 0xFF) - throw restore_corrupted_data_padding(); - - return results; - - } catch(Error &e) { - TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock") - .error(e) - .detail("Filename", file->getFilename()) - .detail("BlockOffset", offset) - .detail("BlockLen", len) - .detail("ErrorRelativeOffset", reader.rptr - buf.begin()) - .detail("ErrorAbsoluteOffset", reader.rptr - buf.begin() + offset); - throw; } - } - ACTOR Future checkTaskVersion(Database cx, Reference task, StringRef name, uint32_t version) { - uint32_t taskVersion = task->getVersion(); - if (taskVersion > version) { - state Error err = task_invalid_version(); + std::vector files; + state Version maxVer = 0; + state Version minVer = std::numeric_limits::max(); + state int64_t totalBytes = 0; - TraceEvent(SevWarn, "BA_BackupRangeTaskFuncExecute").detail("TaskVersion", taskVersion).detail("Name", name).detail("Version", version); - if (KeyBackedConfig::TaskParams.uid().exists(task)) { - std::string msg = format("%s task version `%lu' is greater than supported version `%lu'", task->params[Task::reservedTaskParamKeyType].toString().c_str(), (unsigned long)taskVersion, (unsigned long)version); - wait(BackupConfig(task).logError(cx, err, msg)); + if (!localmap.empty()) { + // Get iterator that points to greatest key, start there. + auto ri = localmap.rbegin(); + auto i = (++ri).base(); + + while (1) { + const BackupConfig::RangeSlice& r = i->second; + + // Add file to final file list + files.push_back(r.fileName); + + // Update version range seen + if (r.version < minVer) + minVer = r.version; + if (r.version > maxVer) + maxVer = r.version; + + // Update total bytes counted. + totalBytes += r.fileSize; + + // Jump to file that either ends where this file begins or has the greatest end that is less than + // the begin of this file. In other words find the map key that is <= begin of this file. To do this + // find the first end strictly greater than begin and then back up one. + i = localmap.upper_bound(i->second.begin); + // If we get begin then we're done, there are no more ranges that end at or before the last file's begin + if (i == localmap.begin()) + break; + --i; } - - throw err; } + Params.endVersion().set(task, maxVer); + wait(bc->writeKeyspaceSnapshotFile(files, totalBytes)); + + TraceEvent(SevInfo, "FileBackupWroteSnapshotManifest") + .detail("BackupUID", config.getUid()) + .detail("BeginVersion", minVer) + .detail("EndVersion", maxVer) + .detail("TotalBytes", totalBytes); + return Void(); } - ACTOR static Future abortFiveZeroBackup(FileBackupAgent* backupAgent, Reference tr, std::string tagName) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + wait(checkTaskVersion(tr->getDatabase(), task, BackupSnapshotManifest::name, BackupSnapshotManifest::version)); - state Subspace tagNames = backupAgent->subspace.get(BackupAgentBase::keyTagName); - Optional uidStr = wait(tr->get(tagNames.pack(Key(tagName)))); - if (!uidStr.present()) { - TraceEvent(SevWarn, "FileBackupAbortIncompatibleBackup_TagNotFound").detail("TagName", tagName.c_str()); - return Void(); + state BackupConfig config(task); + + // Set the latest snapshot end version, which was set during the execute phase + config.latestSnapshotEndVersion().set(tr, Params.endVersion().get(task)); + + state bool stopWhenDone; + state EBackupState backupState; + state Optional restorableVersion; + state Optional firstSnapshotEndVersion; + state Optional tag; + + wait(store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) && + store(backupState, config.stateEnum().getOrThrow(tr)) && + store(restorableVersion, config.getLatestRestorableVersion(tr)) && + store(firstSnapshotEndVersion, config.firstSnapshotEndVersion().get(tr)) && + store(tag, config.tag().get(tr))); + + // If restorable, update the last restorable version for this tag + if (restorableVersion.present() && tag.present()) { + FileBackupAgent().setLastRestorable(tr, StringRef(tag.get()), restorableVersion.get()); } - state UID uid = BinaryReader::fromStringRef(uidStr.get(), Unversioned()); - state Subspace statusSpace = backupAgent->subspace.get(BackupAgentBase::keyStates).get(uid.toString()); - state Subspace globalConfig = backupAgent->subspace.get(BackupAgentBase::keyConfig).get(uid.toString()); - state Subspace newConfigSpace = uidPrefixKey(LiteralStringRef("uid->config/").withPrefix(fileBackupPrefixRange.begin), uid); + if (!firstSnapshotEndVersion.present()) { + config.firstSnapshotEndVersion().set(tr, Params.endVersion().get(task)); + } - Optional statusStr = wait(tr->get(statusSpace.pack(FileBackupAgent::keyStateStatus))); - state EBackupState status = !statusStr.present() ? FileBackupAgent::STATE_NEVERRAN : BackupAgentBase::getState(statusStr.get().toString()); + // If the backup is restorable and the state isn't differential the set state to differential + if (restorableVersion.present() && backupState != BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) + config.stateEnum().set(tr, BackupAgentBase::STATE_RUNNING_DIFFERENTIAL); - TraceEvent(SevInfo, "FileBackupAbortIncompatibleBackup") - .detail("TagName", tagName.c_str()) - .detail("Status", BackupAgentBase::getStateText(status)); - - // Clear the folder id to prevent future tasks from executing at all - tr->clear(singleKeyRange(StringRef(globalConfig.pack(FileBackupAgent::keyFolderId)))); - - // Clear the mutations logging config and data - Key configPath = uidPrefixKey(logRangesRange.begin, uid); - Key logsPath = uidPrefixKey(backupLogKeys.begin, uid); - tr->clear(KeyRangeRef(configPath, strinc(configPath))); - tr->clear(KeyRangeRef(logsPath, strinc(logsPath))); - - // Clear the new-style config space - tr->clear(newConfigSpace.range()); - - Key statusKey = StringRef(statusSpace.pack(FileBackupAgent::keyStateStatus)); - - // Set old style state key to Aborted if it was Runnable - if(backupAgent->isRunnable(status)) - tr->set(statusKey, StringRef(FileBackupAgent::getStateText(BackupAgentBase::STATE_ABORTED))); + // Unless we are to stop, start the next snapshot using the default interval + Reference snapshotDoneFuture = task->getDoneFuture(futureBucket); + if (!stopWhenDone) { + wait(config.initNewSnapshot(tr) && + success(BackupSnapshotDispatchTask::addTask( + tr, taskBucket, task, 1, TaskCompletionKey::signal(snapshotDoneFuture)))); + } else { + // Set the done future as the snapshot is now complete. + wait(snapshotDoneFuture->set(tr, taskBucket)); + } + wait(taskBucket->finish(tr, task)); return Void(); } - struct AbortFiveZeroBackupTask : TaskFuncBase { - static StringRef name; - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state FileBackupAgent backupAgent; - state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString(); + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key key = wait(addBackupTask(BackupSnapshotManifest::name, + BackupSnapshotManifest::version, + tr, + taskBucket, + completionKey, + BackupConfig(parentTask), + waitFor, + NOP_SETUP_TASK_FN, + 1)); + return key; + } - TEST(true); // Canceling old backup task + StringRef getName() const { return name; }; - TraceEvent(SevInfo, "FileBackupCancelOldTask") - .detail("Task", task->params[Task::reservedTaskParamKeyType]) - .detail("TagName", tagName); - wait(abortFiveZeroBackup(&backupAgent, tr, tagName)); - - wait(taskBucket->finish(tr, task)); - return Void(); - } - - virtual StringRef getName() const { - TraceEvent(SevError, "FileBackupError").detail("Cause", "AbortFiveZeroBackupTaskFunc::name() should never be called"); - ASSERT(false); - return StringRef(); - } - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Future(Void()); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); }; - StringRef AbortFiveZeroBackupTask::name = LiteralStringRef("abort_legacy_backup"); - REGISTER_TASKFUNC(AbortFiveZeroBackupTask); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_diff_logs); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_log_range); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_logs); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_range); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_restorable); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finish_full_backup); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finished_full_backup); - REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_start_full_backup); + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef BackupSnapshotManifest::name = LiteralStringRef("file_backup_write_snapshot_manifest_5.2"); +const uint32_t BackupSnapshotManifest::version = 1; +REGISTER_TASKFUNC(BackupSnapshotManifest); - ACTOR static Future abortFiveOneBackup(FileBackupAgent* backupAgent, Reference tr, std::string tagName) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); +Future BackupSnapshotDispatchTask::addSnapshotManifestTask(Reference tr, + Reference taskBucket, + Reference parentTask, + TaskCompletionKey completionKey, + Reference waitFor) { + return BackupSnapshotManifest::addTask(tr, taskBucket, parentTask, completionKey, waitFor); +} - state KeyBackedTag tag = makeBackupTag(tagName); - state UidAndAbortedFlagT current = wait(tag.getOrThrow(tr, false, backup_unneeded())); +struct StartFullBackupTaskFunc : BackupTaskFuncBase { + static StringRef name; + static const uint32_t version; - state BackupConfig config(current.first); - EBackupState status = wait(config.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); + static struct { + static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + } Params; - if (!backupAgent->isRunnable((BackupAgentBase::enumState)status)) { - throw backup_unneeded(); + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version)); + + loop { + state Reference tr(new ReadYourWritesTransaction(cx)); + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Version startVersion = wait(tr->getReadVersion()); + + Params.beginVersion().set(task, startVersion); + break; + } catch (Error& e) { + wait(tr->onError(e)); + } } - TraceEvent(SevInfo, "FBA_AbortFileOneBackup") - .detail("TagName", tagName.c_str()) - .detail("Status", BackupAgentBase::getStateText(status)); - - // Cancel backup task through tag - wait(tag.cancel(tr)); - - Key configPath = uidPrefixKey(logRangesRange.begin, config.getUid()); - Key logsPath = uidPrefixKey(backupLogKeys.begin, config.getUid()); - - tr->clear(KeyRangeRef(configPath, strinc(configPath))); - tr->clear(KeyRangeRef(logsPath, strinc(logsPath))); - - config.stateEnum().set(tr, EBackupState::STATE_ABORTED); - return Void(); } - struct AbortFiveOneBackupTask : TaskFuncBase { - static StringRef name; - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state FileBackupAgent backupAgent; - state BackupConfig config(task); - state std::string tagName = wait(config.tag().getOrThrow(tr)); + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state BackupConfig config(task); + state Version beginVersion = Params.beginVersion().get(task); - TEST(true); // Canceling 5.1 backup task + state Future> backupRangesFuture = config.backupRanges().getOrThrow(tr); + state Future destUidValueFuture = config.destUidValue().getOrThrow(tr); + wait(success(backupRangesFuture) && success(destUidValueFuture)); + std::vector backupRanges = backupRangesFuture.get(); + Key destUidValue = destUidValueFuture.get(); - TraceEvent(SevInfo, "FileBackupCancelFiveOneTask") - .detail("Task", task->params[Task::reservedTaskParamKeyType]) - .detail("TagName", tagName); - wait(abortFiveOneBackup(&backupAgent, tr, tagName)); - - wait(taskBucket->finish(tr, task)); - return Void(); + // Start logging the mutations for the specified ranges of the tag + for (auto& backupRange : backupRanges) { + config.startMutationLogs(tr, backupRange, destUidValue); } - virtual StringRef getName() const { - TraceEvent(SevError, "FileBackupError").detail("Cause", "AbortFiveOneBackupTaskFunc::name() should never be called"); - ASSERT(false); - return StringRef(); - } + config.stateEnum().set(tr, EBackupState::STATE_RUNNING); - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Future(Void()); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; + state Reference backupFinished = futureBucket->future(tr); + + // Initialize the initial snapshot and create tasks to continually write logs and snapshots + // The initial snapshot has a desired duration of 0, meaning go as fast as possible. + wait(config.initNewSnapshot(tr, 0)); + + // Using priority 1 for both of these to at least start both tasks soon + wait(success( + BackupSnapshotDispatchTask::addTask(tr, taskBucket, task, 1, TaskCompletionKey::joinWith(backupFinished)))); + wait(success(BackupLogsDispatchTask::addTask( + tr, taskBucket, task, 1, 0, beginVersion, TaskCompletionKey::joinWith(backupFinished)))); + + // If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then the + // following task will clean up and set the completed state. + wait(success( + FileBackupFinishedTask::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal(), backupFinished))); + + wait(taskBucket->finish(tr, task)); + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + UID uid, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key key = wait(addBackupTask(StartFullBackupTaskFunc::name, + StartFullBackupTaskFunc::version, + tr, + taskBucket, + completionKey, + BackupConfig(uid), + waitFor)); + return key; + } + + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); }; - StringRef AbortFiveOneBackupTask::name = LiteralStringRef("abort_legacy_backup_5.2"); - REGISTER_TASKFUNC(AbortFiveOneBackupTask); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_range); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_ranges); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_logs); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_erase_logs); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_logs); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_finished); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_snapshot_manifest); - REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_start); + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef StartFullBackupTaskFunc::name = LiteralStringRef("file_backup_start_5.2"); +const uint32_t StartFullBackupTaskFunc::version = 1; +REGISTER_TASKFUNC(StartFullBackupTaskFunc); - std::function)> NOP_SETUP_TASK_FN = [](Reference task) { /* NOP */ }; - ACTOR static Future addBackupTask(StringRef name, - uint32_t version, - Reference tr, - Reference taskBucket, - TaskCompletionKey completionKey, - BackupConfig config, - Reference waitFor = Reference(), - std::function)> setupTaskFn = NOP_SETUP_TASK_FN, - int priority = 0, - bool setValidation = true) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); +struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + wait(checkTaskVersion(tr->getDatabase(), task, name, version)); + state RestoreConfig restore(task); + restore.stateEnum().set(tr, ERestoreState::COMPLETED); + tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue); + // Clear the file map now since it could be huge. + restore.fileSet().clear(tr); + + // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for any + // restore operation the ranges to restore must be within the backed up ranges, otherwise from the restore + // perspective it will appear that some key ranges were missing and so the backup set is incomplete and the + // restore has failed. This validation cannot be done currently because Restore only supports a single restore + // range but backups can have many ranges. + + // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored version. + restore.clearApplyMutationsKeys(tr); + + wait(taskBucket->finish(tr, task)); + wait(unlockDatabase(tr, restore.getUid())); + + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { Key doneKey = wait(completionKey.get(tr, taskBucket)); - state Reference task(new Task(name, version, doneKey, priority)); + state Reference task(new Task(RestoreCompleteTaskFunc::name, RestoreCompleteTaskFunc::version, doneKey)); - // Bind backup config to new task - wait(config.toTask(tr, task, setValidation)); - - // Set task specific params - setupTaskFn(task); + // Get restore config from parent task and bind it to new task + wait(RestoreConfig(parentTask).toTask(tr, task)); if (!waitFor) { return taskBucket->addTask(tr, task); } - wait(waitFor->onSetAddTask(tr, taskBucket, task)); + wait(waitFor->onSetAddTask(tr, taskBucket, task)); return LiteralStringRef("OnSetAddTask"); } - // Backup and Restore taskFunc definitions will inherit from one of the following classes which - // servers to catch and log to the appropriate config any error that execute/finish didn't catch and log. - struct RestoreTaskFuncBase : TaskFuncBase { - virtual Future handleError(Database cx, Reference task, Error const &error) { - return RestoreConfig(task).logError(cx, error, format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str())); - } - virtual std::string toString(Reference task) - { - return ""; - } + static StringRef name; + static const uint32_t version; + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Void(); }; - - struct BackupTaskFuncBase : TaskFuncBase { - virtual Future handleError(Database cx, Reference task, Error const &error) { - return BackupConfig(task).logError(cx, error, format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str())); - } - virtual std::string toString(Reference task) - { - return ""; - } + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); }; +}; +StringRef RestoreCompleteTaskFunc::name = LiteralStringRef("restore_complete"); +const uint32_t RestoreCompleteTaskFunc::version = 1; +REGISTER_TASKFUNC(RestoreCompleteTaskFunc); - ACTOR static Future>> getBlockOfShards(Reference tr, Key beginKey, Key endKey, int limit) { +struct RestoreFileTaskFuncBase : RestoreTaskFuncBase { + struct InputParams { + static TaskParam inputFile() { return LiteralStringRef(__FUNCTION__); } + static TaskParam readOffset() { return LiteralStringRef(__FUNCTION__); } + static TaskParam readLen() { return LiteralStringRef(__FUNCTION__); } + } Params; - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Standalone> results; - Standalone values = wait(tr->getRange(KeyRangeRef(keyAfter(beginKey.withPrefix(keyServersPrefix)), endKey.withPrefix(keyServersPrefix)), limit)); + std::string toString(Reference task) { + return format("fileName '%s' readLen %lld readOffset %lld", + Params.inputFile().get(task).fileName.c_str(), + Params.readLen().get(task), + Params.readOffset().get(task)); + } +}; - for (auto &s : values) { - KeyRef k = s.key.removePrefix(keyServersPrefix); - results.push_back_deep(results.arena(), k); +struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { + static struct : InputParams { + // The range of data that the (possibly empty) data represented, which is set if it intersects the target + // restore range + static TaskParam originalFileRange() { return LiteralStringRef(__FUNCTION__); } + static TaskParam> originalFileRanges() { return LiteralStringRef(__FUNCTION__); } + + static std::vector getOriginalFileRanges(Reference task) { + if (originalFileRanges().exists(task)) { + return Params.originalFileRanges().get(task); + } else { + std::vector range; + if (originalFileRange().exists(task)) + range.push_back(Params.originalFileRange().get(task)); + return range; + } } + } Params; - return results; + std::string toString(Reference task) { + std::string returnStr = RestoreFileTaskFuncBase::toString(task); + for (auto& range : Params.getOriginalFileRanges(task)) + returnStr += format(" originalFileRange '%s'", printable(range).c_str()); + return returnStr; } - struct BackupRangeTaskFunc : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - - static struct { - static TaskParam beginKey() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam endKey() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam addBackupRangeTasks() { - return LiteralStringRef(__FUNCTION__); - } - } Params; - - std::string toString(Reference task) { - return format("beginKey '%s' endKey '%s' addTasks %d", - Params.beginKey().get(task).printable().c_str(), - Params.endKey().get(task).printable().c_str(), - Params.addBackupRangeTasks().get(task) - ); - } - - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - - // Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress durable. - // This means: - // - increment the backup config's range bytes written - // - update the range file map - // - update the task begin key - // - save/extend the task with the new params - // Returns whether or not the caller should continue executing the task. - ACTOR static Future finishRangeFile(Reference file, Database cx, Reference task, Reference taskBucket, KeyRange range, Version version) { - wait(file->finish()); - - // Ignore empty ranges. - if(range.empty()) - return false; - - state Reference tr(new ReadYourWritesTransaction(cx)); - state BackupConfig backup(task); - state bool usedFile = false; - - // Avoid unnecessary conflict by prevent taskbucket's automatic timeout extension - // because the following transaction loop extends and updates the task. - wait(task->extendMutex.take()); - state FlowLock::Releaser releaser(task->extendMutex, 1); - - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - // Update the start key of the task so if this transaction completes but the task then fails - // when it is restarted it will continue where this execution left off. - Params.beginKey().set(task, range.end); - - // Save and extend the task with the new begin parameter - state Version newTimeout = wait(taskBucket->extendTimeout(tr, task, true)); - - // Update the range bytes written in the backup config - backup.rangeBytesWritten().atomicOp(tr, file->size(), MutationRef::AddValue); - backup.snapshotRangeFileCount().atomicOp(tr, 1, MutationRef::AddValue); - - // See if there is already a file for this key which has an earlier begin, update the map if not. - Optional s = wait(backup.snapshotRangeFileMap().get(tr, range.end)); - if(!s.present() || s.get().begin >= range.begin) { - backup.snapshotRangeFileMap().set(tr, range.end, {range.begin, version, file->getFileName(), file->size()}); - usedFile = true; - } - - wait(tr->commit()); - task->timeoutVersion = newTimeout; - break; - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - return usedFile; - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, int priority, Key begin, Key end, TaskCompletionKey completionKey, Reference waitFor = Reference(), Version scheduledVersion = invalidVersion) { - Key key = wait(addBackupTask(BackupRangeTaskFunc::name, - BackupRangeTaskFunc::version, - tr, taskBucket, completionKey, - BackupConfig(parentTask), - waitFor, - [=](Reference task) { - Params.beginKey().set(task, begin); - Params.endKey().set(task, end); - Params.addBackupRangeTasks().set(task, false); - if(scheduledVersion != invalidVersion) - ReservedTaskParams::scheduledVersion().set(task, scheduledVersion); - }, - priority)); - return key; - } - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); - - wait(checkTaskVersion(cx, task, BackupRangeTaskFunc::name, BackupRangeTaskFunc::version)); - - state Key beginKey = Params.beginKey().get(task); - state Key endKey = Params.endKey().get(task); - - TraceEvent("FileBackupRangeStart") - .suppressFor(60) - .detail("BackupUID", BackupConfig(task).getUid()) - .detail("BeginKey", Params.beginKey().get(task).printable()) - .detail("EndKey", Params.endKey().get(task).printable()) - .detail("TaskKey", task->key.printable()); - - // When a key range task saves the last chunk of progress and then the executor dies, when the task continues - // its beginKey and endKey will be equal but there is no work to be done. - if(beginKey == endKey) - return Void(); - - // Find out if there is a shard boundary in(beginKey, endKey) - Standalone> keys = wait(runRYWTransaction(cx, [=](Reference tr){ return getBlockOfShards(tr, beginKey, endKey, 1); })); - if (keys.size() > 0) { - Params.addBackupRangeTasks().set(task, true); - return Void(); - } - - // Read everything from beginKey to endKey, write it to an output file, run the output file processor, and - // then set on_done. If we are still writing after X seconds, end the output file and insert a new backup_range - // task for the remainder. - state Reference outFile; - state Version outVersion = invalidVersion; - state Key lastKey; - - // retrieve kvData - state PromiseStream results; - - state Future rc = readCommitted(cx, results, lock, KeyRangeRef(beginKey, endKey), true, true, true); - state RangeFileWriter rangeFile; - state BackupConfig backup(task); - - // Don't need to check keepRunning(task) here because we will do that while finishing each output file, but if bc - // is false then clearly the backup is no longer in progress - state Reference bc = wait(backup.backupContainer().getD(cx)); - if(!bc) { - return Void(); - } - - state bool done = false; - state int64_t nrKeys = 0; - - loop{ - state RangeResultWithVersion values; - try { - RangeResultWithVersion _values = waitNext(results.getFuture()); - values = _values; - lock->release(values.first.expectedSize()); - } catch(Error &e) { - if(e.code() == error_code_end_of_stream) - done = true; - else - throw; - } - - // If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish it. - if (values.second != outVersion || done) { - if (outFile){ - TEST(outVersion != invalidVersion); // Backup range task wrote multiple versions - state Key nextKey = done ? endKey : keyAfter(lastKey); - wait(rangeFile.writeKey(nextKey)); - - if(BUGGIFY) { - wait(rangeFile.padEnd()); - } - - bool usedFile = wait(finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion)); - TraceEvent("FileBackupWroteRangeFile") - .suppressFor(60) - .detail("BackupUID", backup.getUid()) - .detail("Size", outFile->size()) - .detail("Keys", nrKeys) - .detail("ReadVersion", outVersion) - .detail("BeginKey", beginKey.printable()) - .detail("EndKey", nextKey.printable()) - .detail("AddedFileToMap", usedFile); - - nrKeys = 0; - beginKey = nextKey; - } - - if(done) - return Void(); - - // Start writing a new file after verifying this task should keep running as of a new read version (which must be >= outVersion) - outVersion = values.second; - // block size must be at least large enough for 3 max size keys and 2 max size values + overhead so 250k conservatively. - state int blockSize = BUGGIFY ? deterministicRandom()->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE; - state Version snapshotBeginVersion; - state int64_t snapshotRangeFileCount; - - state Reference tr(new ReadYourWritesTransaction(cx)); - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - wait(taskBucket->keepRunning(tr, task) - && storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) - && store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr)) - ); - - break; - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - Reference f = wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize)); - outFile = f; - - // Initialize range file writer and write begin key - rangeFile = RangeFileWriter(outFile, blockSize); - wait(rangeFile.writeKey(beginKey)); - } - - // write kvData to file, update lastKey and key count - if(values.first.size() != 0) { - state size_t i = 0; - for (; i < values.first.size(); ++i) { - wait(rangeFile.writeKV(values.first[i].key, values.first[i].value)); - } - lastKey = values.first.back().key; - nrKeys += values.first.size(); - } - } - } - - ACTOR static Future startBackupRangeInternal(Reference tr, Reference taskBucket, Reference futureBucket, Reference task, Reference onDone) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Key nextKey = Params.beginKey().get(task); - state Key endKey = Params.endKey().get(task); - - state Standalone> keys = wait(getBlockOfShards(tr, nextKey, endKey, CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT)); - - std::vector> addTaskVector; - for (int idx = 0; idx < keys.size(); ++idx) { - if (nextKey != keys[idx]) { - addTaskVector.push_back(addTask(tr, taskBucket, task, task->getPriority(), nextKey, keys[idx], TaskCompletionKey::joinWith(onDone))); - TraceEvent("FileBackupRangeSplit") - .suppressFor(60) - .detail("BackupUID", BackupConfig(task).getUid()) - .detail("BeginKey", Params.beginKey().get(task).printable()) - .detail("EndKey", Params.endKey().get(task).printable()) - .detail("SliceBeginKey", nextKey.printable()) - .detail("SliceEndKey", keys[idx].printable()); - } - nextKey = keys[idx]; - } - - wait(waitForAll(addTaskVector)); - - if (nextKey != endKey) { - // Add task to cover nextKey to the end, using the priority of the current task - wait(success(addTask(tr, taskBucket, task, task->getPriority(), nextKey, endKey, TaskCompletionKey::joinWith(onDone), Reference(), task->getPriority()))); - } - - return Void(); - } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - - if (Params.addBackupRangeTasks().get(task)) { - wait(startBackupRangeInternal(tr, taskBucket, futureBucket, task, taskFuture)); - } - else { - wait(taskFuture->set(tr, taskBucket)); - } - - wait(taskBucket->finish(tr, task)); - - TraceEvent("FileBackupRangeFinish") - .suppressFor(60) - .detail("BackupUID", BackupConfig(task).getUid()) - .detail("BeginKey", Params.beginKey().get(task).printable()) - .detail("EndKey", Params.endKey().get(task).printable()) - .detail("TaskKey", task->key.printable()); - - return Void(); - } - - }; - StringRef BackupRangeTaskFunc::name = LiteralStringRef("file_backup_write_range_5.2"); - const uint32_t BackupRangeTaskFunc::version = 1; - REGISTER_TASKFUNC(BackupRangeTaskFunc); - - struct BackupSnapshotDispatchTask : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - - static struct { - // Set by Execute, used by Finish - static TaskParam shardsBehind() { - return LiteralStringRef(__FUNCTION__); - } - // Set by Execute, used by Finish - static TaskParam snapshotFinished() { - return LiteralStringRef(__FUNCTION__); - } - // Set by Execute, used by Finish - static TaskParam nextDispatchVersion() { - return LiteralStringRef(__FUNCTION__); - } - } Params; - - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, int priority, TaskCompletionKey completionKey, Reference waitFor = Reference(), Version scheduledVersion = invalidVersion) { - Key key = wait(addBackupTask(name, - version, - tr, taskBucket, completionKey, - BackupConfig(parentTask), - waitFor, - [=](Reference task) { - if(scheduledVersion != invalidVersion) - ReservedTaskParams::scheduledVersion().set(task, scheduledVersion); - }, - priority)); - return key; - } - - enum DispatchState { SKIP=0, DONE=1, NOT_DONE_MIN=2}; - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); - wait(checkTaskVersion(cx, task, name, version)); - - state double startTime = timer(); - state Reference tr(new ReadYourWritesTransaction(cx)); - - // The shard map will use 3 values classes. Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN which will mean not done. - // This is to enable an efficient coalesce() call to squash adjacent ranges which are not yet finished to enable efficiently - // finding random database shards which are not done. - state int notDoneSequence = NOT_DONE_MIN; - state KeyRangeMap shardMap(notDoneSequence++, normalKeys.end); - state Key beginKey = normalKeys.begin; - - // Read all shard boundaries and add them to the map - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - state Future>> shardBoundaries = getBlockOfShards(tr, beginKey, normalKeys.end, CLIENT_KNOBS->TOO_MANY); - wait(success(shardBoundaries) && taskBucket->keepRunning(tr, task)); - - if(shardBoundaries.get().size() == 0) - break; - - for(auto &boundary : shardBoundaries.get()) { - shardMap.rawInsert(boundary, notDoneSequence++); - } - - beginKey = keyAfter(shardBoundaries.get().back()); - tr->reset(); - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - // Read required stuff from backup config - state BackupConfig config(task); - state Version recentReadVersion; - state Version snapshotBeginVersion; - state Version snapshotTargetEndVersion; - state int64_t snapshotIntervalSeconds; - state Optional latestSnapshotEndVersion; - state std::vector backupRanges; - state Optional snapshotBatchFutureKey; - state Reference snapshotBatchFuture; - state Optional snapshotBatchSize; - - tr->reset(); - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - wait( store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) - && store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) - && store(backupRanges, config.backupRanges().getOrThrow(tr)) - && store(snapshotIntervalSeconds, config.snapshotIntervalSeconds().getOrThrow(tr)) - // The next two parameters are optional - && store(snapshotBatchFutureKey, config.snapshotBatchFuture().get(tr)) - && store(snapshotBatchSize, config.snapshotBatchSize().get(tr)) - && store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) - && store(recentReadVersion, tr->getReadVersion()) - && taskBucket->keepRunning(tr, task)); - - // If the snapshot batch future key does not exist, this is the first execution of this dispatch task so - // - create and set the snapshot batch future key - // - initialize the batch size to 0 - // - initialize the target snapshot end version if it is not yet set - // - commit - if(!snapshotBatchFutureKey.present()) { - snapshotBatchFuture = futureBucket->future(tr); - config.snapshotBatchFuture().set(tr, snapshotBatchFuture->pack()); - snapshotBatchSize = 0; - config.snapshotBatchSize().set(tr, snapshotBatchSize.get()); - - // The dispatch of this batch can take multiple separate executions if the executor fails - // so store a completion key for the dispatch finish() to set when dispatching the batch is done. - state TaskCompletionKey dispatchCompletionKey = TaskCompletionKey::joinWith(snapshotBatchFuture); - // this is a bad hack - but flow doesn't work well with lambda functions and caputring - // state variables... - auto cfg = &config; - auto tx = &tr; - wait(map(dispatchCompletionKey.get(tr, taskBucket), [cfg, tx](Key const& k) { - cfg->snapshotBatchDispatchDoneKey().set(*tx, k); - return Void(); - })); - wait(tr->commit()); - } - else { - ASSERT(snapshotBatchSize.present()); - // Batch future key exists in the config so create future from it - snapshotBatchFuture = Reference(new TaskFuture(futureBucket, snapshotBatchFutureKey.get())); - } - - break; - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - // Read all dispatched ranges - state std::vector> dispatchBoundaries; - tr->reset(); - beginKey = normalKeys.begin; - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - state Future>> bounds = config.snapshotRangeDispatchMap().getRange(tr, beginKey, keyAfter(normalKeys.end), CLIENT_KNOBS->TOO_MANY); - wait(success(bounds) && taskBucket->keepRunning(tr, task) && store(recentReadVersion, tr->getReadVersion())); - - if(bounds.get().empty()) - break; - - dispatchBoundaries.reserve(dispatchBoundaries.size() + bounds.get().size()); - dispatchBoundaries.insert(dispatchBoundaries.end(), bounds.get().begin(), bounds.get().end()); - - beginKey = keyAfter(bounds.get().back().first); - tr->reset(); - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - // The next few sections involve combining the results above. Yields are used after operations - // that could have operated on many thousands of things and in loops which could have many - // thousands of iterations. - // Declare some common iterators which must be state vars and will be used multiple times. - state int i; - state RangeMap::Iterator iShard; - state RangeMap::Iterator iShardEnd; - - // Set anything inside a dispatched range to DONE. - // Also ensure that the boundary value are true, false, [true, false]... - if(dispatchBoundaries.size() > 0) { - state bool lastValue = false; - state Key lastKey; - for(i = 0; i < dispatchBoundaries.size(); ++i) { - const std::pair &boundary = dispatchBoundaries[i]; - - // Values must alternate - ASSERT(boundary.second == !lastValue); - - // If this was the end of a dispatched range - if(!boundary.second) { - // Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range to DONE. - RangeMap::Ranges shardRanges = shardMap.modify(KeyRangeRef(lastKey, boundary.first)); - iShard = shardRanges.begin(); - iShardEnd = shardRanges.end(); - for(; iShard != iShardEnd; ++iShard) { - iShard->value() = DONE; - wait(yield()); - } - } - lastValue = dispatchBoundaries[i].second; - lastKey = dispatchBoundaries[i].first; - - wait(yield()); - } - ASSERT(lastValue == false); - } - - // Set anything outside the backup ranges to SKIP. We can use insert() here instead of modify() - // because it's OK to delete shard boundaries in the skipped ranges. - if(backupRanges.size() > 0) { - shardMap.insert(KeyRangeRef(normalKeys.begin, backupRanges.front().begin), SKIP); - wait(yield()); - - for(i = 0; i < backupRanges.size() - 1; ++i) { - shardMap.insert(KeyRangeRef(backupRanges[i].end, backupRanges[i + 1].begin), SKIP); - wait(yield()); - } - - shardMap.insert(KeyRangeRef(backupRanges.back().end, normalKeys.end), SKIP); - wait(yield()); - } - - state int countShardsDone = 0; - state int countShardsNotDone = 0; - - // Scan through the shard map, counting the DONE and NOT_DONE shards. - RangeMap::Ranges shardRanges = shardMap.ranges(); - iShard = shardRanges.begin(); - iShardEnd = shardRanges.end(); - for(; iShard != iShardEnd; ++iShard) { - if(iShard->value() == DONE) { - ++countShardsDone; - } - else if(iShard->value() >= NOT_DONE_MIN) - ++countShardsNotDone; - - wait(yield()); - } - - // Coalesce the shard map to make random selection below more efficient. - shardMap.coalesce(normalKeys); - wait(yield()); - - // In this context "all" refers to all of the shards relevant for this particular backup - state int countAllShards = countShardsDone + countShardsNotDone; - - if(countShardsNotDone == 0) { - TraceEvent("FileBackupSnapshotDispatchFinished") - .detail("BackupUID", config.getUid()) - .detail("AllShards", countAllShards) - .detail("ShardsDone", countShardsDone) - .detail("ShardsNotDone", countShardsNotDone) - .detail("SnapshotBeginVersion", snapshotBeginVersion) - .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) - .detail("CurrentVersion", recentReadVersion) - .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); - Params.snapshotFinished().set(task, true); - return Void(); - } - - // Decide when the next snapshot dispatch should run. - state Version nextDispatchVersion; - - // In simulation, use snapshot interval / 5 to ensure multiple dispatches run - // Otherwise, use the knob for the number of seconds between snapshot dispatch tasks. - if(g_network->isSimulated()) - nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * (snapshotIntervalSeconds / 5.0); - else - nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * CLIENT_KNOBS->BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC; - - // If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just use - // the greater of recentReadVersion or snapshotTargetEndVersion. Any range tasks created in this dispatch will - // be scheduled at a random time between recentReadVersion and nextDispatchVersion, - // so nextDispatchVersion shouldn't be less than recentReadVersion. - if(nextDispatchVersion > snapshotTargetEndVersion) - nextDispatchVersion = std::max(recentReadVersion, snapshotTargetEndVersion); - - Params.nextDispatchVersion().set(task, nextDispatchVersion); - - // Calculate number of shards that should be done before the next interval end - // timeElapsed is between 0 and 1 and represents what portion of the shards we should have completed by now - double timeElapsed; - Version snapshotScheduledVersionInterval = snapshotTargetEndVersion - snapshotBeginVersion; - if(snapshotTargetEndVersion > snapshotBeginVersion) - timeElapsed = std::min(1.0, (double)(nextDispatchVersion - snapshotBeginVersion) / (snapshotScheduledVersionInterval)); - else - timeElapsed = 1.0; - - state int countExpectedShardsDone = countAllShards * timeElapsed; - state int countShardsToDispatch = std::max(0, countExpectedShardsDone - countShardsDone); - - // Calculate the number of shards that would have been dispatched by a normal (on-schedule) BackupSnapshotDispatchTask given - // the dispatch window and the start and expected-end versions of the current snapshot. - int64_t dispatchWindow = nextDispatchVersion - recentReadVersion; - - // If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all shards are considered late - int countShardsExpectedPerNormalWindow; - if(snapshotScheduledVersionInterval == 0) { - countShardsExpectedPerNormalWindow = 0; - } - else { - // A dispatchWindow of 0 means the target end version is <= now which also results in all shards being considered late - countShardsExpectedPerNormalWindow = (double(dispatchWindow) / snapshotScheduledVersionInterval) * countAllShards; - } - - // The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are being dispatched, if any. - int countShardsBehind = std::max(0, countShardsToDispatch + snapshotBatchSize.get() - countShardsExpectedPerNormalWindow); - Params.shardsBehind().set(task, countShardsBehind); - - TraceEvent("FileBackupSnapshotDispatchStats") - .detail("BackupUID", config.getUid()) - .detail("AllShards", countAllShards) - .detail("ShardsDone", countShardsDone) - .detail("ShardsNotDone", countShardsNotDone) - .detail("ExpectedShardsDone", countExpectedShardsDone) - .detail("ShardsToDispatch", countShardsToDispatch) - .detail("ShardsBehind", countShardsBehind) - .detail("SnapshotBeginVersion", snapshotBeginVersion) - .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) - .detail("NextDispatchVersion", nextDispatchVersion) - .detail("CurrentVersion", recentReadVersion) - .detail("TimeElapsed", timeElapsed) - .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds); - - // Dispatch random shards to catch up to the expected progress - while(countShardsToDispatch > 0) { - // First select ranges to add - state std::vector rangesToAdd; - - // Limit number of tasks added per transaction - int taskBatchSize = BUGGIFY ? deterministicRandom()->randomInt(1, countShardsToDispatch + 1) : CLIENT_KNOBS->BACKUP_DISPATCH_ADDTASK_SIZE; - int added = 0; - - while(countShardsToDispatch > 0 && added < taskBatchSize && shardMap.size() > 0) { - // Get a random range. - auto it = shardMap.randomRange(); - // Find a NOT_DONE range and add it to rangesToAdd - while(1) { - if(it->value() >= NOT_DONE_MIN) { - rangesToAdd.push_back(it->range()); - it->value() = DONE; - shardMap.coalesce(Key(it->begin())); - ++added; - ++countShardsDone; - --countShardsToDispatch; - --countShardsNotDone; - break; - } - if(it->end() == shardMap.mapEnd) - break; - ++it; - } - } - - state int64_t oldBatchSize = snapshotBatchSize.get(); - state int64_t newBatchSize = oldBatchSize + rangesToAdd.size(); - - // Now add the selected ranges in a single transaction. - tr->reset(); - loop { - try { - TraceEvent("FileBackupSnapshotDispatchAddingTasks") - .suppressFor(2) - .detail("TasksToAdd", rangesToAdd.size()) - .detail("NewBatchSize", newBatchSize); - - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - // For each range, make sure it isn't set in the dispatched range map. - state std::vector>> beginReads; - state std::vector>> endReads; - - for(auto &range : rangesToAdd) { - beginReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.begin)); - endReads.push_back( config.snapshotRangeDispatchMap().get(tr, range.end)); - } - - wait(store(snapshotBatchSize.get(), config.snapshotBatchSize().getOrThrow(tr)) - && waitForAll(beginReads) && waitForAll(endReads) && taskBucket->keepRunning(tr, task)); - - // Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction is already done. - if(snapshotBatchSize.get() == newBatchSize) { - break; - } - else { - ASSERT(snapshotBatchSize.get() == oldBatchSize); - config.snapshotBatchSize().set(tr, newBatchSize); - snapshotBatchSize = newBatchSize; - config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().get(task)); - config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get()); - } - - state std::vector> addTaskFutures; - - for(i = 0; i < beginReads.size(); ++i) { - KeyRange &range = rangesToAdd[i]; - - // This loop might have made changes to begin or end boundaries in a prior - // iteration. If so, the updated values exist in the RYW cache so re-read both entries. - Optional beginValue = config.snapshotRangeDispatchMap().get(tr, range.begin).get(); - Optional endValue = config.snapshotRangeDispatchMap().get(tr, range.end).get(); - - ASSERT(!beginValue.present() || !endValue.present() || beginValue != endValue); - - // If begin is present, it must be a range end so value must be false - // If end is present, it must be a range begin so value must be true - if( (!beginValue.present() || !beginValue.get()) - && (!endValue.present() || endValue.get()) ) - { - if(beginValue.present()) { - config.snapshotRangeDispatchMap().erase(tr, range.begin); - } - else { - config.snapshotRangeDispatchMap().set(tr, range.begin, true); - } - if(endValue.present()) { - config.snapshotRangeDispatchMap().erase(tr, range.end); - } - else { - config.snapshotRangeDispatchMap().set(tr, range.end, false); - } - - Version scheduledVersion = invalidVersion; - // If the next dispatch version is in the future, choose a random version at which to start the new task. - if(nextDispatchVersion > recentReadVersion) - scheduledVersion = recentReadVersion + deterministicRandom()->random01() * (nextDispatchVersion - recentReadVersion); - - // Range tasks during the initial snapshot should run at a higher priority - int priority = latestSnapshotEndVersion.present() ? 0 : 1; - addTaskFutures.push_back(success(BackupRangeTaskFunc::addTask(tr, taskBucket, task, priority, range.begin, range.end, TaskCompletionKey::joinWith(snapshotBatchFuture), Reference(), scheduledVersion))); - - TraceEvent("FileBackupSnapshotRangeDispatched") - .suppressFor(2) - .detail("BackupUID", config.getUid()) - .detail("CurrentVersion", recentReadVersion) - .detail("ScheduledVersion", scheduledVersion) - .detail("BeginKey", range.begin.printable()) - .detail("EndKey", range.end.printable()); - } - else { - // This shouldn't happen because if the transaction was already done or if another execution - // of this task is making progress it should have been detected above. - ASSERT(false); - } - } - - wait(waitForAll(addTaskFutures)); - wait(tr->commit()); - break; - } catch(Error &e) { - wait(tr->onError(e)); - } - } - } - - if(countShardsNotDone == 0) { - TraceEvent("FileBackupSnapshotDispatchFinished") - .detail("BackupUID", config.getUid()) - .detail("AllShards", countAllShards) - .detail("ShardsDone", countShardsDone) - .detail("ShardsNotDone", countShardsNotDone) - .detail("SnapshotBeginVersion", snapshotBeginVersion) - .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion) - .detail("CurrentVersion", recentReadVersion) - .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds) - .detail("DispatchTimeSeconds", timer() - startTime); - Params.snapshotFinished().set(task, true); - } - - return Void(); - } - - // This function is just a wrapper for BackupSnapshotManifest::addTask() which is defined below. - // The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep their execute and finish phases - // defined together inside their class definitions this wrapper is declared here but defined after BackupSnapshotManifest is defined. - static Future addSnapshotManifestTask(Reference tr, Reference taskBucket, Reference parentTask, TaskCompletionKey completionKey, Reference waitFor = Reference()); - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state BackupConfig config(task); - - // Get the batch future and dispatch done keys, then clear them. - state Key snapshotBatchFutureKey; - state Key snapshotBatchDispatchDoneKey; - - wait( store(snapshotBatchFutureKey, config.snapshotBatchFuture().getOrThrow(tr)) - && store(snapshotBatchDispatchDoneKey, config.snapshotBatchDispatchDoneKey().getOrThrow(tr))); - - state Reference snapshotBatchFuture = futureBucket->unpack(snapshotBatchFutureKey); - state Reference snapshotBatchDispatchDoneFuture = futureBucket->unpack(snapshotBatchDispatchDoneKey); - config.snapshotBatchFuture().clear(tr); - config.snapshotBatchDispatchDoneKey().clear(tr); - config.snapshotBatchSize().clear(tr); - - // Update shardsBehind here again in case the execute phase did not actually have to create any shard tasks - config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().getOrDefault(task, 0)); - config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get()); - - state Reference snapshotFinishedFuture = task->getDoneFuture(futureBucket); - - // If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another snapshot dispatch task. - // In either case, the task should wait for snapshotBatchFuture. - // The snapshot done key, passed to the current task, is also passed on. - if(Params.snapshotFinished().getOrDefault(task, false)) { - wait(success(addSnapshotManifestTask(tr, taskBucket, task, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture))); - } - else { - wait(success(addTask(tr, taskBucket, task, 1, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture, Params.nextDispatchVersion().get(task)))); - } - - // This snapshot batch is finished, so set the batch done future. - wait(snapshotBatchDispatchDoneFuture->set(tr, taskBucket)); - - wait(taskBucket->finish(tr, task)); - - return Void(); - } - - }; - StringRef BackupSnapshotDispatchTask::name = LiteralStringRef("file_backup_dispatch_ranges_5.2"); - const uint32_t BackupSnapshotDispatchTask::version = 1; - REGISTER_TASKFUNC(BackupSnapshotDispatchTask); - - struct BackupLogRangeTaskFunc : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - - static struct { - static TaskParam addBackupLogRangeTasks() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam fileSize() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam beginVersion() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam endVersion() { - return LiteralStringRef(__FUNCTION__); - } - } Params; - - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state Reference lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES)); - - wait(checkTaskVersion(cx, task, BackupLogRangeTaskFunc::name, BackupLogRangeTaskFunc::version)); - - state Version beginVersion = Params.beginVersion().get(task); - state Version endVersion = Params.endVersion().get(task); - - state BackupConfig config(task); - state Reference bc; - - state Reference tr(new ReadYourWritesTransaction(cx)); - loop{ + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state RestoreConfig restore(task); + + state RestoreFile rangeFile = Params.inputFile().get(task); + state int64_t readOffset = Params.readOffset().get(task); + state int64_t readLen = Params.readLen().get(task); + + TraceEvent("FileRestoreRangeStart") + .suppressFor(60) + .detail("RestoreUID", restore.getUid()) + .detail("FileName", rangeFile.fileName) + .detail("FileVersion", rangeFile.version) + .detail("FileSize", rangeFile.fileSize) + .detail("ReadOffset", readOffset) + .detail("ReadLen", readLen) + .detail("TaskInstance", THIS_ADDR); + + state Reference tr(new ReadYourWritesTransaction(cx)); + state Future> bc; + state Future> restoreRanges; + state Future addPrefix; + state Future removePrefix; + + loop { + try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - // Wait for the read version to pass endVersion - try { - wait(taskBucket->keepRunning(tr, task)); - if(!bc) { - // Backup container must be present if we're still here - Reference _bc = wait(config.backupContainer().getOrThrow(tr)); - bc = _bc; - } + bc = restore.sourceContainer().getOrThrow(tr); + restoreRanges = restore.getRestoreRangesOrDefault(tr); + addPrefix = restore.addPrefix().getD(tr); + removePrefix = restore.removePrefix().getD(tr); - Version currentVersion = tr->getReadVersion().get(); - if(endVersion < currentVersion) - break; + wait(taskBucket->keepRunning(tr, task)); - wait(delay(std::max(CLIENT_KNOBS->BACKUP_RANGE_MINWAIT, (double) (endVersion-currentVersion)/CLIENT_KNOBS->CORE_VERSIONSPERSECOND))); - tr->reset(); - } - catch (Error &e) { - wait(tr->onError(e)); - } + wait(success(bc) && success(restoreRanges) && success(addPrefix) && success(removePrefix) && + checkTaskVersion(tr->getDatabase(), task, name, version)); + break; + + } catch (Error& e) { + wait(tr->onError(e)); } - - Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); - state Standalone> ranges = getLogRanges(beginVersion, endVersion, destUidValue); - if (ranges.size() > CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) { - Params.addBackupLogRangeTasks().set(task, true); - return Void(); - } - - // Block size must be at least large enough for 1 max size key, 1 max size value, and overhead, so conservatively 125k. - state int blockSize = BUGGIFY ? deterministicRandom()->randomInt(125e3, 4e6) : CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE; - state Reference outFile = wait(bc->writeLogFile(beginVersion, endVersion, blockSize)); - state LogFileWriter logFile(outFile, blockSize); - - state PromiseStream results; - state std::vector> rc; - - for (auto &range : ranges) { - rc.push_back(readCommitted(cx, results, lock, range, false, true, true)); - } - - state Future sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr const &result) { - if(result.isError()) - results.sendError(result.getError()); - else - results.sendError(end_of_stream()); - return Void(); - }); - - state Version lastVersion; - try { - loop { - state RangeResultWithVersion r = waitNext(results.getFuture()); - lock->release(r.first.expectedSize()); - - state int i = 0; - for (; i < r.first.size(); ++i) { - // Remove the backupLogPrefix + UID bytes from the key - wait(logFile.writeKV(r.first[i].key.substr(backupLogPrefixBytes + 16), r.first[i].value)); - lastVersion = r.second; - } - } - } catch (Error &e) { - if(e.code() == error_code_actor_cancelled) - throw; - - if (e.code() != error_code_end_of_stream) { - state Error err = e; - wait(config.logError(cx, err, format("Failed to write to file `%s'", outFile->getFileName().c_str()))); - throw err; - } - } - - // Make sure this task is still alive, if it's not then the data read above could be incomplete. - wait(taskBucket->keepRunning(cx, task)); - - wait(outFile->finish()); - - TraceEvent("FileBackupWroteLogFile") - .suppressFor(60) - .detail("BackupUID", config.getUid()) - .detail("Size", outFile->size()) - .detail("BeginVersion", beginVersion) - .detail("EndVersion", endVersion) - .detail("LastReadVersion", latestVersion); - - Params.fileSize().set(task, outFile->size()); - - return Void(); } - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, int priority, Version beginVersion, Version endVersion, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key key = wait(addBackupTask(BackupLogRangeTaskFunc::name, - BackupLogRangeTaskFunc::version, - tr, taskBucket, completionKey, - BackupConfig(parentTask), - waitFor, - [=](Reference task) { - Params.beginVersion().set(task, beginVersion); - Params.endVersion().set(task, endVersion); - Params.addBackupLogRangeTasks().set(task, false); - }, - priority)); - return key; - } + state Reference inFile = wait(bc.get()->readFile(rangeFile.fileName)); + state Standalone> blockData = wait(decodeRangeFileBlock(inFile, readOffset, readLen)); - ACTOR static Future startBackupLogRangeInternal(Reference tr, Reference taskBucket, Reference futureBucket, Reference task, Reference taskFuture, Version beginVersion, Version endVersion) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); + // First and last key are the range for this file + state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key); + state std::vector originalFileRanges; + // If fileRange doesn't intersect restore range then we're done. + state int index; + for (index = 0; index < restoreRanges.get().size(); index++) { + auto& restoreRange = restoreRanges.get()[index]; + if (!fileRange.intersects(restoreRange)) + continue; - std::vector> addTaskVector; - int tasks = 0; - for (int64_t vblock = beginVersion / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE; vblock < (endVersion + CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE - 1) / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE; vblock += CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) { - Version bv = std::max(beginVersion, vblock * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE); + // We know the file range intersects the restore range but there could still be keys outside the restore + // range. Find the subvector of kv pairs that intersect the restore range. Note that the first and last + // keys are just the range endpoints for this file + int rangeStart = 1; + int rangeEnd = blockData.size() - 1; + // Slide start forward, stop if something in range is found + while (rangeStart < rangeEnd && !restoreRange.contains(blockData[rangeStart].key)) + ++rangeStart; + // Side end backward, stop if something in range is found + while (rangeEnd > rangeStart && !restoreRange.contains(blockData[rangeEnd - 1].key)) + --rangeEnd; - if( tasks >= CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT ) { - addTaskVector.push_back(addTask(tr, taskBucket, task, task->getPriority(), bv, endVersion, TaskCompletionKey::joinWith(taskFuture))); - break; - } + state VectorRef data = blockData.slice(rangeStart, rangeEnd); - Version ev = std::min(endVersion, (vblock + CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE); - addTaskVector.push_back(addTask(tr, taskBucket, task, task->getPriority(), bv, ev, TaskCompletionKey::joinWith(taskFuture))); - tasks++; - } + // Shrink file range to be entirely within restoreRange and translate it to the new prefix + // First, use the untranslated file range to create the shrunk original file range which must be used in the + // kv range version map for applying mutations + state KeyRange originalFileRange = + KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end)); + originalFileRanges.push_back(originalFileRange); - wait(waitForAll(addTaskVector)); - - return Void(); - } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state Version beginVersion = Params.beginVersion().get(task); - state Version endVersion = Params.endVersion().get(task); - state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - state BackupConfig config(task); - - if(Params.fileSize().exists(task)) { - config.logBytesWritten().atomicOp(tr, Params.fileSize().get(task), MutationRef::AddValue); - } - - if (Params.addBackupLogRangeTasks().get(task)) { - wait(startBackupLogRangeInternal(tr, taskBucket, futureBucket, task, taskFuture, beginVersion, endVersion)); - endVersion = beginVersion; + // Now shrink and translate fileRange + Key fileEnd = std::min(fileRange.end, restoreRange.end); + if (fileEnd == (removePrefix.get() == StringRef() ? normalKeys.end : strinc(removePrefix.get()))) { + fileEnd = addPrefix.get() == StringRef() ? normalKeys.end : strinc(addPrefix.get()); } else { - wait(taskFuture->set(tr, taskBucket)); + fileEnd = fileEnd.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()); } - - wait(taskBucket->finish(tr, task)); - return Void(); - } - }; - - StringRef BackupLogRangeTaskFunc::name = LiteralStringRef("file_backup_write_logs_5.2"); - const uint32_t BackupLogRangeTaskFunc::version = 1; - REGISTER_TASKFUNC(BackupLogRangeTaskFunc); - - //This task stopped being used in 6.2, however the code remains here to handle upgrades. - struct EraseLogRangeTaskFunc : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - StringRef getName() const { return name; }; - - static struct { - static TaskParam beginVersion() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam endVersion() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam destUidValue() { - return LiteralStringRef(__FUNCTION__); - } - } Params; - - ACTOR static Future addTask(Reference tr, Reference taskBucket, UID logUid, TaskCompletionKey completionKey, Key destUidValue, Version endVersion = 0, Reference waitFor = Reference()) { - Key key = wait(addBackupTask(EraseLogRangeTaskFunc::name, - EraseLogRangeTaskFunc::version, - tr, taskBucket, completionKey, - BackupConfig(logUid), - waitFor, - [=](Reference task) { - Params.beginVersion().set(task, 1); //FIXME: remove in 6.X, only needed for 5.2 backward compatibility - Params.endVersion().set(task, endVersion); - Params.destUidValue().set(task, destUidValue); - }, - 0, false)); - - return key; - } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - - wait(checkTaskVersion(tr->getDatabase(), task, EraseLogRangeTaskFunc::name, EraseLogRangeTaskFunc::version)); - - state Version endVersion = Params.endVersion().get(task); - state Key destUidValue = Params.destUidValue().get(task); - - state BackupConfig config(task); - state Key logUidValue = config.getUidAsKey(); - - wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task) && eraseLogData(tr, logUidValue, destUidValue, endVersion != 0 ? Optional(endVersion) : Optional())); - - return Void(); - } - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Void(); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef EraseLogRangeTaskFunc::name = LiteralStringRef("file_backup_erase_logs_5.2"); - const uint32_t EraseLogRangeTaskFunc::version = 1; - REGISTER_TASKFUNC(EraseLogRangeTaskFunc); - - - - struct BackupLogsDispatchTask : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - - static struct { - static TaskParam prevBeginVersion() { - return LiteralStringRef(__FUNCTION__); - } - static TaskParam beginVersion() { - return LiteralStringRef(__FUNCTION__); - } - } Params; - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - wait(checkTaskVersion(tr->getDatabase(), task, BackupLogsDispatchTask::name, BackupLogsDispatchTask::version)); - - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - state Reference onDone = task->getDoneFuture(futureBucket); - state Version prevBeginVersion = Params.prevBeginVersion().get(task); - state Version beginVersion = Params.beginVersion().get(task); - state BackupConfig config(task); - config.latestLogEndVersion().set(tr, beginVersion); - - state bool stopWhenDone; - state Optional restorableVersion; - state EBackupState backupState; - state Optional tag; - state Optional latestSnapshotEndVersion; - - wait(store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) - && store(restorableVersion, config.getLatestRestorableVersion(tr)) - && store(backupState, config.stateEnum().getOrThrow(tr)) - && store(tag, config.tag().get(tr)) - && store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr))); - - // If restorable, update the last restorable version for this tag - if(restorableVersion.present() && tag.present()) { - FileBackupAgent().setLastRestorable(tr, StringRef(tag.get()), restorableVersion.get()); - } - - // If the backup is restorable but the state is not differential then set state to differential - if(restorableVersion.present() && backupState != BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) - config.stateEnum().set(tr, BackupAgentBase::STATE_RUNNING_DIFFERENTIAL); - - // If stopWhenDone is set and there is a restorable version, set the done future and do not create further tasks. - if(stopWhenDone && restorableVersion.present()) { - wait(onDone->set(tr, taskBucket) && taskBucket->finish(tr, task)); - - TraceEvent("FileBackupLogsDispatchDone") - .detail("BackupUID", config.getUid()) - .detail("BeginVersion", beginVersion) - .detail("RestorableVersion", restorableVersion.orDefault(-1)); - - return Void(); - } - - state Version endVersion = std::max( tr->getReadVersion().get() + 1, beginVersion + (CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES-1)*CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE ); - - TraceEvent("FileBackupLogDispatch") - .suppressFor(60) - .detail("BeginVersion", beginVersion) - .detail("EndVersion", endVersion) - .detail("RestorableVersion", restorableVersion.orDefault(-1)); - - state Reference logDispatchBatchFuture = futureBucket->future(tr); - - // If a snapshot has ended for this backup then mutations are higher priority to reduce backup lag - state int priority = latestSnapshotEndVersion.present() ? 1 : 0; - - // Add the initial log range task to read/copy the mutations and the next logs dispatch task which will run after this batch is done - wait(success(BackupLogRangeTaskFunc::addTask(tr, taskBucket, task, priority, beginVersion, endVersion, TaskCompletionKey::joinWith(logDispatchBatchFuture)))); - wait(success(BackupLogsDispatchTask::addTask(tr, taskBucket, task, priority, beginVersion, endVersion, TaskCompletionKey::signal(onDone), logDispatchBatchFuture))); - - // Do not erase at the first time - if (prevBeginVersion > 0) { - state Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); - wait( eraseLogData(tr, config.getUidAsKey(), destUidValue, Optional(beginVersion)) ); - } - - wait(taskBucket->finish(tr, task)); - - TraceEvent("FileBackupLogsDispatchContinuing") - .suppressFor(60) - .detail("BackupUID", config.getUid()) - .detail("BeginVersion", beginVersion) - .detail("EndVersion", endVersion); - - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, int priority, Version prevBeginVersion, Version beginVersion, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key key = wait(addBackupTask(BackupLogsDispatchTask::name, - BackupLogsDispatchTask::version, - tr, taskBucket, completionKey, - BackupConfig(parentTask), - waitFor, - [=](Reference task) { - Params.prevBeginVersion().set(task, prevBeginVersion); - Params.beginVersion().set(task, beginVersion); - }, - priority)); - return key; - } - - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Void(); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef BackupLogsDispatchTask::name = LiteralStringRef("file_backup_dispatch_logs_5.2"); - const uint32_t BackupLogsDispatchTask::version = 1; - REGISTER_TASKFUNC(BackupLogsDispatchTask); - - struct FileBackupFinishedTask : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - - StringRef getName() const { return name; }; - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - wait(checkTaskVersion(tr->getDatabase(), task, FileBackupFinishedTask::name, FileBackupFinishedTask::version)); - - state BackupConfig backup(task); - state UID uid = backup.getUid(); - - tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); - state Key destUidValue = wait(backup.destUidValue().getOrThrow(tr)); - wait( eraseLogData(tr, backup.getUidAsKey(), destUidValue) ); - - backup.stateEnum().set(tr, EBackupState::STATE_COMPLETED); - - wait(taskBucket->finish(tr, task)); - - TraceEvent("FileBackupFinished").detail("BackupUID", uid); - - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key key = wait(addBackupTask(FileBackupFinishedTask::name, - FileBackupFinishedTask::version, - tr, taskBucket, completionKey, - BackupConfig(parentTask), waitFor)); - return key; - } - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Void(); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef FileBackupFinishedTask::name = LiteralStringRef("file_backup_finished_5.2"); - const uint32_t FileBackupFinishedTask::version = 1; - REGISTER_TASKFUNC(FileBackupFinishedTask); - - struct BackupSnapshotManifest : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - static struct { - static TaskParam endVersion() { return LiteralStringRef(__FUNCTION__); } - } Params; - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state BackupConfig config(task); - state Reference bc; - - state Reference tr(new ReadYourWritesTransaction(cx)); - - // Read the entire range file map into memory, then walk it backwards from its last entry to produce a list of non overlapping key range files - state std::map localmap; - state Key startKey; - state int batchSize = BUGGIFY ? 1 : 1000000; - - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - wait(taskBucket->keepRunning(tr, task)); - - if(!bc) { - // Backup container must be present if we're still here - wait(store(bc, config.backupContainer().getOrThrow(tr))); - } - - BackupConfig::RangeFileMapT::PairsType rangeresults = wait(config.snapshotRangeFileMap().getRange(tr, startKey, {}, batchSize)); - - for(auto &p : rangeresults) { - localmap.insert(p); - } - - if(rangeresults.size() < batchSize) - break; - - startKey = keyAfter(rangeresults.back().first); - tr->reset(); - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - std::vector files; - state Version maxVer = 0; - state Version minVer = std::numeric_limits::max(); - state int64_t totalBytes = 0; - - if(!localmap.empty()) { - // Get iterator that points to greatest key, start there. - auto ri = localmap.rbegin(); - auto i = (++ri).base(); - - while(1) { - const BackupConfig::RangeSlice &r = i->second; - - // Add file to final file list - files.push_back(r.fileName); - - // Update version range seen - if(r.version < minVer) - minVer = r.version; - if(r.version > maxVer) - maxVer = r.version; - - // Update total bytes counted. - totalBytes += r.fileSize; - - // Jump to file that either ends where this file begins or has the greatest end that is less than - // the begin of this file. In other words find the map key that is <= begin of this file. To do this - // find the first end strictly greater than begin and then back up one. - i = localmap.upper_bound(i->second.begin); - // If we get begin then we're done, there are no more ranges that end at or before the last file's begin - if(i == localmap.begin()) - break; - --i; - } - } - - Params.endVersion().set(task, maxVer); - wait(bc->writeKeyspaceSnapshotFile(files, totalBytes)); - - TraceEvent(SevInfo, "FileBackupWroteSnapshotManifest") - .detail("BackupUID", config.getUid()) - .detail("BeginVersion", minVer) - .detail("EndVersion", maxVer) - .detail("TotalBytes", totalBytes); - - return Void(); - } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - wait(checkTaskVersion(tr->getDatabase(), task, BackupSnapshotManifest::name, BackupSnapshotManifest::version)); - - state BackupConfig config(task); - - // Set the latest snapshot end version, which was set during the execute phase - config.latestSnapshotEndVersion().set(tr, Params.endVersion().get(task)); - - state bool stopWhenDone; - state EBackupState backupState; - state Optional restorableVersion; - state Optional firstSnapshotEndVersion; - state Optional tag; - - wait(store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) - && store(backupState, config.stateEnum().getOrThrow(tr)) - && store(restorableVersion, config.getLatestRestorableVersion(tr)) - && store(firstSnapshotEndVersion, config.firstSnapshotEndVersion().get(tr)) - && store(tag, config.tag().get(tr))); - - // If restorable, update the last restorable version for this tag - if(restorableVersion.present() && tag.present()) { - FileBackupAgent().setLastRestorable(tr, StringRef(tag.get()), restorableVersion.get()); - } - - if(!firstSnapshotEndVersion.present()) { - config.firstSnapshotEndVersion().set(tr, Params.endVersion().get(task)); - } - - // If the backup is restorable and the state isn't differential the set state to differential - if(restorableVersion.present() && backupState != BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) - config.stateEnum().set(tr, BackupAgentBase::STATE_RUNNING_DIFFERENTIAL); - - // Unless we are to stop, start the next snapshot using the default interval - Reference snapshotDoneFuture = task->getDoneFuture(futureBucket); - if(!stopWhenDone) { - wait(config.initNewSnapshot(tr) && success(BackupSnapshotDispatchTask::addTask(tr, taskBucket, task, 1, TaskCompletionKey::signal(snapshotDoneFuture)))); - } else { - // Set the done future as the snapshot is now complete. - wait(snapshotDoneFuture->set(tr, taskBucket)); - } - - wait(taskBucket->finish(tr, task)); - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key key = wait(addBackupTask(BackupSnapshotManifest::name, - BackupSnapshotManifest::version, - tr, taskBucket, completionKey, - BackupConfig(parentTask), waitFor, NOP_SETUP_TASK_FN, 1)); - return key; - } - - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef BackupSnapshotManifest::name = LiteralStringRef("file_backup_write_snapshot_manifest_5.2"); - const uint32_t BackupSnapshotManifest::version = 1; - REGISTER_TASKFUNC(BackupSnapshotManifest); - - Future BackupSnapshotDispatchTask::addSnapshotManifestTask(Reference tr, Reference taskBucket, Reference parentTask, TaskCompletionKey completionKey, Reference waitFor) { - return BackupSnapshotManifest::addTask(tr, taskBucket, parentTask, completionKey, waitFor); - } - - struct StartFullBackupTaskFunc : BackupTaskFuncBase { - static StringRef name; - static const uint32_t version; - - static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - } Params; - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version)); - - loop{ - state Reference tr(new ReadYourWritesTransaction(cx)); - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Version startVersion = wait(tr->getReadVersion()); - - Params.beginVersion().set(task, startVersion); - break; - } - catch (Error &e) { - wait(tr->onError(e)); - } - } - - return Void(); - } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state BackupConfig config(task); - state Version beginVersion = Params.beginVersion().get(task); - - state Future> backupRangesFuture = config.backupRanges().getOrThrow(tr); - state Future destUidValueFuture = config.destUidValue().getOrThrow(tr); - wait(success(backupRangesFuture) && success(destUidValueFuture)); - std::vector backupRanges = backupRangesFuture.get(); - Key destUidValue = destUidValueFuture.get(); - - // Start logging the mutations for the specified ranges of the tag - for (auto &backupRange : backupRanges) { - config.startMutationLogs(tr, backupRange, destUidValue); - } - - config.stateEnum().set(tr, EBackupState::STATE_RUNNING); - - state Reference backupFinished = futureBucket->future(tr); - - // Initialize the initial snapshot and create tasks to continually write logs and snapshots - // The initial snapshot has a desired duration of 0, meaning go as fast as possible. - wait(config.initNewSnapshot(tr, 0)); - - // Using priority 1 for both of these to at least start both tasks soon - wait(success(BackupSnapshotDispatchTask::addTask(tr, taskBucket, task, 1, TaskCompletionKey::joinWith(backupFinished)))); - wait(success(BackupLogsDispatchTask::addTask(tr, taskBucket, task, 1, 0, beginVersion, TaskCompletionKey::joinWith(backupFinished)))); - - // If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then the following - // task will clean up and set the completed state. - wait(success(FileBackupFinishedTask::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal(), backupFinished))); - - wait(taskBucket->finish(tr, task)); - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, UID uid, TaskCompletionKey completionKey, Reference waitFor = Reference()) - { - Key key = wait(addBackupTask(StartFullBackupTaskFunc::name, - StartFullBackupTaskFunc::version, - tr, taskBucket, completionKey, - BackupConfig(uid), waitFor)); - return key; - } - - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef StartFullBackupTaskFunc::name = LiteralStringRef("file_backup_start_5.2"); - const uint32_t StartFullBackupTaskFunc::version = 1; - REGISTER_TASKFUNC(StartFullBackupTaskFunc); - - struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - wait(checkTaskVersion(tr->getDatabase(), task, name, version)); - - state RestoreConfig restore(task); - restore.stateEnum().set(tr, ERestoreState::COMPLETED); - tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue); - // Clear the file map now since it could be huge. - restore.fileSet().clear(tr); - - // TODO: Validate that the range version map has exactly the restored ranges in it. This means that for any restore operation - // the ranges to restore must be within the backed up ranges, otherwise from the restore perspective it will appear that some - // key ranges were missing and so the backup set is incomplete and the restore has failed. - // This validation cannot be done currently because Restore only supports a single restore range but backups can have many ranges. - - // Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored version. - restore.clearApplyMutationsKeys(tr); - - wait(taskBucket->finish(tr, task)); - wait(unlockDatabase(tr, restore.getUid())); - - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key doneKey = wait(completionKey.get(tr, taskBucket)); - state Reference task(new Task(RestoreCompleteTaskFunc::name, RestoreCompleteTaskFunc::version, doneKey)); - - // Get restore config from parent task and bind it to new task - wait(RestoreConfig(parentTask).toTask(tr, task)); - - if (!waitFor) { - return taskBucket->addTask(tr, task); - } - - wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); - } - - static StringRef name; - static const uint32_t version; - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Void(); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - - }; - StringRef RestoreCompleteTaskFunc::name = LiteralStringRef("restore_complete"); - const uint32_t RestoreCompleteTaskFunc::version = 1; - REGISTER_TASKFUNC(RestoreCompleteTaskFunc); - - struct RestoreFileTaskFuncBase : RestoreTaskFuncBase { - struct InputParams { - static TaskParam inputFile() { return LiteralStringRef(__FUNCTION__); } - static TaskParam readOffset() { return LiteralStringRef(__FUNCTION__); } - static TaskParam readLen() { return LiteralStringRef(__FUNCTION__); } - } Params; - - std::string toString(Reference task) { - return format("fileName '%s' readLen %lld readOffset %lld", - Params.inputFile().get(task).fileName.c_str(), - Params.readLen().get(task), - Params.readOffset().get(task)); - } - }; - - struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase { - static struct : InputParams { - // The range of data that the (possibly empty) data represented, which is set if it intersects the target restore range - static TaskParam originalFileRange() { return LiteralStringRef(__FUNCTION__); } - static TaskParam> originalFileRanges() { return LiteralStringRef(__FUNCTION__); } - - static std::vector getOriginalFileRanges(Reference task) { - if (originalFileRanges().exists(task)) { - return Params.originalFileRanges().get(task); - } - else { - std::vector range; - if (originalFileRange().exists(task)) - range.push_back(Params.originalFileRange().get(task)); - return range; - } - } - } Params; - - std::string toString(Reference task) { - std::string returnStr = RestoreFileTaskFuncBase::toString(task); - for(auto &range : Params.getOriginalFileRanges(task)) - returnStr += format(" originalFileRange '%s'", printable(range).c_str()); - return returnStr; - } - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state RestoreConfig restore(task); - - state RestoreFile rangeFile = Params.inputFile().get(task); - state int64_t readOffset = Params.readOffset().get(task); - state int64_t readLen = Params.readLen().get(task); - - TraceEvent("FileRestoreRangeStart") - .suppressFor(60) - .detail("RestoreUID", restore.getUid()) - .detail("FileName", rangeFile.fileName) - .detail("FileVersion", rangeFile.version) - .detail("FileSize", rangeFile.fileSize) - .detail("ReadOffset", readOffset) - .detail("ReadLen", readLen) - .detail("TaskInstance", THIS_ADDR); - - state Reference tr(new ReadYourWritesTransaction(cx)); - state Future> bc; - state Future> restoreRanges; - state Future addPrefix; - state Future removePrefix; - - loop{ - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - bc = restore.sourceContainer().getOrThrow(tr); - restoreRanges = restore.getRestoreRangesOrDefault(tr); - addPrefix = restore.addPrefix().getD(tr); - removePrefix = restore.removePrefix().getD(tr); - - wait(taskBucket->keepRunning(tr, task)); - - wait(success(bc) && success(restoreRanges) && success(addPrefix) && success(removePrefix) && checkTaskVersion(tr->getDatabase(), task, name, version)); - break; - - } - catch (Error &e) { - wait(tr->onError(e)); - } - } - - state Reference inFile = wait(bc.get()->readFile(rangeFile.fileName)); - state Standalone> blockData = wait(decodeRangeFileBlock(inFile, readOffset, readLen)); - - // First and last key are the range for this file - state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key); - state std::vector originalFileRanges; - // If fileRange doesn't intersect restore range then we're done. - state int index; - for (index = 0; index < restoreRanges.get().size(); index++) { - auto &restoreRange = restoreRanges.get()[index]; - if (!fileRange.intersects(restoreRange)) - continue; - - // We know the file range intersects the restore range but there could still be keys outside the restore range. - // Find the subvector of kv pairs that intersect the restore range. Note that the first and last keys are just the range endpoints for this file - int rangeStart = 1; - int rangeEnd = blockData.size() - 1; - // Slide start forward, stop if something in range is found - while (rangeStart < rangeEnd && !restoreRange.contains(blockData[rangeStart].key)) - ++rangeStart; - // Side end backward, stop if something in range is found - while (rangeEnd > rangeStart && !restoreRange.contains(blockData[rangeEnd - 1].key)) - --rangeEnd; - - state VectorRef data = blockData.slice(rangeStart, rangeEnd); - - // Shrink file range to be entirely within restoreRange and translate it to the new prefix - // First, use the untranslated file range to create the shrunk original file range which must be used in the kv range version map for applying mutations - state KeyRange originalFileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end)); - originalFileRanges.push_back(originalFileRange); - - // Now shrink and translate fileRange - Key fileEnd = std::min(fileRange.end, restoreRange.end); - if (fileEnd == (removePrefix.get() == StringRef() ? normalKeys.end : strinc(removePrefix.get()))) { - fileEnd = addPrefix.get() == StringRef() ? normalKeys.end : strinc(addPrefix.get()); - } - else { - fileEnd = fileEnd.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()); - } - fileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin).removePrefix(removePrefix.get()).withPrefix(addPrefix.get()), fileEnd); - - state int start = 0; - state int end = data.size(); - state int dataSizeLimit = BUGGIFY ? deterministicRandom()->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; - - tr->reset(); - loop{ - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - state int i = start; - state int txBytes = 0; - state int iend = start; - - // find iend that results in the desired transaction size - for (; iend < end && txBytes < dataSizeLimit; ++iend) { - txBytes += data[iend].key.expectedSize(); - txBytes += data[iend].value.expectedSize(); - } - - // Clear the range we are about to set. - // If start == 0 then use fileBegin for the start of the range, else data[start] - // If iend == end then use fileEnd for the end of the range, else data[iend] - state KeyRange trRange = KeyRangeRef((start == 0) ? fileRange.begin : data[start].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()) - , (iend == end) ? fileRange.end : data[iend].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())); - - tr->clear(trRange); - - for (; i < iend; ++i) { - tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE); - tr->set(data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()), data[i].value); - } - - // Add to bytes written count - restore.bytesWritten().atomicOp(tr, txBytes, MutationRef::Type::AddValue); - - state Future checkLock = checkDatabaseLock(tr, restore.getUid()); - - wait(taskBucket->keepRunning(tr, task)); - - wait(checkLock); - - wait(tr->commit()); - - TraceEvent("FileRestoreCommittedRange") - .suppressFor(60) - .detail("RestoreUID", restore.getUid()) - .detail("FileName", rangeFile.fileName) - .detail("FileVersion", rangeFile.version) - .detail("FileSize", rangeFile.fileSize) - .detail("ReadOffset", readOffset) - .detail("ReadLen", readLen) - .detail("CommitVersion", tr->getCommittedVersion()) - .detail("BeginRange", trRange.begin) - .detail("EndRange", trRange.end) - .detail("StartIndex", start) - .detail("EndIndex", i) - .detail("DataSize", data.size()) - .detail("Bytes", txBytes) - .detail("OriginalFileRange", originalFileRange) - .detail("TaskInstance", THIS_ADDR); - - // Commit succeeded, so advance starting point - start = i; - - if (start == end) - break; - tr->reset(); - } - catch (Error &e) { - if (e.code() == error_code_transaction_too_large) - dataSizeLimit /= 2; - else - wait(tr->onError(e)); - } - } - } - if (!originalFileRanges.empty()) { - if (BUGGIFY && restoreRanges.get().size() == 1) { - Params.originalFileRange().set(task, originalFileRanges[0]); - } - else { - Params.originalFileRanges().set(task, originalFileRanges); - } - } - return Void(); - } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state RestoreConfig restore(task); - restore.fileBlocksFinished().atomicOp(tr, 1, MutationRef::Type::AddValue); - - // Update the KV range map if originalFileRange is set - std::vector> updateMap; - std::vector ranges = Params.getOriginalFileRanges(task); - for (auto &range : ranges) { - Value versionEncoded = BinaryWriter::toValue(Params.inputFile().get(task).version, Unversioned()); - updateMap.push_back(krmSetRange(tr, restore.applyMutationsMapPrefix(), range, versionEncoded)); - } - - state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - wait(taskFuture->set(tr, taskBucket) && - taskBucket->finish(tr, task) && waitForAll(updateMap)); - - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, RestoreFile rf, int64_t offset, int64_t len, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key doneKey = wait(completionKey.get(tr, taskBucket)); - state Reference task(new Task(RestoreRangeTaskFunc::name, RestoreRangeTaskFunc::version, doneKey)); - - // Create a restore config from the current task and bind it to the new task. - wait(RestoreConfig(parentTask).toTask(tr, task)); - - Params.inputFile().set(task, rf); - Params.readOffset().set(task, offset); - Params.readLen().set(task, len); - - if (!waitFor) { - return taskBucket->addTask(tr, task); - } - - wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); - } - - static StringRef name; - static const uint32_t version; - StringRef getName() const { return name; }; - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef RestoreRangeTaskFunc::name = LiteralStringRef("restore_range_data"); - const uint32_t RestoreRangeTaskFunc::version = 1; - REGISTER_TASKFUNC(RestoreRangeTaskFunc); - - struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { - static StringRef name; - static const uint32_t version; - StringRef getName() const { return name; }; - - static struct : InputParams { - } Params; - - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state RestoreConfig restore(task); - - state RestoreFile logFile = Params.inputFile().get(task); - state int64_t readOffset = Params.readOffset().get(task); - state int64_t readLen = Params.readLen().get(task); - - TraceEvent("FileRestoreLogStart") - .suppressFor(60) - .detail("RestoreUID", restore.getUid()) - .detail("FileName", logFile.fileName) - .detail("FileBeginVersion", logFile.version) - .detail("FileEndVersion", logFile.endVersion) - .detail("FileSize", logFile.fileSize) - .detail("ReadOffset", readOffset) - .detail("ReadLen", readLen) - .detail("TaskInstance", THIS_ADDR); - - state Reference tr( new ReadYourWritesTransaction(cx) ); - state Reference bc; - - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - Reference _bc = wait(restore.sourceContainer().getOrThrow(tr)); - bc = _bc; - - wait(checkTaskVersion(tr->getDatabase(), task, name, version)); - wait(taskBucket->keepRunning(tr, task)); - - break; - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - state Key mutationLogPrefix = restore.mutationLogPrefix(); - state Reference inFile = wait(bc->readFile(logFile.fileName)); - state Standalone> data = wait(decodeLogFileBlock(inFile, readOffset, readLen)); + fileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin) + .removePrefix(removePrefix.get()) + .withPrefix(addPrefix.get()), + fileEnd); state int start = 0; state int end = data.size(); - state int dataSizeLimit = BUGGIFY ? deterministicRandom()->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; + state int dataSizeLimit = + BUGGIFY ? deterministicRandom()->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; tr->reset(); loop { try { - if(start == end) - return Void(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state int i = start; state int txBytes = 0; - for(; i < end && txBytes < dataSizeLimit; ++i) { - Key k = data[i].key.withPrefix(mutationLogPrefix); - ValueRef v = data[i].value; - tr->set(k, v); - txBytes += k.expectedSize(); - txBytes += v.expectedSize(); + state int iend = start; + + // find iend that results in the desired transaction size + for (; iend < end && txBytes < dataSizeLimit; ++iend) { + txBytes += data[iend].key.expectedSize(); + txBytes += data[iend].value.expectedSize(); } - state Future checkLock = checkDatabaseLock(tr, restore.getUid()); + // Clear the range we are about to set. + // If start == 0 then use fileBegin for the start of the range, else data[start] + // If iend == end then use fileEnd for the end of the range, else data[iend] + state KeyRange trRange = KeyRangeRef( + (start == 0) ? fileRange.begin + : data[start].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()), + (iend == end) ? fileRange.end + : data[iend].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())); - wait(taskBucket->keepRunning(tr, task)); - wait( checkLock ); + tr->clear(trRange); + + for (; i < iend; ++i) { + tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE); + tr->set(data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()), + data[i].value); + } // Add to bytes written count restore.bytesWritten().atomicOp(tr, txBytes, MutationRef::Type::AddValue); + state Future checkLock = checkDatabaseLock(tr, restore.getUid()); + + wait(taskBucket->keepRunning(tr, task)); + + wait(checkLock); + wait(tr->commit()); - TraceEvent("FileRestoreCommittedLog") - .suppressFor(60) - .detail("RestoreUID", restore.getUid()) - .detail("FileName", logFile.fileName) - .detail("FileBeginVersion", logFile.version) - .detail("FileEndVersion", logFile.endVersion) - .detail("FileSize", logFile.fileSize) - .detail("ReadOffset", readOffset) - .detail("ReadLen", readLen) - .detail("CommitVersion", tr->getCommittedVersion()) - .detail("StartIndex", start) - .detail("EndIndex", i) - .detail("DataSize", data.size()) - .detail("Bytes", txBytes) - .detail("TaskInstance", THIS_ADDR); + TraceEvent("FileRestoreCommittedRange") + .suppressFor(60) + .detail("RestoreUID", restore.getUid()) + .detail("FileName", rangeFile.fileName) + .detail("FileVersion", rangeFile.version) + .detail("FileSize", rangeFile.fileSize) + .detail("ReadOffset", readOffset) + .detail("ReadLen", readLen) + .detail("CommitVersion", tr->getCommittedVersion()) + .detail("BeginRange", trRange.begin) + .detail("EndRange", trRange.end) + .detail("StartIndex", start) + .detail("EndIndex", i) + .detail("DataSize", data.size()) + .detail("Bytes", txBytes) + .detail("OriginalFileRange", originalFileRange) + .detail("TaskInstance", THIS_ADDR); // Commit succeeded, so advance starting point start = i; + + if (start == end) + break; tr->reset(); - } catch(Error &e) { - if(e.code() == error_code_transaction_too_large) + } catch (Error& e) { + if (e.code() == error_code_transaction_too_large) dataSizeLimit /= 2; else wait(tr->onError(e)); } } } - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - RestoreConfig(task).fileBlocksFinished().atomicOp(tr, 1, MutationRef::Type::AddValue); - - state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - - // TODO: Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails) will never set its taskFuture. - wait(taskFuture->set(tr, taskBucket) && - taskBucket->finish(tr, task)); - - return Void(); + if (!originalFileRanges.empty()) { + if (BUGGIFY && restoreRanges.get().size() == 1) { + Params.originalFileRange().set(task, originalFileRanges[0]); + } else { + Params.originalFileRanges().set(task, originalFileRanges); + } } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, RestoreFile lf, int64_t offset, int64_t len, TaskCompletionKey completionKey, Reference waitFor = Reference()) { - Key doneKey = wait(completionKey.get(tr, taskBucket)); - state Reference task(new Task(RestoreLogDataTaskFunc::name, RestoreLogDataTaskFunc::version, doneKey)); - - // Create a restore config from the current task and bind it to the new task. - wait(RestoreConfig(parentTask).toTask(tr, task)); - Params.inputFile().set(task, lf); - Params.readOffset().set(task, offset); - Params.readLen().set(task, len); - - if (!waitFor) { - return taskBucket->addTask(tr, task); - } - - wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); - } - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef RestoreLogDataTaskFunc::name = LiteralStringRef("restore_log_data"); - const uint32_t RestoreLogDataTaskFunc::version = 1; - REGISTER_TASKFUNC(RestoreLogDataTaskFunc); - - struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { - static StringRef name; - static const uint32_t version; - StringRef getName() const { return name; }; - - static struct { - static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginFile() { return LiteralStringRef(__FUNCTION__); } - static TaskParam beginBlock() { return LiteralStringRef(__FUNCTION__); } - static TaskParam batchSize() { return LiteralStringRef(__FUNCTION__); } - static TaskParam remainingInBatch() { return LiteralStringRef(__FUNCTION__); } - } Params; - - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state RestoreConfig restore(task); - - state Version beginVersion = Params.beginVersion().get(task); - state Reference onDone = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - - state int64_t remainingInBatch = Params.remainingInBatch().get(task); - state bool addingToExistingBatch = remainingInBatch > 0; - state Version restoreVersion; - - wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) - && checkTaskVersion(tr->getDatabase(), task, name, version)); - - // If not adding to an existing batch then update the apply mutations end version so the mutations from the - // previous batch can be applied. Only do this once beginVersion is > 0 (it will be 0 for the initial dispatch). - if(!addingToExistingBatch && beginVersion > 0) { - restore.setApplyEndVersion(tr, std::min(beginVersion, restoreVersion + 1)); - } - - // The applyLag must be retrieved AFTER potentially updating the apply end version. - state int64_t applyLag = wait(restore.getApplyVersionLag(tr)); - state int64_t batchSize = Params.batchSize().get(task); - - // If starting a new batch and the apply lag is too large then re-queue and wait - if(!addingToExistingBatch && applyLag > (BUGGIFY ? 1 : CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 300)) { - // Wait a small amount of time and then re-add this same task. - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); - wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, beginVersion, "", 0, batchSize, remainingInBatch))); - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("ApplyLag", applyLag) - .detail("BatchSize", batchSize) - .detail("Decision", "too_far_behind") - .detail("TaskInstance", THIS_ADDR); - - wait(taskBucket->finish(tr, task)); - return Void(); - } - - state std::string beginFile = Params.beginFile().getOrDefault(task); - // Get a batch of files. We're targeting batchSize blocks being dispatched so query for batchSize files (each of which is 0 or more blocks). - state int taskBatchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_DISPATCH_ADDTASK_SIZE; - state RestoreConfig::FileSetT::Values files = wait(restore.fileSet().getRange(tr, {beginVersion, beginFile}, {}, taskBatchSize)); - - // allPartsDone will be set once all block tasks in the current batch are finished. - state Reference allPartsDone; - - // If adding to existing batch then join the new block tasks to the existing batch future - if(addingToExistingBatch) { - Key fKey = wait(restore.batchFuture().getD(tr)); - allPartsDone = Reference(new TaskFuture(futureBucket, fKey)); - } - else { - // Otherwise create a new future for the new batch - allPartsDone = futureBucket->future(tr); - restore.batchFuture().set(tr, allPartsDone->pack()); - // Set batch quota remaining to batch size - remainingInBatch = batchSize; - } - - // If there were no files to load then this batch is done and restore is almost done. - if(files.size() == 0) { - // If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits for them to finish - if(addingToExistingBatch) { - // Setting next begin to restoreVersion + 1 so that any files in the file map at the restore version won't be dispatched again. - wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, restoreVersion + 1, "", 0, batchSize, 0, TaskCompletionKey::noSignal(), allPartsDone))); - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("BeginFile", Params.beginFile().get(task)) - .detail("BeginBlock", Params.beginBlock().get(task)) - .detail("RestoreVersion", restoreVersion) - .detail("ApplyLag", applyLag) - .detail("Decision", "end_of_final_batch") - .detail("TaskInstance", THIS_ADDR); - } - else if(beginVersion < restoreVersion) { - // If beginVersion is less than restoreVersion then do one more dispatch task to get there - wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, restoreVersion, "", 0, batchSize))); - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("BeginFile", Params.beginFile().get(task)) - .detail("BeginBlock", Params.beginBlock().get(task)) - .detail("RestoreVersion", restoreVersion) - .detail("ApplyLag", applyLag) - .detail("Decision", "apply_to_restore_version") - .detail("TaskInstance", THIS_ADDR); - } - else if(applyLag == 0) { - // If apply lag is 0 then we are done so create the completion task - wait(success(RestoreCompleteTaskFunc::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal()))); - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("BeginFile", Params.beginFile().get(task)) - .detail("BeginBlock", Params.beginBlock().get(task)) - .detail("ApplyLag", applyLag) - .detail("Decision", "restore_complete") - .detail("TaskInstance", THIS_ADDR); - } else { - // Applying of mutations is not yet finished so wait a small amount of time and then re-add this same task. - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); - wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, beginVersion, "", 0, batchSize))); - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("ApplyLag", applyLag) - .detail("Decision", "apply_still_behind") - .detail("TaskInstance", THIS_ADDR); - } - - // If adding to existing batch then task is joined with a batch future so set done future - // Note that this must be done after joining at least one task with the batch future in case all other blockers already finished. - Future setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void(); - - wait(taskBucket->finish(tr, task) && setDone); - return Void(); - } - - // Start moving through the file list and queuing up blocks. Only queue up to RESTORE_DISPATCH_ADDTASK_SIZE blocks per Dispatch task - // and target batchSize total per batch but a batch must end on a complete version boundary so exceed the limit if necessary - // to reach the end of a version of files. - state std::vector> addTaskFutures; - state Version endVersion = files[0].version; - state int blocksDispatched = 0; - state int64_t beginBlock = Params.beginBlock().getOrDefault(task); - state int i = 0; - - for(; i < files.size(); ++i) { - RestoreConfig::RestoreFile &f = files[i]; - - // Here we are "between versions" (prior to adding the first block of the first file of a new version) so this is an opportunity - // to end the current dispatch batch (which must end on a version boundary) if the batch size has been reached or exceeded - if(f.version != endVersion && remainingInBatch <= 0) { - // Next start will be at the first version after endVersion at the first file first block - ++endVersion; - beginFile = ""; - beginBlock = 0; - break; - } - - // Set the starting point for the next task in case we stop inside this file - endVersion = f.version; - beginFile = f.fileName; - - state int64_t j = beginBlock * f.blockSize; - // For each block of the file - for(; j < f.fileSize; j += f.blockSize) { - // Stop if we've reached the addtask limit - if(blocksDispatched == taskBatchSize) - break; - - if(f.isRange) { - addTaskFutures.push_back(RestoreRangeTaskFunc::addTask(tr, taskBucket, task, - f, j, std::min(f.blockSize, f.fileSize - j), - TaskCompletionKey::joinWith(allPartsDone))); - } - else { - addTaskFutures.push_back(RestoreLogDataTaskFunc::addTask(tr, taskBucket, task, - f, j, std::min(f.blockSize, f.fileSize - j), - TaskCompletionKey::joinWith(allPartsDone))); - } - - // Increment beginBlock for the file and total blocks dispatched for this task - ++beginBlock; - ++blocksDispatched; - --remainingInBatch; - } - - // Stop if we've reached the addtask limit - if(blocksDispatched == taskBatchSize) - break; - - // We just completed an entire file so the next task should start at the file after this one within endVersion (or later) - // if this iteration ends up being the last for this task - beginFile = beginFile + '\x00'; - beginBlock = 0; - - TraceEvent("FileRestoreDispatchedFile") - .suppressFor(60) - .detail("RestoreUID", restore.getUid()) - .detail("FileName", f.fileName) - .detail("TaskInstance", THIS_ADDR); - } - - // If no blocks were dispatched then the next dispatch task should run now and be joined with the allPartsDone future - if(blocksDispatched == 0) { - std::string decision; - - // If no files were dispatched either then the batch size wasn't large enough to catch all of the files at the next lowest non-dispatched - // version, so increase the batch size. - if(i == 0) { - batchSize *= 2; - decision = "increased_batch_size"; - } - else - decision = "all_files_were_empty"; - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("BeginFile", Params.beginFile().get(task)) - .detail("BeginBlock", Params.beginBlock().get(task)) - .detail("EndVersion", endVersion) - .detail("ApplyLag", applyLag) - .detail("BatchSize", batchSize) - .detail("Decision", decision) - .detail("TaskInstance", THIS_ADDR) - .detail("RemainingInBatch", remainingInBatch); - - wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, endVersion, beginFile, beginBlock, batchSize, remainingInBatch, TaskCompletionKey::joinWith((allPartsDone))))); - - // If adding to existing batch then task is joined with a batch future so set done future. - // Note that this must be done after joining at least one task with the batch future in case all other blockers already finished. - Future setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void(); - - wait(setDone && taskBucket->finish(tr, task)); - - return Void(); - } - - // Increment the number of blocks dispatched in the restore config - restore.filesBlocksDispatched().atomicOp(tr, blocksDispatched, MutationRef::Type::AddValue); - - // If beginFile is not empty then we had to stop in the middle of a version (possibly within a file) so we cannot end - // the batch here because we do not know if we got all of the files and blocks from the last version queued, so - // make sure remainingInBatch is at least 1. - if(!beginFile.empty()) - remainingInBatch = std::max(1, remainingInBatch); - - // If more blocks need to be dispatched in this batch then add a follow-on task that is part of the allPartsDone group which will won't wait - // to run and will add more block tasks. - if(remainingInBatch > 0) - addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, endVersion, beginFile, beginBlock, batchSize, remainingInBatch, TaskCompletionKey::joinWith(allPartsDone))); - else // Otherwise, add a follow-on task to continue after all previously dispatched blocks are done - addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, endVersion, beginFile, beginBlock, batchSize, 0, TaskCompletionKey::noSignal(), allPartsDone)); - - wait(waitForAll(addTaskFutures)); - - // If adding to existing batch then task is joined with a batch future so set done future. - Future setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void(); - - wait(setDone && taskBucket->finish(tr, task)); - - TraceEvent("FileRestoreDispatch") - .detail("RestoreUID", restore.getUid()) - .detail("BeginVersion", beginVersion) - .detail("BeginFile", Params.beginFile().get(task)) - .detail("BeginBlock", Params.beginBlock().get(task)) - .detail("EndVersion", endVersion) - .detail("ApplyLag", applyLag) - .detail("BatchSize", batchSize) - .detail("Decision", "dispatched_files") - .detail("FilesDispatched", i) - .detail("BlocksDispatched", blocksDispatched) - .detail("TaskInstance", THIS_ADDR) - .detail("RemainingInBatch", remainingInBatch); - - return Void(); - } - - ACTOR static Future addTask(Reference tr, Reference taskBucket, Reference parentTask, Version beginVersion, std::string beginFile, int64_t beginBlock, int64_t batchSize, int64_t remainingInBatch = 0, TaskCompletionKey completionKey = TaskCompletionKey::noSignal(), Reference waitFor = Reference()) { - Key doneKey = wait(completionKey.get(tr, taskBucket)); - - // Use high priority for dispatch tasks that have to queue more blocks for the current batch - unsigned int priority = (remainingInBatch > 0) ? 1 : 0; - state Reference task(new Task(RestoreDispatchTaskFunc::name, RestoreDispatchTaskFunc::version, doneKey, priority)); - - // Create a config from the parent task and bind it to the new task - wait(RestoreConfig(parentTask).toTask(tr, task)); - Params.beginVersion().set(task, beginVersion); - Params.batchSize().set(task, batchSize); - Params.remainingInBatch().set(task, remainingInBatch); - Params.beginBlock().set(task, beginBlock); - Params.beginFile().set(task, beginFile); - - if (!waitFor) { - return taskBucket->addTask(tr, task); - } - - wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); - } - - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return Void(); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; - }; - StringRef RestoreDispatchTaskFunc::name = LiteralStringRef("restore_dispatch"); - const uint32_t RestoreDispatchTaskFunc::version = 1; - REGISTER_TASKFUNC(RestoreDispatchTaskFunc); - - ACTOR Future restoreStatus(Reference tr, Key tagName) { - tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - state std::vector tags; - if(tagName.size() == 0) { - std::vector t = wait(getAllRestoreTags(tr)); - tags = t; - } - else - tags.push_back(makeRestoreTag(tagName.toString())); - - state std::string result; - state int i = 0; - - for(; i < tags.size(); ++i) { - UidAndAbortedFlagT u = wait(tags[i].getD(tr)); - std::string s = wait(RestoreConfig(u.first).getFullStatus(tr)); - result.append(s); - result.append("\n\n"); - } - - return result; + return Void(); } - ACTOR Future abortRestore(Reference tr, Key tagName) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state RestoreConfig restore(task); + restore.fileBlocksFinished().atomicOp(tr, 1, MutationRef::Type::AddValue); - state KeyBackedTag tag = makeRestoreTag(tagName.toString()); - state Optional current = wait(tag.get(tr)); - if(!current.present()) - return ERestoreState::UNITIALIZED; + // Update the KV range map if originalFileRange is set + std::vector> updateMap; + std::vector ranges = Params.getOriginalFileRanges(task); + for (auto& range : ranges) { + Value versionEncoded = BinaryWriter::toValue(Params.inputFile().get(task).version, Unversioned()); + updateMap.push_back(krmSetRange(tr, restore.applyMutationsMapPrefix(), range, versionEncoded)); + } - state RestoreConfig restore(current.get().first); + state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); + wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task) && waitForAll(updateMap)); - state ERestoreState status = wait(restore.stateEnum().getD(tr)); - state bool runnable = wait(restore.isRunnable(tr)); - - if (!runnable) - return status; - - restore.stateEnum().set(tr, ERestoreState::ABORTED); - - // Clear all of the ApplyMutations stuff - restore.clearApplyMutationsKeys(tr); - - // Cancel the backup tasks on this tag - wait(tag.cancel(tr)); - wait(unlockDatabase(tr, current.get().first)); - return ERestoreState::ABORTED; + return Void(); } - ACTOR Future abortRestore(Database cx, Key tagName) { - state Reference tr = Reference( new ReadYourWritesTransaction(cx) ); + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + RestoreFile rf, + int64_t offset, + int64_t len, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key doneKey = wait(completionKey.get(tr, taskBucket)); + state Reference task(new Task(RestoreRangeTaskFunc::name, RestoreRangeTaskFunc::version, doneKey)); - loop { - try { - ERestoreState estate = wait( abortRestore(tr, tagName) ); - if(estate != ERestoreState::ABORTED) { - return estate; - } - wait(tr->commit()); - break; - } catch( Error &e ) { - wait( tr->onError(e) ); - } + // Create a restore config from the current task and bind it to the new task. + wait(RestoreConfig(parentTask).toTask(tr, task)); + + Params.inputFile().set(task, rf); + Params.readOffset().set(task, offset); + Params.readLen().set(task, len); + + if (!waitFor) { + return taskBucket->addTask(tr, task); } - - tr = Reference( new ReadYourWritesTransaction(cx) ); - //Commit a dummy transaction before returning success, to ensure the mutation applier has stopped submitting mutations + wait(waitFor->onSetAddTask(tr, taskBucket, task)); + return LiteralStringRef("OnSetAddTask"); + } + + static StringRef name; + static const uint32_t version; + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef RestoreRangeTaskFunc::name = LiteralStringRef("restore_range_data"); +const uint32_t RestoreRangeTaskFunc::version = 1; +REGISTER_TASKFUNC(RestoreRangeTaskFunc); + +struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase { + static StringRef name; + static const uint32_t version; + StringRef getName() const { return name; }; + + static struct : InputParams { + } Params; + + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state RestoreConfig restore(task); + + state RestoreFile logFile = Params.inputFile().get(task); + state int64_t readOffset = Params.readOffset().get(task); + state int64_t readLen = Params.readLen().get(task); + + TraceEvent("FileRestoreLogStart") + .suppressFor(60) + .detail("RestoreUID", restore.getUid()) + .detail("FileName", logFile.fileName) + .detail("FileBeginVersion", logFile.version) + .detail("FileEndVersion", logFile.endVersion) + .detail("FileSize", logFile.fileSize) + .detail("ReadOffset", readOffset) + .detail("ReadLen", readLen) + .detail("TaskInstance", THIS_ADDR); + + state Reference tr(new ReadYourWritesTransaction(cx)); + state Reference bc; + loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); - tr->addReadConflictRange(singleKeyRange(KeyRef())); - tr->addWriteConflictRange(singleKeyRange(KeyRef())); + + Reference _bc = wait(restore.sourceContainer().getOrThrow(tr)); + bc = _bc; + + wait(checkTaskVersion(tr->getDatabase(), task, name, version)); + wait(taskBucket->keepRunning(tr, task)); + + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + state Key mutationLogPrefix = restore.mutationLogPrefix(); + state Reference inFile = wait(bc->readFile(logFile.fileName)); + state Standalone> data = wait(decodeLogFileBlock(inFile, readOffset, readLen)); + + state int start = 0; + state int end = data.size(); + state int dataSizeLimit = + BUGGIFY ? deterministicRandom()->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE; + + tr->reset(); + loop { + try { + if (start == end) + return Void(); + + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state int i = start; + state int txBytes = 0; + for (; i < end && txBytes < dataSizeLimit; ++i) { + Key k = data[i].key.withPrefix(mutationLogPrefix); + ValueRef v = data[i].value; + tr->set(k, v); + txBytes += k.expectedSize(); + txBytes += v.expectedSize(); + } + + state Future checkLock = checkDatabaseLock(tr, restore.getUid()); + + wait(taskBucket->keepRunning(tr, task)); + wait(checkLock); + + // Add to bytes written count + restore.bytesWritten().atomicOp(tr, txBytes, MutationRef::Type::AddValue); + wait(tr->commit()); - return ERestoreState::ABORTED; - } catch( Error &e ) { - wait( tr->onError(e) ); + + TraceEvent("FileRestoreCommittedLog") + .suppressFor(60) + .detail("RestoreUID", restore.getUid()) + .detail("FileName", logFile.fileName) + .detail("FileBeginVersion", logFile.version) + .detail("FileEndVersion", logFile.endVersion) + .detail("FileSize", logFile.fileSize) + .detail("ReadOffset", readOffset) + .detail("ReadLen", readLen) + .detail("CommitVersion", tr->getCommittedVersion()) + .detail("StartIndex", start) + .detail("EndIndex", i) + .detail("DataSize", data.size()) + .detail("Bytes", txBytes) + .detail("TaskInstance", THIS_ADDR); + + // Commit succeeded, so advance starting point + start = i; + tr->reset(); + } catch (Error& e) { + if (e.code() == error_code_transaction_too_large) + dataSizeLimit /= 2; + else + wait(tr->onError(e)); } } } - struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { - static StringRef name; - static const uint32_t version; + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + RestoreConfig(task).fileBlocksFinished().atomicOp(tr, 1, MutationRef::Type::AddValue); - static struct { - static TaskParam firstVersion() { return LiteralStringRef(__FUNCTION__); } - } Params; + state Reference taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); - ACTOR static Future _execute(Database cx, Reference taskBucket, Reference futureBucket, Reference task) { - state Reference tr(new ReadYourWritesTransaction(cx)); - state RestoreConfig restore(task); - state Version restoreVersion; - state Reference bc; + // TODO: Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails) will + // never set its taskFuture. + wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task)); - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); + return Void(); + } - wait(checkTaskVersion(tr->getDatabase(), task, name, version)); - Version _restoreVersion = wait(restore.restoreVersion().getOrThrow(tr)); - restoreVersion = _restoreVersion; - wait(taskBucket->keepRunning(tr, task)); + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + RestoreFile lf, + int64_t offset, + int64_t len, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + Key doneKey = wait(completionKey.get(tr, taskBucket)); + state Reference task(new Task(RestoreLogDataTaskFunc::name, RestoreLogDataTaskFunc::version, doneKey)); - ERestoreState oldState = wait(restore.stateEnum().getD(tr)); - if(oldState != ERestoreState::QUEUED && oldState != ERestoreState::STARTING) { - wait(restore.logError(cx, restore_error(), format("StartFullRestore: Encountered unexpected state(%d)", oldState), THIS)); - return Void(); - } - restore.stateEnum().set(tr, ERestoreState::STARTING); - restore.fileSet().clear(tr); - restore.fileBlockCount().clear(tr); - restore.fileCount().clear(tr); - Reference _bc = wait(restore.sourceContainer().getOrThrow(tr)); - bc = _bc; + // Create a restore config from the current task and bind it to the new task. + wait(RestoreConfig(parentTask).toTask(tr, task)); + Params.inputFile().set(task, lf); + Params.readOffset().set(task, offset); + Params.readLen().set(task, len); - wait(tr->commit()); - break; - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - tr->reset(); - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Version destVersion = wait(tr->getReadVersion()); - TraceEvent("FileRestoreVersionUpgrade").detail("RestoreVersion", restoreVersion).detail("Dest", destVersion); - if (destVersion <= restoreVersion) { - TEST(true); // Forcing restored cluster to higher version - tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion+1, Unversioned())); - wait(tr->commit()); - } else { - break; - } - } catch( Error &e ) { - wait(tr->onError(e)); - } - } - - Optional restorable = wait(bc->getRestoreSet(restoreVersion)); - - if(!restorable.present()) - throw restore_missing_data(); - - // First version for which log data should be applied - Params.firstVersion().set(task, restorable.get().snapshot.beginVersion); - - // Convert the two lists in restorable (logs and ranges) to a single list of RestoreFiles. - // Order does not matter, they will be put in order when written to the restoreFileMap below. - state std::vector files; - - for(const RangeFile &f : restorable.get().ranges) { - files.push_back({f.version, f.fileName, true, f.blockSize, f.fileSize}); - } - for(const LogFile &f : restorable.get().logs) { - files.push_back({f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion}); - } - - state std::vector::iterator start = files.begin(); - state std::vector::iterator end = files.end(); - - tr->reset(); - while(start != end) { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - wait(taskBucket->keepRunning(tr, task)); - - state std::vector::iterator i = start; - - state int txBytes = 0; - state int nFileBlocks = 0; - state int nFiles = 0; - auto fileSet = restore.fileSet(); - for(; i != end && txBytes < 1e6; ++i) { - txBytes += fileSet.insert(tr, *i); - nFileBlocks += (i->fileSize + i->blockSize - 1) / i->blockSize; - ++nFiles; - } - - // Increment counts - restore.fileCount().atomicOp(tr, nFiles, MutationRef::Type::AddValue); - restore.fileBlockCount().atomicOp(tr, nFileBlocks, MutationRef::Type::AddValue); - - wait(tr->commit()); - - TraceEvent("FileRestoreLoadedFiles") - .detail("RestoreUID", restore.getUid()) - .detail("FileCount", nFiles) - .detail("FileBlockCount", nFileBlocks) - .detail("TransactionBytes", txBytes) - .detail("TaskInstance", THIS_ADDR); - - start = i; - tr->reset(); - } catch(Error &e) { - wait(tr->onError(e)); - } - } - - return Void(); + if (!waitFor) { + return taskBucket->addTask(tr, task); } - ACTOR static Future _finish(Reference tr, Reference taskBucket, Reference futureBucket, Reference task) { - state RestoreConfig restore(task); + wait(waitFor->onSetAddTask(tr, taskBucket, task)); + return LiteralStringRef("OnSetAddTask"); + } - state Version firstVersion = Params.firstVersion().getOrDefault(task, invalidVersion); - if(firstVersion == invalidVersion) { - wait(restore.logError(tr->getDatabase(), restore_missing_data(), "StartFullRestore: The backup had no data.", THIS)); - std::string tag = wait(restore.tag().getD(tr)); - wait(success(abortRestore(tr, StringRef(tag)))); - return Void(); - } + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef RestoreLogDataTaskFunc::name = LiteralStringRef("restore_log_data"); +const uint32_t RestoreLogDataTaskFunc::version = 1; +REGISTER_TASKFUNC(RestoreLogDataTaskFunc); - restore.stateEnum().set(tr, ERestoreState::RUNNING); +struct RestoreDispatchTaskFunc : RestoreTaskFuncBase { + static StringRef name; + static const uint32_t version; + StringRef getName() const { return name; }; - // Set applyMutation versions - restore.setApplyBeginVersion(tr, firstVersion); - restore.setApplyEndVersion(tr, firstVersion); + static struct { + static TaskParam beginVersion() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginFile() { return LiteralStringRef(__FUNCTION__); } + static TaskParam beginBlock() { return LiteralStringRef(__FUNCTION__); } + static TaskParam batchSize() { return LiteralStringRef(__FUNCTION__); } + static TaskParam remainingInBatch() { return LiteralStringRef(__FUNCTION__); } + } Params; - // Apply range data and log data in order - wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, 0, "", 0, CLIENT_KNOBS->RESTORE_DISPATCH_BATCH_SIZE))); + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state RestoreConfig restore(task); + + state Version beginVersion = Params.beginVersion().get(task); + state Reference onDone = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]); + + state int64_t remainingInBatch = Params.remainingInBatch().get(task); + state bool addingToExistingBatch = remainingInBatch > 0; + state Version restoreVersion; + + wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && + checkTaskVersion(tr->getDatabase(), task, name, version)); + + // If not adding to an existing batch then update the apply mutations end version so the mutations from the + // previous batch can be applied. Only do this once beginVersion is > 0 (it will be 0 for the initial + // dispatch). + if (!addingToExistingBatch && beginVersion > 0) { + restore.setApplyEndVersion(tr, std::min(beginVersion, restoreVersion + 1)); + } + + // The applyLag must be retrieved AFTER potentially updating the apply end version. + state int64_t applyLag = wait(restore.getApplyVersionLag(tr)); + state int64_t batchSize = Params.batchSize().get(task); + + // If starting a new batch and the apply lag is too large then re-queue and wait + if (!addingToExistingBatch && applyLag > (BUGGIFY ? 1 : CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 300)) { + // Wait a small amount of time and then re-add this same task. + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); + wait(success(RestoreDispatchTaskFunc::addTask( + tr, taskBucket, task, beginVersion, "", 0, batchSize, remainingInBatch))); + + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("ApplyLag", applyLag) + .detail("BatchSize", batchSize) + .detail("Decision", "too_far_behind") + .detail("TaskInstance", THIS_ADDR); wait(taskBucket->finish(tr, task)); return Void(); } - ACTOR static Future addTask(Reference tr, Reference taskBucket, UID uid, TaskCompletionKey completionKey, Reference waitFor = Reference()) - { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); + state std::string beginFile = Params.beginFile().getOrDefault(task); + // Get a batch of files. We're targeting batchSize blocks being dispatched so query for batchSize files (each + // of which is 0 or more blocks). + state int taskBatchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_DISPATCH_ADDTASK_SIZE; + state RestoreConfig::FileSetT::Values files = + wait(restore.fileSet().getRange(tr, { beginVersion, beginFile }, {}, taskBatchSize)); - Key doneKey = wait(completionKey.get(tr, taskBucket)); - state Reference task(new Task(StartFullRestoreTaskFunc::name, StartFullRestoreTaskFunc::version, doneKey)); + // allPartsDone will be set once all block tasks in the current batch are finished. + state Reference allPartsDone; - state RestoreConfig restore(uid); - // Bind the restore config to the new task - wait(restore.toTask(tr, task)); - - if (!waitFor) { - return taskBucket->addTask(tr, task); - } - - wait(waitFor->onSetAddTask(tr, taskBucket, task)); - return LiteralStringRef("OnSetAddTask"); + // If adding to existing batch then join the new block tasks to the existing batch future + if (addingToExistingBatch) { + Key fKey = wait(restore.batchFuture().getD(tr)); + allPartsDone = Reference(new TaskFuture(futureBucket, fKey)); + } else { + // Otherwise create a new future for the new batch + allPartsDone = futureBucket->future(tr); + restore.batchFuture().set(tr, allPartsDone->pack()); + // Set batch quota remaining to batch size + remainingInBatch = batchSize; } - StringRef getName() const { return name; }; + // If there were no files to load then this batch is done and restore is almost done. + if (files.size() == 0) { + // If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits for + // them to finish + if (addingToExistingBatch) { + // Setting next begin to restoreVersion + 1 so that any files in the file map at the restore version + // won't be dispatched again. + wait(success(RestoreDispatchTaskFunc::addTask(tr, + taskBucket, + task, + restoreVersion + 1, + "", + 0, + batchSize, + 0, + TaskCompletionKey::noSignal(), + allPartsDone))); - Future execute(Database cx, Reference tb, Reference fb, Reference task) { return _execute(cx, tb, fb, task); }; - Future finish(Reference tr, Reference tb, Reference fb, Reference task) { return _finish(tr, tb, fb, task); }; + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("BeginFile", Params.beginFile().get(task)) + .detail("BeginBlock", Params.beginBlock().get(task)) + .detail("RestoreVersion", restoreVersion) + .detail("ApplyLag", applyLag) + .detail("Decision", "end_of_final_batch") + .detail("TaskInstance", THIS_ADDR); + } else if (beginVersion < restoreVersion) { + // If beginVersion is less than restoreVersion then do one more dispatch task to get there + wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, restoreVersion, "", 0, batchSize))); + + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("BeginFile", Params.beginFile().get(task)) + .detail("BeginBlock", Params.beginBlock().get(task)) + .detail("RestoreVersion", restoreVersion) + .detail("ApplyLag", applyLag) + .detail("Decision", "apply_to_restore_version") + .detail("TaskInstance", THIS_ADDR); + } else if (applyLag == 0) { + // If apply lag is 0 then we are done so create the completion task + wait(success(RestoreCompleteTaskFunc::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal()))); + + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("BeginFile", Params.beginFile().get(task)) + .detail("BeginBlock", Params.beginBlock().get(task)) + .detail("ApplyLag", applyLag) + .detail("Decision", "restore_complete") + .detail("TaskInstance", THIS_ADDR); + } else { + // Applying of mutations is not yet finished so wait a small amount of time and then re-add this same + // task. + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); + wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, beginVersion, "", 0, batchSize))); + + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("ApplyLag", applyLag) + .detail("Decision", "apply_still_behind") + .detail("TaskInstance", THIS_ADDR); + } + + // If adding to existing batch then task is joined with a batch future so set done future + // Note that this must be done after joining at least one task with the batch future in case all other + // blockers already finished. + Future setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void(); + + wait(taskBucket->finish(tr, task) && setDone); + return Void(); + } + + // Start moving through the file list and queuing up blocks. Only queue up to RESTORE_DISPATCH_ADDTASK_SIZE + // blocks per Dispatch task and target batchSize total per batch but a batch must end on a complete version + // boundary so exceed the limit if necessary to reach the end of a version of files. + state std::vector> addTaskFutures; + state Version endVersion = files[0].version; + state int blocksDispatched = 0; + state int64_t beginBlock = Params.beginBlock().getOrDefault(task); + state int i = 0; + + for (; i < files.size(); ++i) { + RestoreConfig::RestoreFile& f = files[i]; + + // Here we are "between versions" (prior to adding the first block of the first file of a new version) so + // this is an opportunity to end the current dispatch batch (which must end on a version boundary) if the + // batch size has been reached or exceeded + if (f.version != endVersion && remainingInBatch <= 0) { + // Next start will be at the first version after endVersion at the first file first block + ++endVersion; + beginFile = ""; + beginBlock = 0; + break; + } + + // Set the starting point for the next task in case we stop inside this file + endVersion = f.version; + beginFile = f.fileName; + + state int64_t j = beginBlock * f.blockSize; + // For each block of the file + for (; j < f.fileSize; j += f.blockSize) { + // Stop if we've reached the addtask limit + if (blocksDispatched == taskBatchSize) + break; + + if (f.isRange) { + addTaskFutures.push_back( + RestoreRangeTaskFunc::addTask(tr, + taskBucket, + task, + f, + j, + std::min(f.blockSize, f.fileSize - j), + TaskCompletionKey::joinWith(allPartsDone))); + } else { + addTaskFutures.push_back( + RestoreLogDataTaskFunc::addTask(tr, + taskBucket, + task, + f, + j, + std::min(f.blockSize, f.fileSize - j), + TaskCompletionKey::joinWith(allPartsDone))); + } + + // Increment beginBlock for the file and total blocks dispatched for this task + ++beginBlock; + ++blocksDispatched; + --remainingInBatch; + } + + // Stop if we've reached the addtask limit + if (blocksDispatched == taskBatchSize) + break; + + // We just completed an entire file so the next task should start at the file after this one within + // endVersion (or later) if this iteration ends up being the last for this task + beginFile = beginFile + '\x00'; + beginBlock = 0; + + TraceEvent("FileRestoreDispatchedFile") + .suppressFor(60) + .detail("RestoreUID", restore.getUid()) + .detail("FileName", f.fileName) + .detail("TaskInstance", THIS_ADDR); + } + + // If no blocks were dispatched then the next dispatch task should run now and be joined with the allPartsDone + // future + if (blocksDispatched == 0) { + std::string decision; + + // If no files were dispatched either then the batch size wasn't large enough to catch all of the files at + // the next lowest non-dispatched version, so increase the batch size. + if (i == 0) { + batchSize *= 2; + decision = "increased_batch_size"; + } else + decision = "all_files_were_empty"; + + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("BeginFile", Params.beginFile().get(task)) + .detail("BeginBlock", Params.beginBlock().get(task)) + .detail("EndVersion", endVersion) + .detail("ApplyLag", applyLag) + .detail("BatchSize", batchSize) + .detail("Decision", decision) + .detail("TaskInstance", THIS_ADDR) + .detail("RemainingInBatch", remainingInBatch); + + wait(success(RestoreDispatchTaskFunc::addTask(tr, + taskBucket, + task, + endVersion, + beginFile, + beginBlock, + batchSize, + remainingInBatch, + TaskCompletionKey::joinWith((allPartsDone))))); + + // If adding to existing batch then task is joined with a batch future so set done future. + // Note that this must be done after joining at least one task with the batch future in case all other + // blockers already finished. + Future setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void(); + + wait(setDone && taskBucket->finish(tr, task)); + + return Void(); + } + + // Increment the number of blocks dispatched in the restore config + restore.filesBlocksDispatched().atomicOp(tr, blocksDispatched, MutationRef::Type::AddValue); + + // If beginFile is not empty then we had to stop in the middle of a version (possibly within a file) so we + // cannot end the batch here because we do not know if we got all of the files and blocks from the last version + // queued, so make sure remainingInBatch is at least 1. + if (!beginFile.empty()) + remainingInBatch = std::max(1, remainingInBatch); + + // If more blocks need to be dispatched in this batch then add a follow-on task that is part of the allPartsDone + // group which will won't wait to run and will add more block tasks. + if (remainingInBatch > 0) + addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr, + taskBucket, + task, + endVersion, + beginFile, + beginBlock, + batchSize, + remainingInBatch, + TaskCompletionKey::joinWith(allPartsDone))); + else // Otherwise, add a follow-on task to continue after all previously dispatched blocks are done + addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr, + taskBucket, + task, + endVersion, + beginFile, + beginBlock, + batchSize, + 0, + TaskCompletionKey::noSignal(), + allPartsDone)); + + wait(waitForAll(addTaskFutures)); + + // If adding to existing batch then task is joined with a batch future so set done future. + Future setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void(); + + wait(setDone && taskBucket->finish(tr, task)); + + TraceEvent("FileRestoreDispatch") + .detail("RestoreUID", restore.getUid()) + .detail("BeginVersion", beginVersion) + .detail("BeginFile", Params.beginFile().get(task)) + .detail("BeginBlock", Params.beginBlock().get(task)) + .detail("EndVersion", endVersion) + .detail("ApplyLag", applyLag) + .detail("BatchSize", batchSize) + .detail("Decision", "dispatched_files") + .detail("FilesDispatched", i) + .detail("BlocksDispatched", blocksDispatched) + .detail("TaskInstance", THIS_ADDR) + .detail("RemainingInBatch", remainingInBatch); + + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + Reference parentTask, + Version beginVersion, + std::string beginFile, + int64_t beginBlock, + int64_t batchSize, + int64_t remainingInBatch = 0, + TaskCompletionKey completionKey = TaskCompletionKey::noSignal(), + Reference waitFor = Reference()) { + Key doneKey = wait(completionKey.get(tr, taskBucket)); + + // Use high priority for dispatch tasks that have to queue more blocks for the current batch + unsigned int priority = (remainingInBatch > 0) ? 1 : 0; + state Reference task( + new Task(RestoreDispatchTaskFunc::name, RestoreDispatchTaskFunc::version, doneKey, priority)); + + // Create a config from the parent task and bind it to the new task + wait(RestoreConfig(parentTask).toTask(tr, task)); + Params.beginVersion().set(task, beginVersion); + Params.batchSize().set(task, batchSize); + Params.remainingInBatch().set(task, remainingInBatch); + Params.beginBlock().set(task, beginBlock); + Params.beginFile().set(task, beginFile); + + if (!waitFor) { + return taskBucket->addTask(tr, task); + } + + wait(waitFor->onSetAddTask(tr, taskBucket, task)); + return LiteralStringRef("OnSetAddTask"); + } + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return Void(); }; - StringRef StartFullRestoreTaskFunc::name = LiteralStringRef("restore_start"); - const uint32_t StartFullRestoreTaskFunc::version = 1; - REGISTER_TASKFUNC(StartFullRestoreTaskFunc); + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef RestoreDispatchTaskFunc::name = LiteralStringRef("restore_dispatch"); +const uint32_t RestoreDispatchTaskFunc::version = 1; +REGISTER_TASKFUNC(RestoreDispatchTaskFunc); + +ACTOR Future restoreStatus(Reference tr, Key tagName) { + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state std::vector tags; + if (tagName.size() == 0) { + std::vector t = wait(getAllRestoreTags(tr)); + tags = t; + } else + tags.push_back(makeRestoreTag(tagName.toString())); + + state std::string result; + state int i = 0; + + for (; i < tags.size(); ++i) { + UidAndAbortedFlagT u = wait(tags[i].getD(tr)); + std::string s = wait(RestoreConfig(u.first).getFullStatus(tr)); + result.append(s); + result.append("\n\n"); + } + + return result; } +ACTOR Future abortRestore(Reference tr, Key tagName) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); + + state KeyBackedTag tag = makeRestoreTag(tagName.toString()); + state Optional current = wait(tag.get(tr)); + if (!current.present()) + return ERestoreState::UNITIALIZED; + + state RestoreConfig restore(current.get().first); + + state ERestoreState status = wait(restore.stateEnum().getD(tr)); + state bool runnable = wait(restore.isRunnable(tr)); + + if (!runnable) + return status; + + restore.stateEnum().set(tr, ERestoreState::ABORTED); + + // Clear all of the ApplyMutations stuff + restore.clearApplyMutationsKeys(tr); + + // Cancel the backup tasks on this tag + wait(tag.cancel(tr)); + wait(unlockDatabase(tr, current.get().first)); + return ERestoreState::ABORTED; +} + +ACTOR Future abortRestore(Database cx, Key tagName) { + state Reference tr = + Reference(new ReadYourWritesTransaction(cx)); + + loop { + try { + ERestoreState estate = wait(abortRestore(tr, tagName)); + if (estate != ERestoreState::ABORTED) { + return estate; + } + wait(tr->commit()); + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + tr = Reference(new ReadYourWritesTransaction(cx)); + + // Commit a dummy transaction before returning success, to ensure the mutation applier has stopped submitting + // mutations + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); + tr->addReadConflictRange(singleKeyRange(KeyRef())); + tr->addWriteConflictRange(singleKeyRange(KeyRef())); + wait(tr->commit()); + return ERestoreState::ABORTED; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + +struct StartFullRestoreTaskFunc : RestoreTaskFuncBase { + static StringRef name; + static const uint32_t version; + + static struct { + static TaskParam firstVersion() { return LiteralStringRef(__FUNCTION__); } + } Params; + + ACTOR static Future _execute(Database cx, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state Reference tr(new ReadYourWritesTransaction(cx)); + state RestoreConfig restore(task); + state Version restoreVersion; + state Reference bc; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(checkTaskVersion(tr->getDatabase(), task, name, version)); + Version _restoreVersion = wait(restore.restoreVersion().getOrThrow(tr)); + restoreVersion = _restoreVersion; + wait(taskBucket->keepRunning(tr, task)); + + ERestoreState oldState = wait(restore.stateEnum().getD(tr)); + if (oldState != ERestoreState::QUEUED && oldState != ERestoreState::STARTING) { + wait(restore.logError(cx, + restore_error(), + format("StartFullRestore: Encountered unexpected state(%d)", oldState), + THIS)); + return Void(); + } + restore.stateEnum().set(tr, ERestoreState::STARTING); + restore.fileSet().clear(tr); + restore.fileBlockCount().clear(tr); + restore.fileCount().clear(tr); + Reference _bc = wait(restore.sourceContainer().getOrThrow(tr)); + bc = _bc; + + wait(tr->commit()); + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + tr->reset(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Version destVersion = wait(tr->getReadVersion()); + TraceEvent("FileRestoreVersionUpgrade") + .detail("RestoreVersion", restoreVersion) + .detail("Dest", destVersion); + if (destVersion <= restoreVersion) { + TEST(true); // Forcing restored cluster to higher version + tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion + 1, Unversioned())); + wait(tr->commit()); + } else { + break; + } + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + Optional restorable = wait(bc->getRestoreSet(restoreVersion)); + + if (!restorable.present()) + throw restore_missing_data(); + + // First version for which log data should be applied + Params.firstVersion().set(task, restorable.get().snapshot.beginVersion); + + // Convert the two lists in restorable (logs and ranges) to a single list of RestoreFiles. + // Order does not matter, they will be put in order when written to the restoreFileMap below. + state std::vector files; + + for (const RangeFile& f : restorable.get().ranges) { + files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize }); + } + for (const LogFile& f : restorable.get().logs) { + files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion }); + } + + state std::vector::iterator start = files.begin(); + state std::vector::iterator end = files.end(); + + tr->reset(); + while (start != end) { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + wait(taskBucket->keepRunning(tr, task)); + + state std::vector::iterator i = start; + + state int txBytes = 0; + state int nFileBlocks = 0; + state int nFiles = 0; + auto fileSet = restore.fileSet(); + for (; i != end && txBytes < 1e6; ++i) { + txBytes += fileSet.insert(tr, *i); + nFileBlocks += (i->fileSize + i->blockSize - 1) / i->blockSize; + ++nFiles; + } + + // Increment counts + restore.fileCount().atomicOp(tr, nFiles, MutationRef::Type::AddValue); + restore.fileBlockCount().atomicOp(tr, nFileBlocks, MutationRef::Type::AddValue); + + wait(tr->commit()); + + TraceEvent("FileRestoreLoadedFiles") + .detail("RestoreUID", restore.getUid()) + .detail("FileCount", nFiles) + .detail("FileBlockCount", nFileBlocks) + .detail("TransactionBytes", txBytes) + .detail("TaskInstance", THIS_ADDR); + + start = i; + tr->reset(); + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + return Void(); + } + + ACTOR static Future _finish(Reference tr, + Reference taskBucket, + Reference futureBucket, + Reference task) { + state RestoreConfig restore(task); + + state Version firstVersion = Params.firstVersion().getOrDefault(task, invalidVersion); + if (firstVersion == invalidVersion) { + wait(restore.logError( + tr->getDatabase(), restore_missing_data(), "StartFullRestore: The backup had no data.", THIS)); + std::string tag = wait(restore.tag().getD(tr)); + wait(success(abortRestore(tr, StringRef(tag)))); + return Void(); + } + + restore.stateEnum().set(tr, ERestoreState::RUNNING); + + // Set applyMutation versions + restore.setApplyBeginVersion(tr, firstVersion); + restore.setApplyEndVersion(tr, firstVersion); + + // Apply range data and log data in order + wait(success(RestoreDispatchTaskFunc::addTask( + tr, taskBucket, task, 0, "", 0, CLIENT_KNOBS->RESTORE_DISPATCH_BATCH_SIZE))); + + wait(taskBucket->finish(tr, task)); + return Void(); + } + + ACTOR static Future addTask(Reference tr, + Reference taskBucket, + UID uid, + TaskCompletionKey completionKey, + Reference waitFor = Reference()) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + Key doneKey = wait(completionKey.get(tr, taskBucket)); + state Reference task( + new Task(StartFullRestoreTaskFunc::name, StartFullRestoreTaskFunc::version, doneKey)); + + state RestoreConfig restore(uid); + // Bind the restore config to the new task + wait(restore.toTask(tr, task)); + + if (!waitFor) { + return taskBucket->addTask(tr, task); + } + + wait(waitFor->onSetAddTask(tr, taskBucket, task)); + return LiteralStringRef("OnSetAddTask"); + } + + StringRef getName() const { return name; }; + + Future execute(Database cx, Reference tb, Reference fb, Reference task) { + return _execute(cx, tb, fb, task); + }; + Future finish(Reference tr, + Reference tb, + Reference fb, + Reference task) { + return _finish(tr, tb, fb, task); + }; +}; +StringRef StartFullRestoreTaskFunc::name = LiteralStringRef("restore_start"); +const uint32_t StartFullRestoreTaskFunc::version = 1; +REGISTER_TASKFUNC(StartFullRestoreTaskFunc); +} // namespace fileBackup + struct LogInfo : public ReferenceCounted { std::string fileName; Reference logFile; @@ -3495,7 +3927,7 @@ struct LogInfo : public ReferenceCounted { Version endVersion; int64_t offset; - LogInfo() : offset(0) {}; + LogInfo() : offset(0){}; }; class FileBackupAgentImpl { @@ -3504,7 +3936,12 @@ public: // This method will return the final status of the backup at tag, and return the URL that was used on the tag // when that status value was read. - ACTOR static Future waitBackup(FileBackupAgent* backupAgent, Database cx, std::string tagName, bool stopWhenDone, Reference *pContainer = nullptr, UID *pUID = nullptr) { + ACTOR static Future waitBackup(FileBackupAgent* backupAgent, + Database cx, + std::string tagName, + bool stopWhenDone, + Reference* pContainer = nullptr, + UID* pUID = nullptr) { state std::string backTrace; state KeyBackedTag tag = makeBackupTag(tagName); @@ -3525,45 +3962,53 @@ public: // Break, if one of the following is true // - no longer runnable // - in differential mode (restorable) and stopWhenDone is not enabled - if( !FileBackupAgent::isRunnable(status) || ((!stopWhenDone) && (BackupAgentBase::STATE_RUNNING_DIFFERENTIAL == status) )) { + if (!FileBackupAgent::isRunnable(status) || + ((!stopWhenDone) && (BackupAgentBase::STATE_RUNNING_DIFFERENTIAL == status))) { - if(pContainer != nullptr) { - Reference c = wait(config.backupContainer().getOrThrow(tr, false, backup_invalid_info())); + if (pContainer != nullptr) { + Reference c = + wait(config.backupContainer().getOrThrow(tr, false, backup_invalid_info())); *pContainer = c; } - if(pUID != nullptr) { + if (pUID != nullptr) { *pUID = oldUidAndAborted.get().first; } return status; } - state Future watchFuture = tr->watch( config.stateEnum().key ); - wait( tr->commit() ); - wait( watchFuture ); - } - catch (Error &e) { + state Future watchFuture = tr->watch(config.stateEnum().key); + wait(tr->commit()); + wait(watchFuture); + } catch (Error& e) { wait(tr->onError(e)); } } } - ACTOR static Future submitBackup(FileBackupAgent* backupAgent, Reference tr, Key outContainer, int snapshotIntervalSeconds, std::string tagName, Standalone> backupRanges, bool stopWhenDone) { + ACTOR static Future submitBackup(FileBackupAgent* backupAgent, + Reference tr, + Key outContainer, + int snapshotIntervalSeconds, + std::string tagName, + Standalone> backupRanges, + bool stopWhenDone) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY); TraceEvent(SevInfo, "FBA_SubmitBackup") - .detail("TagName", tagName.c_str()) - .detail("StopWhenDone", stopWhenDone) - .detail("OutContainer", outContainer.toString()); + .detail("TagName", tagName.c_str()) + .detail("StopWhenDone", stopWhenDone) + .detail("OutContainer", outContainer.toString()); state KeyBackedTag tag = makeBackupTag(tagName); Optional uidAndAbortedFlag = wait(tag.get(tr)); if (uidAndAbortedFlag.present()) { state BackupConfig prevConfig(uidAndAbortedFlag.get().first); - state EBackupState prevBackupStatus = wait(prevConfig.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); + state EBackupState prevBackupStatus = + wait(prevConfig.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); if (FileBackupAgent::isRunnable(prevBackupStatus)) { throw backup_duplicate(); } @@ -3579,17 +4024,17 @@ public: state Standalone nowStr = BackupAgentBase::getCurrentTime(); state std::string backupContainer = outContainer.toString(); - // To be consistent with directory handling behavior since FDB backup was first released, if the container string - // describes a local directory then "/backup-" will be added to it. - if(backupContainer.find("file://") == 0) { + // To be consistent with directory handling behavior since FDB backup was first released, if the container + // string describes a local directory then "/backup-" will be added to it. + if (backupContainer.find("file://") == 0) { backupContainer = joinPath(backupContainer, std::string("backup-") + nowStr.toString()); } state Reference bc = IBackupContainer::openContainer(backupContainer); try { wait(timeoutError(bc->create(), 30)); - } catch(Error &e) { - if(e.code() == error_code_actor_cancelled) + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) throw; fprintf(stderr, "ERROR: Could not create backup container: %s\n", e.what()); throw backup_error(); @@ -3598,7 +4043,9 @@ public: Optional lastBackupTimestamp = wait(backupAgent->lastBackupTimestamp().get(tr)); if ((lastBackupTimestamp.present()) && (lastBackupTimestamp.get() >= nowStr)) { - fprintf(stderr, "ERROR: The last backup `%s' happened in the future.\n", printable(lastBackupTimestamp.get()).c_str()); + fprintf(stderr, + "ERROR: The last backup `%s' happened in the future.\n", + printable(lastBackupTimestamp.get()).c_str()); throw backup_error(); } @@ -3620,26 +4067,32 @@ public: state Key destUidValue(BinaryWriter::toValue(uid, Unversioned())); if (normalizedRanges.size() == 1) { - Standalone existingDestUidValues = wait(tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); + Standalone existingDestUidValues = wait( + tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY)); bool found = false; - for(auto it : existingDestUidValues) { - if( BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == normalizedRanges[0] ) { + for (auto it : existingDestUidValues) { + if (BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()) == + normalizedRanges[0]) { destUidValue = it.value; found = true; break; } } - if( !found ) { + if (!found) { destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned()); - tr->set(BinaryWriter::toValue(normalizedRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())).withPrefix(destUidLookupPrefix), destUidValue); + tr->set( + BinaryWriter::toValue(normalizedRanges[0], IncludeVersion(ProtocolVersion::withSharedMutations())) + .withPrefix(destUidLookupPrefix), + destUidValue); } } - tr->set(config.getUidAsKey().withPrefix(destUidValue).withPrefix(backupLatestVersionsPrefix), BinaryWriter::toValue(tr->getReadVersion().get(), Unversioned())); + tr->set(config.getUidAsKey().withPrefix(destUidValue).withPrefix(backupLatestVersionsPrefix), + BinaryWriter::toValue(tr->getReadVersion().get(), Unversioned())); config.destUidValue().set(tr, destUidValue); // Point the tag to this new uid - tag.set(tr, {uid, false}); + tag.set(tr, { uid, false }); backupAgent->lastBackupTimestamp().set(tr, nowStr); @@ -3651,12 +4104,22 @@ public: config.backupRanges().set(tr, normalizedRanges); config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds); - Key taskKey = wait(fileBackup::StartFullBackupTaskFunc::addTask(tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal())); + Key taskKey = wait(fileBackup::StartFullBackupTaskFunc::addTask( + tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal())); return Void(); } - ACTOR static Future submitRestore(FileBackupAgent* backupAgent, Reference tr, Key tagName, Key backupURL, Standalone> ranges, Version restoreVersion, Key addPrefix, Key removePrefix, bool lockDB, UID uid) { + ACTOR static Future submitRestore(FileBackupAgent* backupAgent, + Reference tr, + Key tagName, + Key backupURL, + Standalone> ranges, + Version restoreVersion, + Key addPrefix, + Key removePrefix, + bool lockDB, + UID uid) { KeyRangeMap restoreRangeSet; for (auto& range : ranges) { restoreRangeSet.insert(range, 1); @@ -3668,7 +4131,7 @@ public: restoreRanges.push_back(KeyRange(KeyRangeRef(restoreRange.range().begin, restoreRange.range().end))); } } - for (auto &restoreRange : restoreRanges) + for (auto& restoreRange : restoreRanges) ASSERT(restoreRange.contains(removePrefix) || removePrefix.size() == 0); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -3677,12 +4140,11 @@ public: // Get old restore config for this tag state KeyBackedTag tag = makeRestoreTag(tagName.toString()); state Optional oldUidAndAborted = wait(tag.get(tr)); - if(oldUidAndAborted.present()) { + if (oldUidAndAborted.present()) { if (oldUidAndAborted.get().first == uid) { if (oldUidAndAborted.get().second) { throw restore_duplicate_uid(); - } - else { + } else { return Void(); } } @@ -3702,7 +4164,9 @@ public: state int index; for (index = 0; index < restoreRanges.size(); index++) { - KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end).removePrefix(removePrefix).withPrefix(addPrefix); + KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end) + .removePrefix(removePrefix) + .withPrefix(addPrefix); Standalone existingRows = wait(tr->getRange(restoreIntoRange, 1)); if (existingRows.size() > 0) { throw restore_destination_not_empty(); @@ -3712,7 +4176,7 @@ public: state RestoreConfig restore(uid); // Point the tag to the new uid - tag.set(tr, {uid, false}); + tag.set(tr, { uid, false }); Reference bc = IBackupContainer::openContainer(backupURL.toString()); @@ -3723,14 +4187,14 @@ public: restore.restoreVersion().set(tr, restoreVersion); if (BUGGIFY && restoreRanges.size() == 1) { restore.restoreRange().set(tr, restoreRanges[0]); - } - else { + } else { restore.restoreRanges().set(tr, restoreRanges); } // this also sets restore.add/removePrefix. restore.initApplyMutations(tr, addPrefix, removePrefix); - Key taskKey = wait(fileBackup::StartFullRestoreTaskFunc::addTask(tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal())); + Key taskKey = wait(fileBackup::StartFullRestoreTaskFunc::addTask( + tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal())); if (lockDB) wait(lockDatabase(tr, uid)); @@ -3752,15 +4216,17 @@ public: state KeyBackedTag tag = makeRestoreTag(tagName.toString()); Optional current = wait(tag.get(tr)); - if(!current.present()) { - if(verbose) - printf("Tag: %s State: %s\n", tagName.toString().c_str(), FileBackupAgent::restoreStateText(ERestoreState::UNITIALIZED).toString().c_str()); + if (!current.present()) { + if (verbose) + printf("Tag: %s State: %s\n", + tagName.toString().c_str(), + FileBackupAgent::restoreStateText(ERestoreState::UNITIALIZED).toString().c_str()); return ERestoreState::UNITIALIZED; } state RestoreConfig restore(current.get().first); - if(verbose) { + if (verbose) { state std::string details = wait(restore.getProgress(tr)); printf("%s\n", details.c_str()); } @@ -3776,12 +4242,11 @@ public: // Wait for a change state Future watchFuture = tr->watch(restore.stateEnum().key); wait(tr->commit()); - if(verbose) + if (verbose) wait(watchFuture || delay(1)); else wait(watchFuture); - } - catch (Error &e) { + } catch (Error& e) { wait(tr->onError(e)); } } @@ -3789,7 +4254,9 @@ public: return status; } - ACTOR static Future discontinueBackup(FileBackupAgent* backupAgent, Reference tr, Key tagName) { + ACTOR static Future discontinueBackup(FileBackupAgent* backupAgent, + Reference tr, + Key tagName) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); @@ -3802,16 +4269,16 @@ public: throw backup_unneeded(); } - // If the backup is already restorable then 'mostly' abort it - cancel all tasks via the tag + // If the backup is already restorable then 'mostly' abort it - cancel all tasks via the tag // and clear the mutation logging config and data - but set its state as COMPLETED instead of ABORTED. state Optional latestRestorableVersion = wait(config.getLatestRestorableVersion(tr)); TraceEvent(SevInfo, "FBA_DiscontinueBackup") - .detail("AlreadyRestorable", latestRestorableVersion.present() ? "Yes" : "No") - .detail("TagName", tag.tagName.c_str()) - .detail("Status", BackupAgentBase::getStateText(status)); + .detail("AlreadyRestorable", latestRestorableVersion.present() ? "Yes" : "No") + .detail("TagName", tag.tagName.c_str()) + .detail("Status", BackupAgentBase::getStateText(status)); - if(latestRestorableVersion.present()) { + if (latestRestorableVersion.present()) { // Cancel all backup tasks through tag wait(tag.cancel(tr)); @@ -3819,7 +4286,7 @@ public: state Key destUidValue = wait(config.destUidValue().getOrThrow(tr)); wait(success(tr->getReadVersion())); - wait( eraseLogData(tr, config.getUidAsKey(), destUidValue) ); + wait(eraseLogData(tr, config.getUidAsKey(), destUidValue)); config.stateEnum().set(tr, EBackupState::STATE_COMPLETED); @@ -3837,7 +4304,9 @@ public: return Void(); } - ACTOR static Future abortBackup(FileBackupAgent* backupAgent, Reference tr, std::string tagName) { + ACTOR static Future abortBackup(FileBackupAgent* backupAgent, + Reference tr, + std::string tagName) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); @@ -3853,12 +4322,12 @@ public: } TraceEvent(SevInfo, "FBA_AbortBackup") - .detail("TagName", tagName.c_str()) - .detail("Status", BackupAgentBase::getStateText(status)); + .detail("TagName", tagName.c_str()) + .detail("Status", BackupAgentBase::getStateText(status)); // Cancel backup task through tag wait(tag.cancel(tr)); - + wait(eraseLogData(tr, config.getUidAsKey(), destUidValue)); config.stateEnum().set(tr, EBackupState::STATE_ABORTED); @@ -3870,15 +4339,13 @@ public: Optional version; Optional epochs; - bool present() const { - return version.present(); - } + bool present() const { return version.present(); } JsonBuilderObject toJSON() const { JsonBuilderObject doc; - if(version.present()) { + if (version.present()) { doc.setKey("Version", version.get()); - if(epochs.present()) { + if (epochs.present()) { doc.setKey("EpochSeconds", epochs.get()); doc.setKey("Timestamp", timeStampToString(epochs)); } @@ -3889,10 +4356,11 @@ public: // Helper actor for generating status // If f is present, lookup epochs using timekeeper and tr, return TimestampedVersion - ACTOR static Future getTimestampedVersion(Reference tr, Future> f) { + ACTOR static Future getTimestampedVersion(Reference tr, + Future> f) { state TimestampedVersion tv; wait(store(tv.version, f)); - if(tv.version.present()) { + if (tv.version.present()) { wait(store(tv.epochs, timeKeeperEpochsFromVersion(tv.version.get(), tr))); } return tv; @@ -3914,17 +4382,19 @@ public: state Optional paused; state Version recentReadVersion; - wait( store(paused, tr->get(backupAgent->taskBucket->getPauseKey())) && store(uidAndAbortedFlag, tag.get(tr)) && store(recentReadVersion, tr->getReadVersion()) ); + wait(store(paused, tr->get(backupAgent->taskBucket->getPauseKey())) && + store(uidAndAbortedFlag, tag.get(tr)) && store(recentReadVersion, tr->getReadVersion())); doc.setKey("BackupAgentsPaused", paused.present()); doc.setKey("Tag", tag.tagName); - if(uidAndAbortedFlag.present()) { + if (uidAndAbortedFlag.present()) { doc.setKey("UID", uidAndAbortedFlag.get().first.toString()); state BackupConfig config(uidAndAbortedFlag.get().first); - state EBackupState backupState = wait(config.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); + state EBackupState backupState = + wait(config.stateEnum().getD(tr, false, EBackupState::STATE_NEVERRAN)); JsonBuilderObject statusDoc; statusDoc.setKey("Name", BackupAgentBase::getStateName(backupState)); statusDoc.setKey("Description", BackupAgentBase::getStateText(backupState)); @@ -3934,27 +4404,30 @@ public: state Future done = Void(); - if(backupState != BackupAgentBase::STATE_NEVERRAN) { + if (backupState != BackupAgentBase::STATE_NEVERRAN) { state Reference bc; state TimestampedVersion latestRestorable; - wait( store(latestRestorable, getTimestampedVersion(tr, config.getLatestRestorableVersion(tr))) - && store(bc, config.backupContainer().getOrThrow(tr)) - ); + wait( + store(latestRestorable, getTimestampedVersion(tr, config.getLatestRestorableVersion(tr))) && + store(bc, config.backupContainer().getOrThrow(tr))); doc.setKey("Restorable", latestRestorable.present()); - if(latestRestorable.present()) { + if (latestRestorable.present()) { JsonBuilderObject o = latestRestorable.toJSON(); - if(backupState != BackupAgentBase::STATE_COMPLETED) { - o.setKey("LagSeconds", (recentReadVersion - latestRestorable.version.get()) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND); + if (backupState != BackupAgentBase::STATE_COMPLETED) { + o.setKey("LagSeconds", + (recentReadVersion - latestRestorable.version.get()) / + CLIENT_KNOBS->CORE_VERSIONSPERSECOND); } doc.setKey("LatestRestorablePoint", o); } doc.setKey("DestinationURL", bc->getURL()); } - if(backupState == BackupAgentBase::STATE_RUNNING_DIFFERENTIAL || backupState == BackupAgentBase::STATE_RUNNING) { + if (backupState == BackupAgentBase::STATE_RUNNING_DIFFERENTIAL || + backupState == BackupAgentBase::STATE_RUNNING) { state int64_t snapshotInterval; state int64_t logBytesWritten; state int64_t rangeBytesWritten; @@ -3966,37 +4439,40 @@ public: state TimestampedVersion snapshotLastDispatch; state Optional snapshotLastDispatchShardsBehind; - wait( store(snapshotInterval, config.snapshotIntervalSeconds().getOrThrow(tr)) - && store(logBytesWritten, config.logBytesWritten().getD(tr)) - && store(rangeBytesWritten, config.rangeBytesWritten().getD(tr)) - && store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) - && store(snapshotBegin, getTimestampedVersion(tr, config.snapshotBeginVersion().get(tr))) - && store(snapshotTargetEnd, getTimestampedVersion(tr, config.snapshotTargetEndVersion().get(tr))) - && store(latestLogEnd, getTimestampedVersion(tr, config.latestLogEndVersion().get(tr))) - && store(latestSnapshotEnd, getTimestampedVersion(tr, config.latestSnapshotEndVersion().get(tr))) - && store(snapshotLastDispatch, getTimestampedVersion(tr, config.snapshotDispatchLastVersion().get(tr))) - && store(snapshotLastDispatchShardsBehind, config.snapshotDispatchLastShardsBehind().get(tr)) - ); + wait( + store(snapshotInterval, config.snapshotIntervalSeconds().getOrThrow(tr)) && + store(logBytesWritten, config.logBytesWritten().getD(tr)) && + store(rangeBytesWritten, config.rangeBytesWritten().getD(tr)) && + store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) && + store(snapshotBegin, getTimestampedVersion(tr, config.snapshotBeginVersion().get(tr))) && + store(snapshotTargetEnd, + getTimestampedVersion(tr, config.snapshotTargetEndVersion().get(tr))) && + store(latestLogEnd, getTimestampedVersion(tr, config.latestLogEndVersion().get(tr))) && + store(latestSnapshotEnd, + getTimestampedVersion(tr, config.latestSnapshotEndVersion().get(tr))) && + store(snapshotLastDispatch, + getTimestampedVersion(tr, config.snapshotDispatchLastVersion().get(tr))) && + store(snapshotLastDispatchShardsBehind, config.snapshotDispatchLastShardsBehind().get(tr))); doc.setKey("StopAfterSnapshot", stopWhenDone); doc.setKey("SnapshotIntervalSeconds", snapshotInterval); doc.setKey("LogBytesWritten", logBytesWritten); doc.setKey("RangeBytesWritten", rangeBytesWritten); - if(latestLogEnd.present()) { + if (latestLogEnd.present()) { doc.setKey("LatestLogEnd", latestLogEnd.toJSON()); } - if(latestSnapshotEnd.present()) { + if (latestSnapshotEnd.present()) { doc.setKey("LatestSnapshotEnd", latestSnapshotEnd.toJSON()); } JsonBuilderObject snapshot; - if(snapshotBegin.present()) { + if (snapshotBegin.present()) { snapshot.setKey("Begin", snapshotBegin.toJSON()); - if(snapshotTargetEnd.present()) { + if (snapshotTargetEnd.present()) { snapshot.setKey("EndTarget", snapshotTargetEnd.toJSON()); Version interval = snapshotTargetEnd.version.get() - snapshotBegin.version.get(); @@ -4008,7 +4484,7 @@ public: } JsonBuilderObject dispatchDoc = snapshotLastDispatch.toJSON(); - if(snapshotLastDispatchShardsBehind.present()) { + if (snapshotLastDispatchShardsBehind.present()) { dispatchDoc.setKey("ShardsBehind", snapshotLastDispatchShardsBehind.get()); } snapshot.setKey("LastDispatch", dispatchDoc); @@ -4017,28 +4493,33 @@ public: doc.setKey("CurrentSnapshot", snapshot); } - KeyBackedMap>::PairsType errors = wait(config.lastErrorPerType().getRange(tr, 0, std::numeric_limits::max(), CLIENT_KNOBS->TOO_MANY)); + KeyBackedMap>::PairsType errors = + wait(config.lastErrorPerType().getRange( + tr, 0, std::numeric_limits::max(), CLIENT_KNOBS->TOO_MANY)); JsonBuilderArray errorList; - for(auto &e : errors) { + for (auto& e : errors) { std::string msg = e.second.first; Version ver = e.second.second; JsonBuilderObject errDoc; errDoc.setKey("Message", msg.c_str()); - errDoc.setKey("RelativeSeconds", (ver - recentReadVersion) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND); + errDoc.setKey("RelativeSeconds", + (ver - recentReadVersion) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND); } doc.setKey("Errors", errorList); } return doc.getJson(); - } - catch (Error &e) { + } catch (Error& e) { wait(tr->onError(e)); } } } - ACTOR static Future getStatus(FileBackupAgent* backupAgent, Database cx, bool showErrors, std::string tagName) { + ACTOR static Future getStatus(FileBackupAgent* backupAgent, + Database cx, + bool showErrors, + std::string tagName) { state Reference tr(new ReadYourWritesTransaction(cx)); state std::string statusText; @@ -4068,37 +4549,41 @@ public: state Reference bc; state Optional latestRestorableVersion; state Version recentReadVersion; - - wait( store(latestRestorableVersion, config.getLatestRestorableVersion(tr)) - && store(bc, config.backupContainer().getOrThrow(tr)) - && store(recentReadVersion, tr->getReadVersion()) - ); + + wait(store(latestRestorableVersion, config.getLatestRestorableVersion(tr)) && + store(bc, config.backupContainer().getOrThrow(tr)) && + store(recentReadVersion, tr->getReadVersion())); bool snapshotProgress = false; switch (backupState) { - case BackupAgentBase::STATE_SUBMITTED: - statusText += "The backup on tag `" + tagName + "' is in progress (just started) to " + bc->getURL() + ".\n"; - break; - case BackupAgentBase::STATE_RUNNING: - statusText += "The backup on tag `" + tagName + "' is in progress to " + bc->getURL() + ".\n"; - snapshotProgress = true; - break; - case BackupAgentBase::STATE_RUNNING_DIFFERENTIAL: - statusText += "The backup on tag `" + tagName + "' is restorable but continuing to " + bc->getURL() + ".\n"; - snapshotProgress = true; - break; - case BackupAgentBase::STATE_COMPLETED: - statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() + " completed at version " + format("%lld", latestRestorableVersion.orDefault(-1)) + ".\n"; - break; - default: - statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() + " " + backupStatus + ".\n"; - break; + case BackupAgentBase::STATE_SUBMITTED: + statusText += "The backup on tag `" + tagName + "' is in progress (just started) to " + + bc->getURL() + ".\n"; + break; + case BackupAgentBase::STATE_RUNNING: + statusText += "The backup on tag `" + tagName + "' is in progress to " + bc->getURL() + ".\n"; + snapshotProgress = true; + break; + case BackupAgentBase::STATE_RUNNING_DIFFERENTIAL: + statusText += "The backup on tag `" + tagName + "' is restorable but continuing to " + + bc->getURL() + ".\n"; + snapshotProgress = true; + break; + case BackupAgentBase::STATE_COMPLETED: + statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() + + " completed at version " + format("%lld", latestRestorableVersion.orDefault(-1)) + + ".\n"; + break; + default: + statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() + " " + + backupStatus + ".\n"; + break; } statusText += format("BackupUID: %s\n", uidAndAbortedFlag.get().first.toString().c_str()); statusText += format("BackupURL: %s\n", bc->getURL().c_str()); - if(snapshotProgress) { + if (snapshotProgress) { state int64_t snapshotInterval; state Version snapshotBeginVersion; state Version snapshotTargetEndVersion; @@ -4112,82 +4597,99 @@ public: state Optional snapshotTargetEndVersionTimestamp; state bool stopWhenDone; - wait( store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) - && store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) - && store(snapshotInterval, config.snapshotIntervalSeconds().getOrThrow(tr)) - && store(logBytesWritten, config.logBytesWritten().get(tr)) - && store(rangeBytesWritten, config.rangeBytesWritten().get(tr)) - && store(latestLogEndVersion, config.latestLogEndVersion().get(tr)) - && store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) - && store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) - ); + wait(store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) && + store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) && + store(snapshotInterval, config.snapshotIntervalSeconds().getOrThrow(tr)) && + store(logBytesWritten, config.logBytesWritten().get(tr)) && + store(rangeBytesWritten, config.rangeBytesWritten().get(tr)) && + store(latestLogEndVersion, config.latestLogEndVersion().get(tr)) && + store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) && + store(stopWhenDone, config.stopWhenDone().getOrThrow(tr))); - wait( store(latestSnapshotEndVersionTimestamp, getTimestampFromVersion(latestSnapshotEndVersion, tr)) - && store(latestLogEndVersionTimestamp, getTimestampFromVersion(latestLogEndVersion, tr)) - && store(snapshotBeginVersionTimestamp, timeKeeperEpochsFromVersion(snapshotBeginVersion, tr)) - && store(snapshotTargetEndVersionTimestamp, timeKeeperEpochsFromVersion(snapshotTargetEndVersion, tr)) - ); + wait(store(latestSnapshotEndVersionTimestamp, + getTimestampFromVersion(latestSnapshotEndVersion, tr)) && + store(latestLogEndVersionTimestamp, getTimestampFromVersion(latestLogEndVersion, tr)) && + store(snapshotBeginVersionTimestamp, + timeKeeperEpochsFromVersion(snapshotBeginVersion, tr)) && + store(snapshotTargetEndVersionTimestamp, + timeKeeperEpochsFromVersion(snapshotTargetEndVersion, tr))); statusText += format("Snapshot interval is %lld seconds. ", snapshotInterval); - if(backupState == BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) - statusText += format("Current snapshot progress target is %3.2f%% (>100%% means the snapshot is supposed to be done)\n", 100.0 * (recentReadVersion - snapshotBeginVersion) / (snapshotTargetEndVersion - snapshotBeginVersion)) ; + if (backupState == BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) + statusText += format("Current snapshot progress target is %3.2f%% (>100%% means the " + "snapshot is supposed to be done)\n", + 100.0 * (recentReadVersion - snapshotBeginVersion) / + (snapshotTargetEndVersion - snapshotBeginVersion)); else statusText += "The initial snapshot is still running.\n"; - + statusText += format("\nDetails:\n LogBytes written - %ld\n RangeBytes written - %ld\n " - "Last complete log version and timestamp - %s, %s\n " - "Last complete snapshot version and timestamp - %s, %s\n " - "Current Snapshot start version and timestamp - %s, %s\n " - "Expected snapshot end version and timestamp - %s, %s\n " - "Backup supposed to stop at next snapshot completion - %s\n", - logBytesWritten.orDefault(0), rangeBytesWritten.orDefault(0), - versionToString(latestLogEndVersion).c_str(), timeStampToString(latestLogEndVersionTimestamp).c_str(), - versionToString(latestSnapshotEndVersion).c_str(), timeStampToString(latestSnapshotEndVersionTimestamp).c_str(), - versionToString(snapshotBeginVersion).c_str(), timeStampToString(snapshotBeginVersionTimestamp).c_str(), - versionToString(snapshotTargetEndVersion).c_str(), timeStampToString(snapshotTargetEndVersionTimestamp).c_str(), - boolToYesOrNo(stopWhenDone).c_str()); + "Last complete log version and timestamp - %s, %s\n " + "Last complete snapshot version and timestamp - %s, %s\n " + "Current Snapshot start version and timestamp - %s, %s\n " + "Expected snapshot end version and timestamp - %s, %s\n " + "Backup supposed to stop at next snapshot completion - %s\n", + logBytesWritten.orDefault(0), + rangeBytesWritten.orDefault(0), + versionToString(latestLogEndVersion).c_str(), + timeStampToString(latestLogEndVersionTimestamp).c_str(), + versionToString(latestSnapshotEndVersion).c_str(), + timeStampToString(latestSnapshotEndVersionTimestamp).c_str(), + versionToString(snapshotBeginVersion).c_str(), + timeStampToString(snapshotBeginVersionTimestamp).c_str(), + versionToString(snapshotTargetEndVersion).c_str(), + timeStampToString(snapshotTargetEndVersionTimestamp).c_str(), + boolToYesOrNo(stopWhenDone).c_str()); } // Append the errors, if requested if (showErrors) { - KeyBackedMap>::PairsType errors = wait(config.lastErrorPerType().getRange(tr, 0, std::numeric_limits::max(), CLIENT_KNOBS->TOO_MANY)); + KeyBackedMap>::PairsType errors = + wait(config.lastErrorPerType().getRange( + tr, 0, std::numeric_limits::max(), CLIENT_KNOBS->TOO_MANY)); std::string recentErrors; std::string pastErrors; - for(auto &e : errors) { + for (auto& e : errors) { Version v = e.second.second; - std::string msg = format("%s ago : %s\n", secondsToTimeFormat((recentReadVersion - v) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND).c_str(), e.second.first.c_str()); + std::string msg = format( + "%s ago : %s\n", + secondsToTimeFormat((recentReadVersion - v) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND) + .c_str(), + e.second.first.c_str()); - // If error version is at or more recent than the latest restorable version then it could be inhibiting progress - if(v >= latestRestorableVersion.orDefault(0)) { + // If error version is at or more recent than the latest restorable version then it could be + // inhibiting progress + if (v >= latestRestorableVersion.orDefault(0)) { recentErrors += msg; - } - else { + } else { pastErrors += msg; } } if (!recentErrors.empty()) { if (latestRestorableVersion.present()) - statusText += format("Recent Errors (since latest restorable point %s ago)\n", - secondsToTimeFormat((recentReadVersion - latestRestorableVersion.get()) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND).c_str()) - + recentErrors; + statusText += + format("Recent Errors (since latest restorable point %s ago)\n", + secondsToTimeFormat((recentReadVersion - latestRestorableVersion.get()) / + CLIENT_KNOBS->CORE_VERSIONSPERSECOND) + .c_str()) + + recentErrors; else statusText += "Recent Errors (since initialization)\n" + recentErrors; } - if(!pastErrors.empty()) + if (!pastErrors.empty()) statusText += "Older Errors\n" + pastErrors; } } Optional paused = wait(fPaused); - if(paused.present()) { + if (paused.present()) { statusText += format("\nAll backup agents have been paused.\n"); } break; - } - catch (Error &e) { + } catch (Error& e) { wait(tr->onError(e)); } } @@ -4195,7 +4697,10 @@ public: return statusText; } - ACTOR static Future getLastRestorable(FileBackupAgent* backupAgent, Reference tr, Key tagName, bool snapshot) { + ACTOR static Future getLastRestorable(FileBackupAgent* backupAgent, + Reference tr, + Key tagName, + bool snapshot) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Optional version = wait(tr->get(backupAgent->lastRestorable.pack(tagName), snapshot)); @@ -4204,36 +4709,52 @@ public: } static StringRef read(StringRef& data, int bytes) { - if (bytes > data.size()) throw restore_error(); + if (bytes > data.size()) + throw restore_error(); StringRef r = data.substr(0, bytes); data = data.substr(bytes); return r; } - ACTOR static Future restore(FileBackupAgent* backupAgent, Database cx, Optional cxOrig, Key tagName, Key url, Standalone> ranges, bool waitForComplete, Version targetVersion, bool verbose, Key addPrefix, Key removePrefix, bool lockDB, UID randomUid) { + ACTOR static Future restore(FileBackupAgent* backupAgent, + Database cx, + Optional cxOrig, + Key tagName, + Key url, + Standalone> ranges, + bool waitForComplete, + Version targetVersion, + bool verbose, + Key addPrefix, + Key removePrefix, + bool lockDB, + UID randomUid) { state Reference bc = IBackupContainer::openContainer(url.toString()); state BackupDescription desc = wait(bc->describeBackup()); - if(cxOrig.present()) { + if (cxOrig.present()) { wait(desc.resolveVersionTimes(cxOrig.get())); } printf("Backup Description\n%s", desc.toString().c_str()); - if(targetVersion == invalidVersion && desc.maxRestorableVersion.present()) + if (targetVersion == invalidVersion && desc.maxRestorableVersion.present()) targetVersion = desc.maxRestorableVersion.get(); Optional restoreSet = wait(bc->getRestoreSet(targetVersion)); - if(!restoreSet.present()) { + if (!restoreSet.present()) { TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible") - .detail("BackupContainer", bc->getURL()) - .detail("TargetVersion", targetVersion); - fprintf(stderr, "ERROR: Restore version %" PRId64 " is not possible from %s\n", targetVersion, bc->getURL().c_str()); + .detail("BackupContainer", bc->getURL()) + .detail("TargetVersion", targetVersion); + fprintf(stderr, + "ERROR: Restore version %" PRId64 " is not possible from %s\n", + targetVersion, + bc->getURL().c_str()); throw restore_invalid_version(); } if (verbose) { - printf("Restoring backup to version: %lld\n", (long long) targetVersion); + printf("Restoring backup to version: %lld\n", (long long)targetVersion); } state Reference tr(new ReadYourWritesTransaction(cx)); @@ -4241,30 +4762,37 @@ public: try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - wait(submitRestore(backupAgent, tr, tagName, url, ranges, targetVersion, addPrefix, removePrefix, lockDB, randomUid)); + wait(submitRestore( + backupAgent, tr, tagName, url, ranges, targetVersion, addPrefix, removePrefix, lockDB, randomUid)); wait(tr->commit()); break; - } catch(Error &e) { - if(e.code() == error_code_restore_duplicate_tag) { + } catch (Error& e) { + if (e.code() == error_code_restore_duplicate_tag) { throw; } wait(tr->onError(e)); } } - if(waitForComplete) { + if (waitForComplete) { ERestoreState finalState = wait(waitRestore(cx, tagName, verbose)); - if(finalState != ERestoreState::COMPLETED) + if (finalState != ERestoreState::COMPLETED) throw restore_error(); } return targetVersion; } - //used for correctness only, locks the database before discontinuing the backup and that same lock is then used while doing the restore. - //the tagname of the backup must be the same as the restore. - ACTOR static Future atomicRestore(FileBackupAgent* backupAgent, Database cx, Key tagName, Standalone> ranges, Key addPrefix, Key removePrefix) { - state Reference ryw_tr = Reference(new ReadYourWritesTransaction(cx)); + // used for correctness only, locks the database before discontinuing the backup and that same lock is then used + // while doing the restore. the tagname of the backup must be the same as the restore. + ACTOR static Future atomicRestore(FileBackupAgent* backupAgent, + Database cx, + Key tagName, + Standalone> ranges, + Key addPrefix, + Key removePrefix) { + state Reference ryw_tr = + Reference(new ReadYourWritesTransaction(cx)); state BackupConfig backupConfig; loop { try { @@ -4275,17 +4803,17 @@ public: backupConfig = BackupConfig(uidFlag.first); state EBackupState status = wait(backupConfig.stateEnum().getOrThrow(ryw_tr)); - if (status != BackupAgentBase::STATE_RUNNING_DIFFERENTIAL ) { + if (status != BackupAgentBase::STATE_RUNNING_DIFFERENTIAL) { throw backup_duplicate(); } break; - } catch( Error &e ) { - wait( ryw_tr->onError(e) ); + } catch (Error& e) { + wait(ryw_tr->onError(e)); } } - - //Lock src, record commit version + + // Lock src, record commit version state Transaction tr(cx); state Version commitVersion; state UID randomUid = deterministicRandom()->randomUniqueID(); @@ -4294,12 +4822,12 @@ public: // We must get a commit version so add a conflict range that won't likely cause conflicts // but will ensure that the transaction is actually submitted. tr.addWriteConflictRange(backupConfig.snapshotRangeDispatchMap().space.range()); - wait( lockDatabase(&tr, randomUid) ); + wait(lockDatabase(&tr, randomUid)); wait(tr.commit()); commitVersion = tr.getCommittedVersion(); TraceEvent("AS_Locked").detail("CommitVer", commitVersion); break; - } catch( Error &e ) { + } catch (Error& e) { wait(tr.onError(e)); } } @@ -4307,35 +4835,35 @@ public: ryw_tr->reset(); loop { try { - Optional restoreVersion = wait( backupConfig.getLatestRestorableVersion(ryw_tr) ); - if(restoreVersion.present() && restoreVersion.get() >= commitVersion) { + Optional restoreVersion = wait(backupConfig.getLatestRestorableVersion(ryw_tr)); + if (restoreVersion.present() && restoreVersion.get() >= commitVersion) { TraceEvent("AS_RestoreVersion").detail("RestoreVer", restoreVersion.get()); break; } else { ryw_tr->reset(); wait(delay(0.2)); } - } catch( Error &e ) { - wait( ryw_tr->onError(e) ); + } catch (Error& e) { + wait(ryw_tr->onError(e)); } } ryw_tr->reset(); loop { try { - wait( discontinueBackup(backupAgent, ryw_tr, tagName) ); - wait( ryw_tr->commit() ); + wait(discontinueBackup(backupAgent, ryw_tr, tagName)); + wait(ryw_tr->commit()); TraceEvent("AS_DiscontinuedBackup"); break; - } catch( Error &e ) { - if(e.code() == error_code_backup_unneeded || e.code() == error_code_backup_duplicate){ + } catch (Error& e) { + if (e.code() == error_code_backup_unneeded || e.code() == error_code_backup_duplicate) { break; } - wait( ryw_tr->onError(e) ); + wait(ryw_tr->onError(e)); } } - wait(success( waitBackup(backupAgent, cx, tagName.toString(), true) )); + wait(success(waitBackup(backupAgent, cx, tagName.toString(), true))); TraceEvent("AS_BackupStopped"); ryw_tr->reset(); @@ -4343,22 +4871,34 @@ public: try { ryw_tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); ryw_tr->setOption(FDBTransactionOptions::LOCK_AWARE); - for (auto &range : ranges) { + for (auto& range : ranges) { ryw_tr->addReadConflictRange(range); ryw_tr->clear(range); } - wait( ryw_tr->commit() ); + wait(ryw_tr->commit()); TraceEvent("AS_ClearedRange"); break; - } catch( Error &e ) { - wait( ryw_tr->onError(e) ); + } catch (Error& e) { + wait(ryw_tr->onError(e)); } } Reference bc = wait(backupConfig.backupContainer().getOrThrow(cx)); TraceEvent("AS_StartRestore"); - Version ver = wait( restore(backupAgent, cx, cx, tagName, KeyRef(bc->getURL()), ranges, true, -1, true, addPrefix, removePrefix, true, randomUid) ); + Version ver = wait(restore(backupAgent, + cx, + cx, + tagName, + KeyRef(bc->getURL()), + ranges, + true, + -1, + true, + addPrefix, + removePrefix, + true, + randomUid)); return ver; } }; @@ -4367,11 +4907,37 @@ const std::string BackupAgentBase::defaultTagName = "default"; const int BackupAgentBase::logHeaderSize = 12; const int FileBackupAgent::dataFooterSize = 20; -Future FileBackupAgent::restore(Database cx, Optional cxOrig, Key tagName, Key url, Standalone> ranges, bool waitForComplete, Version targetVersion, bool verbose, Key addPrefix, Key removePrefix, bool lockDB) { - return FileBackupAgentImpl::restore(this, cx, cxOrig, tagName, url, ranges, waitForComplete, targetVersion, verbose, addPrefix, removePrefix, lockDB, deterministicRandom()->randomUniqueID()); +Future FileBackupAgent::restore(Database cx, + Optional cxOrig, + Key tagName, + Key url, + Standalone> ranges, + bool waitForComplete, + Version targetVersion, + bool verbose, + Key addPrefix, + Key removePrefix, + bool lockDB) { + return FileBackupAgentImpl::restore(this, + cx, + cxOrig, + tagName, + url, + ranges, + waitForComplete, + targetVersion, + verbose, + addPrefix, + removePrefix, + lockDB, + deterministicRandom()->randomUniqueID()); } -Future FileBackupAgent::atomicRestore(Database cx, Key tagName, Standalone> ranges, Key addPrefix, Key removePrefix) { +Future FileBackupAgent::atomicRestore(Database cx, + Key tagName, + Standalone> ranges, + Key addPrefix, + Key removePrefix) { return FileBackupAgentImpl::atomicRestore(this, cx, tagName, ranges, addPrefix, removePrefix); } @@ -4391,15 +4957,21 @@ Future FileBackupAgent::waitRestore(Database cx, Key tagName, boo return FileBackupAgentImpl::waitRestore(cx, tagName, verbose); }; -Future FileBackupAgent::submitBackup(Reference tr, Key outContainer, int snapshotIntervalSeconds, std::string tagName, Standalone> backupRanges, bool stopWhenDone) { - return FileBackupAgentImpl::submitBackup(this, tr, outContainer, snapshotIntervalSeconds, tagName, backupRanges, stopWhenDone); +Future FileBackupAgent::submitBackup(Reference tr, + Key outContainer, + int snapshotIntervalSeconds, + std::string tagName, + Standalone> backupRanges, + bool stopWhenDone) { + return FileBackupAgentImpl::submitBackup( + this, tr, outContainer, snapshotIntervalSeconds, tagName, backupRanges, stopWhenDone); } -Future FileBackupAgent::discontinueBackup(Reference tr, Key tagName){ +Future FileBackupAgent::discontinueBackup(Reference tr, Key tagName) { return FileBackupAgentImpl::discontinueBackup(this, tr, tagName); } -Future FileBackupAgent::abortBackup(Reference tr, std::string tagName){ +Future FileBackupAgent::abortBackup(Reference tr, std::string tagName) { return FileBackupAgentImpl::abortBackup(this, tr, tagName); } @@ -4411,7 +4983,9 @@ Future FileBackupAgent::getStatusJSON(Database cx, std::string tagN return FileBackupAgentImpl::getStatusJSON(this, cx, tagName); } -Future FileBackupAgent::getLastRestorable(Reference tr, Key tagName, bool snapshot) { +Future FileBackupAgent::getLastRestorable(Reference tr, + Key tagName, + bool snapshot) { return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot); } @@ -4421,7 +4995,10 @@ void FileBackupAgent::setLastRestorable(Reference tr, tr->set(lastRestorable.pack(tagName), BinaryWriter::toValue(version, Unversioned())); } -Future FileBackupAgent::waitBackup(Database cx, std::string tagName, bool stopWhenDone, Reference *pContainer, UID *pUID) { +Future FileBackupAgent::waitBackup(Database cx, + std::string tagName, + bool stopWhenDone, + Reference* pContainer, + UID* pUID) { return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone, pContainer, pUID); } - diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index 9b4156cd72..e43d74f975 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -24,176 +24,237 @@ ClientKnobs const* CLIENT_KNOBS = new ClientKnobs(); -#define init( knob, value ) initKnob( knob, value, #knob ) +#define init(knob, value) initKnob(knob, value, #knob) ClientKnobs::ClientKnobs(bool randomize) { // FIXME: These are not knobs, get them out of ClientKnobs! BYTE_LIMIT_UNLIMITED = GetRangeLimits::BYTE_LIMIT_UNLIMITED; ROW_LIMIT_UNLIMITED = GetRangeLimits::ROW_LIMIT_UNLIMITED; - init( TOO_MANY, 1000000 ); + init(TOO_MANY, 1000000); - init( SYSTEM_MONITOR_INTERVAL, 5.0 ); + init(SYSTEM_MONITOR_INTERVAL, 5.0); - init( FAILURE_MAX_DELAY, 5.0 ); - init( FAILURE_MIN_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_MIN_DELAY = 1.0; - init( FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY ); - init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY ); - init( FAILURE_EMERGENCY_DELAY, 30.0 ); - init( FAILURE_MAX_GENERATIONS, 10 ); - init( RECOVERY_DELAY_START_GENERATION, 70 ); - init( RECOVERY_DELAY_SECONDS_PER_GENERATION, 60.0 ); - init( MAX_GENERATIONS, 100 ); - init( MAX_GENERATIONS_OVERRIDE, 0 ); + init(FAILURE_MAX_DELAY, 5.0); + init(FAILURE_MIN_DELAY, 4.0); + if (randomize && BUGGIFY) + FAILURE_MIN_DELAY = 1.0; + init(FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY); + init(CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY); + init(FAILURE_EMERGENCY_DELAY, 30.0); + init(FAILURE_MAX_GENERATIONS, 10); + init(RECOVERY_DELAY_START_GENERATION, 70); + init(RECOVERY_DELAY_SECONDS_PER_GENERATION, 60.0); + init(MAX_GENERATIONS, 100); + init(MAX_GENERATIONS_OVERRIDE, 0); - init( COORDINATOR_RECONNECTION_DELAY, 1.0 ); - init( CLIENT_EXAMPLE_AMOUNT, 20 ); - init( MAX_CLIENT_STATUS_AGE, 1.0 ); - init( MAX_PROXY_CONNECTIONS, 5 ); if( randomize && BUGGIFY ) MAX_PROXY_CONNECTIONS = 1; - init( STATUS_IDLE_TIMEOUT, 120.0 ); + init(COORDINATOR_RECONNECTION_DELAY, 1.0); + init(CLIENT_EXAMPLE_AMOUNT, 20); + init(MAX_CLIENT_STATUS_AGE, 1.0); + init(MAX_PROXY_CONNECTIONS, 5); + if (randomize && BUGGIFY) + MAX_PROXY_CONNECTIONS = 1; + init(STATUS_IDLE_TIMEOUT, 120.0); // wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin - init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) - init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; - init( REPLY_BYTE_LIMIT, 80000 ); - init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01(); - init( DEFAULT_MAX_BACKOFF, 1.0 ); - init( BACKOFF_GROWTH_RATE, 2.0 ); - init( RESOURCE_CONSTRAINED_MAX_BACKOFF, 30.0 ); - init( PROXY_COMMIT_OVERHEAD_BYTES, 23 ); //The size of serializing 7 tags (3 primary, 3 remote, 1 log router) + 2 for the tag length + init(WRONG_SHARD_SERVER_DELAY, .01); + if (randomize && BUGGIFY) + WRONG_SHARD_SERVER_DELAY = + deterministicRandom() + ->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of + // retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) + init(FUTURE_VERSION_RETRY_DELAY, .01); + if (randomize && BUGGIFY) + FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + init(REPLY_BYTE_LIMIT, 80000); + init(DEFAULT_BACKOFF, .01); + if (randomize && BUGGIFY) + DEFAULT_BACKOFF = deterministicRandom()->random01(); + init(DEFAULT_MAX_BACKOFF, 1.0); + init(BACKOFF_GROWTH_RATE, 2.0); + init(RESOURCE_CONSTRAINED_MAX_BACKOFF, 30.0); + init(PROXY_COMMIT_OVERHEAD_BYTES, + 23); // The size of serializing 7 tags (3 primary, 3 remote, 1 log router) + 2 for the tag length - init( TRANSACTION_SIZE_LIMIT, 1e7 ); - init( KEY_SIZE_LIMIT, 1e4 ); - init( SYSTEM_KEY_SIZE_LIMIT, 3e4 ); - init( VALUE_SIZE_LIMIT, 1e5 ); - init( SPLIT_KEY_SIZE_LIMIT, KEY_SIZE_LIMIT/2 ); if( randomize && BUGGIFY ) SPLIT_KEY_SIZE_LIMIT = KEY_SIZE_LIMIT - 31;//serverKeysPrefixFor(UID()).size() - 1; - init( METADATA_VERSION_CACHE_SIZE, 1000 ); + init(TRANSACTION_SIZE_LIMIT, 1e7); + init(KEY_SIZE_LIMIT, 1e4); + init(SYSTEM_KEY_SIZE_LIMIT, 3e4); + init(VALUE_SIZE_LIMIT, 1e5); + init(SPLIT_KEY_SIZE_LIMIT, KEY_SIZE_LIMIT / 2); + if (randomize && BUGGIFY) + SPLIT_KEY_SIZE_LIMIT = KEY_SIZE_LIMIT - 31; // serverKeysPrefixFor(UID()).size() - 1; + init(METADATA_VERSION_CACHE_SIZE, 1000); - init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; - init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; - init( BROADCAST_BATCH_SIZE, 20 ); if( randomize && BUGGIFY ) BROADCAST_BATCH_SIZE = 1; - init( TRANSACTION_TIMEOUT_DELAY_INTERVAL, 10.0 ); if( randomize && BUGGIFY ) TRANSACTION_TIMEOUT_DELAY_INTERVAL = 1.0; + init(MAX_BATCH_SIZE, 1000); + if (randomize && BUGGIFY) + MAX_BATCH_SIZE = 1; + init(GRV_BATCH_TIMEOUT, 0.005); + if (randomize && BUGGIFY) + GRV_BATCH_TIMEOUT = 0.1; + init(BROADCAST_BATCH_SIZE, 20); + if (randomize && BUGGIFY) + BROADCAST_BATCH_SIZE = 1; + init(TRANSACTION_TIMEOUT_DELAY_INTERVAL, 10.0); + if (randomize && BUGGIFY) + TRANSACTION_TIMEOUT_DELAY_INTERVAL = 1.0; - init( LOCATION_CACHE_EVICTION_SIZE, 300000 ); - init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; + init(LOCATION_CACHE_EVICTION_SIZE, 300000); + init(LOCATION_CACHE_EVICTION_SIZE_SIM, 10); + if (randomize && BUGGIFY) + LOCATION_CACHE_EVICTION_SIZE_SIM = 3; - init( GET_RANGE_SHARD_LIMIT, 2 ); - init( WARM_RANGE_SHARD_LIMIT, 100 ); - init( STORAGE_METRICS_SHARD_LIMIT, 100 ); if( randomize && BUGGIFY ) STORAGE_METRICS_SHARD_LIMIT = 3; - init( SHARD_COUNT_LIMIT, 80 ); if( randomize && BUGGIFY ) SHARD_COUNT_LIMIT = 3; - init( STORAGE_METRICS_UNFAIR_SPLIT_LIMIT, 2.0/3.0 ); - init( STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, 15.0 ); - init( AGGREGATE_HEALTH_METRICS_MAX_STALENESS, 0.5 ); - init( DETAILED_HEALTH_METRICS_MAX_STALENESS, 5.0 ); + init(GET_RANGE_SHARD_LIMIT, 2); + init(WARM_RANGE_SHARD_LIMIT, 100); + init(STORAGE_METRICS_SHARD_LIMIT, 100); + if (randomize && BUGGIFY) + STORAGE_METRICS_SHARD_LIMIT = 3; + init(SHARD_COUNT_LIMIT, 80); + if (randomize && BUGGIFY) + SHARD_COUNT_LIMIT = 3; + init(STORAGE_METRICS_UNFAIR_SPLIT_LIMIT, 2.0 / 3.0); + init(STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, 15.0); + init(AGGREGATE_HEALTH_METRICS_MAX_STALENESS, 0.5); + init(DETAILED_HEALTH_METRICS_MAX_STALENESS, 5.0); - //KeyRangeMap - init( KRM_GET_RANGE_LIMIT, 1e5 ); if( randomize && BUGGIFY ) KRM_GET_RANGE_LIMIT = 10; - init( KRM_GET_RANGE_LIMIT_BYTES, 1e8 ); if( randomize && BUGGIFY ) KRM_GET_RANGE_LIMIT_BYTES = 10000; //This must be sufficiently larger than KEY_SIZE_LIMIT to ensure that at least two entries will be returned from an attempt to read a key range map + // KeyRangeMap + init(KRM_GET_RANGE_LIMIT, 1e5); + if (randomize && BUGGIFY) + KRM_GET_RANGE_LIMIT = 10; + init(KRM_GET_RANGE_LIMIT_BYTES, 1e8); + if (randomize && BUGGIFY) + KRM_GET_RANGE_LIMIT_BYTES = 10000; // This must be sufficiently larger than KEY_SIZE_LIMIT to ensure that at + // least two entries will be returned from an attempt to read a key range map - init( DEFAULT_MAX_OUTSTANDING_WATCHES, 1e4 ); - init( ABSOLUTE_MAX_WATCHES, 1e6 ); - init( WATCH_POLLING_TIME, 1.0 ); if( randomize && BUGGIFY ) WATCH_POLLING_TIME = 5.0; - init( NO_RECENT_UPDATES_DURATION, 20.0 ); if( randomize && BUGGIFY ) NO_RECENT_UPDATES_DURATION = 0.1; - init( FAST_WATCH_TIMEOUT, 20.0 ); if( randomize && BUGGIFY ) FAST_WATCH_TIMEOUT = 1.0; - init( WATCH_TIMEOUT, 900.0 ); if( randomize && BUGGIFY ) WATCH_TIMEOUT = 20.0; + init(DEFAULT_MAX_OUTSTANDING_WATCHES, 1e4); + init(ABSOLUTE_MAX_WATCHES, 1e6); + init(WATCH_POLLING_TIME, 1.0); + if (randomize && BUGGIFY) + WATCH_POLLING_TIME = 5.0; + init(NO_RECENT_UPDATES_DURATION, 20.0); + if (randomize && BUGGIFY) + NO_RECENT_UPDATES_DURATION = 0.1; + init(FAST_WATCH_TIMEOUT, 20.0); + if (randomize && BUGGIFY) + FAST_WATCH_TIMEOUT = 1.0; + init(WATCH_TIMEOUT, 900.0); + if (randomize && BUGGIFY) + WATCH_TIMEOUT = 20.0; // Core - init( CORE_VERSIONSPERSECOND, 1e6 ); - init( LOG_RANGE_BLOCK_SIZE, 1e6 ); //Dependent on CORE_VERSIONSPERSECOND - init( MUTATION_BLOCK_SIZE, 10000 ); + init(CORE_VERSIONSPERSECOND, 1e6); + init(LOG_RANGE_BLOCK_SIZE, 1e6); // Dependent on CORE_VERSIONSPERSECOND + init(MUTATION_BLOCK_SIZE, 10000); // TaskBucket - init( TASKBUCKET_LOGGING_DELAY, 5.0 ); - init( TASKBUCKET_MAX_PRIORITY, 1 ); - init( TASKBUCKET_CHECK_TIMEOUT_CHANCE, 0.02 ); if( randomize && BUGGIFY ) TASKBUCKET_CHECK_TIMEOUT_CHANCE = 1.0; - init( TASKBUCKET_TIMEOUT_JITTER_OFFSET, 0.9 ); - init( TASKBUCKET_TIMEOUT_JITTER_RANGE, 0.2 ); - init( TASKBUCKET_CHECK_ACTIVE_DELAY, 0.5 ); - init( TASKBUCKET_CHECK_ACTIVE_AMOUNT, 10 ); - init( TASKBUCKET_TIMEOUT_VERSIONS, 60*CORE_VERSIONSPERSECOND ); if( randomize && BUGGIFY ) TASKBUCKET_TIMEOUT_VERSIONS = 30*CORE_VERSIONSPERSECOND; - init( TASKBUCKET_MAX_TASK_KEYS, 1000 ); if( randomize && BUGGIFY ) TASKBUCKET_MAX_TASK_KEYS = 20; + init(TASKBUCKET_LOGGING_DELAY, 5.0); + init(TASKBUCKET_MAX_PRIORITY, 1); + init(TASKBUCKET_CHECK_TIMEOUT_CHANCE, 0.02); + if (randomize && BUGGIFY) + TASKBUCKET_CHECK_TIMEOUT_CHANCE = 1.0; + init(TASKBUCKET_TIMEOUT_JITTER_OFFSET, 0.9); + init(TASKBUCKET_TIMEOUT_JITTER_RANGE, 0.2); + init(TASKBUCKET_CHECK_ACTIVE_DELAY, 0.5); + init(TASKBUCKET_CHECK_ACTIVE_AMOUNT, 10); + init(TASKBUCKET_TIMEOUT_VERSIONS, 60 * CORE_VERSIONSPERSECOND); + if (randomize && BUGGIFY) + TASKBUCKET_TIMEOUT_VERSIONS = 30 * CORE_VERSIONSPERSECOND; + init(TASKBUCKET_MAX_TASK_KEYS, 1000); + if (randomize && BUGGIFY) + TASKBUCKET_MAX_TASK_KEYS = 20; - //Backup - init( BACKUP_LOCAL_FILE_WRITE_BLOCK, 1024*1024 ); - init( BACKUP_CONCURRENT_DELETES, 100 ); - init( BACKUP_SIMULATED_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) BACKUP_SIMULATED_LIMIT_BYTES = 1000; - init( BACKUP_GET_RANGE_LIMIT_BYTES, 1e6 ); - init( BACKUP_LOCK_BYTES, 1e8 ); - init( BACKUP_RANGE_TIMEOUT, TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 ); - init( BACKUP_RANGE_MINWAIT, std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0)); - init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC, 10 * 60 ); // 10 minutes - init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC, 3600 * 24 * 10); // 10 days - init( BACKUP_SHARD_TASK_LIMIT, 1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4; - init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60); - init( BACKUP_AGGREGATE_POLL_RATE, 2.0 ); // polls per second target for all agents on the cluster - init( BACKUP_LOG_WRITE_BATCH_MAX_SIZE, 1e6 ); //Must be much smaller than TRANSACTION_SIZE_LIMIT - init( BACKUP_LOG_ATOMIC_OPS_SIZE, 1000 ); - init( BACKUP_OPERATION_COST_OVERHEAD, 50 ); - init( BACKUP_MAX_LOG_RANGES, 21 ); if( randomize && BUGGIFY ) BACKUP_MAX_LOG_RANGES = 4; - init( BACKUP_SIM_COPY_LOG_RANGES, 100 ); - init( BACKUP_VERSION_DELAY, 5*CORE_VERSIONSPERSECOND ); + // Backup + init(BACKUP_LOCAL_FILE_WRITE_BLOCK, 1024 * 1024); + init(BACKUP_CONCURRENT_DELETES, 100); + init(BACKUP_SIMULATED_LIMIT_BYTES, 1e6); + if (randomize && BUGGIFY) + BACKUP_SIMULATED_LIMIT_BYTES = 1000; + init(BACKUP_GET_RANGE_LIMIT_BYTES, 1e6); + init(BACKUP_LOCK_BYTES, 1e8); + init(BACKUP_RANGE_TIMEOUT, TASKBUCKET_TIMEOUT_VERSIONS / CORE_VERSIONSPERSECOND / 2.0); + init(BACKUP_RANGE_MINWAIT, std::max(1.0, BACKUP_RANGE_TIMEOUT / 2.0)); + init(BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC, 10 * 60); // 10 minutes + init(BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC, 3600 * 24 * 10); // 10 days + init(BACKUP_SHARD_TASK_LIMIT, 1000); + if (randomize && BUGGIFY) + BACKUP_SHARD_TASK_LIMIT = 4; + init(BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60); + init(BACKUP_AGGREGATE_POLL_RATE, 2.0); // polls per second target for all agents on the cluster + init(BACKUP_LOG_WRITE_BATCH_MAX_SIZE, 1e6); // Must be much smaller than TRANSACTION_SIZE_LIMIT + init(BACKUP_LOG_ATOMIC_OPS_SIZE, 1000); + init(BACKUP_OPERATION_COST_OVERHEAD, 50); + init(BACKUP_MAX_LOG_RANGES, 21); + if (randomize && BUGGIFY) + BACKUP_MAX_LOG_RANGES = 4; + init(BACKUP_SIM_COPY_LOG_RANGES, 100); + init(BACKUP_VERSION_DELAY, 5 * CORE_VERSIONSPERSECOND); bool buggifyMapLimits = randomize && BUGGIFY; - init( BACKUP_MAP_KEY_LOWER_LIMIT, 1e4 ); if( buggifyMapLimits ) BACKUP_MAP_KEY_LOWER_LIMIT = 4; - init( BACKUP_MAP_KEY_UPPER_LIMIT, 1e5 ); if( buggifyMapLimits ) BACKUP_MAP_KEY_UPPER_LIMIT = 30; - init( BACKUP_COPY_TASKS, 90 ); - init( BACKUP_BLOCK_SIZE, LOG_RANGE_BLOCK_SIZE/10 ); - init( BACKUP_TASKS_PER_AGENT, 10 ); - init( SIM_BACKUP_TASKS_PER_AGENT, 10 ); - init( BACKUP_RANGEFILE_BLOCK_SIZE, 1024 * 1024); - init( BACKUP_LOGFILE_BLOCK_SIZE, 1024 * 1024); - init( BACKUP_DISPATCH_ADDTASK_SIZE, 50 ); - init( RESTORE_DISPATCH_ADDTASK_SIZE, 150 ); - init( RESTORE_DISPATCH_BATCH_SIZE, 30000 ); if( randomize && BUGGIFY ) RESTORE_DISPATCH_BATCH_SIZE = 20; - init( RESTORE_WRITE_TX_SIZE, 256 * 1024 ); - init( APPLY_MAX_LOCK_BYTES, 1e9 ); - init( APPLY_MIN_LOCK_BYTES, 11e6 ); //Must be bigger than TRANSACTION_SIZE_LIMIT - init( APPLY_BLOCK_SIZE, LOG_RANGE_BLOCK_SIZE/5 ); - init( APPLY_MAX_DECAY_RATE, 0.99 ); - init( APPLY_MAX_INCREASE_FACTOR, 1.1 ); - init( BACKUP_ERROR_DELAY, 10.0 ); - init( BACKUP_STATUS_DELAY, 40.0 ); - init( BACKUP_STATUS_JITTER, 0.05 ); - init( MIN_CLEANUP_SECONDS, 3600.0 ); + init(BACKUP_MAP_KEY_LOWER_LIMIT, 1e4); + if (buggifyMapLimits) + BACKUP_MAP_KEY_LOWER_LIMIT = 4; + init(BACKUP_MAP_KEY_UPPER_LIMIT, 1e5); + if (buggifyMapLimits) + BACKUP_MAP_KEY_UPPER_LIMIT = 30; + init(BACKUP_COPY_TASKS, 90); + init(BACKUP_BLOCK_SIZE, LOG_RANGE_BLOCK_SIZE / 10); + init(BACKUP_TASKS_PER_AGENT, 10); + init(SIM_BACKUP_TASKS_PER_AGENT, 10); + init(BACKUP_RANGEFILE_BLOCK_SIZE, 1024 * 1024); + init(BACKUP_LOGFILE_BLOCK_SIZE, 1024 * 1024); + init(BACKUP_DISPATCH_ADDTASK_SIZE, 50); + init(RESTORE_DISPATCH_ADDTASK_SIZE, 150); + init(RESTORE_DISPATCH_BATCH_SIZE, 30000); + if (randomize && BUGGIFY) + RESTORE_DISPATCH_BATCH_SIZE = 20; + init(RESTORE_WRITE_TX_SIZE, 256 * 1024); + init(APPLY_MAX_LOCK_BYTES, 1e9); + init(APPLY_MIN_LOCK_BYTES, 11e6); // Must be bigger than TRANSACTION_SIZE_LIMIT + init(APPLY_BLOCK_SIZE, LOG_RANGE_BLOCK_SIZE / 5); + init(APPLY_MAX_DECAY_RATE, 0.99); + init(APPLY_MAX_INCREASE_FACTOR, 1.1); + init(BACKUP_ERROR_DELAY, 10.0); + init(BACKUP_STATUS_DELAY, 40.0); + init(BACKUP_STATUS_JITTER, 0.05); + init(MIN_CLEANUP_SECONDS, 3600.0); // Configuration - init( DEFAULT_AUTO_PROXIES, 3 ); - init( DEFAULT_AUTO_RESOLVERS, 1 ); - init( DEFAULT_AUTO_LOGS, 3 ); + init(DEFAULT_AUTO_PROXIES, 3); + init(DEFAULT_AUTO_RESOLVERS, 1); + init(DEFAULT_AUTO_LOGS, 3); - init( IS_ACCEPTABLE_DELAY, 1.5 ); + init(IS_ACCEPTABLE_DELAY, 1.5); - init( HTTP_READ_SIZE, 128*1024 ); - init( HTTP_SEND_SIZE, 32*1024 ); - init( HTTP_VERBOSE_LEVEL, 0 ); - init( HTTP_REQUEST_ID_HEADER, "" ); - init( BLOBSTORE_CONNECT_TRIES, 10 ); - init( BLOBSTORE_CONNECT_TIMEOUT, 10 ); - init( BLOBSTORE_MAX_CONNECTION_LIFE, 120 ); - init( BLOBSTORE_REQUEST_TRIES, 10 ); - init( BLOBSTORE_REQUEST_TIMEOUT, 60 ); + init(HTTP_READ_SIZE, 128 * 1024); + init(HTTP_SEND_SIZE, 32 * 1024); + init(HTTP_VERBOSE_LEVEL, 0); + init(HTTP_REQUEST_ID_HEADER, ""); + init(BLOBSTORE_CONNECT_TRIES, 10); + init(BLOBSTORE_CONNECT_TIMEOUT, 10); + init(BLOBSTORE_MAX_CONNECTION_LIFE, 120); + init(BLOBSTORE_REQUEST_TRIES, 10); + init(BLOBSTORE_REQUEST_TIMEOUT, 60); - init( BLOBSTORE_CONCURRENT_UPLOADS, BACKUP_TASKS_PER_AGENT*2 ); - init( BLOBSTORE_CONCURRENT_LISTS, 20 ); - init( BLOBSTORE_CONCURRENT_REQUESTS, BLOBSTORE_CONCURRENT_UPLOADS + BLOBSTORE_CONCURRENT_LISTS + 5); + init(BLOBSTORE_CONCURRENT_UPLOADS, BACKUP_TASKS_PER_AGENT * 2); + init(BLOBSTORE_CONCURRENT_LISTS, 20); + init(BLOBSTORE_CONCURRENT_REQUESTS, BLOBSTORE_CONCURRENT_UPLOADS + BLOBSTORE_CONCURRENT_LISTS + 5); - init( BLOBSTORE_CONCURRENT_WRITES_PER_FILE, 5 ); - init( BLOBSTORE_CONCURRENT_READS_PER_FILE, 3 ); - init( BLOBSTORE_READ_BLOCK_SIZE, 1024 * 1024 ); - init( BLOBSTORE_READ_AHEAD_BLOCKS, 0 ); - init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2 ); - init( BLOBSTORE_MULTIPART_MAX_PART_SIZE, 20000000 ); - init( BLOBSTORE_MULTIPART_MIN_PART_SIZE, 5242880 ); + init(BLOBSTORE_CONCURRENT_WRITES_PER_FILE, 5); + init(BLOBSTORE_CONCURRENT_READS_PER_FILE, 3); + init(BLOBSTORE_READ_BLOCK_SIZE, 1024 * 1024); + init(BLOBSTORE_READ_AHEAD_BLOCKS, 0); + init(BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2); + init(BLOBSTORE_MULTIPART_MAX_PART_SIZE, 20000000); + init(BLOBSTORE_MULTIPART_MIN_PART_SIZE, 5242880); // These are basically unlimited by default but can be used to reduce blob IO if needed - init( BLOBSTORE_REQUESTS_PER_SECOND, 200 ); - init( BLOBSTORE_MAX_SEND_BYTES_PER_SECOND, 1e9 ); - init( BLOBSTORE_MAX_RECV_BYTES_PER_SECOND, 1e9 ); + init(BLOBSTORE_REQUESTS_PER_SECOND, 200); + init(BLOBSTORE_MAX_SEND_BYTES_PER_SECOND, 1e9); + init(BLOBSTORE_MAX_RECV_BYTES_PER_SECOND, 1e9); - init( BLOBSTORE_LIST_REQUESTS_PER_SECOND, 200 ); - init( BLOBSTORE_WRITE_REQUESTS_PER_SECOND, 50 ); - init( BLOBSTORE_READ_REQUESTS_PER_SECOND, 100 ); - init( BLOBSTORE_DELETE_REQUESTS_PER_SECOND, 200 ); + init(BLOBSTORE_LIST_REQUESTS_PER_SECOND, 200); + init(BLOBSTORE_WRITE_REQUESTS_PER_SECOND, 50); + init(BLOBSTORE_READ_REQUESTS_PER_SECOND, 100); + init(BLOBSTORE_DELETE_REQUESTS_PER_SECOND, 200); // Client Status Info init(CSI_SAMPLING_PROBABILITY, -1.0); @@ -202,12 +263,12 @@ ClientKnobs::ClientKnobs(bool randomize) { CSI_SAMPLING_PROBABILITY = deterministicRandom()->random01() / 10; // rand range 0 - 0.1 CSI_SIZE_LIMIT = deterministicRandom()->randomInt(1024 * 1024, 100 * 1024 * 1024); // 1 MB - 100 MB } - init(CSI_STATUS_DELAY, 10.0 ); + init(CSI_STATUS_DELAY, 10.0); - init( CONSISTENCY_CHECK_RATE_LIMIT_MAX, 50e6 ); // Limit in per sec - init( CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME, 7 * 24 * 60 * 60 ); // 7 days - - //fdbcli - init( CLI_CONNECT_PARALLELISM, 400 ); - init( CLI_CONNECT_TIMEOUT, 10.0 ); + init(CONSISTENCY_CHECK_RATE_LIMIT_MAX, 50e6); // Limit in per sec + init(CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME, 7 * 24 * 60 * 60); // 7 days + + // fdbcli + init(CLI_CONNECT_PARALLELISM, 400); + init(CLI_CONNECT_TIMEOUT, 10.0); } diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 0c62c09fe9..9c129ddcfa 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -20,12 +20,13 @@ #pragma once -// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source version. +// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source +// version. #if defined(NO_INTELLISENSE) && !defined(FLOW_ASYNCFILENONDURABLE_ACTOR_G_H) - #define FLOW_ASYNCFILENONDURABLE_ACTOR_G_H - #include "fdbrpc/AsyncFileNonDurable.actor.g.h" +#define FLOW_ASYNCFILENONDURABLE_ACTOR_G_H +#include "fdbrpc/AsyncFileNonDurable.actor.g.h" #elif !defined(FLOW_ASYNCFILENONDURABLE_ACTOR_H) - #define FLOW_ASYNCFILENONDURABLE_ACTOR_H +#define FLOW_ASYNCFILENONDURABLE_ACTOR_H #include "flow/flow.h" #include "fdbrpc/IAsyncFile.h" @@ -33,153 +34,155 @@ #include "fdbrpc/simulator.h" #include "fdbrpc/TraceFileIO.h" #include "fdbrpc/RangeMap.h" -#include "flow/actorcompiler.h" // This must be the last #include. +#include "flow/actorcompiler.h" // This must be the last #include. #undef max #undef min -ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, TaskPriority taskID ); -ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, TaskPriority taskID ); +ACTOR Future sendOnProcess(ISimulator::ProcessInfo* process, Promise promise, TaskPriority taskID); +ACTOR Future sendErrorOnProcess(ISimulator::ProcessInfo* process, + Promise promise, + Error e, + TaskPriority taskID); -ACTOR template -Future sendErrorOnShutdown( Future in ) { +ACTOR template +Future sendErrorOnShutdown(Future in) { choose { - when( wait(success( g_simulator.getCurrentProcess()->shutdownSignal.getFuture() )) ) { + when(wait(success(g_simulator.getCurrentProcess()->shutdownSignal.getFuture()))) { throw io_error().asInjectedFault(); } - when( T rep = wait( in ) ) { - return rep; - } + when(T rep = wait(in)) { return rep; } } } -class AsyncFileDetachable sealed : public IAsyncFile, public ReferenceCounted{ +class AsyncFileDetachable sealed : public IAsyncFile, public ReferenceCounted { private: Reference file; Future shutdown; public: - explicit AsyncFileDetachable( Reference file ) : file(file) { - shutdown = doShutdown(this); - } + explicit AsyncFileDetachable(Reference file) : file(file) { shutdown = doShutdown(this); } - ACTOR Future doShutdown( AsyncFileDetachable* self ) { - wait(success( g_simulator.getCurrentProcess()->shutdownSignal.getFuture() )); + ACTOR Future doShutdown(AsyncFileDetachable* self) { + wait(success(g_simulator.getCurrentProcess()->shutdownSignal.getFuture())); self->file = Reference(); return Void(); } - - ACTOR static Future> open( Future> wrappedFile ) { + + ACTOR static Future> open(Future> wrappedFile) { choose { - when( wait(success( g_simulator.getCurrentProcess()->shutdownSignal.getFuture() )) ) { + when(wait(success(g_simulator.getCurrentProcess()->shutdownSignal.getFuture()))) { throw io_error().asInjectedFault(); } - when( Reference f = wait( wrappedFile ) ) { - return Reference( new AsyncFileDetachable(f) ); + when(Reference f = wait(wrappedFile)) { + return Reference(new AsyncFileDetachable(f)); } } } - virtual void addref() { - ReferenceCounted::addref(); - } - virtual void delref() { - ReferenceCounted::delref(); + virtual void addref() { ReferenceCounted::addref(); } + virtual void delref() { ReferenceCounted::delref(); } + + Future read(void* data, int length, int64_t offset) { + if (!file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady()) + return io_error().asInjectedFault(); + return sendErrorOnShutdown(file->read(data, length, offset)); } - Future read(void *data, int length, int64_t offset) { - if( !file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady() ) + Future write(void const* data, int length, int64_t offset) { + if (!file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady()) return io_error().asInjectedFault(); - return sendErrorOnShutdown( file->read( data, length, offset ) ); + return sendErrorOnShutdown(file->write(data, length, offset)); } - Future write(void const *data, int length, int64_t offset) { - if( !file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady() ) - return io_error().asInjectedFault(); - return sendErrorOnShutdown( file->write( data, length, offset ) ); - } - Future truncate(int64_t size) { - if( !file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady() ) + if (!file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady()) return io_error().asInjectedFault(); - return sendErrorOnShutdown( file->truncate( size ) ); + return sendErrorOnShutdown(file->truncate(size)); } Future sync() { - if( !file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady() ) + if (!file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady()) return io_error().asInjectedFault(); - return sendErrorOnShutdown( file->sync() ); + return sendErrorOnShutdown(file->sync()); } Future size() { - if( !file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady() ) + if (!file.getPtr() || g_simulator.getCurrentProcess()->shutdownSignal.getFuture().isReady()) return io_error().asInjectedFault(); - return sendErrorOnShutdown( file->size() ); + return sendErrorOnShutdown(file->size()); } int64_t debugFD() { - if( !file.getPtr() ) + if (!file.getPtr()) throw io_error().asInjectedFault(); return file->debugFD(); } std::string getFilename() { - if( !file.getPtr() ) + if (!file.getPtr()) throw io_error().asInjectedFault(); return file->getFilename(); } }; -//An async file implementation which wraps another async file and will randomly destroy sectors that it is writing when killed -//This is used to simulate a power failure which prevents all written data from being persisted to disk -class AsyncFileNonDurable sealed : public IAsyncFile, public ReferenceCounted{ +// An async file implementation which wraps another async file and will randomly destroy sectors that it is writing when +// killed This is used to simulate a power failure which prevents all written data from being persisted to disk +class AsyncFileNonDurable sealed : public IAsyncFile, public ReferenceCounted { public: UID id; std::string filename; - //An approximation of the size of the file; .size() should be used instead of this variable in most cases + // An approximation of the size of the file; .size() should be used instead of this variable in most cases int64_t approximateSize; - //The address of the machine that opened the file + // The address of the machine that opened the file NetworkAddress openedAddress; bool aio; private: - //The wrapped IAsyncFile + // The wrapped IAsyncFile Reference file; - //The maximum amount of time a write is delayed before being passed along to the underlying file + // The maximum amount of time a write is delayed before being passed along to the underlying file double maxWriteDelay; - //Modifications which haven't been pushed to file, mapped by the location in the file that is being modified - RangeMap< uint64_t, Future > pendingModifications; + // Modifications which haven't been pushed to file, mapped by the location in the file that is being modified + RangeMap> pendingModifications; - //Will be blocked whenever kill is running + // Will be blocked whenever kill is running Promise killed; Promise killComplete; - //Used by sync (and kill) to force writes which have not yet been passed along. - //If true is sent, then writes will be durable. If false, then they may not be durable. + // Used by sync (and kill) to force writes which have not yet been passed along. + // If true is sent, then writes will be durable. If false, then they may not be durable. Promise startSyncPromise; - //The performance parameters of the simulated disk + // The performance parameters of the simulated disk Reference diskParameters; - //Set to true the first time sync is called on the file + // Set to true the first time sync is called on the file bool hasBeenSynced; - //Used to describe what corruption is allowed by the file as well as the type of corruption being used on a particular page + // Used to describe what corruption is allowed by the file as well as the type of corruption being used on a + // particular page enum KillMode { NO_CORRUPTION = 0, DROP_ONLY = 1, FULL_CORRUPTION = 2 }; - //Limits what types of corruption are applied to writes from this file + // Limits what types of corruption are applied to writes from this file KillMode killMode; - ActorCollection reponses; //cannot call getResult on this actor collection, since the actors will be on different processes + ActorCollection + reponses; // cannot call getResult on this actor collection, since the actors will be on different processes - AsyncFileNonDurable(const std::string& filename, Reference file, Reference diskParameters, NetworkAddress openedAddress, bool aio) - : openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false), aio(aio) { + AsyncFileNonDurable(const std::string& filename, + Reference file, + Reference diskParameters, + NetworkAddress openedAddress, + bool aio) + : openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false), + aio(aio) { - //This is only designed to work in simulation + // This is only designed to work in simulation ASSERT(g_network->isSimulated()); this->id = deterministicRandom()->randomUniqueID(); @@ -197,52 +200,59 @@ private: public: static std::map> filesBeingDeleted; - //Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile - ACTOR static Future> open(std::string filename, std::string actualFilename, Future> wrappedFile, Reference diskParameters, bool aio) { + // Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile + ACTOR static Future> open(std::string filename, + std::string actualFilename, + Future> wrappedFile, + Reference diskParameters, + bool aio) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); state Future shutdown = success(currentProcess->shutdownSignal.getFuture()); //TraceEvent("AsyncFileNonDurableOpenBegin").detail("Filename", filename).detail("Addr", g_simulator.getCurrentProcess()->address); - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { wait(success(wrappedFile) || shutdown); - if(shutdown.isReady()) + if (shutdown.isReady()) throw io_error().asInjectedFault(); state Reference file = wrappedFile.get(); - //If we are in the process of deleting a file, we can't let someone else modify it at the same time. We therefore block the creation of new files until deletion is complete + // If we are in the process of deleting a file, we can't let someone else modify it at the same time. We + // therefore block the creation of new files until deletion is complete state std::map>::iterator deletedFile = filesBeingDeleted.find(filename); - if(deletedFile != filesBeingDeleted.end()) { + if (deletedFile != filesBeingDeleted.end()) { //TraceEvent("AsyncFileNonDurableOpenWaitOnDelete1").detail("Filename", filename); - wait( deletedFile->second || shutdown ); + wait(deletedFile->second || shutdown); //TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename); - if(shutdown.isReady()) + if (shutdown.isReady()) throw io_error().asInjectedFault(); } - state Reference nonDurableFile( new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio) ); + state Reference nonDurableFile( + new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio)); - //Causes the approximateSize member to be set + // Causes the approximateSize member to be set state Future sizeFuture = nonDurableFile->size(); wait(success(sizeFuture) || shutdown); - if(shutdown.isReady()) + if (shutdown.isReady()) throw io_error().asInjectedFault(); //TraceEvent("AsyncFileNonDurableOpenComplete").detail("Filename", filename); - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return nonDurableFile; - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - std::string currentFilename = ( wrappedFile.isReady() && !wrappedFile.isError() ) ? wrappedFile.get()->getFilename() : actualFilename; - currentProcess->machine->openFiles.erase( currentFilename ); + std::string currentFilename = + (wrappedFile.isReady() && !wrappedFile.isError()) ? wrappedFile.get()->getFilename() : actualFilename; + currentProcess->machine->openFiles.erase(currentFilename); //TraceEvent("AsyncFileNonDurableOpenError").error(e, true).detail("Filename", filename).detail("Address", currentProcess->address).detail("Addr", g_simulator.getCurrentProcess()->address); - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } @@ -251,29 +261,26 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } - virtual void addref() { - ReferenceCounted::addref(); - } - virtual void delref() { - if(delref_no_destroy()) { + virtual void addref() { ReferenceCounted::addref(); } + virtual void delref() { + if (delref_no_destroy()) { ASSERT(filesBeingDeleted.count(filename) == 0); //TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename); Future deleteFuture = deleteFile(this); - if(!deleteFuture.isReady()) + if (!deleteFuture.isReady()) filesBeingDeleted[filename] = deleteFuture; } } - //Passes along reads straight to the underlying file, waiting for any outstanding changes that could affect the results - Future read(void *data, int length, int64_t offset) { - return read(this, data, length, offset); - } + // Passes along reads straight to the underlying file, waiting for any outstanding changes that could affect the + // results + Future read(void* data, int length, int64_t offset) { return read(this, data, length, offset); } - //Writes data to the file. Writes are delayed a random amount of time before being - //passed to the underlying file - Future write(void const *data, int length, int64_t offset) { + // Writes data to the file. Writes are delayed a random amount of time before being + // passed to the underlying file + Future write(void const* data, int length, int64_t offset) { //TraceEvent("AsyncFileNonDurable_Write", id).detail("Filename", filename).detail("Offset", offset).detail("Length", length); - if(length == 0) { + if (length == 0) { TraceEvent(SevWarnAlways, "AsyncFileNonDurable_EmptyModification", id).detail("Filename", filename); return Void(); } @@ -285,9 +292,9 @@ public: writeEnded.send(write(this, writeStarted, writeEnded.getFuture(), data, length, offset)); return writeStarted.getFuture(); } - - //Truncates the file. Truncates are delayed a random amount of time before being - //passed to the underlying file + + // Truncates the file. Truncates are delayed a random amount of time before being + // passed to the underlying file Future truncate(int64_t size) { //TraceEvent("AsyncFileNonDurable_Truncate", id).detail("Filename", filename).detail("Offset", size); debugFileTruncate("AsyncFileNonDurableTruncate", filename, size); @@ -298,40 +305,33 @@ public: return truncateStarted.getFuture(); } - //Fsyncs the file. This allows all delayed modifications to the file to complete before - //syncing the underlying file + // Fsyncs the file. This allows all delayed modifications to the file to complete before + // syncing the underlying file Future sync() { //TraceEvent("AsyncFileNonDurable_Sync", id).detail("Filename", filename); Future syncFuture = sync(this, true); - reponses.add( syncFuture ); + reponses.add(syncFuture); return syncFuture; } - //Passes along size requests to the underlying file, augmenting with any writes past the end of the file - Future size() { - return size(this); - } + // Passes along size requests to the underlying file, augmenting with any writes past the end of the file + Future size() { return size(this); } - int64_t debugFD() { - return file->debugFD(); - } + int64_t debugFD() { return file->debugFD(); } - std::string getFilename() { - return file->getFilename(); - } + std::string getFilename() { return file->getFilename(); } - //Forces a non-durable sync (some writes are not made or made incorrectly) - //This is used when the file should 'die' without first completing its operations + // Forces a non-durable sync (some writes are not made or made incorrectly) + // This is used when the file should 'die' without first completing its operations //(e.g. to simulate power failure) Future kill() { TraceEvent("AsyncFileNonDurable_Kill", id).detail("Filename", filename); - TEST(true); //AsyncFileNonDurable was killed + TEST(true); // AsyncFileNonDurable was killed return sync(this, false); } private: - - //Returns a future that is used to ensure the waiter ends up on the main thread + // Returns a future that is used to ensure the waiter ends up on the main thread Future returnToMainThread() { Promise p; Future f = p.getFuture(); @@ -339,32 +339,37 @@ private: return f; } - //Gets existing modifications that overlap the specified range. Optionally inserts a new modification into the map - std::vector> getModificationsAndInsert(int64_t offset, int64_t length, bool insertModification = false, Future value = Void()) { - auto modification = RangeMapRange(offset, length>=0 ? offset+length : uint64_t(-1)); + // Gets existing modifications that overlap the specified range. Optionally inserts a new modification into the map + std::vector> getModificationsAndInsert(int64_t offset, + int64_t length, + bool insertModification = false, + Future value = Void()) { + auto modification = RangeMapRange(offset, length >= 0 ? offset + length : uint64_t(-1)); auto priorModifications = pendingModifications.intersectingRanges(modification); - //Aggregate existing modifications in this range + // Aggregate existing modifications in this range std::vector> modificationFutures; - for(auto itr = priorModifications.begin(); itr != priorModifications.end(); ++itr) { - if(itr.value().isValid() && (!itr.value().isReady() || itr.value().isError())) { + for (auto itr = priorModifications.begin(); itr != priorModifications.end(); ++itr) { + if (itr.value().isValid() && (!itr.value().isReady() || itr.value().isError())) { modificationFutures.push_back(itr.value()); } } - //Add the modification if we are doing a write or truncate - if(insertModification) + // Add the modification if we are doing a write or truncate + if (insertModification) pendingModifications.insert(modification, value); return modificationFutures; } - //Checks if the file is killed. If so, then the current sync is completed if running and then an error is thrown - ACTOR Future checkKilled(AsyncFileNonDurable *self, std::string context) { - if(self->killed.isSet()) { + // Checks if the file is killed. If so, then the current sync is completed if running and then an error is thrown + ACTOR Future checkKilled(AsyncFileNonDurable* self, std::string context) { + if (self->killed.isSet()) { //TraceEvent("AsyncFileNonDurable_KilledInCheck", self->id).detail("In", context).detail("Filename", self->filename); wait(self->killComplete.getFuture()); - TraceEvent("AsyncFileNonDurable_KilledFileOperation", self->id).detail("In", context).detail("Filename", self->filename); + TraceEvent("AsyncFileNonDurable_KilledFileOperation", self->id) + .detail("In", context) + .detail("Filename", self->filename); TEST(true); // AsyncFileNonDurable operation killed throw io_error().asInjectedFault(); } @@ -372,50 +377,56 @@ private: return Void(); } - //Passes along reads straight to the underlying file, waiting for any outstanding changes that could affect the results - ACTOR Future onRead(AsyncFileNonDurable *self, void *data, int length, int64_t offset) { + // Passes along reads straight to the underlying file, waiting for any outstanding changes that could affect the + // results + ACTOR Future onRead(AsyncFileNonDurable* self, void* data, int length, int64_t offset) { wait(self->checkKilled(self, "Read")); vector> priorModifications = self->getModificationsAndInsert(offset, length); wait(waitForAll(priorModifications)); state Future readFuture = self->file->read(data, length, offset); - wait( success( readFuture ) || self->killed.getFuture() ); + wait(success(readFuture) || self->killed.getFuture()); // throws if we were killed wait(self->checkKilled(self, "ReadEnd")); debugFileCheck("AsyncFileNonDurableRead", self->filename, data, offset, length); - //if(g_simulator.getCurrentProcess()->rebooting) - //TraceEvent("AsyncFileNonDurable_ReadEnd", self->id).detail("Filename", self->filename); + // if(g_simulator.getCurrentProcess()->rebooting) + //TraceEvent("AsyncFileNonDurable_ReadEnd", self->id).detail("Filename", self->filename); return readFuture.get(); } - ACTOR Future read(AsyncFileNonDurable *self, void *data, int length, int64_t offset) { + ACTOR Future read(AsyncFileNonDurable* self, void* data, int length, int64_t offset) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { - state int rep = wait( self->onRead( self, data, length, offset ) ); - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + state int rep = wait(self->onRead(self, data, length, offset)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return rep; - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } - //Delays writes a random amount of time before passing them through to the underlying file. - //If a kill interrupts the delay, then the output could be the correct write, part of the write, - //or none of the write. It may also corrupt parts of sectors which have not been written correctly - ACTOR Future write(AsyncFileNonDurable *self, Promise writeStarted, Future> ownFuture, void const* data, int length, int64_t offset) { + // Delays writes a random amount of time before passing them through to the underlying file. + // If a kill interrupts the delay, then the output could be the correct write, part of the write, + // or none of the write. It may also corrupt parts of sectors which have not been written correctly + ACTOR Future write(AsyncFileNonDurable* self, + Promise writeStarted, + Future> ownFuture, + void const* data, + int length, + int64_t offset) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - wait( g_simulator.onMachine( currentProcess ) ); - + wait(g_simulator.onMachine(currentProcess)); + state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay; state Standalone dataCopy(StringRef((uint8_t*)data, length)); @@ -426,33 +437,34 @@ private: wait(self->checkKilled(self, "Write")); Future writeEnded = wait(ownFuture); - std::vector> priorModifications = self->getModificationsAndInsert(offset, length, true, writeEnded); + std::vector> priorModifications = + self->getModificationsAndInsert(offset, length, true, writeEnded); - if(BUGGIFY_WITH_PROB(0.001)) - priorModifications.push_back(delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) || self->killed.getFuture()); + if (BUGGIFY_WITH_PROB(0.001)) + priorModifications.push_back( + delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) || + self->killed.getFuture()); else - priorModifications.push_back(waitUntilDiskReady(self->diskParameters, length) || self->killed.getFuture()); + priorModifications.push_back(waitUntilDiskReady(self->diskParameters, length) || + self->killed.getFuture()); wait(waitForAll(priorModifications)); self->approximateSize = std::max(self->approximateSize, length + offset); - self->reponses.add( sendOnProcess( currentProcess, writeStarted, currentTaskID ) ); - } - catch(Error &e) { - self->reponses.add( sendErrorOnProcess( currentProcess, writeStarted, e, currentTaskID ) ); + self->reponses.add(sendOnProcess(currentProcess, writeStarted, currentTaskID)); + } catch (Error& e) { + self->reponses.add(sendErrorOnProcess(currentProcess, writeStarted, e, currentTaskID)); throw; } //TraceEvent("AsyncFileNonDurable_WriteDoneWithPreviousMods", self->id).detail("Delay", delayDuration).detail("Filename", self->filename).detail("WriteLength", length).detail("Offset", offset); - //Wait a random amount of time or until a sync/kill is issued + // Wait a random amount of time or until a sync/kill is issued state bool saveDurable = true; choose { - when(wait(delay(delayDuration))) { } - when(bool durable = wait(startSyncFuture)) { - saveDurable = durable; - } + when(wait(delay(delayDuration))) {} + when(bool durable = wait(startSyncFuture)) { saveDurable = durable; } } debugFileCheck("AsyncFileNonDurableWriteAfterWait", self->filename, dataCopy.begin(), offset, length); @@ -460,48 +472,56 @@ private: // In AIO mode, only page-aligned writes are supported ASSERT(!self->aio || (offset % 4096 == 0 && length % 4096 == 0)); - //Non-durable writes should introduce errors at the page level and corrupt at the sector level - //Otherwise, we can perform the entire write at once + // Non-durable writes should introduce errors at the page level and corrupt at the sector level + // Otherwise, we can perform the entire write at once int diskPageLength = saveDurable ? length : 4096; int diskSectorLength = saveDurable ? length : 512; vector> writeFutures; - for(int writeOffset = 0; writeOffset < length; ) { + for (int writeOffset = 0; writeOffset < length;) { // Number of bytes until the next diskPageLength file offset within the write or the end of the write. int pageLength = diskPageLength; - if(!self->aio && !saveDurable) { + if (!self->aio && !saveDurable) { // If not in AIO mode, and the save is not durable, then we can't perform the entire write all at once // and the first and last pages touched by the write could be partial. - pageLength = std::min((int64_t)length - writeOffset, diskPageLength - ((offset + writeOffset) % diskPageLength)); + pageLength = std::min((int64_t)length - writeOffset, + diskPageLength - ((offset + writeOffset) % diskPageLength)); } - //choose a random action to perform on this page write (write correctly, corrupt, or don't write) + // choose a random action to perform on this page write (write correctly, corrupt, or don't write) KillMode pageKillMode = (KillMode)deterministicRandom()->randomInt(0, self->killMode + 1); - - for(int pageOffset = 0; pageOffset < pageLength; ) { + + for (int pageOffset = 0; pageOffset < pageLength;) { // Number of bytes until the next diskSectorLength file offset within the write or the end of the write. int sectorLength = diskSectorLength; - if(!self->aio && !saveDurable) { - // If not in AIO mode, and the save is not durable, then we can't perform the entire write all at once - // and the first and last sectors touched by the write could be partial. - sectorLength = std::min((int64_t)length - (writeOffset + pageOffset), diskSectorLength - ((offset + writeOffset + pageOffset) % diskSectorLength)); + if (!self->aio && !saveDurable) { + // If not in AIO mode, and the save is not durable, then we can't perform the entire write all at + // once and the first and last sectors touched by the write could be partial. + sectorLength = + std::min((int64_t)length - (writeOffset + pageOffset), + diskSectorLength - ((offset + writeOffset + pageOffset) % diskSectorLength)); } - //If saving durable, then perform the write correctly. Otherwise, perform the write correcly with a probability of 1/3. - //If corrupting the write, then this sector will be written correctly with a 1/4 chance - if(saveDurable || pageKillMode == NO_CORRUPTION || (pageKillMode == FULL_CORRUPTION && deterministicRandom()->random01() < 0.25)) { - //if (!saveDurable) TraceEvent(SevInfo, "AsyncFileNonDurableWrite", self->id).detail("Filename", self->filename).detail("Offset", offset+writeOffset+pageOffset).detail("Length", sectorLength); - writeFutures.push_back(self->file->write(dataCopy.begin() + writeOffset + pageOffset, sectorLength, offset + writeOffset + pageOffset)); + // If saving durable, then perform the write correctly. Otherwise, perform the write correcly with a + // probability of 1/3. If corrupting the write, then this sector will be written correctly with a 1/4 + // chance + if (saveDurable || pageKillMode == NO_CORRUPTION || + (pageKillMode == FULL_CORRUPTION && deterministicRandom()->random01() < 0.25)) { + // if (!saveDurable) TraceEvent(SevInfo, "AsyncFileNonDurableWrite", self->id).detail("Filename", + // self->filename).detail("Offset", offset+writeOffset+pageOffset).detail("Length", sectorLength); + writeFutures.push_back(self->file->write( + dataCopy.begin() + writeOffset + pageOffset, sectorLength, offset + writeOffset + pageOffset)); } - //If the write is not durable, then the write will either be corrupted or not written at all. If corrupted, there is 1/4 chance that a given - //sector will not be written - else if(pageKillMode == FULL_CORRUPTION && deterministicRandom()->random01() < 0.66667) { - //The incorrect part of the write can be the rightmost bytes (side = 0), the leftmost bytes (side = 1), or the entire write (side = 2) + // If the write is not durable, then the write will either be corrupted or not written at all. If + // corrupted, there is 1/4 chance that a given sector will not be written + else if (pageKillMode == FULL_CORRUPTION && deterministicRandom()->random01() < 0.66667) { + // The incorrect part of the write can be the rightmost bytes (side = 0), the leftmost bytes (side = + // 1), or the entire write (side = 2) int side = deterministicRandom()->randomInt(0, 3); - //There is a 1/2 chance that a bad write will have garbage written into its bad portion - //The chance is increased to 1 if the entire write is bad + // There is a 1/2 chance that a bad write will have garbage written into its bad portion + // The chance is increased to 1 if the entire write is bad bool garbage = side == 2 || deterministicRandom()->random01() < 0.5; int64_t goodStart = 0; @@ -509,37 +529,52 @@ private: int64_t badStart = 0; int64_t badEnd = sectorLength; - if(side == 0) { + if (side == 0) { goodEnd = deterministicRandom()->randomInt(0, sectorLength); badStart = goodEnd; - } - else if(side == 1) { + } else if (side == 1) { badEnd = deterministicRandom()->randomInt(0, sectorLength); goodStart = badEnd; - } - else + } else goodEnd = 0; - //Write randomly generated bytes, if required - if(garbage && badStart != badEnd) { - uint8_t *badData = const_cast(&dataCopy.begin()[badStart + writeOffset + pageOffset]); - for(int i = 0; i < badEnd - badStart; i += sizeof(uint32_t)) { + // Write randomly generated bytes, if required + if (garbage && badStart != badEnd) { + uint8_t* badData = const_cast(&dataCopy.begin()[badStart + writeOffset + pageOffset]); + for (int i = 0; i < badEnd - badStart; i += sizeof(uint32_t)) { uint32_t val = deterministicRandom()->randomUInt32(); memcpy(&badData[i], &val, std::min(badEnd - badStart - i, (int64_t)sizeof(uint32_t))); } - writeFutures.push_back(self->file->write(dataCopy.begin() + writeOffset + pageOffset, sectorLength, offset + writeOffset + pageOffset)); - debugFileSet("AsyncFileNonDurableBadWrite", self->filename, dataCopy.begin() + writeOffset + pageOffset, offset + writeOffset + pageOffset, sectorLength); - } - else if(goodStart != goodEnd) - writeFutures.push_back(self->file->write(dataCopy.begin() + goodStart + writeOffset + pageOffset, goodEnd - goodStart, goodStart + offset + writeOffset + pageOffset)); + writeFutures.push_back(self->file->write(dataCopy.begin() + writeOffset + pageOffset, + sectorLength, + offset + writeOffset + pageOffset)); + debugFileSet("AsyncFileNonDurableBadWrite", + self->filename, + dataCopy.begin() + writeOffset + pageOffset, + offset + writeOffset + pageOffset, + sectorLength); + } else if (goodStart != goodEnd) + writeFutures.push_back( + self->file->write(dataCopy.begin() + goodStart + writeOffset + pageOffset, + goodEnd - goodStart, + goodStart + offset + writeOffset + pageOffset)); - TraceEvent("AsyncFileNonDurable_BadWrite", self->id).detail("Offset", offset + writeOffset + pageOffset).detail("Length", sectorLength).detail("GoodStart", goodStart).detail("GoodEnd", goodEnd).detail("HasGarbage", garbage).detail("Side", side).detail("Filename", self->filename); - TEST(true); //AsyncFileNonDurable bad write - } - else { - TraceEvent("AsyncFileNonDurable_DroppedWrite", self->id).detail("Offset", offset + writeOffset + pageOffset).detail("Length", sectorLength).detail("Filename", self->filename); - TEST(true); //AsyncFileNonDurable dropped write + TraceEvent("AsyncFileNonDurable_BadWrite", self->id) + .detail("Offset", offset + writeOffset + pageOffset) + .detail("Length", sectorLength) + .detail("GoodStart", goodStart) + .detail("GoodEnd", goodEnd) + .detail("HasGarbage", garbage) + .detail("Side", side) + .detail("Filename", self->filename); + TEST(true); // AsyncFileNonDurable bad write + } else { + TraceEvent("AsyncFileNonDurable_DroppedWrite", self->id) + .detail("Offset", offset + writeOffset + pageOffset) + .detail("Length", sectorLength) + .detail("Filename", self->filename); + TEST(true); // AsyncFileNonDurable dropped write } pageOffset += sectorLength; @@ -547,19 +582,22 @@ private: writeOffset += pageLength; } - + wait(waitForAll(writeFutures)); //TraceEvent("AsyncFileNonDurable_WriteDone", self->id).detail("Delay", delayDuration).detail("Filename", self->filename).detail("WriteLength", length).detail("Offset", offset); return Void(); } - //Delays truncates a random amount of time before passing them through to the underlying file. - //If a kill interrupts the delay, then the truncate may or may not be performed - ACTOR Future truncate(AsyncFileNonDurable *self, Promise truncateStarted, Future> ownFuture, int64_t size) { + // Delays truncates a random amount of time before passing them through to the underlying file. + // If a kill interrupts the delay, then the truncate may or may not be performed + ACTOR Future truncate(AsyncFileNonDurable* self, + Promise truncateStarted, + Future> ownFuture, + int64_t size) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - wait( g_simulator.onMachine( currentProcess ) ); - + wait(g_simulator.onMachine(currentProcess)); + state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay; state Future startSyncFuture = self->startSyncPromise.getFuture(); @@ -568,10 +606,13 @@ private: wait(self->checkKilled(self, "Truncate")); Future truncateEnded = wait(ownFuture); - std::vector> priorModifications = self->getModificationsAndInsert(size, -1, true, truncateEnded); + std::vector> priorModifications = + self->getModificationsAndInsert(size, -1, true, truncateEnded); - if(BUGGIFY_WITH_PROB(0.001)) - priorModifications.push_back(delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) || self->killed.getFuture()); + if (BUGGIFY_WITH_PROB(0.001)) + priorModifications.push_back( + delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) || + self->killed.getFuture()); else priorModifications.push_back(waitUntilDiskReady(self->diskParameters, 0) || self->killed.getFuture()); @@ -579,101 +620,103 @@ private: self->approximateSize = size; - self->reponses.add( sendOnProcess( currentProcess, truncateStarted, currentTaskID ) ); - } - catch(Error &e) { - self->reponses.add( sendErrorOnProcess( currentProcess, truncateStarted, e, currentTaskID ) ); + self->reponses.add(sendOnProcess(currentProcess, truncateStarted, currentTaskID)); + } catch (Error& e) { + self->reponses.add(sendErrorOnProcess(currentProcess, truncateStarted, e, currentTaskID)); throw; } - //Wait a random amount of time or until a sync/kill is issued + // Wait a random amount of time or until a sync/kill is issued state bool saveDurable = true; choose { - when(wait(delay(delayDuration))) { } - when(bool durable = wait(startSyncFuture)) { - saveDurable = durable; - } + when(wait(delay(delayDuration))) {} + when(bool durable = wait(startSyncFuture)) { saveDurable = durable; } } - if(g_network->check_yield(TaskPriority::DefaultYield)) { + if (g_network->check_yield(TaskPriority::DefaultYield)) { wait(delay(0, TaskPriority::DefaultYield)); } - //If performing a durable truncate, then pass it through to the file. Otherwise, pass it through with a 1/2 chance - if(saveDurable || self->killMode == NO_CORRUPTION || deterministicRandom()->random01() < 0.5) + // If performing a durable truncate, then pass it through to the file. Otherwise, pass it through with a 1/2 + // chance + if (saveDurable || self->killMode == NO_CORRUPTION || deterministicRandom()->random01() < 0.5) wait(self->file->truncate(size)); else { TraceEvent("AsyncFileNonDurable_DroppedTruncate", self->id).detail("Size", size); - TEST(true); //AsyncFileNonDurable dropped truncate + TEST(true); // AsyncFileNonDurable dropped truncate } return Void(); } - //Waits for delayed modifications to the file to complete and then syncs the underlying file - //If durable is false, then some of the delayed modifications will not be applied or will be - //applied incorrectly - ACTOR Future onSync(AsyncFileNonDurable *self, bool durable) { + // Waits for delayed modifications to the file to complete and then syncs the underlying file + // If durable is false, then some of the delayed modifications will not be applied or will be + // applied incorrectly + ACTOR Future onSync(AsyncFileNonDurable* self, bool durable) { //TraceEvent("AsyncFileNonDurable_ImplSync", self->id).detail("Filename", self->filename).detail("Durable", durable); ASSERT(durable || !self->killed.isSet()); // this file is kill()ed only once - if(durable) { + if (durable) { self->hasBeenSynced = true; wait(waitUntilDiskReady(self->diskParameters, 0, true) || self->killed.getFuture()); } wait(self->checkKilled(self, durable ? "Sync" : "Kill")); - - if(!durable) - self->killed.send( Void() ); - //Get all outstanding modifications + if (!durable) + self->killed.send(Void()); + + // Get all outstanding modifications std::vector> outstandingModifications; std::vector> stillPendingModifications; auto rangeItr = self->pendingModifications.ranges(); - for(auto itr = rangeItr.begin(); itr != rangeItr.end(); ++itr) { - if(itr.value().isValid() && (!itr->value().isReady() || itr->value().isError())) { + for (auto itr = rangeItr.begin(); itr != rangeItr.end(); ++itr) { + if (itr.value().isValid() && (!itr->value().isReady() || itr->value().isError())) { outstandingModifications.push_back(itr->value()); - if(!itr.value().isReady()) + if (!itr.value().isReady()) stillPendingModifications.push_back(itr->range()); } } Future allModifications = waitForAll(outstandingModifications); - //Clear out the pending modifications map of all completed modifications + // Clear out the pending modifications map of all completed modifications self->pendingModifications.insert(RangeMapRange(0, -1), Void()); - for(auto itr = stillPendingModifications.begin(); itr != stillPendingModifications.end(); ++itr) - self->pendingModifications.insert(*itr, success(allModifications)); //waitForAll cannot wait on the same future more than once, so wrap the future with success + for (auto itr = stillPendingModifications.begin(); itr != stillPendingModifications.end(); ++itr) + self->pendingModifications.insert( + *itr, success(allModifications)); // waitForAll cannot wait on the same future more than once, so wrap + // the future with success - //Signal all modifications to end their delay and reset the startSyncPromise + // Signal all modifications to end their delay and reset the startSyncPromise Promise startSyncPromise = self->startSyncPromise; self->startSyncPromise = Promise(); - //Writes will be durable in a kill with a 10% probability + // Writes will be durable in a kill with a 10% probability state bool writeDurable = durable || deterministicRandom()->random01() < 0.1; startSyncPromise.send(writeDurable); - //Wait for outstanding writes to complete - if(durable) + // Wait for outstanding writes to complete + if (durable) wait(allModifications); else wait(success(errorOr(allModifications))); - if(!durable) { - //Sometimes sync the file if writes were made durably. Before a file is first synced, it is stored in a temporary file and then renamed to the correct - //location once sync is called. By not calling sync, we simulate a failure to fsync the directory storing the file - if(self->hasBeenSynced && writeDurable && deterministicRandom()->random01() < 0.5) { - TEST(true); //AsyncFileNonDurable kill was durable and synced + if (!durable) { + // Sometimes sync the file if writes were made durably. Before a file is first synced, it is stored in a + // temporary file and then renamed to the correct location once sync is called. By not calling sync, we + // simulate a failure to fsync the directory storing the file + if (self->hasBeenSynced && writeDurable && deterministicRandom()->random01() < 0.5) { + TEST(true); // AsyncFileNonDurable kill was durable and synced wait(success(errorOr(self->file->sync()))); } - //Setting this promise could trigger the deletion of the AsyncFileNonDurable; after this none of its members should be used + // Setting this promise could trigger the deletion of the AsyncFileNonDurable; after this none of its + // members should be used //TraceEvent("AsyncFileNonDurable_ImplSyncEnd", self->id).detail("Filename", self->filename).detail("Durable", durable); self->killComplete.send(Void()); } - //A killed file cannot be allowed to report that it successfully synced + // A killed file cannot be allowed to report that it successfully synced else { wait(self->checkKilled(self, "SyncEnd")); wait(self->file->sync()); @@ -683,94 +726,96 @@ private: return Void(); } - ACTOR Future sync(AsyncFileNonDurable *self, bool durable) { + ACTOR Future sync(AsyncFileNonDurable* self, bool durable) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { - wait( self->onSync( self, durable ) ); - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(self->onSync(self, durable)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return Void(); - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } - //Passes along size requests to the underlying file, augmenting with any writes past the end of the file - ACTOR Future onSize(AsyncFileNonDurable *self) { + // Passes along size requests to the underlying file, augmenting with any writes past the end of the file + ACTOR Future onSize(AsyncFileNonDurable* self) { //TraceEvent("AsyncFileNonDurable_Size", self->id).detail("Filename", self->filename); wait(self->checkKilled(self, "Size")); state Future sizeFuture = self->file->size(); - wait( success( sizeFuture ) || self->killed.getFuture() ); + wait(success(sizeFuture) || self->killed.getFuture()); wait(self->checkKilled(self, "SizeEnd")); - //Include any modifications which extend past the end of the file + // Include any modifications which extend past the end of the file uint64_t maxModification = self->pendingModifications.lastItem().begin(); self->approximateSize = std::max(sizeFuture.get(), maxModification); return self->approximateSize; } - ACTOR Future size(AsyncFileNonDurable *self) { + ACTOR Future size(AsyncFileNonDurable* self) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { - state int64_t rep = wait( self->onSize( self ) ); - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + state int64_t rep = wait(self->onSize(self)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return rep; - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } - //Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it - ACTOR Future deleteFile(AsyncFileNonDurable *self) { + // Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it + ACTOR Future deleteFile(AsyncFileNonDurable* self) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); state std::string filename = self->filename; - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { - //Make sure all writes have gone through. + // Make sure all writes have gone through. Promise startSyncPromise = self->startSyncPromise; self->startSyncPromise = Promise(); startSyncPromise.send(true); std::vector> outstandingModifications; - for(auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); ++itr) - if(itr->value().isValid() && !itr->value().isReady()) + for (auto itr = self->pendingModifications.ranges().begin(); + itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) outstandingModifications.push_back(itr->value()); - //Ignore errors here so that all modifications can finish + // Ignore errors here so that all modifications can finish wait(waitForAllReady(outstandingModifications)); - //Make sure we aren't in the process of killing the file - if(self->killed.isSet()) + // Make sure we aren't in the process of killing the file + if (self->killed.isSet()) wait(self->killComplete.getFuture()); - //Remove this file from the filesBeingDeleted map so that new files can be created with this filename - g_simulator.getMachineByNetworkAddress( self->openedAddress )->closingFiles.erase(self->getFilename()); - g_simulator.getMachineByNetworkAddress( self->openedAddress )->deletingFiles.erase(self->getFilename()); + // Remove this file from the filesBeingDeleted map so that new files can be created with this filename + g_simulator.getMachineByNetworkAddress(self->openedAddress)->closingFiles.erase(self->getFilename()); + g_simulator.getMachineByNetworkAddress(self->openedAddress)->deletingFiles.erase(self->getFilename()); AsyncFileNonDurable::filesBeingDeleted.erase(self->filename); //TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename); delete self; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return Void(); - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 7365e0dcb0..3c2f37335b 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -37,24 +37,30 @@ #include "fdbrpc/Replication.h" #include "fdbrpc/ReplicationUtils.h" #include "fdbrpc/AsyncFileWriteChecker.h" -#include "flow/actorcompiler.h" // This must be the last #include. +#include "flow/actorcompiler.h" // This must be the last #include. -bool simulator_should_inject_fault( const char* context, const char* file, int line, int error_code ) { - if (!g_network->isSimulated()) return false; +bool simulator_should_inject_fault(const char* context, const char* file, int line, int error_code) { + if (!g_network->isSimulated()) + return false; auto p = g_simulator.getCurrentProcess(); - if (p->fault_injection_p2 && deterministicRandom()->random01() < p->fault_injection_p2 && !g_simulator.speedUpSimulation) { + if (p->fault_injection_p2 && deterministicRandom()->random01() < p->fault_injection_p2 && + !g_simulator.speedUpSimulation) { uint32_t h1 = line + (p->fault_injection_r >> 32); - if (h1 < p->fault_injection_p1*std::numeric_limits::max()) { - TEST(true); // A fault was injected - TEST(error_code == error_code_io_timeout); // An io timeout was injected - TEST(error_code == error_code_io_error); // An io error was injected - TEST(error_code == error_code_platform_error); // A platform error was injected. - TraceEvent(SevWarn, "FaultInjected").detail("Context", context).detail("File", file).detail("Line", line).detail("ErrorCode", error_code); - if(error_code == error_code_io_timeout) { - g_network->setGlobal(INetwork::enASIOTimedOut, (flowGlobalType)true); + if (h1 < p->fault_injection_p1 * std::numeric_limits::max()) { + TEST(true); // A fault was injected + TEST(error_code == error_code_io_timeout); // An io timeout was injected + TEST(error_code == error_code_io_error); // An io error was injected + TEST(error_code == error_code_platform_error); // A platform error was injected. + TraceEvent(SevWarn, "FaultInjected") + .detail("Context", context) + .detail("File", file) + .detail("Line", line) + .detail("ErrorCode", error_code); + if (error_code == error_code_io_timeout) { + g_network->setGlobal(INetwork::enASIOTimedOut, (flowGlobalType) true); } return true; } @@ -63,24 +69,36 @@ bool simulator_should_inject_fault( const char* context, const char* file, int l return false; } -void ISimulator::displayWorkers() const -{ +void ISimulator::displayWorkers() const { std::map> machineMap; // Create a map of machine Id for (auto processInfo : getAllProcesses()) { - std::string dataHall = processInfo->locality.dataHallId().present() ? processInfo->locality.dataHallId().get().printable() : "[unset]"; - std::string machineId = processInfo->locality.machineId().present() ? processInfo->locality.machineId().get().printable() : "[unset]"; + std::string dataHall = processInfo->locality.dataHallId().present() + ? processInfo->locality.dataHallId().get().printable() + : "[unset]"; + std::string machineId = processInfo->locality.machineId().present() + ? processInfo->locality.machineId().get().printable() + : "[unset]"; machineMap[format("%-8s %s", dataHall.c_str(), machineId.c_str())].push_back(processInfo); } printf("DataHall MachineId\n"); - printf(" Address Name Class Excluded Failed Rebooting Cleared Role DataFolder\n"); + printf(" Address Name Class Excluded Failed Rebooting Cleared Role " + " DataFolder\n"); for (auto& machineRecord : machineMap) { printf("\n%s\n", machineRecord.first.c_str()); for (auto& processInfo : machineRecord.second) { printf(" %9s %-10s%-13s%-8s %-6s %-9s %-8s %-48s %-40s\n", - processInfo->address.toString().c_str(), processInfo->name, processInfo->startingClass.toString().c_str(), (processInfo->isExcluded() ? "True" : "False"), (processInfo->failed ? "True" : "False"), (processInfo->rebooting ? "True" : "False"), (processInfo->isCleared() ? "True" : "False"), getRoles(processInfo->address).c_str(), processInfo->dataFolder); + processInfo->address.toString().c_str(), + processInfo->name, + processInfo->startingClass.toString().c_str(), + (processInfo->isExcluded() ? "True" : "False"), + (processInfo->failed ? "True" : "False"), + (processInfo->rebooting ? "True" : "False"), + (processInfo->isCleared() ? "True" : "False"), + getRoles(processInfo->address).c_str(), + processInfo->dataFolder); } } @@ -88,15 +106,12 @@ void ISimulator::displayWorkers() const } namespace std { -template<> +template <> class hash { public: - size_t operator()(const Endpoint &s) const - { - return hashlittle(&s, sizeof(s), 0); - } + size_t operator()(const Endpoint& s) const { return hashlittle(&s, sizeof(s), 0); } }; -} +} // namespace std bool onlyBeforeSimulatorInit() { return g_network->isSimulated() && g_simulator.getAllProcesses().empty(); @@ -107,36 +122,36 @@ const UID TOKEN_ENDPOINT_NOT_FOUND(-1, -1); int openCount = 0; struct SimClogging { - double getSendDelay( NetworkAddress from, NetworkAddress to ) { + double getSendDelay(NetworkAddress from, NetworkAddress to) { return halfLatency(); double tnow = now(); double t = tnow + halfLatency(); - if (!g_simulator.speedUpSimulation && clogSendUntil.count( to.ip )) - t = std::max( t, clogSendUntil[ to.ip ] ); + if (!g_simulator.speedUpSimulation && clogSendUntil.count(to.ip)) + t = std::max(t, clogSendUntil[to.ip]); return t - tnow; } - double getRecvDelay( NetworkAddress from, NetworkAddress to ) { - auto pair = std::make_pair( from.ip, to.ip ); + double getRecvDelay(NetworkAddress from, NetworkAddress to) { + auto pair = std::make_pair(from.ip, to.ip); double tnow = now(); double t = tnow + halfLatency(); - if(!g_simulator.speedUpSimulation) - t += clogPairLatency[ pair ]; + if (!g_simulator.speedUpSimulation) + t += clogPairLatency[pair]; - if (!g_simulator.speedUpSimulation && clogPairUntil.count( pair )) - t = std::max( t, clogPairUntil[ pair ] ); + if (!g_simulator.speedUpSimulation && clogPairUntil.count(pair)) + t = std::max(t, clogPairUntil[pair]); - if (!g_simulator.speedUpSimulation && clogRecvUntil.count( to.ip )) - t = std::max( t, clogRecvUntil[ to.ip ] ); + if (!g_simulator.speedUpSimulation && clogRecvUntil.count(to.ip)) + t = std::max(t, clogRecvUntil[to.ip]); return t - tnow; } void clogPairFor(const IPAddress& from, const IPAddress& to, double t) { - auto& u = clogPairUntil[ std::make_pair( from, to ) ]; + auto& u = clogPairUntil[std::make_pair(from, to)]; u = std::max(u, now() + t); } void clogSendFor(const IPAddress& from, double t) { @@ -148,9 +163,9 @@ struct SimClogging { u = std::max(u, now() + t); } double setPairLatencyIfNotSet(const IPAddress& from, const IPAddress& to, double t) { - auto i = clogPairLatency.find( std::make_pair(from,to) ); + auto i = clogPairLatency.find(std::make_pair(from, to)); if (i == clogPairLatency.end()) - i = clogPairLatency.insert( std::make_pair( std::make_pair(from,to), t ) ).first; + i = clogPairLatency.insert(std::make_pair(std::make_pair(from, to), t)).first; return i->second; } @@ -163,10 +178,12 @@ private: const double pFast = 0.999; if (a <= pFast) { a = a / pFast; - return 0.5 * (FLOW_KNOBS->MIN_NETWORK_LATENCY * (1-a) + FLOW_KNOBS->FAST_NETWORK_LATENCY/pFast * a); // 0.5ms average + return 0.5 * (FLOW_KNOBS->MIN_NETWORK_LATENCY * (1 - a) + + FLOW_KNOBS->FAST_NETWORK_LATENCY / pFast * a); // 0.5ms average } else { - a = (a-pFast) / (1-pFast); // uniform 0-1 again - return 0.5 * (FLOW_KNOBS->MIN_NETWORK_LATENCY * (1-a) + FLOW_KNOBS->SLOW_NETWORK_LATENCY*a); // long tail up to X ms + a = (a - pFast) / (1 - pFast); // uniform 0-1 again + return 0.5 * (FLOW_KNOBS->MIN_NETWORK_LATENCY * (1 - a) + + FLOW_KNOBS->SLOW_NETWORK_LATENCY * a); // long tail up to X ms } } }; @@ -174,66 +191,70 @@ private: SimClogging g_clogging; struct Sim2Conn : IConnection, ReferenceCounted { - Sim2Conn( ISimulator::ProcessInfo* process ) - : process(process), dbgid( deterministicRandom()->randomUniqueID() ), opened(false), closedByCaller(false), stopReceive(Never()) - { + Sim2Conn(ISimulator::ProcessInfo* process) + : process(process), dbgid(deterministicRandom()->randomUniqueID()), opened(false), closedByCaller(false), + stopReceive(Never()) { pipes = sender(this) && receiver(this); } - // connect() is called on a pair of connections immediately after creation; logically it is part of the constructor and no other method may be called previously! - void connect( Reference peer, NetworkAddress peerEndpoint ) { + // connect() is called on a pair of connections immediately after creation; logically it is part of the constructor + // and no other method may be called previously! + void connect(Reference peer, NetworkAddress peerEndpoint) { this->peer = peer; this->peerProcess = peer->process; this->peerId = peer->dbgid; this->peerEndpoint = peerEndpoint; - // Every one-way connection gets a random permanent latency and a random send buffer for the duration of the connection - auto latency = g_clogging.setPairLatencyIfNotSet( peerProcess->address.ip, process->address.ip, FLOW_KNOBS->MAX_CLOGGING_LATENCY*deterministicRandom()->random01() ); - sendBufSize = std::max( deterministicRandom()->randomInt(0, 5000000), 25e6 * (latency + .002) ); + // Every one-way connection gets a random permanent latency and a random send buffer for the duration of the + // connection + auto latency = + g_clogging.setPairLatencyIfNotSet(peerProcess->address.ip, + process->address.ip, + FLOW_KNOBS->MAX_CLOGGING_LATENCY * deterministicRandom()->random01()); + sendBufSize = std::max(deterministicRandom()->randomInt(0, 5000000), 25e6 * (latency + .002)); TraceEvent("Sim2Connection").detail("SendBufSize", sendBufSize).detail("Latency", latency); } - ~Sim2Conn() { - ASSERT_ABORT( !opened || closedByCaller ); - } + ~Sim2Conn() { ASSERT_ABORT(!opened || closedByCaller); } virtual void addref() { ReferenceCounted::addref(); } virtual void delref() { ReferenceCounted::delref(); } - virtual void close() { closedByCaller = true; closeInternal(); } + virtual void close() { + closedByCaller = true; + closeInternal(); + } - virtual Future acceptHandshake() { return delay(0.01*deterministicRandom()->random01()); } - virtual Future connectHandshake() { return delay(0.01*deterministicRandom()->random01()); } + virtual Future acceptHandshake() { return delay(0.01 * deterministicRandom()->random01()); } + virtual Future connectHandshake() { return delay(0.01 * deterministicRandom()->random01()); } virtual Future onWritable() { return whenWritable(this); } virtual Future onReadable() { return whenReadable(this); } - bool isPeerGone() { - return !peer || peerProcess->failed; - } + bool isPeerGone() { return !peer || peerProcess->failed; } void peerClosed() { leakedConnectionTracker = trackLeakedConnection(this); stopReceive = delay(1.0); } - // Reads as many bytes as possible from the read buffer into [begin,end) and returns the number of bytes read (might be 0) - // (or may throw an error if the connection dies) - virtual int read( uint8_t* begin, uint8_t* end ) { + // Reads as many bytes as possible from the read buffer into [begin,end) and returns the number of bytes read (might + // be 0) (or may throw an error if the connection dies) + virtual int read(uint8_t* begin, uint8_t* end) { rollRandomClose(); - int64_t avail = receivedBytes.get() - readBytes.get(); // SOMEDAY: random? - int toRead = std::min( end-begin, avail ); - ASSERT( toRead >= 0 && toRead <= recvBuf.size() && toRead <= end-begin ); - for(int i=0; i(end - begin, avail); + ASSERT(toRead >= 0 && toRead <= recvBuf.size() && toRead <= end - begin); + for (int i = 0; i < toRead; i++) begin[i] = recvBuf[i]; - recvBuf.erase( recvBuf.begin(), recvBuf.begin() + toRead ); - readBytes.set( readBytes.get() + toRead ); + recvBuf.erase(recvBuf.begin(), recvBuf.begin() + toRead); + readBytes.set(readBytes.get() + toRead); return toRead; } - // Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of bytes written (might be 0) - // (or may throw an error if the connection dies) - virtual int write( SendBuffer const* buffer, int limit) { + // Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of + // bytes written (might be 0) (or may throw an error if the connection dies) + virtual int write(SendBuffer const* buffer, int limit) { rollRandomClose(); ASSERT(limit > 0); @@ -241,49 +262,50 @@ struct Sim2Conn : IConnection, ReferenceCounted { if (BUGGIFY) { toSend = std::min(limit, buffer->bytes_written - buffer->bytes_sent); } else { - for(auto p = buffer; p; p=p->next) { + for (auto p = buffer; p; p = p->next) { toSend += p->bytes_written - p->bytes_sent; - if(toSend >= limit) { - if(toSend > limit) + if (toSend >= limit) { + if (toSend > limit) toSend = limit; break; } } } ASSERT(toSend); - if (BUGGIFY) toSend = std::min(toSend, deterministicRandom()->randomInt(0, 1000)); + if (BUGGIFY) + toSend = std::min(toSend, deterministicRandom()->randomInt(0, 1000)); - if (!peer) return toSend; - toSend = std::min( toSend, peer->availableSendBufferForPeer() ); - ASSERT( toSend >= 0 ); + if (!peer) + return toSend; + toSend = std::min(toSend, peer->availableSendBufferForPeer()); + ASSERT(toSend >= 0); int leftToSend = toSend; - for(auto p = buffer; p && leftToSend>0; p=p->next) { + for (auto p = buffer; p && leftToSend > 0; p = p->next) { int ts = std::min(leftToSend, p->bytes_written - p->bytes_sent); - peer->recvBuf.insert( peer->recvBuf.end(), p->data + p->bytes_sent, p->data + p->bytes_sent + ts ); + peer->recvBuf.insert(peer->recvBuf.end(), p->data + p->bytes_sent, p->data + p->bytes_sent + ts); leftToSend -= ts; } - ASSERT( leftToSend == 0 ); - peer->writtenBytes.set( peer->writtenBytes.get() + toSend ); + ASSERT(leftToSend == 0); + peer->writtenBytes.set(peer->writtenBytes.get() + toSend); return toSend; } - // Returns the network address and port of the other end of the connection. In the case of an incoming connection, this may not - // be an address we can connect to! + // Returns the network address and port of the other end of the connection. In the case of an incoming connection, + // this may not be an address we can connect to! virtual NetworkAddress getPeerAddress() { return peerEndpoint; } virtual UID getDebugID() { return dbgid; } bool opened, closedByCaller; private: - ISimulator::ProcessInfo* process, *peerProcess; + ISimulator::ProcessInfo *process, *peerProcess; UID dbgid, peerId; NetworkAddress peerEndpoint; - std::deque< uint8_t > recvBuf; // Includes bytes written but not yet received! + std::deque recvBuf; // Includes bytes written but not yet received! AsyncVar readBytes, // bytes already pulled from recvBuf (location of the beginning of recvBuf) - receivedBytes, - sentBytes, - writtenBytes; // location of the end of recvBuf ( == recvBuf.size() + readBytes.get() ) + receivedBytes, sentBytes, + writtenBytes; // location of the end of recvBuf ( == recvBuf.size() + readBytes.get() ) Reference peer; int sendBufSize; @@ -292,10 +314,12 @@ private: Future pipes; Future stopReceive; - int availableSendBufferForPeer() const { return sendBufSize - (writtenBytes.get() - receivedBytes.get()); } // SOMEDAY: acknowledgedBytes instead of receivedBytes + int availableSendBufferForPeer() const { + return sendBufSize - (writtenBytes.get() - receivedBytes.get()); + } // SOMEDAY: acknowledgedBytes instead of receivedBytes void closeInternal() { - if(peer) { + if (peer) { peer->peerClosed(); stopReceive = delay(1.0); } @@ -303,92 +327,107 @@ private: peer.clear(); } - ACTOR static Future sender( Sim2Conn* self ) { + ACTOR static Future sender(Sim2Conn* self) { loop { - wait( self->writtenBytes.onChange() ); // takes place on peer! - ASSERT( g_simulator.getCurrentProcess() == self->peerProcess ); - wait( delay( .002 * deterministicRandom()->random01() ) ); - self->sentBytes.set( self->writtenBytes.get() ); // or possibly just some sometimes... + wait(self->writtenBytes.onChange()); // takes place on peer! + ASSERT(g_simulator.getCurrentProcess() == self->peerProcess); + wait(delay(.002 * deterministicRandom()->random01())); + self->sentBytes.set(self->writtenBytes.get()); // or possibly just some sometimes... } } - ACTOR static Future receiver( Sim2Conn* self ) { + ACTOR static Future receiver(Sim2Conn* self) { loop { if (self->sentBytes.get() != self->receivedBytes.get()) - wait( g_simulator.onProcess( self->peerProcess ) ); - while ( self->sentBytes.get() == self->receivedBytes.get() ) - wait( self->sentBytes.onChange() ); - ASSERT( g_simulator.getCurrentProcess() == self->peerProcess ); - state int64_t pos = deterministicRandom()->random01() < .5 ? self->sentBytes.get() : deterministicRandom()->randomInt64( self->receivedBytes.get(), self->sentBytes.get()+1 ); - wait( delay( g_clogging.getSendDelay( self->process->address, self->peerProcess->address ) ) ); - wait( g_simulator.onProcess( self->process ) ); - ASSERT( g_simulator.getCurrentProcess() == self->process ); - wait( delay( g_clogging.getRecvDelay( self->process->address, self->peerProcess->address ) ) ); - ASSERT( g_simulator.getCurrentProcess() == self->process ); - if(self->stopReceive.isReady()) { + wait(g_simulator.onProcess(self->peerProcess)); + while (self->sentBytes.get() == self->receivedBytes.get()) + wait(self->sentBytes.onChange()); + ASSERT(g_simulator.getCurrentProcess() == self->peerProcess); + state int64_t pos = + deterministicRandom()->random01() < .5 + ? self->sentBytes.get() + : deterministicRandom()->randomInt64(self->receivedBytes.get(), self->sentBytes.get() + 1); + wait(delay(g_clogging.getSendDelay(self->process->address, self->peerProcess->address))); + wait(g_simulator.onProcess(self->process)); + ASSERT(g_simulator.getCurrentProcess() == self->process); + wait(delay(g_clogging.getRecvDelay(self->process->address, self->peerProcess->address))); + ASSERT(g_simulator.getCurrentProcess() == self->process); + if (self->stopReceive.isReady()) { wait(Future(Never())); } - self->receivedBytes.set( pos ); - wait( Future(Void()) ); // Prior notification can delete self and cancel this actor - ASSERT( g_simulator.getCurrentProcess() == self->process ); + self->receivedBytes.set(pos); + wait(Future(Void())); // Prior notification can delete self and cancel this actor + ASSERT(g_simulator.getCurrentProcess() == self->process); } } - ACTOR static Future whenReadable( Sim2Conn* self ) { + ACTOR static Future whenReadable(Sim2Conn* self) { try { loop { if (self->readBytes.get() != self->receivedBytes.get()) { - ASSERT( g_simulator.getCurrentProcess() == self->process ); + ASSERT(g_simulator.getCurrentProcess() == self->process); return Void(); } - wait( self->receivedBytes.onChange() ); + wait(self->receivedBytes.onChange()); self->rollRandomClose(); } } catch (Error& e) { - ASSERT( g_simulator.getCurrentProcess() == self->process ); + ASSERT(g_simulator.getCurrentProcess() == self->process); throw; } } - ACTOR static Future whenWritable( Sim2Conn* self ) { + ACTOR static Future whenWritable(Sim2Conn* self) { try { loop { - if (!self->peer) return Void(); + if (!self->peer) + return Void(); if (self->peer->availableSendBufferForPeer() > 0) { - ASSERT( g_simulator.getCurrentProcess() == self->process ); + ASSERT(g_simulator.getCurrentProcess() == self->process); return Void(); } try { - wait( self->peer->receivedBytes.onChange() ); - ASSERT( g_simulator.getCurrentProcess() == self->peerProcess ); + wait(self->peer->receivedBytes.onChange()); + ASSERT(g_simulator.getCurrentProcess() == self->peerProcess); } catch (Error& e) { - if (e.code() != error_code_broken_promise) throw; + if (e.code() != error_code_broken_promise) + throw; } - wait( g_simulator.onProcess( self->process ) ); + wait(g_simulator.onProcess(self->process)); } } catch (Error& e) { - ASSERT( g_simulator.getCurrentProcess() == self->process ); + ASSERT(g_simulator.getCurrentProcess() == self->process); throw; } } void rollRandomClose() { - if (now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration && deterministicRandom()->random01() < .00001) { + if (now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration && + deterministicRandom()->random01() < .00001) { g_simulator.lastConnectionFailure = now(); double a = deterministicRandom()->random01(), b = deterministicRandom()->random01(); - TEST(true); // Simulated connection failure - TraceEvent("ConnectionFailure", dbgid).detail("MyAddr", process->address).detail("PeerAddr", peerProcess->address).detail("SendClosed", a > .33).detail("RecvClosed", a < .66).detail("Explicit", b < .3); - if (a < .66 && peer) peer->closeInternal(); - if (a > .33) closeInternal(); - // At the moment, we occasionally notice the connection failed immediately. In principle, this could happen but only after a delay. + TEST(true); // Simulated connection failure + TraceEvent("ConnectionFailure", dbgid) + .detail("MyAddr", process->address) + .detail("PeerAddr", peerProcess->address) + .detail("SendClosed", a > .33) + .detail("RecvClosed", a < .66) + .detail("Explicit", b < .3); + if (a < .66 && peer) + peer->closeInternal(); + if (a > .33) + closeInternal(); + // At the moment, we occasionally notice the connection failed immediately. In principle, this could happen + // but only after a delay. if (b < .3) throw connection_failed(); } } - ACTOR static Future trackLeakedConnection( Sim2Conn* self ) { - wait( g_simulator.onProcess( self->process ) ); + ACTOR static Future trackLeakedConnection(Sim2Conn* self) { + wait(g_simulator.onProcess(self->process)); if (self->process->address.isPublic()) { - wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) ); + wait( + delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5)); } else { - wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) ); + wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5)); } TraceEvent(SevError, "LeakedConnection", self->dbgid) .error(connection_leaked()) @@ -403,7 +442,7 @@ private: #include #include -int sf_open( const char* filename, int flags, int convFlags, int mode ); +int sf_open(const char* filename, int flags, int convFlags, int mode); #if defined(_WIN32) #include @@ -419,8 +458,8 @@ int sf_open( const char* filename, int flags, int convFlags, int mode ); #define _chsize ::ftruncate #define O_BINARY 0 -int sf_open( const char* filename, int flags, int convFlags, int mode ) { - return _open( filename, convFlags, mode ); +int sf_open(const char* filename, int flags, int convFlags, int mode) { + return _open(filename, convFlags, mode); } #else @@ -433,17 +472,21 @@ public: static bool should_poll() { return false; } - ACTOR static Future> open( std::string filename, int flags, int mode, - Reference diskParameters = Reference(new DiskParameters(25000, 150000000)), bool delayOnWrite = true ) { + ACTOR static Future> open( + std::string filename, + int flags, + int mode, + Reference diskParameters = Reference(new DiskParameters(25000, 150000000)), + bool delayOnWrite = true) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - if(++openCount >= 3000) { + if (++openCount >= 3000) { TraceEvent(SevError, "TooManyFiles"); ASSERT(false); } - if(openCount == 2000) { + if (openCount == 2000) { TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyFiles"); g_simulator.speedUpSimulation = true; g_simulator.connectionFailuresDisableDuration = 1e6; @@ -451,34 +494,39 @@ public: // Filesystems on average these days seem to start to have limits of around 255 characters for a // filename. We add ".part" below, so we need to stay under 250. - ASSERT( basename(filename).size() < 250 ); + ASSERT(basename(filename).size() < 250); - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { - wait( delay(FLOW_KNOBS->MIN_OPEN_TIME + deterministicRandom()->random01() * (FLOW_KNOBS->MAX_OPEN_TIME - FLOW_KNOBS->MIN_OPEN_TIME) ) ); + wait(delay(FLOW_KNOBS->MIN_OPEN_TIME + + deterministicRandom()->random01() * (FLOW_KNOBS->MAX_OPEN_TIME - FLOW_KNOBS->MIN_OPEN_TIME))); std::string open_filename = filename; if (flags & OPEN_ATOMIC_WRITE_AND_CREATE) { - ASSERT( (flags & OPEN_CREATE) && (flags & OPEN_READWRITE) && !(flags & OPEN_EXCLUSIVE) ); + ASSERT((flags & OPEN_CREATE) && (flags & OPEN_READWRITE) && !(flags & OPEN_EXCLUSIVE)); open_filename = filename + ".part"; } - int h = sf_open( open_filename.c_str(), flags, flagConversion(flags), mode ); - if( h == -1 ) { + int h = sf_open(open_filename.c_str(), flags, flagConversion(flags), mode); + if (h == -1) { bool notFound = errno == ENOENT; Error e = notFound ? file_not_found() : io_error(); - TraceEvent(notFound ? SevWarn : SevWarnAlways, "FileOpenError").error(e).GetLastError().detail("File", filename).detail("Flags", flags); + TraceEvent(notFound ? SevWarn : SevWarnAlways, "FileOpenError") + .error(e) + .GetLastError() + .detail("File", filename) + .detail("Flags", flags); throw e; } platform::makeTemporary(open_filename.c_str()); - SimpleFile *simpleFile = new SimpleFile( h, diskParameters, delayOnWrite, filename, open_filename, flags ); - state Reference file = Reference( simpleFile ); - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + SimpleFile* simpleFile = new SimpleFile(h, diskParameters, delayOnWrite, filename, open_filename, flags); + state Reference file = Reference(simpleFile); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return file; - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } @@ -488,86 +536,97 @@ public: virtual int64_t debugFD() { return (int64_t)h; } - virtual Future read( void* data, int length, int64_t offset ) { - return read_impl( this, data, length, offset ); + virtual Future read(void* data, int length, int64_t offset) { return read_impl(this, data, length, offset); } + + virtual Future write(void const* data, int length, int64_t offset) { + return write_impl(this, StringRef((const uint8_t*)data, length), offset); } - virtual Future write( void const* data, int length, int64_t offset ) { - return write_impl( this, StringRef((const uint8_t*)data, length), offset ); - } + virtual Future truncate(int64_t size) { return truncate_impl(this, size); } - virtual Future truncate( int64_t size ) { - return truncate_impl( this, size ); - } + virtual Future sync() { return sync_impl(this); } - virtual Future sync() { - return sync_impl( this ); - } + virtual Future size() { return size_impl(this); } - virtual Future size() { - return size_impl( this ); - } + virtual std::string getFilename() { return actualFilename; } - virtual std::string getFilename() { - return actualFilename; - } - - ~SimpleFile() { - _close( h ); - } + ~SimpleFile() { _close(h); } private: int h; - //Performance parameters of simulated disk + // Performance parameters of simulated disk Reference diskParameters; std::string filename, actualFilename; int flags; UID dbgId; - //If true, then writes/truncates will be preceded by a delay (like other operations). If false, then they will not - //This is to support AsyncFileNonDurable, which issues its own delays for writes and truncates + // If true, then writes/truncates will be preceded by a delay (like other operations). If false, then they will not + // This is to support AsyncFileNonDurable, which issues its own delays for writes and truncates bool delayOnWrite; - SimpleFile(int h, Reference diskParameters, bool delayOnWrite, const std::string& filename, const std::string& actualFilename, int flags) - : h(h), diskParameters(diskParameters), delayOnWrite(delayOnWrite), filename(filename), actualFilename(actualFilename), dbgId(deterministicRandom()->randomUniqueID()), flags(flags) {} + SimpleFile(int h, + Reference diskParameters, + bool delayOnWrite, + const std::string& filename, + const std::string& actualFilename, + int flags) + : h(h), diskParameters(diskParameters), delayOnWrite(delayOnWrite), filename(filename), + actualFilename(actualFilename), dbgId(deterministicRandom()->randomUniqueID()), flags(flags) {} - static int flagConversion( int flags ) { + static int flagConversion(int flags) { int outFlags = O_BINARY | O_CLOEXEC; - if( flags&OPEN_READWRITE ) outFlags |= O_RDWR; - if( flags&OPEN_CREATE ) outFlags |= O_CREAT; - if( flags&OPEN_READONLY ) outFlags |= O_RDONLY; - if( flags&OPEN_EXCLUSIVE ) outFlags |= O_EXCL; - if( flags&OPEN_ATOMIC_WRITE_AND_CREATE ) outFlags |= O_TRUNC; + if (flags & OPEN_READWRITE) + outFlags |= O_RDWR; + if (flags & OPEN_CREATE) + outFlags |= O_CREAT; + if (flags & OPEN_READONLY) + outFlags |= O_RDONLY; + if (flags & OPEN_EXCLUSIVE) + outFlags |= O_EXCL; + if (flags & OPEN_ATOMIC_WRITE_AND_CREATE) + outFlags |= O_TRUNC; return outFlags; } - ACTOR static Future read_impl( SimpleFile* self, void* data, int length, int64_t offset ) { - ASSERT( ( self->flags & IAsyncFile::OPEN_NO_AIO ) != 0 || - ( (uintptr_t)data % 4096 == 0 && length % 4096 == 0 && offset % 4096 == 0 ) ); // Required by KAIO. + ACTOR static Future read_impl(SimpleFile* self, void* data, int length, int64_t offset) { + ASSERT((self->flags & IAsyncFile::OPEN_NO_AIO) != 0 || + ((uintptr_t)data % 4096 == 0 && length % 4096 == 0 && offset % 4096 == 0)); // Required by KAIO. state UID opId = deterministicRandom()->randomUniqueID(); if (randLog) - fprintf( randLog, "SFR1 %s %s %s %d %" PRId64 "\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str(), length, offset ); + fprintf(randLog, + "SFR1 %s %s %s %d %" PRId64 "\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str(), + length, + offset); - wait( waitUntilDiskReady( self->diskParameters, length ) ); + wait(waitUntilDiskReady(self->diskParameters, length)); - if( _lseeki64( self->h, offset, SEEK_SET ) == -1 ) { + if (_lseeki64(self->h, offset, SEEK_SET) == -1) { TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 1); throw io_error(); } unsigned int read_bytes = 0; - if( ( read_bytes = _read( self->h, data, (unsigned int) length ) ) == -1 ) { + if ((read_bytes = _read(self->h, data, (unsigned int)length)) == -1) { TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 2); throw io_error(); } if (randLog) { - uint32_t a=0, b=0; - hashlittle2( data, read_bytes, &a, &b ); - fprintf( randLog, "SFR2 %s %s %s %d %d\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str(), read_bytes, a ); + uint32_t a = 0, b = 0; + hashlittle2(data, read_bytes, &a, &b); + fprintf(randLog, + "SFR2 %s %s %s %d %d\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str(), + read_bytes, + a); } debugFileCheck("SimpleFileRead", self->filename, data, offset, length); @@ -578,35 +637,46 @@ private: return read_bytes; } - ACTOR static Future write_impl( SimpleFile* self, StringRef data, int64_t offset ) { + ACTOR static Future write_impl(SimpleFile* self, StringRef data, int64_t offset) { state UID opId = deterministicRandom()->randomUniqueID(); if (randLog) { - uint32_t a=0, b=0; - hashlittle2( data.begin(), data.size(), &a, &b ); - fprintf( randLog, "SFW1 %s %s %s %d %d %" PRId64 "\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str(), a, data.size(), offset ); + uint32_t a = 0, b = 0; + hashlittle2(data.begin(), data.size(), &a, &b); + fprintf(randLog, + "SFW1 %s %s %s %d %d %" PRId64 "\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str(), + a, + data.size(), + offset); } - if(self->delayOnWrite) - wait( waitUntilDiskReady( self->diskParameters, data.size() ) ); + if (self->delayOnWrite) + wait(waitUntilDiskReady(self->diskParameters, data.size())); - if( _lseeki64( self->h, offset, SEEK_SET ) == -1 ) { + if (_lseeki64(self->h, offset, SEEK_SET) == -1) { TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 3); throw io_error(); } unsigned int write_bytes = 0; - if ( ( write_bytes = _write( self->h, (void*)data.begin(), data.size() ) ) == -1 ) { + if ((write_bytes = _write(self->h, (void*)data.begin(), data.size())) == -1) { TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 4); throw io_error(); } - if ( write_bytes != data.size() ) { + if (write_bytes != data.size()) { TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 5); throw io_error(); } if (randLog) { - fprintf( randLog, "SFW2 %s %s %s\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str()); + fprintf(randLog, + "SFW2 %s %s %s\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str()); } debugFileCheck("SimpleFileWrite", self->filename, (void*)data.begin(), offset, data.size()); @@ -617,49 +687,71 @@ private: return Void(); } - ACTOR static Future truncate_impl( SimpleFile* self, int64_t size ) { + ACTOR static Future truncate_impl(SimpleFile* self, int64_t size) { state UID opId = deterministicRandom()->randomUniqueID(); if (randLog) - fprintf( randLog, "SFT1 %s %s %s %" PRId64 "\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str(), size ); + fprintf(randLog, + "SFT1 %s %s %s %" PRId64 "\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str(), + size); // KAIO will return EINVAL, as len==0 is an error. - if( (self->flags & IAsyncFile::OPEN_NO_AIO) == 0 && size == 0) { + if ((self->flags & IAsyncFile::OPEN_NO_AIO) == 0 && size == 0) { throw io_error(); } - if(self->delayOnWrite) - wait( waitUntilDiskReady( self->diskParameters, 0 ) ); + if (self->delayOnWrite) + wait(waitUntilDiskReady(self->diskParameters, 0)); - if( _chsize( self->h, (long) size ) == -1 ) { - TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 6).detail("Filename", self->filename).detail("Size", size).detail("Fd", self->h).GetLastError(); + if (_chsize(self->h, (long)size) == -1) { + TraceEvent(SevWarn, "SimpleFileIOError") + .detail("Location", 6) + .detail("Filename", self->filename) + .detail("Size", size) + .detail("Fd", self->h) + .GetLastError(); throw io_error(); } if (randLog) - fprintf( randLog, "SFT2 %s %s %s\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str()); + fprintf(randLog, + "SFT2 %s %s %s\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str()); - INJECT_FAULT( io_timeout, "SimpleFile::truncate" ); - INJECT_FAULT( io_error, "SimpleFile::truncate" ); + INJECT_FAULT(io_timeout, "SimpleFile::truncate"); + INJECT_FAULT(io_error, "SimpleFile::truncate"); return Void(); } - ACTOR static Future sync_impl( SimpleFile* self ) { + ACTOR static Future sync_impl(SimpleFile* self) { state UID opId = deterministicRandom()->randomUniqueID(); if (randLog) - fprintf( randLog, "SFC1 %s %s %s\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str()); + fprintf(randLog, + "SFC1 %s %s %s\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str()); - if(self->delayOnWrite) - wait( waitUntilDiskReady( self->diskParameters, 0, true ) ); + if (self->delayOnWrite) + wait(waitUntilDiskReady(self->diskParameters, 0, true)); if (self->flags & OPEN_ATOMIC_WRITE_AND_CREATE) { self->flags &= ~OPEN_ATOMIC_WRITE_AND_CREATE; auto& machineCache = g_simulator.getCurrentProcess()->machine->openFiles; std::string sourceFilename = self->filename + ".part"; - if(machineCache.count(sourceFilename)) { - TraceEvent("SimpleFileRename").detail("From", sourceFilename).detail("To", self->filename).detail("SourceCount", machineCache.count(sourceFilename)).detail("FileCount", machineCache.count(self->filename)); - renameFile( sourceFilename.c_str(), self->filename.c_str() ); + if (machineCache.count(sourceFilename)) { + TraceEvent("SimpleFileRename") + .detail("From", sourceFilename) + .detail("To", self->filename) + .detail("SourceCount", machineCache.count(sourceFilename)) + .detail("FileCount", machineCache.count(self->filename)); + renameFile(sourceFilename.c_str(), self->filename.c_str()); ASSERT(!machineCache.count(self->filename)); machineCache[self->filename] = machineCache[sourceFilename]; @@ -669,30 +761,43 @@ private: } if (randLog) - fprintf( randLog, "SFC2 %s %s %s\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str()); + fprintf(randLog, + "SFC2 %s %s %s\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str()); - INJECT_FAULT( io_timeout, "SimpleFile::sync" ); - INJECT_FAULT( io_error, "SimpleFile::sync" ); + INJECT_FAULT(io_timeout, "SimpleFile::sync"); + INJECT_FAULT(io_error, "SimpleFile::sync"); return Void(); } - ACTOR static Future size_impl( SimpleFile* self ) { + ACTOR static Future size_impl(SimpleFile* self) { state UID opId = deterministicRandom()->randomUniqueID(); if (randLog) - fprintf(randLog, "SFS1 %s %s %s\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str()); + fprintf(randLog, + "SFS1 %s %s %s\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str()); - wait( waitUntilDiskReady( self->diskParameters, 0 ) ); + wait(waitUntilDiskReady(self->diskParameters, 0)); - int64_t pos = _lseeki64( self->h, 0L, SEEK_END ); - if( pos == -1 ) { + int64_t pos = _lseeki64(self->h, 0L, SEEK_END); + if (pos == -1) { TraceEvent(SevWarn, "SimpleFileIOError").detail("Location", 8); throw io_error(); } if (randLog) - fprintf(randLog, "SFS2 %s %s %s %" PRId64 "\n", self->dbgId.shortString().c_str(), self->filename.c_str(), opId.shortString().c_str(), pos); - INJECT_FAULT( io_error, "SimpleFile::size" ); + fprintf(randLog, + "SFS2 %s %s %s %" PRId64 "\n", + self->dbgId.shortString().c_str(), + self->filename.c_str(), + opId.shortString().c_str(), + pos); + INJECT_FAULT(io_error, "SimpleFile::size"); return pos; } @@ -700,46 +805,43 @@ private: struct SimDiskSpace { int64_t totalSpace; - int64_t baseFreeSpace; //The original free space of the disk + deltas from simulated external modifications + int64_t baseFreeSpace; // The original free space of the disk + deltas from simulated external modifications double lastUpdate; }; -void doReboot( ISimulator::ProcessInfo* const& p, ISimulator::KillType const& kt ); +void doReboot(ISimulator::ProcessInfo* const& p, ISimulator::KillType const& kt); struct Sim2Listener : IListener, ReferenceCounted { - explicit Sim2Listener( ISimulator::ProcessInfo* process, const NetworkAddress& listenAddr ) - : process(process), - address(listenAddr) {} + explicit Sim2Listener(ISimulator::ProcessInfo* process, const NetworkAddress& listenAddr) + : process(process), address(listenAddr) {} - void incomingConnection( double seconds, Reference conn ) { // Called by another process! - incoming( Reference::addRef( this ), seconds, conn ); + void incomingConnection(double seconds, Reference conn) { // Called by another process! + incoming(Reference::addRef(this), seconds, conn); } virtual void addref() { ReferenceCounted::addref(); } virtual void delref() { ReferenceCounted::delref(); } - virtual Future> accept() { - return popOne( nextConnection.getFuture() ); - } + virtual Future> accept() { return popOne(nextConnection.getFuture()); } virtual NetworkAddress getListenAddress() { return address; } private: ISimulator::ProcessInfo* process; - PromiseStream< Reference > nextConnection; + PromiseStream> nextConnection; - ACTOR static void incoming( Reference self, double seconds, Reference conn ) { - wait( g_simulator.onProcess(self->process) ); - wait( delay( seconds ) ); - if (((Sim2Conn*)conn.getPtr())->isPeerGone() && deterministicRandom()->random01()<0.5) + ACTOR static void incoming(Reference self, double seconds, Reference conn) { + wait(g_simulator.onProcess(self->process)); + wait(delay(seconds)); + if (((Sim2Conn*)conn.getPtr())->isPeerGone() && deterministicRandom()->random01() < 0.5) return; TraceEvent("Sim2IncomingConn", conn->getDebugID()) - .detail("ListenAddress", self->getListenAddress()) - .detail("PeerAddress", conn->getPeerAddress()); - self->nextConnection.send( conn ); + .detail("ListenAddress", self->getListenAddress()) + .detail("PeerAddress", conn->getPeerAddress()); + self->nextConnection.send(conn); } - ACTOR static Future> popOne( FutureStream< Reference > conns ) { - Reference c = waitNext( conns ); + ACTOR static Future> popOne(FutureStream> conns) { + Reference c = waitNext(conns); ((Sim2Conn*)c.getPtr())->opened = true; return c; } @@ -752,72 +854,75 @@ private: class Sim2 : public ISimulator, public INetworkConnections { public: // Implement INetwork interface - // Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating machines and time + // Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating + // machines and time virtual double now() { return time; } // timer() can be up to 0.1 seconds ahead of now() virtual double timer() { - timerTime += deterministicRandom()->random01()*(time+0.1-timerTime)/2.0; - return timerTime; + timerTime += deterministicRandom()->random01() * (time + 0.1 - timerTime) / 2.0; + return timerTime; } - virtual Future delay( double seconds, TaskPriority taskID ) { + virtual Future delay(double seconds, TaskPriority taskID) { ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max); - return delay( seconds, taskID, currentProcess ); + return delay(seconds, taskID, currentProcess); } - Future delay( double seconds, TaskPriority taskID, ProcessInfo* machine ) { - ASSERT( seconds >= -0.0001 ); + Future delay(double seconds, TaskPriority taskID, ProcessInfo* machine) { + ASSERT(seconds >= -0.0001); seconds = std::max(0.0, seconds); Future f; - if(!currentProcess->rebooting && machine == currentProcess && !currentProcess->shutdownSignal.isSet() && FLOW_KNOBS->MAX_BUGGIFIED_DELAY > 0 && deterministicRandom()->random01() < 0.25) { //FIXME: why doesnt this work when we are changing machines? - seconds += FLOW_KNOBS->MAX_BUGGIFIED_DELAY*pow(deterministicRandom()->random01(),1000.0); + if (!currentProcess->rebooting && machine == currentProcess && !currentProcess->shutdownSignal.isSet() && + FLOW_KNOBS->MAX_BUGGIFIED_DELAY > 0 && + deterministicRandom()->random01() < 0.25) { // FIXME: why doesnt this work when we are changing machines? + seconds += FLOW_KNOBS->MAX_BUGGIFIED_DELAY * pow(deterministicRandom()->random01(), 1000.0); } mutex.enter(); - tasks.push( Task( time + seconds, taskID, taskCount++, machine, f ) ); + tasks.push(Task(time + seconds, taskID, taskCount++, machine, f)); mutex.leave(); return f; } - ACTOR static Future checkShutdown(Sim2 *self, TaskPriority taskID) { + ACTOR static Future checkShutdown(Sim2* self, TaskPriority taskID) { wait(success(self->getCurrentProcess()->shutdownSignal.getFuture())); self->setCurrentTask(taskID); return Void(); } - virtual Future yield( TaskPriority taskID ) { - if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; + virtual Future yield(TaskPriority taskID) { + if (taskID == TaskPriority::DefaultYield) + taskID = currentTaskID; if (check_yield(taskID)) { - // We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), but - // don't want to prevent instantaneous shutdown of "rebooted" machines. - return delay(getCurrentProcess()->rebooting ? 0 : .001,taskID) || checkShutdown(this, taskID); + // We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), + // but don't want to prevent instantaneous shutdown of "rebooted" machines. + return delay(getCurrentProcess()->rebooting ? 0 : .001, taskID) || checkShutdown(this, taskID); } setCurrentTask(taskID); return Void(); } - virtual bool check_yield( TaskPriority taskID ) { - if (yielded) return true; + virtual bool check_yield(TaskPriority taskID) { + if (yielded) + return true; if (--yield_limit <= 0) { - yield_limit = deterministicRandom()->randomInt(1, 150); // If yield returns false *too* many times in a row, there could be a stack overflow, since we can't deterministically check stack size as the real network does + yield_limit = deterministicRandom()->randomInt( + 1, 150); // If yield returns false *too* many times in a row, there could be a stack overflow, since we + // can't deterministically check stack size as the real network does return yielded = true; } return yielded = BUGGIFY_WITH_PROB(0.01); } - virtual TaskPriority getCurrentTask() { - return currentTaskID; - } - virtual void setCurrentTask(TaskPriority taskID ) { - currentTaskID = taskID; - } + virtual TaskPriority getCurrentTask() { return currentTaskID; } + virtual void setCurrentTask(TaskPriority taskID) { currentTaskID = taskID; } // Sets the taskID/priority of the current task, without yielding - virtual Future> connect( NetworkAddress toAddr, std::string host ) { - ASSERT( host.empty()); - if (!addressMap.count( toAddr )) { - return waitForProcessAndConnect( toAddr, this ); + virtual Future> connect(NetworkAddress toAddr, std::string host) { + ASSERT(host.empty()); + if (!addressMap.count(toAddr)) { + return waitForProcessAndConnect(toAddr, this); } auto peerp = getProcessByAddress(toAddr); - Reference myc( new Sim2Conn( getCurrentProcess() ) ); - Reference peerc( new Sim2Conn( peerp ) ); + Reference myc(new Sim2Conn(getCurrentProcess())); + Reference peerc(new Sim2Conn(peerp)); myc->connect(peerc, toAddr); IPAddress localIp; @@ -829,19 +934,22 @@ public: } else { localIp = IPAddress(getCurrentProcess()->address.ip.toV4() + deterministicRandom()->randomInt(0, 256)); } - peerc->connect(myc, NetworkAddress(localIp, deterministicRandom()->randomInt(40000, 60000), false, toAddr.isTLS())); + peerc->connect(myc, + NetworkAddress(localIp, deterministicRandom()->randomInt(40000, 60000), false, toAddr.isTLS())); - ((Sim2Listener*)peerp->getListener(toAddr).getPtr())->incomingConnection( 0.5*deterministicRandom()->random01(), Reference(peerc) ); - return onConnect( ::delay(0.5*deterministicRandom()->random01()), myc ); + ((Sim2Listener*)peerp->getListener(toAddr).getPtr()) + ->incomingConnection(0.5 * deterministicRandom()->random01(), Reference(peerc)); + return onConnect(::delay(0.5 * deterministicRandom()->random01()), myc); } - virtual Future> resolveTCPEndpoint( std::string host, std::string service) { + virtual Future> resolveTCPEndpoint(std::string host, std::string service) { throw lookup_failed(); } - ACTOR static Future> onConnect( Future ready, Reference conn ) { + ACTOR static Future> onConnect(Future ready, Reference conn) { wait(ready); if (conn->isPeerGone()) { conn.clear(); - if(FLOW_KNOBS->SIM_CONNECT_ERROR_MODE == 1 || (FLOW_KNOBS->SIM_CONNECT_ERROR_MODE == 2 && deterministicRandom()->random01() > 0.5)) { + if (FLOW_KNOBS->SIM_CONNECT_ERROR_MODE == 1 || + (FLOW_KNOBS->SIM_CONNECT_ERROR_MODE == 2 && deterministicRandom()->random01() > 0.5)) { throw connection_failed(); } wait(Never()); @@ -849,18 +957,18 @@ public: conn->opened = true; return conn; } - virtual Reference listen( NetworkAddress localAddr ) { - Reference listener( getCurrentProcess()->getListener(localAddr) ); + virtual Reference listen(NetworkAddress localAddr) { + Reference listener(getCurrentProcess()->getListener(localAddr)); ASSERT(listener); return listener; } - ACTOR static Future> waitForProcessAndConnect( - NetworkAddress toAddr, INetworkConnections *self ) { + ACTOR static Future> waitForProcessAndConnect(NetworkAddress toAddr, + INetworkConnections* self) { // We have to be able to connect to processes that don't yet exist, so we do some silly polling loop { - wait( ::delay( 0.1 * deterministicRandom()->random01() ) ); + wait(::delay(0.1 * deterministicRandom()->random01())); if (g_sim2.addressMap.count(toAddr)) { - Reference c = wait( self->connect( toAddr ) ); + Reference c = wait(self->connect(toAddr)); return c; } } @@ -870,29 +978,25 @@ public: return emptyConfig; } - virtual void stop() { - isStopped = true; - } - virtual void addStopCallback( std::function fn ) { - stopCallbacks.emplace_back(std::move(fn)); - } + virtual void stop() { isStopped = true; } + virtual void addStopCallback(std::function fn) { stopCallbacks.emplace_back(std::move(fn)); } virtual bool isSimulated() const { return true; } struct SimThreadArgs { - THREAD_FUNC_RETURN (*func) (void*); - void *arg; + THREAD_FUNC_RETURN (*func)(void*); + void* arg; - ISimulator::ProcessInfo *currentProcess; + ISimulator::ProcessInfo* currentProcess; - SimThreadArgs(THREAD_FUNC_RETURN (*func) (void*), void *arg) : func(func), arg(arg) { + SimThreadArgs(THREAD_FUNC_RETURN (*func)(void*), void* arg) : func(func), arg(arg) { ASSERT(g_network->isSimulated()); currentProcess = g_simulator.getCurrentProcess(); } }; - //Starts a new thread, making sure to set any thread local state - THREAD_FUNC simStartThread(void *arg) { - SimThreadArgs *simArgs = (SimThreadArgs*)arg; + // Starts a new thread, making sure to set any thread local state + THREAD_FUNC simStartThread(void* arg) { + SimThreadArgs* simArgs = (SimThreadArgs*)arg; ISimulator::currentProcess = simArgs->currentProcess; simArgs->func(simArgs->arg); @@ -900,36 +1004,44 @@ public: THREAD_RETURN; } - virtual THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void*), void *arg ) { - SimThreadArgs *simArgs = new SimThreadArgs(func, arg); + virtual THREAD_HANDLE startThread(THREAD_FUNC_RETURN (*func)(void*), void* arg) { + SimThreadArgs* simArgs = new SimThreadArgs(func, arg); return ::startThread(simStartThread, simArgs); } - virtual void getDiskBytes( std::string const& directory, int64_t& free, int64_t& total) { - ProcessInfo *proc = getCurrentProcess(); - SimDiskSpace &diskSpace = diskSpaceMap[proc->address.ip]; + virtual void getDiskBytes(std::string const& directory, int64_t& free, int64_t& total) { + ProcessInfo* proc = getCurrentProcess(); + SimDiskSpace& diskSpace = diskSpaceMap[proc->address.ip]; int64_t totalFileSize = 0; int numFiles = 0; - //Get the size of all files we've created on the server and subtract them from the free space - for(auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) { - if( file->second.isReady() ) { + // Get the size of all files we've created on the server and subtract them from the free space + for (auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) { + if (file->second.isReady()) { totalFileSize += ((AsyncFileNonDurable*)file->second.get().getPtr())->approximateSize; } numFiles++; } - if(diskSpace.totalSpace == 0) { - diskSpace.totalSpace = 5e9 + deterministicRandom()->random01() * 100e9; //Total space between 5GB and 105GB - diskSpace.baseFreeSpace = std::min(diskSpace.totalSpace, std::max(5e9, (deterministicRandom()->random01() * (1 - .075) + .075) * diskSpace.totalSpace) + totalFileSize); //Minimum 5GB or 7.5% total disk space, whichever is higher + if (diskSpace.totalSpace == 0) { + diskSpace.totalSpace = 5e9 + deterministicRandom()->random01() * 100e9; // Total space between 5GB and 105GB + diskSpace.baseFreeSpace = std::min( + diskSpace.totalSpace, + std::max(5e9, (deterministicRandom()->random01() * (1 - .075) + .075) * diskSpace.totalSpace) + + totalFileSize); // Minimum 5GB or 7.5% total disk space, whichever is higher - TraceEvent("Sim2DiskSpaceInitialization").detail("TotalSpace", diskSpace.totalSpace).detail("BaseFreeSpace", diskSpace.baseFreeSpace).detail("TotalFileSize", totalFileSize).detail("NumFiles", numFiles); - } - else { - int64_t maxDelta = std::min(5.0, (now() - diskSpace.lastUpdate)) * (BUGGIFY ? 10e6 : 1e6); //External processes modifying the disk + TraceEvent("Sim2DiskSpaceInitialization") + .detail("TotalSpace", diskSpace.totalSpace) + .detail("BaseFreeSpace", diskSpace.baseFreeSpace) + .detail("TotalFileSize", totalFileSize) + .detail("NumFiles", numFiles); + } else { + int64_t maxDelta = std::min(5.0, (now() - diskSpace.lastUpdate)) * + (BUGGIFY ? 10e6 : 1e6); // External processes modifying the disk int64_t delta = -maxDelta + deterministicRandom()->random01() * maxDelta * 2; - diskSpace.baseFreeSpace = std::min(diskSpace.totalSpace, std::max(diskSpace.baseFreeSpace + delta, totalFileSize)); + diskSpace.baseFreeSpace = std::min( + diskSpace.totalSpace, std::max(diskSpace.baseFreeSpace + delta, totalFileSize)); } diskSpace.lastUpdate = now(); @@ -937,59 +1049,61 @@ public: total = diskSpace.totalSpace; free = std::max(0, diskSpace.baseFreeSpace - totalFileSize); - if(free == 0) - TraceEvent(SevWarnAlways, "Sim2NoFreeSpace").detail("TotalSpace", diskSpace.totalSpace).detail("BaseFreeSpace", diskSpace.baseFreeSpace).detail("TotalFileSize", totalFileSize).detail("NumFiles", numFiles); - } - virtual bool isAddressOnThisHost( NetworkAddress const& addr ) { - return addr.ip == getCurrentProcess()->address.ip; + if (free == 0) + TraceEvent(SevWarnAlways, "Sim2NoFreeSpace") + .detail("TotalSpace", diskSpace.totalSpace) + .detail("BaseFreeSpace", diskSpace.baseFreeSpace) + .detail("TotalFileSize", totalFileSize) + .detail("NumFiles", numFiles); } + virtual bool isAddressOnThisHost(NetworkAddress const& addr) { return addr.ip == getCurrentProcess()->address.ip; } - ACTOR static Future deleteFileImpl( Sim2* self, std::string filename, bool mustBeDurable ) { + ACTOR static Future deleteFileImpl(Sim2* self, std::string filename, bool mustBeDurable) { // This is a _rudimentary_ simulation of the untrustworthiness of non-durable deletes and the possibility of // rebooting during a durable one. It isn't perfect: for example, on real filesystems testing // for the existence of a non-durably deleted file BEFORE a reboot will show that it apparently doesn't exist. - if(g_simulator.getCurrentProcess()->machine->openFiles.count(filename)) { + if (g_simulator.getCurrentProcess()->machine->openFiles.count(filename)) { g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); g_simulator.getCurrentProcess()->machine->deletingFiles.insert(filename); } - if ( mustBeDurable || deterministicRandom()->random01() < 0.5 ) { + if (mustBeDurable || deterministicRandom()->random01() < 0.5) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); state TaskPriority currentTaskID = g_network->getCurrentTask(); - wait( g_simulator.onMachine( currentProcess ) ); + wait(g_simulator.onMachine(currentProcess)); try { - wait( ::delay(0.05 * deterministicRandom()->random01()) ); + wait(::delay(0.05 * deterministicRandom()->random01())); if (!currentProcess->rebooting) { auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false); - ASSERT( f.isReady() ); - wait( ::delay(0.05 * deterministicRandom()->random01()) ); - TEST( true ); // Simulated durable delete + ASSERT(f.isReady()); + wait(::delay(0.05 * deterministicRandom()->random01())); + TEST(true); // Simulated durable delete } - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); return Void(); - } catch( Error &e ) { + } catch (Error& e) { state Error err = e; - wait( g_simulator.onProcess( currentProcess, currentTaskID ) ); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } else { - TEST( true ); // Simulated non-durable delete + TEST(true); // Simulated non-durable delete return Void(); } } - ACTOR static Future runLoop(Sim2 *self) { - state ISimulator::ProcessInfo *callingMachine = self->currentProcess; - while ( !self->isStopped ) { - wait( self->net2->yield(TaskPriority::DefaultYield) ); + ACTOR static Future runLoop(Sim2* self) { + state ISimulator::ProcessInfo* callingMachine = self->currentProcess; + while (!self->isStopped) { + wait(self->net2->yield(TaskPriority::DefaultYield)); self->mutex.enter(); - if( self->tasks.size() == 0 ) { + if (self->tasks.size() == 0) { self->mutex.leave(); ASSERT(false); } - //if (!randLog/* && now() >= 32.0*/) + // if (!randLog/* && now() >= 32.0*/) // randLog = fopen("randLog.txt", "wt"); - Task t = std::move( self->tasks.top() ); // Unfortunately still a copy under gcc where .top() returns const& + Task t = std::move(self->tasks.top()); // Unfortunately still a copy under gcc where .top() returns const& self->currentTaskID = t.taskID; self->tasks.pop(); self->mutex.leave(); @@ -999,61 +1113,68 @@ public: } self->currentProcess = callingMachine; self->net2->stop(); - for ( auto& fn : self->stopCallbacks ) { + for (auto& fn : self->stopCallbacks) { fn(); } return Void(); } - ACTOR Future _run(Sim2 *self) { + ACTOR Future _run(Sim2* self) { Future loopFuture = self->runLoop(self); self->net2->run(); - wait( loopFuture ); + wait(loopFuture); return Void(); } // Implement ISimulator interface - virtual void run() { - _run(this); - } - virtual ProcessInfo* newProcess(const char* name, IPAddress ip, uint16_t port, bool sslEnabled, uint16_t listenPerProcess, - LocalityData locality, ProcessClass startingClass, const char* dataFolder, + virtual void run() { _run(this); } + virtual ProcessInfo* newProcess(const char* name, + IPAddress ip, + uint16_t port, + bool sslEnabled, + uint16_t listenPerProcess, + LocalityData locality, + ProcessClass startingClass, + const char* dataFolder, const char* coordinationFolder) { - ASSERT( locality.machineId().present() ); - MachineInfo& machine = machines[ locality.machineId().get() ]; + ASSERT(locality.machineId().present()); + MachineInfo& machine = machines[locality.machineId().get()]; if (!machine.machineId.present()) machine.machineId = locality.machineId(); - for( int i = 0; i < machine.processes.size(); i++ ) { - if( machine.processes[i]->locality.machineId() != locality.machineId() ) { // SOMEDAY: compute ip from locality to avoid this check + for (int i = 0; i < machine.processes.size(); i++) { + if (machine.processes[i]->locality.machineId() != + locality.machineId()) { // SOMEDAY: compute ip from locality to avoid this check TraceEvent("Sim2Mismatch") .detail("IP", format("%s", ip.toString().c_str())) .detail("MachineId", locality.machineId()) .detail("NewName", name) .detail("ExistingMachineId", machine.processes[i]->locality.machineId()) .detail("ExistingName", machine.processes[i]->name); - ASSERT( false ); + ASSERT(false); } - ASSERT( machine.processes[i]->address.port != port ); + ASSERT(machine.processes[i]->address.port != port); } // This is for async operations on non-durable files. // These files must live on after process kills for sim purposes. - if( machine.machineProcess == 0 ) { + if (machine.machineProcess == 0) { NetworkAddress machineAddress(ip, 0, false, false); - machine.machineProcess = new ProcessInfo("Machine", locality, startingClass, {machineAddress}, this, "", ""); + machine.machineProcess = + new ProcessInfo("Machine", locality, startingClass, { machineAddress }, this, "", ""); machine.machineProcess->machine = &machine; } NetworkAddressList addresses; addresses.address = NetworkAddress(ip, port, true, sslEnabled); - if(listenPerProcess == 2) { - addresses.secondaryAddress = NetworkAddress(ip, port+1, true, false); + if (listenPerProcess == 2) { + addresses.secondaryAddress = NetworkAddress(ip, port + 1, true, false); } - ProcessInfo* m = new ProcessInfo(name, locality, startingClass, addresses, this, dataFolder, coordinationFolder); + ProcessInfo* m = + new ProcessInfo(name, locality, startingClass, addresses, this, dataFolder, coordinationFolder); for (int processPort = port; processPort < port + listenPerProcess; ++processPort) { NetworkAddress address(ip, processPort, true, sslEnabled && processPort == port); - m->listenerMap[address] = Reference( new Sim2Listener(m, address) ); + m->listenerMap[address] = Reference(new Sim2Listener(m, address)); addressMap[address] = m; } m->machine = &machine; @@ -1062,18 +1183,22 @@ public: m->excluded = g_simulator.isExcluded(addresses.address); m->cleared = g_simulator.isCleared(addresses.address); - m->setGlobal(enTDMetrics, (flowGlobalType) &m->tdmetrics); - m->setGlobal(enNetworkConnections, (flowGlobalType) m->network); + m->setGlobal(enTDMetrics, (flowGlobalType)&m->tdmetrics); + m->setGlobal(enNetworkConnections, (flowGlobalType)m->network); m->setGlobal(enASIOTimedOut, (flowGlobalType) false); - TraceEvent("NewMachine").detail("Name", name).detail("Address", m->address).detail("MachineId", m->locality.machineId()).detail("Excluded", m->excluded).detail("Cleared", m->cleared); + TraceEvent("NewMachine") + .detail("Name", name) + .detail("Address", m->address) + .detail("MachineId", m->locality.machineId()) + .detail("Excluded", m->excluded) + .detail("Cleared", m->cleared); // FIXME: Sometimes, connections to/from this process will explicitly close return m; } - virtual bool isAvailable() const - { + virtual bool isAvailable() const { std::vector processesLeft, processesDead; for (auto processInfo : getAllProcesses()) { if (processInfo->isAvailableClass()) { @@ -1087,9 +1212,8 @@ public: return canKillProcesses(processesLeft, processesDead, KillInstantly, NULL); } - virtual bool datacenterDead(Optional> dcId) const - { - if(!dcId.present()) { + virtual bool datacenterDead(Optional> dcId) const { + if (!dcId.present()) { return false; } @@ -1109,23 +1233,30 @@ public: } std::vector badCombo; - bool primaryTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false) : primaryProcessesDead.validate(tLogPolicy); - if(usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) { + bool primaryTLogsDead = + tLogWriteAntiQuorum + ? !validateAllCombinations( + badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false) + : primaryProcessesDead.validate(tLogPolicy); + if (usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) { primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy); } return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy); } - // The following function will determine if the specified configuration of available and dead processes can allow the cluster to survive - virtual bool canKillProcesses(std::vector const& availableProcesses, std::vector const& deadProcesses, KillType kt, KillType* newKillType) const - { + // The following function will determine if the specified configuration of available and dead processes can allow + // the cluster to survive + virtual bool canKillProcesses(std::vector const& availableProcesses, + std::vector const& deadProcesses, + KillType kt, + KillType* newKillType) const { bool canSurvive = true; - int nQuorum = ((desiredCoordinators+1)/2)*2-1; + int nQuorum = ((desiredCoordinators + 1) / 2) * 2 - 1; KillType newKt = kt; - if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)) - { + if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || + (kt == RebootProcessAndDelete)) { LocalityGroup primaryProcessesLeft, primaryProcessesDead; LocalityGroup primarySatelliteProcessesLeft, primarySatelliteProcessesDead; LocalityGroup remoteProcessesLeft, remoteProcessesDead; @@ -1139,7 +1270,7 @@ public: std::vector badCombo; std::set>> uniqueMachines; - if(!primaryDcId.present()) { + if (!primaryDcId.present()) { for (auto processInfo : availableProcesses) { primaryProcessesLeft.add(processInfo->locality); primaryLocalitiesLeft.push_back(processInfo->locality); @@ -1152,31 +1283,39 @@ public: } else { for (auto processInfo : availableProcesses) { uniqueMachines.insert(processInfo->locality.zoneId()); - if(processInfo->locality.dcId() == primaryDcId) { + if (processInfo->locality.dcId() == primaryDcId) { primaryProcessesLeft.add(processInfo->locality); primaryLocalitiesLeft.push_back(processInfo->locality); - } else if(processInfo->locality.dcId() == remoteDcId) { + } else if (processInfo->locality.dcId() == remoteDcId) { remoteProcessesLeft.add(processInfo->locality); remoteLocalitiesLeft.push_back(processInfo->locality); - } else if(std::find(primarySatelliteDcIds.begin(), primarySatelliteDcIds.end(), processInfo->locality.dcId()) != primarySatelliteDcIds.end()) { + } else if (std::find(primarySatelliteDcIds.begin(), + primarySatelliteDcIds.end(), + processInfo->locality.dcId()) != primarySatelliteDcIds.end()) { primarySatelliteProcessesLeft.add(processInfo->locality); primarySatelliteLocalitiesLeft.push_back(processInfo->locality); - } else if(std::find(remoteSatelliteDcIds.begin(), remoteSatelliteDcIds.end(), processInfo->locality.dcId()) != remoteSatelliteDcIds.end()) { + } else if (std::find(remoteSatelliteDcIds.begin(), + remoteSatelliteDcIds.end(), + processInfo->locality.dcId()) != remoteSatelliteDcIds.end()) { remoteSatelliteProcessesLeft.add(processInfo->locality); remoteSatelliteLocalitiesLeft.push_back(processInfo->locality); } } for (auto processInfo : deadProcesses) { - if(processInfo->locality.dcId() == primaryDcId) { + if (processInfo->locality.dcId() == primaryDcId) { primaryProcessesDead.add(processInfo->locality); primaryLocalitiesDead.push_back(processInfo->locality); - } else if(processInfo->locality.dcId() == remoteDcId) { + } else if (processInfo->locality.dcId() == remoteDcId) { remoteProcessesDead.add(processInfo->locality); remoteLocalitiesDead.push_back(processInfo->locality); - } else if(std::find(primarySatelliteDcIds.begin(), primarySatelliteDcIds.end(), processInfo->locality.dcId()) != primarySatelliteDcIds.end()) { + } else if (std::find(primarySatelliteDcIds.begin(), + primarySatelliteDcIds.end(), + processInfo->locality.dcId()) != primarySatelliteDcIds.end()) { primarySatelliteProcessesDead.add(processInfo->locality); primarySatelliteLocalitiesDead.push_back(processInfo->locality); - } else if(std::find(remoteSatelliteDcIds.begin(), remoteSatelliteDcIds.end(), processInfo->locality.dcId()) != remoteSatelliteDcIds.end()) { + } else if (std::find(remoteSatelliteDcIds.begin(), + remoteSatelliteDcIds.end(), + processInfo->locality.dcId()) != remoteSatelliteDcIds.end()) { remoteSatelliteProcessesDead.add(processInfo->locality); remoteSatelliteLocalitiesDead.push_back(processInfo->locality); } @@ -1185,42 +1324,99 @@ public: bool tooManyDead = false; bool notEnoughLeft = false; - bool primaryTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false) : primaryProcessesDead.validate(tLogPolicy); - if(usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) { + bool primaryTLogsDead = + tLogWriteAntiQuorum + ? !validateAllCombinations( + badCombo, primaryProcessesDead, tLogPolicy, primaryLocalitiesLeft, tLogWriteAntiQuorum, false) + : primaryProcessesDead.validate(tLogPolicy); + if (usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) { primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy); } - if(!primaryDcId.present()) { + if (!primaryDcId.present()) { tooManyDead = primaryTLogsDead || primaryProcessesDead.validate(storagePolicy); - notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy); + notEnoughLeft = + !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy); } else { - bool remoteTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo, remoteProcessesDead, tLogPolicy, remoteLocalitiesLeft, tLogWriteAntiQuorum, false) : remoteProcessesDead.validate(tLogPolicy); - if(usableRegions > 1 && remoteTLogPolicy && !remoteTLogsDead) { + bool remoteTLogsDead = tLogWriteAntiQuorum ? !validateAllCombinations(badCombo, + remoteProcessesDead, + tLogPolicy, + remoteLocalitiesLeft, + tLogWriteAntiQuorum, + false) + : remoteProcessesDead.validate(tLogPolicy); + if (usableRegions > 1 && remoteTLogPolicy && !remoteTLogsDead) { remoteTLogsDead = remoteProcessesDead.validate(remoteTLogPolicy); } - if(!hasSatelliteReplication) { - if(usableRegions > 1) { - tooManyDead = primaryTLogsDead || remoteTLogsDead || ( primaryProcessesDead.validate(storagePolicy) && remoteProcessesDead.validate(storagePolicy) ); - notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(remoteTLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(remoteTLogPolicy) || !remoteProcessesLeft.validate(storagePolicy); + if (!hasSatelliteReplication) { + if (usableRegions > 1) { + tooManyDead = primaryTLogsDead || remoteTLogsDead || + (primaryProcessesDead.validate(storagePolicy) && + remoteProcessesDead.validate(storagePolicy)); + notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || + !primaryProcessesLeft.validate(remoteTLogPolicy) || + !primaryProcessesLeft.validate(storagePolicy) || + !remoteProcessesLeft.validate(tLogPolicy) || + !remoteProcessesLeft.validate(remoteTLogPolicy) || + !remoteProcessesLeft.validate(storagePolicy); } else { - tooManyDead = primaryTLogsDead || remoteTLogsDead || primaryProcessesDead.validate(storagePolicy) || remoteProcessesDead.validate(storagePolicy); - notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy); + tooManyDead = primaryTLogsDead || remoteTLogsDead || + primaryProcessesDead.validate(storagePolicy) || + remoteProcessesDead.validate(storagePolicy); + notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || + !primaryProcessesLeft.validate(storagePolicy) || + !remoteProcessesLeft.validate(tLogPolicy) || + !remoteProcessesLeft.validate(storagePolicy); } } else { - bool primarySatelliteTLogsDead = satelliteTLogWriteAntiQuorumFallback ? !validateAllCombinations(badCombo, primarySatelliteProcessesDead, satelliteTLogPolicyFallback, primarySatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorumFallback, false) : primarySatelliteProcessesDead.validate(satelliteTLogPolicyFallback); - bool remoteSatelliteTLogsDead = satelliteTLogWriteAntiQuorumFallback ? !validateAllCombinations(badCombo, remoteSatelliteProcessesDead, satelliteTLogPolicyFallback, remoteSatelliteLocalitiesLeft, satelliteTLogWriteAntiQuorumFallback, false) : remoteSatelliteProcessesDead.validate(satelliteTLogPolicyFallback); + bool primarySatelliteTLogsDead = + satelliteTLogWriteAntiQuorumFallback + ? !validateAllCombinations(badCombo, + primarySatelliteProcessesDead, + satelliteTLogPolicyFallback, + primarySatelliteLocalitiesLeft, + satelliteTLogWriteAntiQuorumFallback, + false) + : primarySatelliteProcessesDead.validate(satelliteTLogPolicyFallback); + bool remoteSatelliteTLogsDead = + satelliteTLogWriteAntiQuorumFallback + ? !validateAllCombinations(badCombo, + remoteSatelliteProcessesDead, + satelliteTLogPolicyFallback, + remoteSatelliteLocalitiesLeft, + satelliteTLogWriteAntiQuorumFallback, + false) + : remoteSatelliteProcessesDead.validate(satelliteTLogPolicyFallback); - if(usableRegions > 1) { - notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(remoteTLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(remoteTLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) || !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy); + if (usableRegions > 1) { + notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || + !primaryProcessesLeft.validate(remoteTLogPolicy) || + !primaryProcessesLeft.validate(storagePolicy) || + !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || + !remoteProcessesLeft.validate(tLogPolicy) || + !remoteProcessesLeft.validate(remoteTLogPolicy) || + !remoteProcessesLeft.validate(storagePolicy) || + !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy); } else { - notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || !primaryProcessesLeft.validate(storagePolicy) || !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || !remoteProcessesLeft.validate(tLogPolicy) || !remoteProcessesLeft.validate(storagePolicy) || !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy); + notEnoughLeft = !primaryProcessesLeft.validate(tLogPolicy) || + !primaryProcessesLeft.validate(storagePolicy) || + !primarySatelliteProcessesLeft.validate(satelliteTLogPolicy) || + !remoteProcessesLeft.validate(tLogPolicy) || + !remoteProcessesLeft.validate(storagePolicy) || + !remoteSatelliteProcessesLeft.validate(satelliteTLogPolicy); } - if(usableRegions > 1 && allowLogSetKills) { - tooManyDead = ( primaryTLogsDead && primarySatelliteTLogsDead ) || ( remoteTLogsDead && remoteSatelliteTLogsDead ) || ( primaryTLogsDead && remoteTLogsDead ) || ( primaryProcessesDead.validate(storagePolicy) && remoteProcessesDead.validate(storagePolicy) ); + if (usableRegions > 1 && allowLogSetKills) { + tooManyDead = (primaryTLogsDead && primarySatelliteTLogsDead) || + (remoteTLogsDead && remoteSatelliteTLogsDead) || + (primaryTLogsDead && remoteTLogsDead) || + (primaryProcessesDead.validate(storagePolicy) && + remoteProcessesDead.validate(storagePolicy)); } else { - tooManyDead = primaryTLogsDead || remoteTLogsDead || primaryProcessesDead.validate(storagePolicy) || remoteProcessesDead.validate(storagePolicy); + tooManyDead = primaryTLogsDead || remoteTLogsDead || + primaryProcessesDead.validate(storagePolicy) || + remoteProcessesDead.validate(storagePolicy); } } } @@ -1229,126 +1425,179 @@ public: if (tooManyDead) { newKt = Reboot; canSurvive = false; - TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("TLogPolicy", tLogPolicy->info()).detail("Reason", "tLogPolicy validates against dead processes."); + TraceEvent("KillChanged") + .detail("KillType", kt) + .detail("NewKillType", newKt) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("Reason", "tLogPolicy validates against dead processes."); } // Reboot and Delete if remaining machines do NOT fulfill policies else if ((kt < RebootAndDelete) && notEnoughLeft) { newKt = RebootAndDelete; canSurvive = false; - TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("TLogPolicy", tLogPolicy->info()).detail("Reason", "tLogPolicy does not validates against remaining processes."); - } - else if ((kt < RebootAndDelete) && (nQuorum > uniqueMachines.size())) { + TraceEvent("KillChanged") + .detail("KillType", kt) + .detail("NewKillType", newKt) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("Reason", "tLogPolicy does not validates against remaining processes."); + } else if ((kt < RebootAndDelete) && (nQuorum > uniqueMachines.size())) { newKt = RebootAndDelete; canSurvive = false; - TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("StoragePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("Reason", "Not enough unique machines to perform auto configuration of coordinators."); - } - else { - TraceEvent("CanSurviveKills").detail("KillType", kt).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()); + TraceEvent("KillChanged") + .detail("KillType", kt) + .detail("NewKillType", newKt) + .detail("StoragePolicy", storagePolicy->info()) + .detail("Quorum", nQuorum) + .detail("Machines", uniqueMachines.size()) + .detail("Reason", "Not enough unique machines to perform auto configuration of coordinators."); + } else { + TraceEvent("CanSurviveKills") + .detail("KillType", kt) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("StoragePolicy", storagePolicy->info()) + .detail("Quorum", nQuorum) + .detail("Machines", uniqueMachines.size()); } } - if (newKillType) *newKillType = newKt; + if (newKillType) + *newKillType = newKt; return canSurvive; } - virtual void destroyProcess( ISimulator::ProcessInfo *p ) { - TraceEvent("ProcessDestroyed").detail("Name", p->name).detail("Address", p->address).detail("MachineId", p->locality.machineId()); + virtual void destroyProcess(ISimulator::ProcessInfo* p) { + TraceEvent("ProcessDestroyed") + .detail("Name", p->name) + .detail("Address", p->address) + .detail("MachineId", p->locality.machineId()); currentlyRebootingProcesses.insert(std::pair(p->address, p)); - std::vector& processes = machines[ p->locality.machineId().get() ].processes; - if( p != processes.back() ) { - auto it = std::find( processes.begin(), processes.end(), p ); - std::swap( *it, processes.back() ); + std::vector& processes = machines[p->locality.machineId().get()].processes; + if (p != processes.back()) { + auto it = std::find(processes.begin(), processes.end(), p); + std::swap(*it, processes.back()); } processes.pop_back(); - killProcess_internal( p, KillInstantly ); + killProcess_internal(p, KillInstantly); } - void killProcess_internal( ProcessInfo* machine, KillType kt ) { - TEST( true ); // Simulated machine was killed with any kill type - TEST( kt == KillInstantly ); // Simulated machine was killed instantly - TEST( kt == InjectFaults ); // Simulated machine was killed with faults - TEST( kt == FailDisk ); // Simulated machine was killed with a failed disk + void killProcess_internal(ProcessInfo* machine, KillType kt) { + TEST(true); // Simulated machine was killed with any kill type + TEST(kt == KillInstantly); // Simulated machine was killed instantly + TEST(kt == InjectFaults); // Simulated machine was killed with faults + TEST(kt == FailDisk); // Simulated machine was killed with a failed disk if (kt == KillInstantly) { - TraceEvent(SevWarn, "FailMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace(); + TraceEvent(SevWarn, "FailMachine") + .detail("Name", machine->name) + .detail("Address", machine->address) + .detail("ZoneId", machine->locality.zoneId()) + .detail("Process", machine->toString()) + .detail("Rebooting", machine->rebooting) + .detail("Protected", protectedAddresses.count(machine->address)) + .backtrace(); // This will remove all the "tracked" messages that came from the machine being killed latestEventCache.clear(); machine->failed = true; } else if (kt == InjectFaults) { - TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace(); + TraceEvent(SevWarn, "FaultMachine") + .detail("Name", machine->name) + .detail("Address", machine->address) + .detail("ZoneId", machine->locality.zoneId()) + .detail("Process", machine->toString()) + .detail("Rebooting", machine->rebooting) + .detail("Protected", protectedAddresses.count(machine->address)) + .backtrace(); should_inject_fault = simulator_should_inject_fault; machine->fault_injection_r = deterministicRandom()->randomUniqueID().first(); machine->fault_injection_p1 = 0.1; machine->fault_injection_p2 = deterministicRandom()->random01(); } else if (kt == FailDisk) { - TraceEvent(SevWarn, "FailDiskMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace(); + TraceEvent(SevWarn, "FailDiskMachine") + .detail("Name", machine->name) + .detail("Address", machine->address) + .detail("ZoneId", machine->locality.zoneId()) + .detail("Process", machine->toString()) + .detail("Rebooting", machine->rebooting) + .detail("Protected", protectedAddresses.count(machine->address)) + .backtrace(); machine->failedDisk = true; } else { - ASSERT( false ); + ASSERT(false); } ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting); } - virtual void rebootProcess( ProcessInfo* process, KillType kt ) { - if( kt == RebootProcessAndDelete && protectedAddresses.count(process->address) ) { - TraceEvent("RebootChanged").detail("ZoneId", process->locality.describeZone()).detail("KillType", RebootProcess).detail("OrigKillType", kt).detail("Reason", "Protected process"); + virtual void rebootProcess(ProcessInfo* process, KillType kt) { + if (kt == RebootProcessAndDelete && protectedAddresses.count(process->address)) { + TraceEvent("RebootChanged") + .detail("ZoneId", process->locality.describeZone()) + .detail("KillType", RebootProcess) + .detail("OrigKillType", kt) + .detail("Reason", "Protected process"); kt = RebootProcess; } - doReboot( process, kt ); + doReboot(process, kt); } - virtual void rebootProcess(Optional> zoneId, bool allProcesses ) { - if( allProcesses ) { + virtual void rebootProcess(Optional> zoneId, bool allProcesses) { + if (allProcesses) { auto processes = getAllProcesses(); - for( int i = 0; i < processes.size(); i++ ) - if( processes[i]->locality.zoneId() == zoneId && !processes[i]->rebooting ) - doReboot( processes[i], RebootProcess ); + for (int i = 0; i < processes.size(); i++) + if (processes[i]->locality.zoneId() == zoneId && !processes[i]->rebooting) + doReboot(processes[i], RebootProcess); } else { auto processes = getAllProcesses(); - for( int i = 0; i < processes.size(); i++ ) { - if( processes[i]->locality.zoneId() != zoneId || processes[i]->rebooting ) { + for (int i = 0; i < processes.size(); i++) { + if (processes[i]->locality.zoneId() != zoneId || processes[i]->rebooting) { swapAndPop(&processes, i--); } } - if( processes.size() ) - doReboot( deterministicRandom()->randomChoice( processes ), RebootProcess ); + if (processes.size()) + doReboot(deterministicRandom()->randomChoice(processes), RebootProcess); } } - virtual void killProcess( ProcessInfo* machine, KillType kt ) { + virtual void killProcess(ProcessInfo* machine, KillType kt) { TraceEvent("AttemptingKillProcess"); - if (kt < RebootAndDelete ) { - killProcess_internal( machine, kt ); + if (kt < RebootAndDelete) { + killProcess_internal(machine, kt); } } - virtual void killInterface( NetworkAddress address, KillType kt ) { - if (kt < RebootAndDelete ) { - std::vector& processes = machines[ addressMap[address]->locality.machineId() ].processes; - for( int i = 0; i < processes.size(); i++ ) - killProcess_internal( processes[i], kt ); + virtual void killInterface(NetworkAddress address, KillType kt) { + if (kt < RebootAndDelete) { + std::vector& processes = machines[addressMap[address]->locality.machineId()].processes; + for (int i = 0; i < processes.size(); i++) + killProcess_internal(processes[i], kt); } } virtual bool killZone(Optional> zoneId, KillType kt, bool forceKill, KillType* ktFinal) { auto processes = getAllProcesses(); std::set>> zoneMachines; for (auto& process : processes) { - if(process->locality.zoneId() == zoneId) { + if (process->locality.zoneId() == zoneId) { zoneMachines.insert(process->locality.machineId()); } } bool result = false; - for(auto& machineId : zoneMachines) { - if(killMachine(machineId, kt, forceKill, ktFinal)) { + for (auto& machineId : zoneMachines) { + if (killMachine(machineId, kt, forceKill, ktFinal)) { result = true; } } return result; } - virtual bool killMachine(Optional> machineId, KillType kt, bool forceKill, KillType* ktFinal) { + virtual bool killMachine(Optional> machineId, + KillType kt, + bool forceKill, + KillType* ktFinal) { auto ktOrig = kt; TEST(true); // Trying to killing a machine TEST(kt == KillInstantly); // Trying to kill instantly - TEST(kt == InjectFaults); // Trying to kill by injecting faults + TEST(kt == InjectFaults); // Trying to kill by injecting faults - if(speedUpSimulation && !forceKill) { - TraceEvent(SevWarn, "AbortedKill").detail("MachineId", machineId).detail("Reason", "Unforced kill within speedy simulation.").backtrace(); - if (ktFinal) *ktFinal = None; + if (speedUpSimulation && !forceKill) { + TraceEvent(SevWarn, "AbortedKill") + .detail("MachineId", machineId) + .detail("Reason", "Unforced kill within speedy simulation.") + .backtrace(); + if (ktFinal) + *ktFinal = None; return false; } @@ -1365,36 +1614,38 @@ public: // Do nothing, if no processes to kill if (processesOnMachine == 0) { - TraceEvent(SevWarn, "AbortedKill").detail("MachineId", machineId).detail("Reason", "The target had no processes running.").detail("Processes", processesOnMachine).detail("ProcessesPerMachine", processesPerMachine).backtrace(); - if (ktFinal) *ktFinal = None; + TraceEvent(SevWarn, "AbortedKill") + .detail("MachineId", machineId) + .detail("Reason", "The target had no processes running.") + .detail("Processes", processesOnMachine) + .detail("ProcessesPerMachine", processesPerMachine) + .backtrace(); + if (ktFinal) + *ktFinal = None; return false; } // Check if machine can be removed, if requested - if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) - { + if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || + (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) { std::vector processesLeft, processesDead; - int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0; + int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0; for (auto processInfo : getAllProcesses()) { if (processInfo->isAvailableClass()) { if (processInfo->isExcluded()) { processesDead.push_back(processInfo); excluded++; - } - else if (processInfo->isCleared()) { + } else if (processInfo->isCleared()) { processesDead.push_back(processInfo); cleared++; - } - else if (!processInfo->isAvailable()) { + } else if (!processInfo->isAvailable()) { processesDead.push_back(processInfo); unavailable++; - } - else if (protectedAddresses.count(processInfo->address)) { + } else if (protectedAddresses.count(processInfo->address)) { processesLeft.push_back(processInfo); protectedWorker++; - } - else if (processInfo->locality.machineId() != machineId) { + } else if (processInfo->locality.machineId() != machineId) { processesLeft.push_back(processInfo); } else { processesDead.push_back(processInfo); @@ -1402,70 +1653,152 @@ public: } } if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) { - TraceEvent("ChangedKillMachine").detail("MachineId", machineId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("Protected", protectedWorker).detail("Unavailable", unavailable).detail("Excluded", excluded).detail("Cleared", cleared).detail("ProtectedTotal", protectedAddresses.size()).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info()); - } - else if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk)) { - TraceEvent("DeadMachine").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info()); + TraceEvent("ChangedKillMachine") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("OrigKillType", ktOrig) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("TotalProcesses", machines.size()) + .detail("ProcessesPerMachine", processesPerMachine) + .detail("Protected", protectedWorker) + .detail("Unavailable", unavailable) + .detail("Excluded", excluded) + .detail("Cleared", cleared) + .detail("ProtectedTotal", protectedAddresses.size()) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("StoragePolicy", storagePolicy->info()); + } else if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk)) { + TraceEvent("DeadMachine") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("TotalProcesses", machines.size()) + .detail("ProcessesPerMachine", processesPerMachine) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("StoragePolicy", storagePolicy->info()); for (auto process : processesLeft) { - TraceEvent("DeadMachineSurvivors").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("SurvivingProcess", process->toString()); + TraceEvent("DeadMachineSurvivors") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("SurvivingProcess", process->toString()); } for (auto process : processesDead) { - TraceEvent("DeadMachineVictims").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("VictimProcess", process->toString()); + TraceEvent("DeadMachineVictims") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("VictimProcess", process->toString()); } - } - else { - TraceEvent("ClearMachine").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info()); + } else { + TraceEvent("ClearMachine") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("TotalProcesses", machines.size()) + .detail("ProcessesPerMachine", processesPerMachine) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("StoragePolicy", storagePolicy->info()); for (auto process : processesLeft) { - TraceEvent("ClearMachineSurvivors").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("SurvivingProcess", process->toString()); + TraceEvent("ClearMachineSurvivors") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("SurvivingProcess", process->toString()); } for (auto process : processesDead) { - TraceEvent("ClearMachineVictims").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("VictimProcess", process->toString()); + TraceEvent("ClearMachineVictims") + .detail("MachineId", machineId) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("VictimProcess", process->toString()); } } } - TEST(originalKt != kt); // Kill type was changed from requested to reboot. + TEST(originalKt != kt); // Kill type was changed from requested to reboot. // Check if any processes on machine are rebooting - if( processesOnMachine != processesPerMachine && kt >= RebootAndDelete ) { - TEST(true); //Attempted reboot, but the target did not have all of its processes running - TraceEvent(SevWarn, "AbortedKill").detail("KillType", kt).detail("MachineId", machineId).detail("Reason", "Machine processes does not match number of processes per machine").detail("Processes", processesOnMachine).detail("ProcessesPerMachine", processesPerMachine).backtrace(); - if (ktFinal) *ktFinal = None; + if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) { + TEST(true); // Attempted reboot, but the target did not have all of its processes running + TraceEvent(SevWarn, "AbortedKill") + .detail("KillType", kt) + .detail("MachineId", machineId) + .detail("Reason", "Machine processes does not match number of processes per machine") + .detail("Processes", processesOnMachine) + .detail("ProcessesPerMachine", processesPerMachine) + .backtrace(); + if (ktFinal) + *ktFinal = None; return false; } // Check if any processes on machine are rebooting - if ( processesOnMachine != processesPerMachine ) { - TEST(true); //Attempted reboot, but the target did not have all of its processes running - TraceEvent(SevWarn, "AbortedKill").detail("KillType", kt).detail("MachineId", machineId).detail("Reason", "Machine processes does not match number of processes per machine").detail("Processes", processesOnMachine).detail("ProcessesPerMachine", processesPerMachine).backtrace(); - if (ktFinal) *ktFinal = None; + if (processesOnMachine != processesPerMachine) { + TEST(true); // Attempted reboot, but the target did not have all of its processes running + TraceEvent(SevWarn, "AbortedKill") + .detail("KillType", kt) + .detail("MachineId", machineId) + .detail("Reason", "Machine processes does not match number of processes per machine") + .detail("Processes", processesOnMachine) + .detail("ProcessesPerMachine", processesPerMachine) + .backtrace(); + if (ktFinal) + *ktFinal = None; return false; } - TraceEvent("KillMachine").detail("MachineId", machineId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig); - if ( kt < RebootAndDelete ) { - if((kt == InjectFaults || kt == FailDisk) && machines[machineId].machineProcess != nullptr) - killProcess_internal( machines[machineId].machineProcess, kt ); + TraceEvent("KillMachine") + .detail("MachineId", machineId) + .detail("Kt", kt) + .detail("KtOrig", ktOrig) + .detail("KillableMachines", processesOnMachine) + .detail("ProcessPerMachine", processesPerMachine) + .detail("KillChanged", kt != ktOrig); + if (kt < RebootAndDelete) { + if ((kt == InjectFaults || kt == FailDisk) && machines[machineId].machineProcess != nullptr) + killProcess_internal(machines[machineId].machineProcess, kt); for (auto& process : machines[machineId].processes) { - TraceEvent("KillMachineProcess").detail("KillType", kt).detail("Process", process->toString()).detail("StartingClass", process->startingClass.toString()).detail("Failed", process->failed).detail("Excluded", process->excluded).detail("Cleared", process->cleared).detail("Rebooting", process->rebooting); + TraceEvent("KillMachineProcess") + .detail("KillType", kt) + .detail("Process", process->toString()) + .detail("StartingClass", process->startingClass.toString()) + .detail("Failed", process->failed) + .detail("Excluded", process->excluded) + .detail("Cleared", process->cleared) + .detail("Rebooting", process->rebooting); if (process->startingClass != ProcessClass::TesterClass) - killProcess_internal( process, kt ); + killProcess_internal(process, kt); } - } - else if ( kt == Reboot || kt == RebootAndDelete ) { + } else if (kt == Reboot || kt == RebootAndDelete) { for (auto& process : machines[machineId].processes) { - TraceEvent("KillMachineProcess").detail("KillType", kt).detail("Process", process->toString()).detail("StartingClass", process->startingClass.toString()).detail("Failed", process->failed).detail("Excluded", process->excluded).detail("Cleared", process->cleared).detail("Rebooting", process->rebooting); + TraceEvent("KillMachineProcess") + .detail("KillType", kt) + .detail("Process", process->toString()) + .detail("StartingClass", process->startingClass.toString()) + .detail("Failed", process->failed) + .detail("Excluded", process->excluded) + .detail("Cleared", process->cleared) + .detail("Rebooting", process->rebooting); if (process->startingClass != ProcessClass::TesterClass) - doReboot(process, kt ); + doReboot(process, kt); } } TEST(kt == RebootAndDelete); // Resulted in a reboot and delete TEST(kt == Reboot); // Resulted in a reboot TEST(kt == KillInstantly); // Resulted in an instant kill - TEST(kt == InjectFaults); // Resulted in a kill by injecting faults + TEST(kt == InjectFaults); // Resulted in a kill by injecting faults - if (ktFinal) *ktFinal = kt; + if (ktFinal) + *ktFinal = kt; return true; } @@ -1473,7 +1806,7 @@ public: auto ktOrig = kt; auto processes = getAllProcesses(); std::map>, int> datacenterMachines; - int dcProcesses = 0; + int dcProcesses = 0; // Switch to a reboot, if anything protected on machine for (auto& procRecord : processes) { @@ -1483,23 +1816,33 @@ public: if (processDcId.present() && (processDcId == dcId)) { if ((kt != Reboot) && (protectedAddresses.count(procRecord->address))) { kt = Reboot; - TraceEvent(SevWarn, "DcKillChanged").detail("DataCenter", dcId).detail("KillType", kt).detail("OrigKillType", ktOrig) - .detail("Reason", "Datacenter has protected process").detail("ProcessAddress", procRecord->address).detail("Failed", procRecord->failed).detail("Rebooting", procRecord->rebooting).detail("Excluded", procRecord->excluded).detail("Cleared", procRecord->cleared).detail("Process", procRecord->toString()); + TraceEvent(SevWarn, "DcKillChanged") + .detail("DataCenter", dcId) + .detail("KillType", kt) + .detail("OrigKillType", ktOrig) + .detail("Reason", "Datacenter has protected process") + .detail("ProcessAddress", procRecord->address) + .detail("Failed", procRecord->failed) + .detail("Rebooting", procRecord->rebooting) + .detail("Excluded", procRecord->excluded) + .detail("Cleared", procRecord->cleared) + .detail("Process", procRecord->toString()); } - datacenterMachines[processMachineId.get()] ++; - dcProcesses ++; + datacenterMachines[processMachineId.get()]++; + dcProcesses++; } } // Check if machine can be removed, if requested - if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) - { - std::vector processesLeft, processesDead; + if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || + (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))) { + std::vector processesLeft, processesDead; for (auto processInfo : getAllProcesses()) { if (processInfo->isAvailableClass()) { if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) { processesDead.push_back(processInfo); - } else if (protectedAddresses.count(processInfo->address) || datacenterMachines.find(processInfo->locality.machineId()) == datacenterMachines.end()) { + } else if (protectedAddresses.count(processInfo->address) || + datacenterMachines.find(processInfo->locality.machineId()) == datacenterMachines.end()) { processesLeft.push_back(processInfo); } else { processesDead.push_back(processInfo); @@ -1508,90 +1851,117 @@ public: } if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) { - TraceEvent(SevWarn, "DcKillChanged").detail("DataCenter", dcId).detail("KillType", kt).detail("OrigKillType", ktOrig); - } - else { - TraceEvent("DeadDataCenter").detail("DataCenter", dcId).detail("KillType", kt).detail("DcZones", datacenterMachines.size()).detail("DcProcesses", dcProcesses).detail("ProcessesDead", processesDead.size()).detail("ProcessesLeft", processesLeft.size()).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info()); + TraceEvent(SevWarn, "DcKillChanged") + .detail("DataCenter", dcId) + .detail("KillType", kt) + .detail("OrigKillType", ktOrig); + } else { + TraceEvent("DeadDataCenter") + .detail("DataCenter", dcId) + .detail("KillType", kt) + .detail("DcZones", datacenterMachines.size()) + .detail("DcProcesses", dcProcesses) + .detail("ProcessesDead", processesDead.size()) + .detail("ProcessesLeft", processesLeft.size()) + .detail("TLogPolicy", tLogPolicy->info()) + .detail("StoragePolicy", storagePolicy->info()); for (auto process : processesLeft) { - TraceEvent("DeadDcSurvivors").detail("MachineId", process->locality.machineId()).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("SurvivingProcess", process->toString()); + TraceEvent("DeadDcSurvivors") + .detail("MachineId", process->locality.machineId()) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("SurvivingProcess", process->toString()); } for (auto process : processesDead) { - TraceEvent("DeadDcVictims").detail("MachineId", process->locality.machineId()).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("VictimProcess", process->toString()); + TraceEvent("DeadDcVictims") + .detail("MachineId", process->locality.machineId()) + .detail("KillType", kt) + .detail("ProcessesLeft", processesLeft.size()) + .detail("ProcessesDead", processesDead.size()) + .detail("VictimProcess", process->toString()); } } } - KillType ktResult, ktMin = kt; + KillType ktResult, ktMin = kt; for (auto& datacenterMachine : datacenterMachines) { - if(deterministicRandom()->random01() < 0.99) { + if (deterministicRandom()->random01() < 0.99) { killMachine(datacenterMachine.first, kt, true, &ktResult); if (ktResult != kt) { TraceEvent(SevWarn, "KillDCFail") - .detail("Zone", datacenterMachine.first) - .detail("KillType", kt) - .detail("KillTypeResult", ktResult) - .detail("KillTypeOrig", ktOrig); + .detail("Zone", datacenterMachine.first) + .detail("KillType", kt) + .detail("KillTypeResult", ktResult) + .detail("KillTypeOrig", ktOrig); ASSERT(ktResult == None); } - ktMin = std::min( ktResult, ktMin ); + ktMin = std::min(ktResult, ktMin); } } TraceEvent("KillDataCenter") - .detail("DcZones", datacenterMachines.size()) - .detail("DcProcesses", dcProcesses) - .detail("DCID", dcId) - .detail("KillType", kt) - .detail("KillTypeOrig", ktOrig) - .detail("KillTypeMin", ktMin) - .detail("KilledDC", kt==ktMin); + .detail("DcZones", datacenterMachines.size()) + .detail("DcProcesses", dcProcesses) + .detail("DCID", dcId) + .detail("KillType", kt) + .detail("KillTypeOrig", ktOrig) + .detail("KillTypeMin", ktMin) + .detail("KilledDC", kt == ktMin); TEST(kt != ktMin); // DataCenter kill was rejected by killMachine - TEST((kt==ktMin) && (kt == RebootAndDelete)); // Resulted in a reboot and delete - TEST((kt==ktMin) && (kt == Reboot)); // Resulted in a reboot - TEST((kt==ktMin) && (kt == KillInstantly)); // Resulted in an instant kill - TEST((kt==ktMin) && (kt == InjectFaults)); // Resulted in a kill by injecting faults - TEST((kt==ktMin) && (kt != ktOrig)); // Kill request was downgraded - TEST((kt==ktMin) && (kt == ktOrig)); // Requested kill was done + TEST((kt == ktMin) && (kt == RebootAndDelete)); // Resulted in a reboot and delete + TEST((kt == ktMin) && (kt == Reboot)); // Resulted in a reboot + TEST((kt == ktMin) && (kt == KillInstantly)); // Resulted in an instant kill + TEST((kt == ktMin) && (kt == InjectFaults)); // Resulted in a kill by injecting faults + TEST((kt == ktMin) && (kt != ktOrig)); // Kill request was downgraded + TEST((kt == ktMin) && (kt == ktOrig)); // Requested kill was done - if (ktFinal) *ktFinal = ktMin; + if (ktFinal) + *ktFinal = ktMin; return (kt == ktMin); } virtual void clogInterface(const IPAddress& ip, double seconds, ClogMode mode = ClogDefault) { if (mode == ClogDefault) { double a = deterministicRandom()->random01(); - if ( a < 0.3 ) mode = ClogSend; - else if (a < 0.6 ) mode = ClogReceive; - else mode = ClogAll; + if (a < 0.3) + mode = ClogSend; + else if (a < 0.6) + mode = ClogReceive; + else + mode = ClogAll; } TraceEvent("ClogInterface") .detail("IP", ip.toString()) .detail("Delay", seconds) - .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All"); + .detail("Queue", + mode == ClogSend ? "Send" + : mode == ClogReceive ? "Receive" + : "All"); - if (mode == ClogSend || mode==ClogAll) - g_clogging.clogSendFor( ip, seconds ); - if (mode == ClogReceive || mode==ClogAll) - g_clogging.clogRecvFor( ip, seconds ); + if (mode == ClogSend || mode == ClogAll) + g_clogging.clogSendFor(ip, seconds); + if (mode == ClogReceive || mode == ClogAll) + g_clogging.clogRecvFor(ip, seconds); } virtual void clogPair(const IPAddress& from, const IPAddress& to, double seconds) { - g_clogging.clogPairFor( from, to, seconds ); + g_clogging.clogPairFor(from, to, seconds); } virtual std::vector getAllProcesses() const { std::vector processes; - for( auto& c : machines ) { - processes.insert( processes.end(), c.second.processes.begin(), c.second.processes.end() ); + for (auto& c : machines) { + processes.insert(processes.end(), c.second.processes.begin(), c.second.processes.end()); } - for( auto& c : currentlyRebootingProcesses ) { - processes.push_back( c.second ); + for (auto& c : currentlyRebootingProcesses) { + processes.push_back(c.second); } return processes; } - virtual ProcessInfo* getProcessByAddress( NetworkAddress const& address ) { + virtual ProcessInfo* getProcessByAddress(NetworkAddress const& address) { NetworkAddress normalizedAddress(address.ip, address.port, true, address.isTLS()); - ASSERT( addressMap.count( normalizedAddress ) ); - return addressMap[ normalizedAddress ]; + ASSERT(addressMap.count(normalizedAddress)); + return addressMap[normalizedAddress]; } virtual MachineInfo* getMachineByNetworkAddress(NetworkAddress const& address) { @@ -1602,22 +1972,30 @@ public: return &machines[machineId]; } - virtual void destroyMachine(Optional> const& machineId ) { + virtual void destroyMachine(Optional> const& machineId) { auto& machine = machines[machineId]; - for( auto process : machine.processes ) { - ASSERT( process->failed ); + for (auto process : machine.processes) { + ASSERT(process->failed); } - if( machine.machineProcess ) { - killProcess_internal( machine.machineProcess, KillInstantly ); + if (machine.machineProcess) { + killProcess_internal(machine.machineProcess, KillInstantly); } machines.erase(machineId); } - Sim2() : time(0.0), timerTime(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(TaskPriority::Zero) { + Sim2() + : time(0.0), timerTime(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(TaskPriority::Zero) { // Not letting currentProcess be NULL eliminates some annoying special cases - currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", ""); + currentProcess = + new ProcessInfo("NoMachine", + LocalityData(Optional>(), StringRef(), StringRef(), StringRef()), + ProcessClass(), + { NetworkAddress() }, + this, + "", + ""); g_network = net2 = newNet2(TLSConfig(), false, true); - g_network->addStopCallback( Net2FileSystem::stop ); + g_network->addStopCallback(Net2FileSystem::stop); Net2FileSystem::newFileSystem(); check_yield(TaskPriority::Zero); } @@ -1629,16 +2007,38 @@ public: uint64_t stable; ProcessInfo* machine; Promise action; - Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {} - Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); } - Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine), action(std::move(rhs.action)) {} - void operator= ( Task const& rhs ) { taskID = rhs.taskID; time = rhs.time; stable = rhs.stable; machine = rhs.machine; action = rhs.action; } - Task( Task const& rhs ) : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {} - void operator= (Task&& rhs) BOOST_NOEXCEPT { time = rhs.time; taskID = rhs.taskID; stable = rhs.stable; machine = rhs.machine; action = std::move(rhs.action); } + Task(double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise&& action) + : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {} + Task(double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future& future) + : time(time), taskID(taskID), stable(stable), machine(machine) { + future = action.getFuture(); + } + Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), + taskID(rhs.taskID), + stable(rhs.stable), + machine(rhs.machine), + action(std::move(rhs.action)) {} + void operator=(Task const& rhs) { + taskID = rhs.taskID; + time = rhs.time; + stable = rhs.stable; + machine = rhs.machine; + action = rhs.action; + } + Task(Task const& rhs) + : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {} + void operator=(Task&& rhs) BOOST_NOEXCEPT { + time = rhs.time; + taskID = rhs.taskID; + stable = rhs.stable; + machine = rhs.machine; + action = std::move(rhs.action); + } - bool operator < (Task const& rhs) const { + bool operator<(Task const& rhs) const { // Ordering is reversed for priority_queue - if (time != rhs.time) return time > rhs.time; + if (time != rhs.time) + return time > rhs.time; return stable > rhs.stable; } }; @@ -1646,8 +2046,7 @@ public: void execTask(struct Task& t) { if (t.machine->failed) { t.action.send(Never()); - } - else { + } else { mutex.enter(); this->time = t.time; this->timerTime = std::max(this->timerTime, this->time); @@ -1655,204 +2054,243 @@ public: this->currentProcess = t.machine; try { - //auto before = getCPUTicks(); + // auto before = getCPUTicks(); t.action.send(Void()); - ASSERT( this->currentProcess == t.machine ); + ASSERT(this->currentProcess == t.machine); /*auto elapsed = getCPUTicks() - before; currentProcess->cpuTicks += elapsed; if (deterministicRandom()->random01() < 0.01){ - TraceEvent("TaskDuration").detail("CpuTicks", currentProcess->cpuTicks); - currentProcess->cpuTicks = 0; + TraceEvent("TaskDuration").detail("CpuTicks", currentProcess->cpuTicks); + currentProcess->cpuTicks = 0; }*/ } catch (Error& e) { TraceEvent(SevError, "UnhandledSimulationEventError").error(e, true); killProcess(t.machine, KillInstantly); } - //if( this->time > 45.522817 ) { + // if( this->time > 45.522817 ) { // printf("foo\n"); //} if (randLog) - fprintf( randLog, "T %f %d %s %" PRId64 "\n", this->time, int(deterministicRandom()->peek() % 10000), t.machine ? t.machine->name : "none", t.stable); + fprintf(randLog, + "T %f %d %s %" PRId64 "\n", + this->time, + int(deterministicRandom()->peek() % 10000), + t.machine ? t.machine->name : "none", + t.stable); } } - virtual void onMainThread( Promise&& signal, TaskPriority taskID ) { + virtual void onMainThread(Promise&& signal, TaskPriority taskID) { // This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread // or a thread created with g_network->startThread ASSERT(getCurrentProcess()); mutex.enter(); ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max); - tasks.push( Task( time, taskID, taskCount++, getCurrentProcess(), std::move(signal) ) ); + tasks.push(Task(time, taskID, taskCount++, getCurrentProcess(), std::move(signal))); mutex.leave(); } - bool isOnMainThread() const override { - return net2->isOnMainThread(); + bool isOnMainThread() const override { return net2->isOnMainThread(); } + virtual Future onProcess(ISimulator::ProcessInfo* process, TaskPriority taskID) { + return delay(0, taskID, process); } - virtual Future onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID ) { - return delay( 0, taskID, process ); - } - virtual Future onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID ) { - if( process->machine == 0 ) + virtual Future onMachine(ISimulator::ProcessInfo* process, TaskPriority taskID) { + if (process->machine == 0) return Void(); - return delay( 0, taskID, process->machine->machineProcess ); + return delay(0, taskID, process->machine->machineProcess); } - //time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because - //time should only be modified from the main thread. + // time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because + // time should only be modified from the main thread. double time; double timerTime; TaskPriority currentTaskID; - //taskCount is guarded by ISimulator::mutex + // taskCount is guarded by ISimulator::mutex uint64_t taskCount; - std::map>, MachineInfo > machines; + std::map>, MachineInfo> machines; std::map addressMap; std::map> filesDeadMap; - //tasks is guarded by ISimulator::mutex + // tasks is guarded by ISimulator::mutex std::priority_queue> tasks; std::vector> stopCallbacks; - //Sim2Net network; - INetwork *net2; + // Sim2Net network; + INetwork* net2; - //Map from machine IP -> machine disk space info + // Map from machine IP -> machine disk space info std::map diskSpaceMap; - //Whether or not yield has returned true during the current iteration of the run loop + // Whether or not yield has returned true during the current iteration of the run loop bool yielded; - int yield_limit; // how many more times yield may return false before next returning true + int yield_limit; // how many more times yield may return false before next returning true }; void startNewSimulator() { - ASSERT( !g_network ); + ASSERT(!g_network); g_network = g_pSimulator = new Sim2(); g_simulator.connectionFailuresDisableDuration = deterministicRandom()->random01() < 0.5 ? 0 : 1e6; } -ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) { - TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay); +ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) { + TraceEvent("RebootingProcessAttempt") + .detail("ZoneId", p->locality.zoneId()) + .detail("KillType", kt) + .detail("Process", p->toString()) + .detail("StartingClass", p->startingClass.toString()) + .detail("Failed", p->failed) + .detail("Excluded", p->excluded) + .detail("Cleared", p->cleared) + .detail("Rebooting", p->rebooting) + .detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay); - wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question + wait(g_sim2.delay(0, TaskPriority::DefaultDelay, p)); // Switch to the machine in question try { - ASSERT( kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || kt == ISimulator::RebootProcessAndDelete ); + ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || + kt == ISimulator::RebootProcessAndDelete); - TEST( kt == ISimulator::RebootProcess ); // Simulated process rebooted - TEST( kt == ISimulator::Reboot ); // Simulated machine rebooted - TEST( kt == ISimulator::RebootAndDelete ); // Simulated machine rebooted with data and coordination state deletion - TEST( kt == ISimulator::RebootProcessAndDelete ); // Simulated process rebooted with data and coordination state deletion + TEST(kt == ISimulator::RebootProcess); // Simulated process rebooted + TEST(kt == ISimulator::Reboot); // Simulated machine rebooted + TEST(kt == ISimulator::RebootAndDelete); // Simulated machine rebooted with data and coordination state deletion + TEST( + kt == + ISimulator::RebootProcessAndDelete); // Simulated process rebooted with data and coordination state deletion - if( p->rebooting || !p->isReliable() ) + if (p->rebooting || !p->isReliable()) return; - TraceEvent("RebootingProcess").detail("KillType", kt).detail("Address", p->address).detail("ZoneId", p->locality.zoneId()).detail("DataHall", p->locality.dataHallId()).detail("Locality", p->locality.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).backtrace(); + TraceEvent("RebootingProcess") + .detail("KillType", kt) + .detail("Address", p->address) + .detail("ZoneId", p->locality.zoneId()) + .detail("DataHall", p->locality.dataHallId()) + .detail("Locality", p->locality.toString()) + .detail("Failed", p->failed) + .detail("Excluded", p->excluded) + .detail("Cleared", p->cleared) + .backtrace(); p->rebooting = true; if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) { p->cleared = true; g_simulator.clearAddress(p->address); } - p->shutdownSignal.send( kt ); + p->shutdownSignal.send(kt); } catch (Error& e) { TraceEvent(SevError, "RebootError").error(e); - p->shutdownSignal.sendError(e); // ? + p->shutdownSignal.sendError(e); // ? throw; // goes nowhere! } } -//Simulates delays for performing operations on disk -Future waitUntilDiskReady( Reference diskParameters, int64_t size, bool sync ) { - if(g_simulator.getCurrentProcess()->failedDisk) { +// Simulates delays for performing operations on disk +Future waitUntilDiskReady(Reference diskParameters, int64_t size, bool sync) { + if (g_simulator.getCurrentProcess()->failedDisk) { return Never(); } - if(g_simulator.connectionFailuresDisableDuration > 1e4) + if (g_simulator.connectionFailuresDisableDuration > 1e4) return delay(0.0001); - if( diskParameters->nextOperation < now() ) diskParameters->nextOperation = now(); - diskParameters->nextOperation += ( 1.0 / diskParameters->iops ) + ( size / diskParameters->bandwidth ); + if (diskParameters->nextOperation < now()) + diskParameters->nextOperation = now(); + diskParameters->nextOperation += (1.0 / diskParameters->iops) + (size / diskParameters->bandwidth); double randomLatency; - if(sync) { + if (sync) { randomLatency = .005 + deterministicRandom()->random01() * (BUGGIFY ? 1.0 : .010); } else randomLatency = 10 * deterministicRandom()->random01() / diskParameters->iops; - return delayUntil( diskParameters->nextOperation + randomLatency ); + return delayUntil(diskParameters->nextOperation + randomLatency); } #if defined(_WIN32) /* Opening with FILE_SHARE_DELETE lets simulation actually work on windows - previously renames were always failing. - FIXME: Use an actual platform abstraction for this stuff! Is there any reason we can't use underlying net2 for example? */ + FIXME: Use an actual platform abstraction for this stuff! Is there any reason we can't use underlying net2 for + example? */ #include -int sf_open( const char* filename, int flags, int convFlags, int mode ) { - HANDLE wh = CreateFile( filename, GENERIC_READ | ((flags&IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), - FILE_SHARE_READ|FILE_SHARE_WRITE|FILE_SHARE_DELETE, NULL, - (flags&IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW : - (flags&IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL, - NULL ); +int sf_open(const char* filename, int flags, int convFlags, int mode) { + HANDLE wh = CreateFile(filename, + GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, + (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW + : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS + : OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL); int h = -1; - if (wh != INVALID_HANDLE_VALUE) h = _open_osfhandle( (intptr_t)wh, convFlags ); - else errno = GetLastError() == ERROR_FILE_NOT_FOUND ? ENOENT : EFAULT; + if (wh != INVALID_HANDLE_VALUE) + h = _open_osfhandle((intptr_t)wh, convFlags); + else + errno = GetLastError() == ERROR_FILE_NOT_FOUND ? ENOENT : EFAULT; return h; } #endif // Opens a file for asynchronous I/O -Future< Reference > Sim2FileSystem::open( std::string filename, int64_t flags, int64_t mode ) -{ - ASSERT( (flags & IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE) || - !(flags & IAsyncFile::OPEN_CREATE) || - StringRef(filename).endsWith(LiteralStringRef(".fdb-lock")) ); // We don't use "ordinary" non-atomic file creation right now except for folder locking, and we don't have code to simulate its unsafeness. +Future> Sim2FileSystem::open(std::string filename, int64_t flags, int64_t mode) { + ASSERT((flags & IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE) || !(flags & IAsyncFile::OPEN_CREATE) || + StringRef(filename).endsWith( + LiteralStringRef(".fdb-lock"))); // We don't use "ordinary" non-atomic file creation right now except for + // folder locking, and we don't have code to simulate its unsafeness. - if ( (flags & IAsyncFile::OPEN_EXCLUSIVE) ) ASSERT( flags & IAsyncFile::OPEN_CREATE ); + if ((flags & IAsyncFile::OPEN_EXCLUSIVE)) + ASSERT(flags & IAsyncFile::OPEN_CREATE); if (flags & IAsyncFile::OPEN_UNCACHED) { auto& machineCache = g_simulator.getCurrentProcess()->machine->openFiles; std::string actualFilename = filename; - if ( machineCache.find(filename) == machineCache.end() ) { - if(flags & IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE) { + if (machineCache.find(filename) == machineCache.end()) { + if (flags & IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE) { actualFilename = filename + ".part"; auto partFile = machineCache.find(actualFilename); - if(partFile != machineCache.end()) { + if (partFile != machineCache.end()) { Future> f = AsyncFileDetachable::open(partFile->second); - if(FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) - f = map(f, [=](Reference r) { return Reference(new AsyncFileWriteChecker(r)); }); + if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) + f = map(f, [=](Reference r) { + return Reference(new AsyncFileWriteChecker(r)); + }); return f; } } - //Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile. This way, they can both keep up with the time to start the next operation - Reference diskParameters(new DiskParameters(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH)); - machineCache[actualFilename] = AsyncFileNonDurable::open(filename, actualFilename, SimpleFile::open(filename, flags, mode, diskParameters, false), diskParameters, (flags & IAsyncFile::OPEN_NO_AIO) == 0); + // Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile. This way, + // they can both keep up with the time to start the next operation + Reference diskParameters( + new DiskParameters(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH)); + machineCache[actualFilename] = + AsyncFileNonDurable::open(filename, + actualFilename, + SimpleFile::open(filename, flags, mode, diskParameters, false), + diskParameters, + (flags & IAsyncFile::OPEN_NO_AIO) == 0); } - Future> f = AsyncFileDetachable::open( machineCache[actualFilename] ); - if(FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) + Future> f = AsyncFileDetachable::open(machineCache[actualFilename]); + if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) f = map(f, [=](Reference r) { return Reference(new AsyncFileWriteChecker(r)); }); return f; - } - else + } else return AsyncFileCached::open(filename, flags, mode); } -// Deletes the given file. If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power failure. -Future< Void > Sim2FileSystem::deleteFile( std::string filename, bool mustBeDurable ) -{ +// Deletes the given file. If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power +// failure. +Future Sim2FileSystem::deleteFile(std::string filename, bool mustBeDurable) { return Sim2::deleteFileImpl(&g_sim2, filename, mustBeDurable); } ACTOR Future renameFileImpl(std::string from, std::string to) { - wait(delay(0.5*deterministicRandom()->random01())); + wait(delay(0.5 * deterministicRandom()->random01())); ::renameFile(from, to); - wait(delay(0.5*deterministicRandom()->random01())); + wait(delay(0.5 * deterministicRandom()->random01())); return Void(); } @@ -1860,7 +2298,7 @@ Future Sim2FileSystem::renameFile(std::string const& from, std::string con return renameFileImpl(from, to); } -Future< std::time_t > Sim2FileSystem::lastWriteTime( std::string filename ) { +Future Sim2FileSystem::lastWriteTime(std::string filename) { // TODO: update this map upon file writes. static std::map fileWrites; if (BUGGIFY && deterministicRandom()->random01() < 0.01) { @@ -1869,7 +2307,6 @@ Future< std::time_t > Sim2FileSystem::lastWriteTime( std::string filename ) { return fileWrites[filename]; } -void Sim2FileSystem::newFileSystem() -{ +void Sim2FileSystem::newFileSystem() { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem()); }