From 57534cc876982f2e592d91702b3f7b2dbe202cce Mon Sep 17 00:00:00 2001 From: Will Wilson Date: Wed, 15 Jun 2022 11:52:29 -0400 Subject: [PATCH 001/216] Guard a few uses of gsimulator. --- fdbserver/storageserver.actor.cpp | 2 +- fdbserver/workloads/WriteDuringRead.actor.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index f44a2d6655..42e5858385 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -4308,7 +4308,7 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Even if TSS mode is Disabled, this may be the second test in a restarting test where the first run // had it enabled. - state int byteLimit = (BUGGIFY && g_simulator.tssMode == ISimulator::TSSMode::Disabled && + state int byteLimit = (BUGGIFY && g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::Disabled && !data->isTss() && !data->isSSWithTSSPair()) ? 1 : CLIENT_KNOBS->REPLY_BYTE_LIMIT; diff --git a/fdbserver/workloads/WriteDuringRead.actor.cpp b/fdbserver/workloads/WriteDuringRead.actor.cpp index a462877aba..0bd0a83e2c 100644 --- a/fdbserver/workloads/WriteDuringRead.actor.cpp +++ b/fdbserver/workloads/WriteDuringRead.actor.cpp @@ -87,7 +87,7 @@ struct WriteDuringReadWorkload : TestWorkload { TEST(adjacentKeys && (nodes + minNode) > CLIENT_KNOBS->KEY_SIZE_LIMIT); // WriteDuringReadWorkload testing large keys - useExtraDB = g_simulator.extraDB != nullptr; + useExtraDB = g_network->isSimulated() && g_simulator.extraDB != nullptr; if (useExtraDB) { auto extraFile = makeReference(*g_simulator.extraDB); extraDB = Database::createDatabase(extraFile, -1); From 09e0feb26d8674b2875e142f004e8f752ba15bc2 Mon Sep 17 00:00:00 2001 From: Will Wilson Date: Wed, 15 Jun 2022 12:07:19 -0400 Subject: [PATCH 002/216] Add stdexcept inclusion to TLS plugin. --- FDBLibTLS/FDBLibTLSPolicy.cpp | 1 + FDBLibTLS/FDBLibTLSVerify.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/FDBLibTLS/FDBLibTLSPolicy.cpp b/FDBLibTLS/FDBLibTLSPolicy.cpp index 6f81f91335..d97932659b 100644 --- a/FDBLibTLS/FDBLibTLSPolicy.cpp +++ b/FDBLibTLS/FDBLibTLSPolicy.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/FDBLibTLS/FDBLibTLSVerify.cpp b/FDBLibTLS/FDBLibTLSVerify.cpp index 216966f4c0..4aeea07c15 100644 --- a/FDBLibTLS/FDBLibTLSVerify.cpp +++ b/FDBLibTLS/FDBLibTLSVerify.cpp @@ -28,6 +28,7 @@ #include #include #include +#include static int hexValue(char c) { static char const digits[] = "0123456789ABCDEF"; From ba5c8bd86e467f60ab5e776fbce565c86278b02c Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Wed, 6 Jul 2022 16:59:21 -0700 Subject: [PATCH 003/216] start on RedwoodIO lauch limit --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 2 ++ fdbserver/VersionedBTree.actor.cpp | 29 +++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index a4dfbbdd6e..e84ac5c53e 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -859,6 +859,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } + init( REDWOOD_PRIORITY_LAUNCH_LIMITS, "32,32,32,32"); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 2122a980ac..eb5b0c158a 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -826,6 +826,8 @@ public: bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches + std::string REDWOOD_PRIORITY_LAUNCH_LIMITS; + // Server request latency measurement int LATENCY_SAMPLE_SIZE; double LATENCY_METRICS_LOGGING_INTERVAL; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 6eb2ac0972..594b160916 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -143,8 +143,17 @@ private: #endif public: - PriorityMultiLock(int concurrency, int maxPriority, int launchLimit = std::numeric_limits::max()) - : concurrency(concurrency), available(concurrency), waiting(0), launchLimit(launchLimit) { + PriorityMultiLock(int concurrency, int maxPriority, std::string launchLimit) + : concurrency(concurrency), available(concurrency), waiting(0) { + this->launchLimit.resize(maxPriority + 1); + std::stringstream launchLimitStream(launchLimit); + size_t index = 0; + while (launchLimitStream.good()) { + std::string limit; + getline(launchLimitStream, limit, ','); + this->launchLimit[index++] = std::stoi(limit); + } + ASSERT(index == maxPriority + 1); waiters.resize(maxPriority + 1); fRunner = runner(this); } @@ -215,6 +224,7 @@ private: // Priority to try to run tasks from next state int priority = maxPriority; + state int ioLaunchLimit = self->launchLimit[priority]; state Queue* pQueue = &self->waiters[maxPriority]; // Track the number of waiters unlocked at the same priority in a row @@ -242,7 +252,7 @@ private: lastPriorityCount, self->toString().c_str()); - while (!pQueue->empty() && ++lastPriorityCount < self->launchLimit) { + while (!pQueue->empty() && ++lastPriorityCount < ioLaunchLimit) { Waiter w = pQueue->front(); pQueue->pop_front(); --self->waiting; @@ -280,7 +290,7 @@ private: } else { --priority; } - + ioLaunchLimit = self->launchLimit[priority]; pQueue = &self->waiters[priority]; lastPriorityCount = 0; } @@ -290,7 +300,7 @@ private: int concurrency; int available; int waiting; - int launchLimit; + std::vector launchLimit; std::vector waiters; Deque> runners; Future fRunner; @@ -417,7 +427,7 @@ std::string toString(const std::pair& o) { constexpr static int ioMinPriority = 0; constexpr static int ioLeafPriority = 1; constexpr static int ioMaxPriority = 3; - +constexpr static int maxConcurrentReadsLaunchLimit = std::numeric_limits::max(); // A FIFO queue of T stored as a linked list of pages. // Main operations are pop(), pushBack(), pushFront(), and flush(). // @@ -2195,7 +2205,8 @@ public: bool memoryOnly, std::shared_ptr keyProvider, Promise errorPromise = {}) - : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2), + : keyProvider(keyProvider), + ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCH_LIMITS), pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { @@ -7676,7 +7687,9 @@ RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")) class KeyValueStoreRedwood : public IKeyValueStore { public: KeyValueStoreRedwood(std::string filename, UID logID) - : m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, 0), + : m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, + 0, + std::to_string(maxConcurrentReadsLaunchLimit)), prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) { int pageSize = From 095541941808b0c6bbd97f2deba7900451f43c01 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Mon, 11 Jul 2022 11:13:32 -0700 Subject: [PATCH 004/216] move ParsingStringVector function to genericactor class --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 1 + fdbserver/VersionedBTree.actor.cpp | 19 +++++++------------ flow/genericactors.actor.h | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index e84ac5c53e..acdce71125 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -859,6 +859,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } + init( REDWOOD_IO_MAX_PRIORITY, 3); init( REDWOOD_PRIORITY_LAUNCH_LIMITS, "32,32,32,32"); // Server request latency measurement diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index eb5b0c158a..2469a2450c 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -826,6 +826,7 @@ public: bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches + int REDWOOD_IO_MAX_PRIORITY; std::string REDWOOD_PRIORITY_LAUNCH_LIMITS; // Server request latency measurement diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 594b160916..ca30289788 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -145,15 +145,8 @@ private: public: PriorityMultiLock(int concurrency, int maxPriority, std::string launchLimit) : concurrency(concurrency), available(concurrency), waiting(0) { - this->launchLimit.resize(maxPriority + 1); - std::stringstream launchLimitStream(launchLimit); - size_t index = 0; - while (launchLimitStream.good()) { - std::string limit; - getline(launchLimitStream, limit, ','); - this->launchLimit[index++] = std::stoi(limit); - } - ASSERT(index == maxPriority + 1); + this->launchLimit = parseStringToVector(launchLimit, ','); + ASSERT(this->launchLimit.size() == maxPriority + 1); waiters.resize(maxPriority + 1); fRunner = runner(this); } @@ -426,7 +419,8 @@ std::string toString(const std::pair& o) { constexpr static int ioMinPriority = 0; constexpr static int ioLeafPriority = 1; -constexpr static int ioMaxPriority = 3; +// constexpr static int ioMaxPriority = 3; +static int ioMaxPriority = SERVER_KNOBS->REDWOOD_IO_MAX_PRIORITY; constexpr static int maxConcurrentReadsLaunchLimit = std::numeric_limits::max(); // A FIFO queue of T stored as a linked list of pages. // Main operations are pop(), pushBack(), pushFront(), and flush(). @@ -2205,8 +2199,9 @@ public: bool memoryOnly, std::shared_ptr keyProvider, Promise errorPromise = {}) - : keyProvider(keyProvider), - ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCH_LIMITS), + : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, + SERVER_KNOBS->REDWOOD_IO_MAX_PRIORITY, + SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCH_LIMITS), pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 4955fb6212..bb700cee2a 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -100,6 +100,21 @@ T sorted(T range) { return range; } +template +std::vector parseStringToVector(std::string str, char delim) { + std::vector result; + std::stringstream stream(str); + std::string token; + while (stream.good()) { + getline(stream, token, delim); + std::istringstream tokenStream(token); + T item; + tokenStream >> item; + result.push_back(item); + } + return result; +} + template ErrorOr errorOr(T t) { return ErrorOr(t); From 39b37a80efe5527bb7689bf19ba15ed33845f919 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Mon, 11 Jul 2022 16:10:38 -0700 Subject: [PATCH 005/216] remove comments and format --- fdbclient/ServerKnobs.cpp | 4 ++-- fdbclient/include/fdbclient/ServerKnobs.h | 2 +- fdbserver/VersionedBTree.actor.cpp | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 68f9a432ca..e627a6a3b3 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -873,8 +873,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } - init( REDWOOD_IO_MAX_PRIORITY, 3); - init( REDWOOD_PRIORITY_LAUNCH_LIMITS, "32,32,32,32"); + init( REDWOOD_IO_MAX_PRIORITY, 3 ); + init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" ); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index daa6521a4a..393dc3bce3 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -851,7 +851,7 @@ public: int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches int REDWOOD_IO_MAX_PRIORITY; - std::string REDWOOD_PRIORITY_LAUNCH_LIMITS; + std::string REDWOOD_PRIORITY_LAUNCHS; // Server request latency measurement int LATENCY_SAMPLE_SIZE; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 0ca6a6ecb0..8224b19446 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -418,7 +418,6 @@ std::string toString(const std::pair& o) { constexpr static int ioMinPriority = 0; constexpr static int ioLeafPriority = 1; -// constexpr static int ioMaxPriority = 3; static int ioMaxPriority = SERVER_KNOBS->REDWOOD_IO_MAX_PRIORITY; constexpr static int maxConcurrentReadsLaunchLimit = std::numeric_limits::max(); // A FIFO queue of T stored as a linked list of pages. @@ -2200,7 +2199,7 @@ public: Promise errorPromise = {}) : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_IO_MAX_PRIORITY, - SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCH_LIMITS), + SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS), pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { From d77695b77fa1ea4378dc559e3ab80be70be23cb0 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Tue, 12 Jul 2022 11:54:10 -0700 Subject: [PATCH 006/216] use explicit number for ioMaxPriority --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/VersionedBTree.actor.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index e627a6a3b3..e3c525a4c7 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -874,7 +874,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } init( REDWOOD_IO_MAX_PRIORITY, 3 ); - init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" ); + init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" ); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8224b19446..374de55518 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -418,7 +418,7 @@ std::string toString(const std::pair& o) { constexpr static int ioMinPriority = 0; constexpr static int ioLeafPriority = 1; -static int ioMaxPriority = SERVER_KNOBS->REDWOOD_IO_MAX_PRIORITY; +constexpr static int ioMaxPriority = 3; constexpr static int maxConcurrentReadsLaunchLimit = std::numeric_limits::max(); // A FIFO queue of T stored as a linked list of pages. // Main operations are pop(), pushBack(), pushFront(), and flush(). From 312e160a1248b461a2e10300a664bd6c3f4e3ca5 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Thu, 14 Jul 2022 15:29:54 -0700 Subject: [PATCH 007/216] use PriorityMultiLock in storage server --- fdbclient/NativeAPI.actor.cpp | 5 +- fdbclient/ServerKnobs.cpp | 1 + fdbclient/include/fdbclient/FDBTypes.h | 8 + fdbclient/include/fdbclient/NativeAPI.actor.h | 1 + fdbclient/include/fdbclient/ServerKnobs.h | 1 + .../KeyValueStoreCompressTestData.actor.cpp | 11 +- fdbserver/KeyValueStoreMemory.actor.cpp | 9 +- fdbserver/KeyValueStoreRocksDB.actor.cpp | 28 +-- fdbserver/KeyValueStoreSQLite.actor.cpp | 18 +- .../KeyValueStoreShardedRocksDB.actor.cpp | 52 ++--- fdbserver/RocksDBCheckpointUtils.actor.cpp | 4 +- fdbserver/ServerCheckpoint.actor.cpp | 2 +- fdbserver/StorageCache.actor.cpp | 5 +- fdbserver/VersionedBTree.actor.cpp | 205 +----------------- fdbserver/include/fdbserver/IKeyValueStore.h | 8 - .../fdbserver/RemoteIKeyValueStore.actor.h | 6 +- fdbserver/storageserver.actor.cpp | 74 +++---- flow/include/flow/genericactors.actor.h | 175 +++++++++++++++ flow/include/flow/network.h | 2 +- 19 files changed, 279 insertions(+), 336 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 6aef2a5f01..9cda18bbd4 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -4264,7 +4264,7 @@ Future getRange(Reference trState, req.arena.dependsOn(mapper.arena()); setMatchIndex(req, matchIndex); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); - req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); + req.isFetchKeys = (trState->readType == ReadType::FETCH); req.version = readVersion; trState->cx->getLatestCommitVersions( @@ -4719,9 +4719,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, req.spanContext = spanContext; req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limitBytes = std::numeric_limits::max(); - // leaving the flag off for now to prevent data fetches stall under heavy load // it is used to inform the storage that the rangeRead is for Fetch - // req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); + req.isFetchKeys = (trState->readType == ReadType::FETCH); trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index e3c525a4c7..9eacc643c5 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -740,6 +740,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); + init( STORAGESERVER_READ_PRIORITIES, "32,8,12,32,48" ); //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index 15cd4fc527..ef85a01794 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -1283,6 +1283,14 @@ struct WorkerBackupStatus { enum class TransactionPriority : uint8_t { BATCH, DEFAULT, IMMEDIATE, MIN = BATCH, MAX = IMMEDIATE }; +enum class ReadType { + EAGER = 0, + FETCH = 1, + LOW = 2, + NORMAL = 3, + HIGH = 4, +}; + const std::array allTransactionPriorities = { TransactionPriority::BATCH, TransactionPriority::DEFAULT, diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 3abab222bb..7903b4481f 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -243,6 +243,7 @@ struct TransactionState : ReferenceCounted { Optional debugID; TaskPriority taskID; + ReadType readType = ReadType::NORMAL; SpanContext spanContext; UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False; bool readVersionObtainedFromGrvProxy; diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 393dc3bce3..75e0b5b810 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -698,6 +698,7 @@ public: int CHECKPOINT_TRANSFER_BLOCK_BYTES; int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; + std::string STORAGESERVER_READ_PRIORITIES; // Wait Failure int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; diff --git a/fdbserver/KeyValueStoreCompressTestData.actor.cpp b/fdbserver/KeyValueStoreCompressTestData.actor.cpp index 3121232b58..77d3fb7a52 100644 --- a/fdbserver/KeyValueStoreCompressTestData.actor.cpp +++ b/fdbserver/KeyValueStoreCompressTestData.actor.cpp @@ -56,7 +56,7 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore { void clear(KeyRangeRef range, const Arena* arena = nullptr) override { store->clear(range, arena); } Future commit(bool sequential = false) override { return store->commit(sequential); } - Future> readValue(KeyRef key, IKeyValueStore::ReadType, Optional debugID) override { + Future> readValue(KeyRef key, ReadType, Optional debugID) override { return doReadValue(store, key, debugID); } @@ -64,22 +64,19 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore { // problem is still present if you are using this storage interface, but this storage interface is not used by // customers ever. However, if you want to try to test malicious atomic op workloads with compressed values for some // reason, you will need to fix this. - Future> readValuePrefix(KeyRef key, - int maxLength, - IKeyValueStore::ReadType, - Optional debugID) override { + Future> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional debugID) override { return doReadValuePrefix(store, key, maxLength, debugID); } // If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending // The total size of the returned value (less the last entry) will be less than byteLimit - Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, IKeyValueStore::ReadType) override { + Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override { return doReadRange(store, keys, rowLimit, byteLimit); } private: ACTOR static Future> doReadValue(IKeyValueStore* store, Key key, Optional debugID) { - Optional v = wait(store->readValue(key, IKeyValueStore::ReadType::NORMAL, debugID)); + Optional v = wait(store->readValue(key, ReadType::NORMAL, debugID)); if (!v.present()) return v; return unpack(v.get()); diff --git a/fdbserver/KeyValueStoreMemory.actor.cpp b/fdbserver/KeyValueStoreMemory.actor.cpp index c548891b85..0350ce5318 100644 --- a/fdbserver/KeyValueStoreMemory.actor.cpp +++ b/fdbserver/KeyValueStoreMemory.actor.cpp @@ -198,7 +198,7 @@ public: return c; } - Future> readValue(KeyRef key, IKeyValueStore::ReadType, Optional debugID) override { + Future> readValue(KeyRef key, ReadType, Optional debugID) override { if (recovering.isError()) throw recovering.getError(); if (!recovering.isReady()) @@ -210,10 +210,7 @@ public: return Optional(it.getValue()); } - Future> readValuePrefix(KeyRef key, - int maxLength, - IKeyValueStore::ReadType, - Optional debugID) override { + Future> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional debugID) override { if (recovering.isError()) throw recovering.getError(); if (!recovering.isReady()) @@ -232,7 +229,7 @@ public: // If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending // The total size of the returned value (less the last entry) will be less than byteLimit - Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, IKeyValueStore::ReadType) override { + Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override { if (recovering.isError()) throw recovering.getError(); if (!recovering.isReady()) diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index f06629f707..8d5aefc040 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -1804,8 +1804,8 @@ struct RocksDBKeyValueStore : IKeyValueStore { // We don't throttle eager reads and reads to the FF keyspace because FDB struggles when those reads fail. // Thus far, they have been low enough volume to not cause an issue. - static bool shouldThrottle(IKeyValueStore::ReadType type, KeyRef key) { - return type != IKeyValueStore::ReadType::EAGER && !(key.startsWith(systemKeys.begin)); + static bool shouldThrottle(ReadType type, KeyRef key) { + return type != ReadType::EAGER && !(key.startsWith(systemKeys.begin)); } ACTOR template @@ -1826,7 +1826,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { return result; } - Future> readValue(KeyRef key, IKeyValueStore::ReadType type, Optional debugID) override { + Future> readValue(KeyRef key, ReadType type, Optional debugID) override { if (!shouldThrottle(type, key)) { auto a = new Reader::ReadValueAction(key, debugID); auto res = a->result.getFuture(); @@ -1834,18 +1834,15 @@ struct RocksDBKeyValueStore : IKeyValueStore { return res; } - auto& semaphore = (type == IKeyValueStore::ReadType::FETCH) ? fetchSemaphore : readSemaphore; - int maxWaiters = (type == IKeyValueStore::ReadType::FETCH) ? numFetchWaiters : numReadWaiters; + auto& semaphore = (type == ReadType::FETCH) ? fetchSemaphore : readSemaphore; + int maxWaiters = (type == ReadType::FETCH) ? numFetchWaiters : numReadWaiters; checkWaiters(semaphore, maxWaiters); auto a = std::make_unique(key, debugID); return read(a.release(), &semaphore, readThreads.getPtr(), &counters.failedToAcquire); } - Future> readValuePrefix(KeyRef key, - int maxLength, - IKeyValueStore::ReadType type, - Optional debugID) override { + Future> readValuePrefix(KeyRef key, int maxLength, ReadType type, Optional debugID) override { if (!shouldThrottle(type, key)) { auto a = new Reader::ReadValuePrefixAction(key, maxLength, debugID); auto res = a->result.getFuture(); @@ -1853,8 +1850,8 @@ struct RocksDBKeyValueStore : IKeyValueStore { return res; } - auto& semaphore = (type == IKeyValueStore::ReadType::FETCH) ? fetchSemaphore : readSemaphore; - int maxWaiters = (type == IKeyValueStore::ReadType::FETCH) ? numFetchWaiters : numReadWaiters; + auto& semaphore = (type == ReadType::FETCH) ? fetchSemaphore : readSemaphore; + int maxWaiters = (type == ReadType::FETCH) ? numFetchWaiters : numReadWaiters; checkWaiters(semaphore, maxWaiters); auto a = std::make_unique(key, maxLength, debugID); @@ -1881,10 +1878,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { return result; } - Future readRange(KeyRangeRef keys, - int rowLimit, - int byteLimit, - IKeyValueStore::ReadType type) override { + Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType type) override { if (!shouldThrottle(type, keys.begin)) { auto a = new Reader::ReadRangeAction(keys, rowLimit, byteLimit); auto res = a->result.getFuture(); @@ -1892,8 +1886,8 @@ struct RocksDBKeyValueStore : IKeyValueStore { return res; } - auto& semaphore = (type == IKeyValueStore::ReadType::FETCH) ? fetchSemaphore : readSemaphore; - int maxWaiters = (type == IKeyValueStore::ReadType::FETCH) ? numFetchWaiters : numReadWaiters; + auto& semaphore = (type == ReadType::FETCH) ? fetchSemaphore : readSemaphore; + int maxWaiters = (type == ReadType::FETCH) ? numFetchWaiters : numReadWaiters; checkWaiters(semaphore, maxWaiters); auto a = std::make_unique(keys, rowLimit, byteLimit); diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index f65ce27199..5f27fbe726 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -1589,12 +1589,9 @@ public: void clear(KeyRangeRef range, const Arena* arena = nullptr) override; Future commit(bool sequential = false) override; - Future> readValue(KeyRef key, IKeyValueStore::ReadType, Optional debugID) override; - Future> readValuePrefix(KeyRef key, - int maxLength, - IKeyValueStore::ReadType, - Optional debugID) override; - Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, IKeyValueStore::ReadType) override; + Future> readValue(KeyRef key, ReadType, Optional debugID) override; + Future> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional debugID) override; + Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override; KeyValueStoreSQLite(std::string const& filename, UID logID, @@ -2216,7 +2213,7 @@ Future KeyValueStoreSQLite::commit(bool sequential) { writeThread->post(p); return f; } -Future> KeyValueStoreSQLite::readValue(KeyRef key, IKeyValueStore::ReadType, Optional debugID) { +Future> KeyValueStoreSQLite::readValue(KeyRef key, ReadType, Optional debugID) { ++readsRequested; auto p = new Reader::ReadValueAction(key, debugID); auto f = p->result.getFuture(); @@ -2225,7 +2222,7 @@ Future> KeyValueStoreSQLite::readValue(KeyRef key, IKeyValueStor } Future> KeyValueStoreSQLite::readValuePrefix(KeyRef key, int maxLength, - IKeyValueStore::ReadType, + ReadType, Optional debugID) { ++readsRequested; auto p = new Reader::ReadValuePrefixAction(key, maxLength, debugID); @@ -2233,10 +2230,7 @@ Future> KeyValueStoreSQLite::readValuePrefix(KeyRef key, readThreads->post(p); return f; } -Future KeyValueStoreSQLite::readRange(KeyRangeRef keys, - int rowLimit, - int byteLimit, - IKeyValueStore::ReadType) { +Future KeyValueStoreSQLite::readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) { ++readsRequested; auto p = new Reader::ReadRangeAction(keys, rowLimit, byteLimit); auto f = p->result.getFuture(); diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp index 40531e0d8f..dadc60cf16 100644 --- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp @@ -2235,8 +2235,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { // We don't throttle eager reads and reads to the FF keyspace because FDB struggles when those reads fail. // Thus far, they have been low enough volume to not cause an issue. - static bool shouldThrottle(IKeyValueStore::ReadType type, KeyRef key) { - return type != IKeyValueStore::ReadType::EAGER && !(key.startsWith(systemKeys.begin)); + static bool shouldThrottle(ReadType type, KeyRef key) { + return type != ReadType::EAGER && !(key.startsWith(systemKeys.begin)); } ACTOR template @@ -2257,7 +2257,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { return result; } - Future> readValue(KeyRef key, IKeyValueStore::ReadType type, Optional debugID) override { + Future> readValue(KeyRef key, ReadType type, Optional debugID) override { auto* shard = shardManager.getDataShard(key); if (shard == nullptr || !shard->physicalShard->initialized()) { // TODO: read non-exist system key range should not cause an error. @@ -2272,18 +2272,15 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { return res; } - auto& semaphore = (type == IKeyValueStore::ReadType::FETCH) ? fetchSemaphore : readSemaphore; - int maxWaiters = (type == IKeyValueStore::ReadType::FETCH) ? numFetchWaiters : numReadWaiters; + auto& semaphore = (type == ReadType::FETCH) ? fetchSemaphore : readSemaphore; + int maxWaiters = (type == ReadType::FETCH) ? numFetchWaiters : numReadWaiters; checkWaiters(semaphore, maxWaiters); auto a = std::make_unique(key, shard->physicalShard, debugID); return read(a.release(), &semaphore, readThreads.getPtr(), &counters.failedToAcquire); } - Future> readValuePrefix(KeyRef key, - int maxLength, - IKeyValueStore::ReadType type, - Optional debugID) override { + Future> readValuePrefix(KeyRef key, int maxLength, ReadType type, Optional debugID) override { auto* shard = shardManager.getDataShard(key); if (shard == nullptr || !shard->physicalShard->initialized()) { // TODO: read non-exist system key range should not cause an error. @@ -2300,8 +2297,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { return res; } - auto& semaphore = (type == IKeyValueStore::ReadType::FETCH) ? fetchSemaphore : readSemaphore; - int maxWaiters = (type == IKeyValueStore::ReadType::FETCH) ? numFetchWaiters : numReadWaiters; + auto& semaphore = (type == ReadType::FETCH) ? fetchSemaphore : readSemaphore; + int maxWaiters = (type == ReadType::FETCH) ? numFetchWaiters : numReadWaiters; checkWaiters(semaphore, maxWaiters); auto a = std::make_unique(key, maxLength, shard->physicalShard, debugID); @@ -2328,10 +2325,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { return result; } - Future readRange(KeyRangeRef keys, - int rowLimit, - int byteLimit, - IKeyValueStore::ReadType type) override { + Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType type) override { TraceEvent(SevVerbose, "ShardedRocksReadRangeBegin", this->id).detail("Range", keys); auto shards = shardManager.getDataShardsByRange(keys); @@ -2348,8 +2342,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { return res; } - auto& semaphore = (type == IKeyValueStore::ReadType::FETCH) ? fetchSemaphore : readSemaphore; - int maxWaiters = (type == IKeyValueStore::ReadType::FETCH) ? numFetchWaiters : numReadWaiters; + auto& semaphore = (type == ReadType::FETCH) ? fetchSemaphore : readSemaphore; + int maxWaiters = (type == ReadType::FETCH) ? numFetchWaiters : numReadWaiters; checkWaiters(semaphore, maxWaiters); auto a = std::make_unique(keys, shards, rowLimit, byteLimit); @@ -2515,24 +2509,21 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") { // Range read // Read forward full range. - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), expectedRows.size()); for (int i = 0; i < expectedRows.size(); ++i) { ASSERT(result[i] == expectedRows[i]); } // Read backward full range. - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), -1000, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), -1000, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), expectedRows.size()); for (int i = 0; i < expectedRows.size(); ++i) { ASSERT(result[i] == expectedRows[59 - i]); } // Forward with row limit. - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("2"_sr, "6"_sr), 10, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("2"_sr, "6"_sr), 10, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), 10); for (int i = 0; i < 10; ++i) { ASSERT(result[i] == expectedRows[20 + i]); @@ -2558,16 +2549,14 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") { wait(kvStore->init()); // Read all values. - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), expectedRows.size()); for (int i = 0; i < expectedRows.size(); ++i) { ASSERT(result[i] == expectedRows[i]); } // Read partial range with row limit - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("5"_sr, ":"_sr), 35, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("5"_sr, ":"_sr), 35, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), 35); for (int i = 0; i < result.size(); ++i) { ASSERT(result[i] == expectedRows[40 + i]); @@ -2577,8 +2566,7 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") { kvStore->clear(KeyRangeRef("40"_sr, "45"_sr)); wait(kvStore->commit(false)); - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("4"_sr, "5"_sr), 20, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("4"_sr, "5"_sr), 20, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), 5); // Clear a single value. @@ -2598,12 +2586,10 @@ TEST_CASE("noSim/ShardedRocksDB/RangeOps") { kvStore = new ShardedRocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID()); wait(kvStore->init()); - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("1"_sr, "8"_sr), 1000, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("1"_sr, "8"_sr), 1000, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), 0); - RangeResult result = - wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, IKeyValueStore::ReadType::NORMAL)); + RangeResult result = wait(kvStore->readRange(KeyRangeRef("0"_sr, ":"_sr), 1000, 10000, ReadType::NORMAL)); ASSERT_EQ(result.size(), 19); Future closed = kvStore->onClosed(); diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp index 9edc6923fd..4b6ea1797b 100644 --- a/fdbserver/RocksDBCheckpointUtils.actor.cpp +++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp @@ -437,7 +437,7 @@ private: } ACTOR static Future doClose(RocksDBCFCheckpointReader* self) { - wait(delay(0, TaskPriority::FetchKeys)); + wait(delay(0, TaskPriority::CheckPoint)); delete self; return Void(); } @@ -791,7 +791,7 @@ ACTOR Future deleteRocksCheckpoint(CheckpointMetaData checkpoint) { TraceEvent(SevInfo, "DeleteCheckpointRemovedDir", checkpoint.checkpointID) .detail("CheckpointID", checkpoint.checkpointID) .detail("Dir", dir); - wait(delay(0, TaskPriority::FetchKeys)); + wait(delay(0, TaskPriority::CheckPoint)); } return Void(); diff --git a/fdbserver/ServerCheckpoint.actor.cpp b/fdbserver/ServerCheckpoint.actor.cpp index 91d0145fd8..cc74b3cb22 100644 --- a/fdbserver/ServerCheckpoint.actor.cpp +++ b/fdbserver/ServerCheckpoint.actor.cpp @@ -35,7 +35,7 @@ ICheckpointReader* newCheckpointReader(const CheckpointMetaData& checkpoint, UID } ACTOR Future deleteCheckpoint(CheckpointMetaData checkpoint) { - wait(delay(0, TaskPriority::FetchKeys)); + wait(delay(0, TaskPriority::CheckPoint)); state CheckpointFormat format = checkpoint.getFormat(); if (format == RocksDBColumnFamily || format == RocksDB) { wait(deleteRocksCheckpoint(checkpoint)); diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 1be4f63698..5565b8492f 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -736,7 +736,7 @@ ACTOR Future getKeyValues(StorageCacheData* data, GetKeyValuesRequest req) // so we need to downgrade here TaskPriority taskType = TaskPriority::DefaultEndpoint; if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - taskType = TaskPriority::FetchKeys; + taskType = TaskPriority::LowPriorityRead; // } else if (false) { // // Placeholder for up-prioritizing fetches for important requests // taskType = TaskPriority::DefaultDelay; @@ -1192,7 +1192,8 @@ ACTOR Future tryFetchRange(Database cx, ASSERT(!cx->switchable); tr.setVersion(version); - tr.trState->taskID = TaskPriority::FetchKeys; + // tr.trState->taskID = TaskPriority::FetchKeys; + tr.trState->readType = ReadType::FETCH; limits.minRows = 0; try { diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 374de55518..0d10f8ff78 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -104,202 +104,6 @@ std::string addPrefix(std::string prefix, std::string lines) { #define PRIORITYMULTILOCK_DEBUG 0 -// A multi user lock with a concurrent holder limit where waiters are granted the lock according to -// an integer priority from 0 to maxPriority, inclusive, where higher integers are given priority. -// -// The interface is similar to FlowMutex except that lock holders can drop the lock to release it. -// -// Usage: -// Lock lock = wait(prioritylock.lock(priorityLevel)); -// lock.release(); // Explicit release, or -// // let lock and all copies of lock go out of scope to release -class PriorityMultiLock { - -public: - // Waiting on the lock returns a Lock, which is really just a Promise - // Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release - // the Lock before it goes out of scope. - struct Lock { - void release() { promise.send(Void()); } - - // This is exposed in case the caller wants to use/copy it directly - Promise promise; - }; - -private: - struct Waiter { - Waiter() : queuedTime(now()) {} - Promise lockPromise; - double queuedTime; - }; - - typedef Deque Queue; - -#if PRIORITYMULTILOCK_DEBUG -#define prioritylock_printf(...) printf(__VA_ARGS__) -#else -#define prioritylock_printf(...) -#endif - -public: - PriorityMultiLock(int concurrency, int maxPriority, std::string launchLimit) - : concurrency(concurrency), available(concurrency), waiting(0) { - this->launchLimit = parseStringToVector(launchLimit, ','); - ASSERT(this->launchLimit.size() == maxPriority + 1); - waiters.resize(maxPriority + 1); - fRunner = runner(this); - } - - ~PriorityMultiLock() { prioritylock_printf("destruct"); } - - Future lock(int priority = 0) { - prioritylock_printf("lock begin %s\n", toString().c_str()); - - // This shortcut may enable a waiter to jump the line when the releaser loop yields - if (available > 0) { - --available; - Lock p; - addRunner(p); - prioritylock_printf("lock exit immediate %s\n", toString().c_str()); - return p; - } - - Waiter w; - waiters[priority].push_back(w); - ++waiting; - prioritylock_printf("lock exit queued %s\n", toString().c_str()); - return w.lockPromise.getFuture(); - } - - std::string toString() const { - int runnersDone = 0; - for (int i = 0; i < runners.size(); ++i) { - if (runners[i].isReady()) { - ++runnersDone; - } - } - - std::string s = - format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d runnersDone=%d ", - this, - concurrency, - available, - concurrency - available, - waiting, - runners.size(), - runnersDone); - - for (int i = 0; i < waiters.size(); ++i) { - s += format("p%d_waiters=%u ", i, waiters[i].size()); - } - - s += "}"; - return s; - } - -private: - void addRunner(Lock& lock) { - runners.push_back(map(ready(lock.promise.getFuture()), [=](Void) { - prioritylock_printf("Lock released\n"); - ++available; - if (waiting > 0 || runners.size() > 100) { - release.trigger(); - } - return Void(); - })); - } - - ACTOR static Future runner(PriorityMultiLock* self) { - state int sinceYield = 0; - state Future error = self->brokenOnDestruct.getFuture(); - state int maxPriority = self->waiters.size() - 1; - - // Priority to try to run tasks from next - state int priority = maxPriority; - state int ioLaunchLimit = self->launchLimit[priority]; - state Queue* pQueue = &self->waiters[maxPriority]; - - // Track the number of waiters unlocked at the same priority in a row - state int lastPriorityCount = 0; - - loop { - // Cleanup finished runner futures at the front of the runner queue. - while (!self->runners.empty() && self->runners.front().isReady()) { - self->runners.pop_front(); - } - - // Wait for a runner to release its lock - wait(self->release.onTrigger()); - prioritylock_printf("runner wakeup %s\n", self->toString().c_str()); - - if (++sinceYield == 1000) { - sinceYield = 0; - wait(delay(0)); - } - - // While there are available slots and there are waiters, launch tasks - while (self->available > 0 && self->waiting > 0) { - prioritylock_printf("Checking priority=%d lastPriorityCount=%d %s\n", - priority, - lastPriorityCount, - self->toString().c_str()); - - while (!pQueue->empty() && ++lastPriorityCount < ioLaunchLimit) { - Waiter w = pQueue->front(); - pQueue->pop_front(); - --self->waiting; - Lock lock; - prioritylock_printf(" Running waiter priority=%d wait=%f %s\n", - priority, - now() - w.queuedTime, - self->toString().c_str()); - w.lockPromise.send(lock); - - // Self may have been destructed during the lock callback - if (error.isReady()) { - throw error.getError(); - } - - // If the lock was not already released, add it to the runners future queue - if (lock.promise.canBeSet()) { - self->addRunner(lock); - - // A slot has been consumed, so stop reading from this queue if there aren't any more - if (--self->available == 0) { - break; - } - } - } - - // If there are no more slots available, then don't move to the next priority - if (self->available == 0) { - break; - } - - // Decrease priority, wrapping around to max from 0 - if (priority == 0) { - priority = maxPriority; - } else { - --priority; - } - ioLaunchLimit = self->launchLimit[priority]; - pQueue = &self->waiters[priority]; - lastPriorityCount = 0; - } - } - } - - int concurrency; - int available; - int waiting; - std::vector launchLimit; - std::vector waiters; - Deque> runners; - Future fRunner; - AsyncTrigger release; - Promise brokenOnDestruct; -}; - // Some convenience functions for debugging to stringify various structures // Classes can add compatibility by either specializing toString or implementing // std::string toString() const; @@ -7799,7 +7603,7 @@ public: m_tree->set(keyValue); } - Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, IKeyValueStore::ReadType) override { + Future readRange(KeyRangeRef keys, int rowLimit, int byteLimit, ReadType) override { debug_printf("READRANGE %s\n", printable(keys).c_str()); return catchError(readRange_impl(this, keys, rowLimit, byteLimit)); } @@ -7968,14 +7772,11 @@ public: return Optional(); } - Future> readValue(KeyRef key, IKeyValueStore::ReadType, Optional debugID) override { + Future> readValue(KeyRef key, ReadType, Optional debugID) override { return catchError(readValue_impl(this, key, debugID)); } - Future> readValuePrefix(KeyRef key, - int maxLength, - IKeyValueStore::ReadType, - Optional debugID) override { + Future> readValuePrefix(KeyRef key, int maxLength, ReadType, Optional debugID) override { return catchError(map(readValue_impl(this, key, debugID), [maxLength](Optional v) { if (v.present() && v.get().size() > maxLength) { v.get().contents() = v.get().substr(0, maxLength); diff --git a/fdbserver/include/fdbserver/IKeyValueStore.h b/fdbserver/include/fdbserver/IKeyValueStore.h index 942ff37539..8f3ed8d0ee 100644 --- a/fdbserver/include/fdbserver/IKeyValueStore.h +++ b/fdbserver/include/fdbserver/IKeyValueStore.h @@ -67,14 +67,6 @@ public: virtual Future commit( bool sequential = false) = 0; // returns when prior sets and clears are (atomically) durable - enum class ReadType { - EAGER, - FETCH, - LOW, - NORMAL, - HIGH, - }; - virtual Future> readValue(KeyRef key, ReadType type = ReadType::NORMAL, Optional debugID = Optional()) = 0; diff --git a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h index 0465e4afbb..6aa2c40c83 100644 --- a/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h +++ b/fdbserver/include/fdbserver/RemoteIKeyValueStore.actor.h @@ -155,7 +155,7 @@ struct OpenKVStoreRequest { struct IKVSGetValueRequest { constexpr static FileIdentifier file_identifier = 1029439; KeyRef key; - IKeyValueStore::ReadType type; + ReadType type; Optional debugID = Optional(); ReplyPromise> reply; @@ -202,7 +202,7 @@ struct IKVSReadValuePrefixRequest { constexpr static FileIdentifier file_identifier = 1928374; KeyRef key; int maxLength; - IKeyValueStore::ReadType type; + ReadType type; Optional debugID = Optional(); ReplyPromise> reply; @@ -246,7 +246,7 @@ struct IKVSReadRangeRequest { KeyRangeRef keys; int rowLimit; int byteLimit; - IKeyValueStore::ReadType type; + ReadType type; ReplyPromise reply; template diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 10125b09f9..346d791bb3 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -278,19 +278,19 @@ struct StorageServerDisk { // - "a", if key "a" exist // - "b", if key "a" doesn't exist, and "b" is the next existing key in total order // - allKeys.end, if keyrange [a, allKeys.end) is empty - Future readNextKeyInclusive(KeyRef key, IKeyValueStore::ReadType type = IKeyValueStore::ReadType::NORMAL) { + Future readNextKeyInclusive(KeyRef key, ReadType type = ReadType::NORMAL) { ++(*kvScans); return readFirstKey(storage, KeyRangeRef(key, allKeys.end), type); } Future> readValue(KeyRef key, - IKeyValueStore::ReadType type = IKeyValueStore::ReadType::NORMAL, + ReadType type = ReadType::NORMAL, Optional debugID = Optional()) { ++(*kvGets); return storage->readValue(key, type, debugID); } Future> readValuePrefix(KeyRef key, int maxLength, - IKeyValueStore::ReadType type = IKeyValueStore::ReadType::NORMAL, + ReadType type = ReadType::NORMAL, Optional debugID = Optional()) { ++(*kvGets); return storage->readValuePrefix(key, maxLength, type, debugID); @@ -298,7 +298,7 @@ struct StorageServerDisk { Future readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30, - IKeyValueStore::ReadType type = IKeyValueStore::ReadType::NORMAL) { + ReadType type = ReadType::NORMAL) { ++(*kvScans); return storage->readRange(keys, rowLimit, byteLimit, type); } @@ -327,7 +327,7 @@ private: IKeyValueStore* storage; void writeMutations(const VectorRef& mutations, Version debugVersion, const char* debugContext); - ACTOR static Future readFirstKey(IKeyValueStore* storage, KeyRangeRef range, IKeyValueStore::ReadType type) { + ACTOR static Future readFirstKey(IKeyValueStore* storage, KeyRangeRef range, ReadType type) { RangeResult r = wait(storage->readRange(range, 1, 1 << 30, type)); if (r.size()) return r[0].key; @@ -852,6 +852,8 @@ public: FlowLock serveFetchCheckpointParallelismLock; + PriorityMultiLock reqSSLock; + int64_t instanceID; Promise otherError; @@ -1059,6 +1061,7 @@ public: fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), + reqSSLock(1, (int)ReadType::HIGH, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false), versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0), lastDurableVersionEBrake(0), maxQueryQueue(0), transactionTagCounter(ssi.id()), counters(this), @@ -1596,7 +1599,7 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { path = 1; } else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) { path = 2; - Optional vv = wait(data->storage.readValue(req.key, IKeyValueStore::ReadType::NORMAL, req.debugID)); + Optional vv = wait(data->storage.readValue(req.key, ReadType::NORMAL, req.debugID)); data->counters.kvGetBytes += vv.expectedSize(); // Validate that while we were reading the data we didn't lose the version or shard if (version < data->storageVersion()) { @@ -2999,7 +3002,7 @@ ACTOR Future readRange(StorageServer* data, int limit, int* pLimitBytes, SpanContext parentSpan, - IKeyValueStore::ReadType type, + ReadType type, Optional tenantPrefix) { state GetKeyValuesReply result; state StorageServer::VersionedData::ViewAtVersion view = data->data().at(version); @@ -3238,7 +3241,7 @@ ACTOR Future findKey(StorageServer* data, KeyRange range, int* pOffset, SpanContext parentSpan, - IKeyValueStore::ReadType type) + ReadType type) // Attempts to find the key indicated by sel in the data at version, within range. // Precondition: selectorInRange(sel, range) // If it is found, offset is set to 0 and a key is returned which falls inside range. @@ -3358,8 +3361,9 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) { state Span span("SS:getKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; - state IKeyValueStore::ReadType type = - req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; + state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; + state int readPriority = + (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -3375,11 +3379,8 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - wait(delay(0, TaskPriority::FetchKeys)); - } else { - wait(data->getQueryDelay()); - } + wait(data->getQueryDelay()); + state PriorityMultiLock::Lock lock = wait(data->reqSSLock.lock(readPriority)); try { if (req.debugID.present()) @@ -4078,8 +4079,9 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe { state Span span("SS:getMappedKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; - state IKeyValueStore::ReadType type = - req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; + state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; + state int readPriority = + (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4095,11 +4097,8 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - wait(delay(0, TaskPriority::FetchKeys)); - } else { - wait(data->getQueryDelay()); - } + wait(data->getQueryDelay()); + state PriorityMultiLock::Lock lock = wait(data->reqSSLock.lock(readPriority)); try { if (req.debugID.present()) @@ -4286,8 +4285,9 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe { state Span span("SS:getKeyValuesStream"_loc, req.spanContext); state int64_t resultSize = 0; - state IKeyValueStore::ReadType type = - req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; + state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; + state int readPriority = + (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4302,11 +4302,7 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - wait(delay(0, TaskPriority::FetchKeys)); - } else { - wait(delay(0, TaskPriority::DefaultEndpoint)); - } + state PriorityMultiLock::Lock lock = wait(data->reqSSLock.lock(readPriority)); try { if (req.debugID.present()) @@ -4463,11 +4459,11 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe end = lastKey; } - if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - wait(delay(0, TaskPriority::FetchKeys)); + /*if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { + wait(store(lock, data->reqSSLock.lock((int)ReadType::FETCH))); } else { - wait(delay(0, TaskPriority::DefaultEndpoint)); - } + wait(store(lock, data->reqSSLock.lock((int)ReadType::NORMAL))); + }*/ data->transactionTagCounter.addRequest(req.tags, resultSize); } @@ -4519,8 +4515,8 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { KeyRangeRef searchRange = data->clampRangeToTenant(shard, tenantEntry, req.arena); state int offset; - Key absoluteKey = wait( - findKey(data, req.sel, version, searchRange, &offset, req.spanContext, IKeyValueStore::ReadType::NORMAL)); + Key absoluteKey = + wait(findKey(data, req.sel, version, searchRange, &offset, req.spanContext, ReadType::NORMAL)); data->checkChangeCounter(changeCounter, KeyRangeRef(std::min(req.sel.getKey(), absoluteKey), @@ -4620,7 +4616,7 @@ ACTOR Future doEagerReads(StorageServer* data, UpdateEagerReadInfo* eager) if (SERVER_KNOBS->ENABLE_CLEAR_RANGE_EAGER_READS) { std::vector> keyEnd(eager->keyBegin.size()); for (int i = 0; i < keyEnd.size(); i++) - keyEnd[i] = data->storage.readNextKeyInclusive(eager->keyBegin[i], IKeyValueStore::ReadType::EAGER); + keyEnd[i] = data->storage.readNextKeyInclusive(eager->keyBegin[i], ReadType::EAGER); data->counters.eagerReadsKeys += keyEnd.size(); state Future> futureKeyEnds = getAll(keyEnd); @@ -4633,8 +4629,7 @@ ACTOR Future doEagerReads(StorageServer* data, UpdateEagerReadInfo* eager) std::vector>> value(eager->keys.size()); for (int i = 0; i < value.size(); i++) - value[i] = - data->storage.readValuePrefix(eager->keys[i].first, eager->keys[i].second, IKeyValueStore::ReadType::EAGER); + value[i] = data->storage.readValuePrefix(eager->keys[i].first, eager->keys[i].second, ReadType::EAGER); state Future>> futureValues = getAll(value); std::vector> optionalValues = wait(futureValues); @@ -5984,7 +5979,8 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { while (!shard->updates.empty() && shard->updates[0].version <= fetchVersion) shard->updates.pop_front(); tr.setVersion(fetchVersion); - tr.trState->taskID = TaskPriority::FetchKeys; + tr.trState->readType = ReadType::FETCH; + state PromiseStream results; state Future hold = SERVER_KNOBS->FETCH_USING_STREAMING ? tr.getRangeStream(results, keys, GetRangeLimits(), Snapshot::True) diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index 025a969ec7..c24150b0e3 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -2148,6 +2148,181 @@ private: static std::unordered_map> instanceMap; }; +// A multi user lock with a concurrent holder limit where waiters are granted the lock according to +// an integer priority from 0 to maxPriority, inclusive, where higher integers are given priority. +// +// The interface is similar to FlowMutex except that lock holders can drop the lock to release it. +// +// Usage: +// Lock lock = wait(prioritylock.lock(priorityLevel)); +// lock.release(); // Explicit release, or +// // let lock and all copies of lock go out of scope to release +class PriorityMultiLock { + +public: + // Waiting on the lock returns a Lock, which is really just a Promise + // Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release + // the Lock before it goes out of scope. + struct Lock { + void release() { promise.send(Void()); } + + // This is exposed in case the caller wants to use/copy it directly + Promise promise; + }; + + PriorityMultiLock(int concurrency, int maxPriority, std::string launchLimit) + : concurrency(concurrency), available(concurrency), waiting(0) { + this->launchLimit = parseStringToVector(launchLimit, ','); + ASSERT(this->launchLimit.size() == maxPriority + 1); + waiters.resize(maxPriority + 1); + fRunner = runner(this); + } + + ~PriorityMultiLock() {} + + Future lock(int priority = 0) { + + // This shortcut may enable a waiter to jump the line when the releaser loop yields + if (available > 0) { + --available; + Lock p; + addRunner(p); + return p; + } + + Waiter w; + waiters[priority].push_back(w); + ++waiting; + return w.lockPromise.getFuture(); + } + + std::string toString() const { + int runnersDone = 0; + for (int i = 0; i < runners.size(); ++i) { + if (runners[i].isReady()) { + ++runnersDone; + } + } + + std::string s = + format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d runnersDone=%d ", + this, + concurrency, + available, + concurrency - available, + waiting, + runners.size(), + runnersDone); + + for (int i = 0; i < waiters.size(); ++i) { + s += format("p%d_waiters=%u ", i, waiters[i].size()); + } + + s += "}"; + return s; + } + +private: + struct Waiter { + Waiter() : queuedTime(now()) {} + Promise lockPromise; + double queuedTime; + }; + + int concurrency; + int available; + int waiting; + typedef Deque Queue; + std::vector launchLimit; + std::vector waiters; + Deque> runners; + Future fRunner; + AsyncTrigger release; + Promise brokenOnDestruct; + + void addRunner(Lock& lock) { + runners.push_back(map(ready(lock.promise.getFuture()), [=](Void) { + ++available; + if (waiting > 0 || runners.size() > 100) { + release.trigger(); + } + return Void(); + })); + } + + ACTOR static Future runner(PriorityMultiLock* self) { + state int sinceYield = 0; + state Future error = self->brokenOnDestruct.getFuture(); + state int maxPriority = self->waiters.size() - 1; + + // Priority to try to run tasks from next + state int priority = maxPriority; + state int ioLaunchLimit = self->launchLimit[priority]; + state Queue* pQueue = &self->waiters[maxPriority]; + + // Track the number of waiters unlocked at the same priority in a row + state int lastPriorityCount = 0; + + loop { + // Cleanup finished runner futures at the front of the runner queue. + while (!self->runners.empty() && self->runners.front().isReady()) { + self->runners.pop_front(); + } + + // Wait for a runner to release its lock + wait(self->release.onTrigger()); + + if (++sinceYield == 1000) { + sinceYield = 0; + wait(delay(0)); + } + + // While there are available slots and there are waiters, launch tasks + while (self->available > 0 && self->waiting > 0) { + + while (!pQueue->empty() && ++lastPriorityCount < ioLaunchLimit) { + Waiter w = pQueue->front(); + pQueue->pop_front(); + --self->waiting; + Lock lock; + + w.lockPromise.send(lock); + + // Self may have been destructed during the lock callback + if (error.isReady()) { + throw error.getError(); + } + + // If the lock was not already released, add it to the runners future queue + if (lock.promise.canBeSet()) { + self->addRunner(lock); + + // A slot has been consumed, so stop reading from this queue if there aren't any more + if (--self->available == 0) { + break; + } + } + } + + // If there are no more slots available, then don't move to the next priority + if (self->available == 0) { + break; + } + + // Decrease priority, wrapping around to max from 0 + if (priority == 0) { + priority = maxPriority; + } else { + --priority; + } + ioLaunchLimit = self->launchLimit[priority]; + pQueue = &self->waiters[priority]; + lastPriorityCount = 0; + } + } + } +}; + template std::unordered_map> FlowSingleton::instanceMap; diff --git a/flow/include/flow/network.h b/flow/include/flow/network.h index 62613f402e..d414badbc8 100644 --- a/flow/include/flow/network.h +++ b/flow/include/flow/network.h @@ -104,7 +104,7 @@ enum class TaskPriority { BlobWorkerReadChangeFeed = 2720, BlobWorkerUpdateFDB = 2710, BlobWorkerUpdateStorage = 2700, - FetchKeys = 2500, + CheckPoint = 2500, RestoreApplierWriteDB = 2310, RestoreApplierReceiveMutations = 2300, RestoreLoaderFinishVersionBatch = 2220, From 2322730345b688c81115d595853bf3df539c8fc3 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Tue, 19 Jul 2022 11:19:30 -0700 Subject: [PATCH 008/216] fix issues related to terminate storage server --- fdbclient/NativeAPI.actor.cpp | 3 +- fdbclient/include/fdbclient/FDBTypes.h | 8 +--- fdbserver/storageserver.actor.cpp | 52 ++++++++++++++----------- flow/include/flow/genericactors.actor.h | 12 ++++++ 4 files changed, 45 insertions(+), 30 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 9cda18bbd4..481e74348b 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -4720,7 +4720,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limitBytes = std::numeric_limits::max(); // it is used to inform the storage that the rangeRead is for Fetch - req.isFetchKeys = (trState->readType == ReadType::FETCH); + // req.isFetchKeys = (trState->readType == ReadType::FETCH); + req.isFetchKeys = false; trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index ef85a01794..4f4d48cb75 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -1283,13 +1283,7 @@ struct WorkerBackupStatus { enum class TransactionPriority : uint8_t { BATCH, DEFAULT, IMMEDIATE, MIN = BATCH, MAX = IMMEDIATE }; -enum class ReadType { - EAGER = 0, - FETCH = 1, - LOW = 2, - NORMAL = 3, - HIGH = 4, -}; +enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH }; const std::array allTransactionPriorities = { TransactionPriority::BATCH, diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 346d791bb3..9b0dba7d98 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -852,7 +852,7 @@ public: FlowLock serveFetchCheckpointParallelismLock; - PriorityMultiLock reqSSLock; + PriorityMultiLock ssLock; int64_t instanceID; @@ -1061,7 +1061,7 @@ public: fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), - reqSSLock(1, (int)ReadType::HIGH, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), + ssLock(FLOW_KNOBS->MAX_OUTSTANDING, (int)ReadType::MAX, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false), versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0), lastDurableVersionEBrake(0), maxQueryQueue(0), transactionTagCounter(ssi.id()), counters(this), @@ -1167,13 +1167,14 @@ public: // Normally the storage server prefers to serve read requests over making mutations // durable to disk. However, when the storage server falls to far behind on // making mutations durable, this function will change the priority to prefer writes. - Future getQueryDelay() { + + int getQueryPriority() { if ((version.get() - durableVersion.get() > SERVER_KNOBS->LOW_PRIORITY_DURABILITY_LAG) || (queueSize() > SERVER_KNOBS->LOW_PRIORITY_STORAGE_QUEUE_BYTES)) { ++counters.lowPriorityQueries; - return delay(0, TaskPriority::LowPriorityRead); + return (int)ReadType::LOW; } - return delay(0, TaskPriority::DefaultEndpoint); + return (int)ReadType::NORMAL; } template @@ -1549,6 +1550,7 @@ Optional StorageServer::getTenantEntry(Version version, TenantIn ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { state int64_t resultSize = 0; + state PriorityMultiLock::Lock lock; Span span("SS:getValue"_loc, req.spanContext); if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -1566,7 +1568,9 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(data->getQueryDelay()); + wait(delay(0)); + state int readPriority = data->getQueryPriority(); + wait(store(lock, data->ssLock.lock(readPriority))); if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", @@ -3362,8 +3366,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) state Span span("SS:getKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; - state int readPriority = - (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -3379,8 +3381,10 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(data->getQueryDelay()); - state PriorityMultiLock::Lock lock = wait(data->reqSSLock.lock(readPriority)); + wait(delay(0)); + state int readPriority = + (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : data->getQueryPriority(); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); try { if (req.debugID.present()) @@ -4080,8 +4084,6 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe state Span span("SS:getMappedKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; - state int readPriority = - (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4097,8 +4099,10 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(data->getQueryDelay()); - state PriorityMultiLock::Lock lock = wait(data->reqSSLock.lock(readPriority)); + wait(delay(0)); + state int readPriority = + (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : data->getQueryPriority(); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); try { if (req.debugID.present()) @@ -4302,7 +4306,8 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - state PriorityMultiLock::Lock lock = wait(data->reqSSLock.lock(readPriority)); + wait(delay(0)); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); try { if (req.debugID.present()) @@ -4459,12 +4464,6 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe end = lastKey; } - /*if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - wait(store(lock, data->reqSSLock.lock((int)ReadType::FETCH))); - } else { - wait(store(lock, data->reqSSLock.lock((int)ReadType::NORMAL))); - }*/ - data->transactionTagCounter.addRequest(req.tags, resultSize); } } @@ -4485,6 +4484,7 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { state Span span("SS:getKey"_loc, req.spanContext); + state PriorityMultiLock::Lock lock; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } @@ -4499,7 +4499,9 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(data->getQueryDelay()); + wait(delay(0)); + state int readPriority = data->getQueryPriority(); + wait(store(lock, data->ssLock.lock(readPriority))); try { Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag); @@ -9485,6 +9487,9 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, // If the storage server dies while something that uses self is still on the stack, // we want that actor to complete before we terminate and that memory goes out of scope + + self.ssLock.kill(); + state Error err = e; if (storageServerTerminated(self, persistentData, err)) { ssCore.cancel(); @@ -9597,6 +9602,9 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, throw internal_error(); } catch (Error& e) { + + self.ssLock.kill(); + if (self.byteSampleRecovery.isValid()) { self.byteSampleRecovery.cancel(); } diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index c24150b0e3..42d0298aa9 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -2196,6 +2196,18 @@ public: return w.lockPromise.getFuture(); } + void kill() { + for (int i = 0; i < runners.size(); ++i) { + if (!runners[i].isReady()) { + runners[i].cancel(); + } + } + runners.clear(); + brokenOnDestruct.sendError(broken_promise()); + waiting = 0; + waiters.clear(); + } + std::string toString() const { int runnersDone = 0; for (int i = 0; i < runners.size(); ++i) { From 6eaf39c6ccd34dadac398c42a722a2d914921098 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 21 Jul 2022 11:37:26 -0700 Subject: [PATCH 009/216] change reference to pointer; default constructor enabled --- fdbserver/DataDistribution.actor.cpp | 16 +++--- fdbserver/DataDistributionTracker.actor.cpp | 53 +++++++++++-------- .../fdbserver/DataDistribution.actor.h | 4 +- flow/include/flow/ActorCollection.h | 2 +- 4 files changed, 43 insertions(+), 32 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 0acdba7a00..02f66f99a5 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -536,14 +536,14 @@ ACTOR Future dataDistribution(Reference self, // FIXME: wrap the bootstrap process into class DataDistributor state Reference primaryTeamCollection; state Reference remoteTeamCollection; - state bool trackerCancelled; + state std::shared_ptr trackerCancelled(new bool(false)); state bool ddIsTenantAware = SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED; loop { - trackerCancelled = false; + *trackerCancelled = false; // Stored outside of data distribution tracker to avoid slow tasks // when tracker is cancelled - state KeyRangeMap shards; + state std::shared_ptr> shards(new KeyRangeMap); state Promise removeFailedServer; try { wait(DataDistributor::init(self, ddEnabledState)); @@ -603,8 +603,8 @@ ACTOR Future dataDistribution(Reference self, readyToStart, anyZeroHealthyTeams, self->ddId, - &shards, - &trackerCancelled), + shards, + trackerCancelled), "DDTracker", self->ddId, &normalDDQueueErrors())); @@ -691,7 +691,7 @@ ACTOR Future dataDistribution(Reference self, wait(waitForAll(actors)); return Void(); } catch (Error& e) { - trackerCancelled = true; + *trackerCancelled = true; state Error err = e; TraceEvent("DataDistributorDestroyTeamCollections").error(e); state std::vector teamForDroppedRange; @@ -715,10 +715,10 @@ ACTOR Future dataDistribution(Reference self, if (!g_network->isSimulated()) { TraceEvent(SevWarnAlways, "DataDistributorCancelled"); } - shards.clear(); + shards->clear(); throw e; } else { - wait(shards.clearAsync()); + wait(shards->clearAsync()); } TraceEvent("DataDistributorTeamCollectionsDestroyed").error(err); if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) { diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 6b504a6f8a..ff96703557 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -71,10 +71,11 @@ ACTOR Future updateMaxShardSize(Reference> dbSizeEstimat struct DataDistributionTracker { Database cx; UID distributorId; - KeyRangeMap& shards; + // At now, the lifetime of shards is guaranteed longer than DataDistributionTracker. + std::shared_ptr> shards; ActorCollection sizeChanges; - int64_t systemSizeEstimate; + int64_t systemSizeEstimate = 0; Reference> dbSizeEstimate; Reference>> maxShardSize; Future maxShardSizeUpdater; @@ -92,7 +93,7 @@ struct DataDistributionTracker { // The reference to trackerCancelled must be extracted by actors, // because by the time (trackerCancelled == true) this memory cannot // be accessed - bool& trackerCancelled; + std::shared_ptr trackerCancelled; // This class extracts the trackerCancelled reference from a DataDistributionTracker object // Because some actors spawned by the dataDistributionTracker outlive the DataDistributionTracker @@ -104,7 +105,7 @@ struct DataDistributionTracker { public: SafeAccessor(DataDistributionTracker* tracker) - : trackerCancelled(tracker->trackerCancelled), tracker(*tracker) { + : trackerCancelled(*tracker->trackerCancelled), tracker(*tracker) { ASSERT(!trackerCancelled); } @@ -117,21 +118,23 @@ struct DataDistributionTracker { } }; + DataDistributionTracker() = default; + DataDistributionTracker(Database cx, UID distributorId, Promise const& readyToStart, PromiseStream const& output, Reference shardsAffectedByTeamFailure, Reference> anyZeroHealthyTeams, - KeyRangeMap& shards, - bool& trackerCancelled) + std::shared_ptr> shards, + std::shared_ptr trackerCancelled) : cx(cx), distributorId(distributorId), shards(shards), sizeChanges(false), systemSizeEstimate(0), dbSizeEstimate(new AsyncVar()), maxShardSize(new AsyncVar>()), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), readyToStart(readyToStart), anyZeroHealthyTeams(anyZeroHealthyTeams), trackerCancelled(trackerCancelled) {} ~DataDistributionTracker() { - trackerCancelled = true; + *trackerCancelled = true; // Cancel all actors so they aren't waiting on sizeChanged broken promise sizeChanges.clear(false); } @@ -399,7 +402,7 @@ ACTOR Future getFirstSize(Reference>> s ACTOR Future changeSizes(DataDistributionTracker* self, KeyRange keys, int64_t oldShardsEndingSize) { state std::vector> sizes; state std::vector> systemSizes; - for (auto it : self->shards.intersectingRanges(keys)) { + for (auto it : self->shards->intersectingRanges(keys)) { Future thisSize = getFirstSize(it->value().stats); sizes.push_back(thisSize); if (it->range().begin >= systemKeys.begin) { @@ -556,8 +559,8 @@ Future shardMerger(DataDistributionTracker* self, Reference>> shardSize) { int64_t maxShardSize = self->maxShardSize->get().get(); - auto prevIter = self->shards.rangeContaining(keys.begin); - auto nextIter = self->shards.rangeContaining(keys.begin); + auto prevIter = self->shards->rangeContaining(keys.begin); + auto nextIter = self->shards->rangeContaining(keys.begin); CODE_PROBE(true, "shard to be merged"); ASSERT(keys.begin > allKeys.begin); @@ -778,7 +781,7 @@ ACTOR Future shardTracker(DataDistributionTracker::SafeAccessor self, } void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optional startingMetrics) { - auto ranges = self->shards.getAffectedRangesAfterInsertion(keys, ShardTrackedData()); + auto ranges = self->shards->getAffectedRangesAfterInsertion(keys, ShardTrackedData()); for (int i = 0; i < ranges.size(); i++) { if (!ranges[i].value.trackShard.isValid() && ranges[i].begin != keys.begin) { // When starting, key space will be full of "dummy" default contructed entries. @@ -805,7 +808,7 @@ void restartShardTrackers(DataDistributionTracker* self, KeyRangeRef keys, Optio data.stats = shardMetrics; data.trackShard = shardTracker(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics); data.trackBytes = trackShardMetrics(DataDistributionTracker::SafeAccessor(self), ranges[i], shardMetrics); - self->shards.insert(ranges[i], data); + self->shards->insert(ranges[i], data); } } @@ -848,7 +851,7 @@ ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get for (i = 0; i < SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT && i < req.keys.size(); ++i) { auto range = req.keys[i]; StorageMetrics metrics; - for (auto t : self->shards.intersectingRanges(range)) { + for (auto t : self->shards->intersectingRanges(range)) { auto& stats = t.value().stats; if (!stats->get().present()) { onChange = stats->onChange(); @@ -915,7 +918,7 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr loop { Future onChange; StorageMetrics returnMetrics; - for (auto t : self->shards.intersectingRanges(req.keys)) { + for (auto t : self->shards->intersectingRanges(req.keys)) { auto& stats = t.value().stats; if (!stats->get().present()) { onChange = stats->onChange(); @@ -959,8 +962,8 @@ ACTOR Future fetchShardMetricsList_impl(DataDistributionTracker* self, Get // list of metrics, regenerate on loop when full range unsuccessful Standalone> result; Future onChange; - auto beginIter = self->shards.containedRanges(req.keys).begin(); - auto endIter = self->shards.intersectingRanges(req.keys).end(); + auto beginIter = self->shards->containedRanges(req.keys).begin(); + auto endIter = self->shards->intersectingRanges(req.keys).end(); for (auto t = beginIter; t != endIter; ++t) { auto& stats = t.value().stats; if (!stats->get().present()) { @@ -1008,16 +1011,16 @@ ACTOR Future dataDistributionTracker(Reference in Promise readyToStart, Reference> anyZeroHealthyTeams, UID distributorId, - KeyRangeMap* shards, - bool* trackerCancelled) { + std::shared_ptr> shards, + std::shared_ptr trackerCancelled) { state DataDistributionTracker self(cx, distributorId, readyToStart, output, shardsAffectedByTeamFailure, anyZeroHealthyTeams, - *shards, - *trackerCancelled); + shards, + trackerCancelled); state Future loggingTrigger = Void(); state Future readHotDetect = readHotDetector(&self); state Reference ddTrackerStatsEventHolder = makeReference("DDTrackerStats"); @@ -1031,7 +1034,7 @@ ACTOR Future dataDistributionTracker(Reference in } when(wait(loggingTrigger)) { TraceEvent("DDTrackerStats", self.distributorId) - .detail("Shards", self.shards.size()) + .detail("Shards", self.shards->size()) .detail("TotalSizeBytes", self.dbSizeEstimate->get()) .detail("SystemSizeBytes", self.systemSizeEstimate) .trackLatest(ddTrackerStatsEventHolder->trackingKey); @@ -1221,3 +1224,11 @@ void ShardsAffectedByTeamFailure::check() const { } } } + +namespace data_distribution_test { +DataDistributionTracker createDDTrackerForUnitTest() {} +} // namespace data_distribution_test +TEST_CASE("/DataDistributor/Tracker/FetchTopK") { + // state DataDistributionTracker self; + return Void(); +} \ No newline at end of file diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 508b7a11ab..c565d2d043 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -401,8 +401,8 @@ ACTOR Future dataDistributionTracker(Reference in Promise readyToStart, Reference> zeroHealthyTeams, UID distributorId, - KeyRangeMap* shards, - bool* trackerCancelled); + std::shared_ptr> shards, + std::shared_ptr trackerCancelled); ACTOR Future dataDistributionQueue(Database cx, PromiseStream output, diff --git a/flow/include/flow/ActorCollection.h b/flow/include/flow/ActorCollection.h index ae4567f178..c1fbae1129 100644 --- a/flow/include/flow/ActorCollection.h +++ b/flow/include/flow/ActorCollection.h @@ -67,7 +67,7 @@ class ActorCollection : NonCopyable { Future m_out; public: - explicit ActorCollection(bool returnWhenEmptied) { + explicit ActorCollection(bool returnWhenEmptied = false) { m_out = actorCollection(m_add.getFuture(), nullptr, nullptr, nullptr, nullptr, returnWhenEmptied); } From 86165689e55a5e50be18821baf72e6a141d6aaed Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 22 Jul 2022 16:34:29 -0700 Subject: [PATCH 010/216] add basic mock global state utils --- fdbserver/DataDistributionTracker.actor.cpp | 31 ++++++++-- fdbserver/MockGlobalState.cpp | 33 ++++++++++ fdbserver/SimulatedCluster.actor.cpp | 20 +++--- fdbserver/include/fdbserver/MockGlobalState.h | 62 +++++++++++++++++++ .../include/fdbserver/SimulatedCluster.h | 12 ++++ 5 files changed, 145 insertions(+), 13 deletions(-) create mode 100644 fdbserver/MockGlobalState.cpp create mode 100644 fdbserver/include/fdbserver/MockGlobalState.h diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 8925e568f8..412bf05a69 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -22,6 +22,7 @@ #include "fdbclient/SystemData.h" #include "fdbserver/DataDistribution.actor.h" #include "fdbserver/Knobs.h" +#include "fdbserver/workloads/workloads.actor.h" #include "fdbclient/DatabaseContext.h" #include "flow/ActorCollection.h" #include "flow/FastRef.h" @@ -844,8 +845,8 @@ ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get loop { onChange = Future(); returnMetrics.clear(); - state int64_t minReadLoad = std::numeric_limits::max(); - state int64_t maxReadLoad = std::numeric_limits::min(); + state int64_t minReadLoad = -1; + state int64_t maxReadLoad = -1; state int i; for (i = 0; i < SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT && i < req.keys.size(); ++i) { auto range = req.keys[i]; @@ -865,7 +866,7 @@ ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get } if (metrics.bytesReadPerKSecond > 0) { - minReadLoad = std::min(metrics.bytesReadPerKSecond, minReadLoad); + minReadLoad = std::min(metrics.bytesReadPerKSecond, std::max((decltype(minReadLoad))0, minReadLoad)); maxReadLoad = std::max(metrics.bytesReadPerKSecond, maxReadLoad); if (req.minBytesReadPerKSecond <= metrics.bytesReadPerKSecond && metrics.bytesReadPerKSecond <= req.maxBytesReadPerKSecond) { @@ -1224,9 +1225,29 @@ void ShardsAffectedByTeamFailure::check() const { } namespace data_distribution_test { -DataDistributionTracker createDDTrackerForUnitTest() {} } // namespace data_distribution_test TEST_CASE("/DataDistributor/Tracker/FetchTopK") { - // state DataDistributionTracker self; + state DataDistributionTracker self; + state GetTopKMetricsRequest req; + req.topK = 3; + for(int i = 1; i <= 10; i += 2) { + KeyRange keys(KeyRangeRef(doubleToTestKey(i), doubleToTestKey(i+2))); + req.keys.push_back(keys); + // std::cout << "here: " << req.keys.back().begin.toString() << "\n"; + } + req.minBytesReadPerKSecond = 1000; + req.minBytesReadPerKSecond = 10000; + + self.shards = std::make_shared>(); + double targetDensities[10] = {2, 1, 3, 5, 4, 10, 6, 8, 7, 0}; + for(int i = 0; i <= 5; ++ i) { + + } + wait(fetchTopKShardMetrics_impl(&self, req)); + auto& reply = req.reply.getFuture().get(); + ASSERT(reply.shardMetrics.empty()); + ASSERT(reply.maxReadLoad == -1); + ASSERT(reply.minReadLoad == -1); + return Void(); } \ No newline at end of file diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.cpp new file mode 100644 index 0000000000..a1a882ffe1 --- /dev/null +++ b/fdbserver/MockGlobalState.cpp @@ -0,0 +1,33 @@ +/* + * MockGlobalState.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/MockGlobalState.h" + +void MockGlobalState::initialAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) { + ASSERT(conf.storageTeamSize > 0); + configuration = conf; + std::vector allServers; + for(int i = 1; i <= conf.storageTeamSize; ++ i) { + allServers.emplace_back(UID(i, 0)); + servers[allServers.back()] = MockStorageServer(allServers.back(), defaultDiskSpace); + } + keyServers.insert(allKeys.begin, allServers); +} + diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index a3f0af0f63..b245f64639 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -48,6 +48,7 @@ #include "flow/FaultInjection.h" #include "flow/CodeProbeUtils.h" #include "flow/actorcompiler.h" // This must be the last #include. +#include "fdbserver/SimulatedCluster.h" #undef max #undef min @@ -80,7 +81,7 @@ bool destructed = false; // Configuration details specified in workload test files that change the simulation // environment details -class TestConfig { +class TestConfig: public BasicTestConfig { class ConfigBuilder { using value_type = toml::basic_value; using base_variant = std::variant, ConfigDBType>; @@ -289,11 +290,9 @@ class TestConfig { public: int extraDB = 0; - int minimumReplication = 0; int minimumRegions = 0; bool configureLocked = false; bool startIncompatibleProcess = false; - int logAntiQuorum = -1; bool isFirstTestInRestart = false; // 7.0 cannot be downgraded to 6.3 after enabling TSS, so disable TSS for 6.3 downgrade tests bool disableTss = false; @@ -312,17 +311,15 @@ public: // 5 = "ssd-sharded-rocksdb" // Requires a comma-separated list of numbers WITHOUT whitespaces std::vector storageEngineExcludeTypes; + Optional datacenters, stderrSeverity, processesPerMachine; // Set the maximum TLog version that can be selected for a test // Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version. int maxTLogVersion = TLogVersion::MAX_SUPPORTED; - // Set true to simplify simulation configs for easier debugging - bool simpleConfig = false; int extraMachineCountDC = 0; + Optional generateFearless, buggify; - Optional datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType, - stderrSeverity, machineCount, processesPerMachine, coordinators; - bool blobGranulesEnabled = false; Optional config; + bool blobGranulesEnabled = false; bool randomlyRenameZoneId = false; bool allowDefaultTenant = true; @@ -2516,3 +2513,10 @@ ACTOR void setupAndRun(std::string dataFolder, wait(Never()); ASSERT(false); } + +DatabaseConfiguration generateNormalDatabaseConfiguration(const BasicTestConfig& testConfig, uint64_t defaultDiskSpace) { + TestConfig config; + config.BasicTestConfig::operator=(testConfig); + SimulationConfig simConf(config); + return simConf.db; +} diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h new file mode 100644 index 0000000000..201370c4f7 --- /dev/null +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -0,0 +1,62 @@ +/* + * MockGlobalState.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H_H +#define FOUNDATIONDB_MOCKGLOBALSTATE_H_H + +#include "StorageMetrics.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/DatabaseConfiguration.h" +#include "SimulatedCluster.h" + +class MockStorageServer { +public: + // control plane statistics associated with a real storage server + uint64_t usedDiskSpace = 0, availableDiskSpace; + KeyRangeMap shardTotalBytes; // randomly generated in setup phase + + // sampled metrics + StorageServerMetrics metrics; + CoalescedKeyRangeMap> byteSampleClears; + + StorageServerInterface ssi; // serve RPC requests + UID id; + + MockStorageServer() = default; + MockStorageServer(const UID& id, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0) + : usedDiskSpace(usedDiskSpace), availableDiskSpace(availableDiskSpace), id(id) {} +}; + +class MockGlobalState { +public: + KeyRangeMap> keyServers; // a shard belongs to which servers + std::map servers; // all mock servers + DatabaseConfiguration configuration; + + // user defined parameters for mock workload purpose + double emptyProb; // probability of doing an empty read + uint32_t minByteSize, maxByteSize; // the size band of a point data operation + + void initialAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, + uint64_t defaultDiskSpace = 1000LL * 1024 * 1024 * 1024); +}; + +#endif // FOUNDATIONDB_MOCKGLOBALSTATE_H_H diff --git a/fdbserver/include/fdbserver/SimulatedCluster.h b/fdbserver/include/fdbserver/SimulatedCluster.h index d92d3dbb98..2637b52a25 100644 --- a/fdbserver/include/fdbserver/SimulatedCluster.h +++ b/fdbserver/include/fdbserver/SimulatedCluster.h @@ -28,4 +28,16 @@ void setupAndRun(std::string const& dataFolder, bool const& restoring, std::string const& whitelistBinPath); +class BasicTestConfig { +public: + int minimumReplication = 0; + int logAntiQuorum = -1; + // Set true to simplify simulation configs for easier debugging + bool simpleConfig = false; + Optional desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType, machineCount, + coordinators; +}; + +DatabaseConfiguration generateNormalDatabaseConfiguration(const BasicTestConfig& testConfig); + #endif From f761f9a03a537a282f457410b38c52ec70070ebb Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Mon, 25 Jul 2022 10:10:42 -0700 Subject: [PATCH 011/216] use DefaultEndPoint as the default priority for storage server reads --- fdbclient/ServerKnobs.cpp | 2 + fdbclient/include/fdbclient/ServerKnobs.h | 2 + fdbserver/storageserver.actor.cpp | 56 +++++++++++++++++------ flow/include/flow/genericactors.actor.h | 25 ++++++++-- 4 files changed, 67 insertions(+), 18 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 9eacc643c5..6cda742bb0 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -740,6 +740,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); + init( STORAGESERVER_MAX_RANK, 4 ); + init( STORAGESERVER_READ_RANKS, "0,1,2,3,4" ); init( STORAGESERVER_READ_PRIORITIES, "32,8,12,32,48" ); //Wait Failure diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 75e0b5b810..4b471db40e 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -698,6 +698,8 @@ public: int CHECKPOINT_TRANSFER_BLOCK_BYTES; int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; + int STORAGESERVER_MAX_RANK; + std::string STORAGESERVER_READ_RANKS; std::string STORAGESERVER_READ_PRIORITIES; // Wait Failure diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 9b0dba7d98..0df7536b27 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -853,6 +853,7 @@ public: FlowLock serveFetchCheckpointParallelismLock; PriorityMultiLock ssLock; + std::vector readPriorityRanks; int64_t instanceID; @@ -1061,12 +1062,15 @@ public: fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), - ssLock(FLOW_KNOBS->MAX_OUTSTANDING, (int)ReadType::MAX, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), + ssLock(FLOW_KNOBS->MAX_OUTSTANDING, + SERVER_KNOBS->STORAGESERVER_MAX_RANK, + SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false), versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0), lastDurableVersionEBrake(0), maxQueryQueue(0), transactionTagCounter(ssi.id()), counters(this), storageServerSourceTLogIDEventHolder( makeReference(ssi.id().toString() + "/StorageServerSourceTLogID")) { + readPriorityRanks = parseStringToVector(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ','); version.initMetric(LiteralStringRef("StorageServer.Version"), counters.cc.id); oldestVersion.initMetric(LiteralStringRef("StorageServer.OldestVersion"), counters.cc.id); durableVersion.initMetric(LiteralStringRef("StorageServer.DurableVersion"), counters.cc.id); @@ -1568,9 +1572,9 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0)); - state int readPriority = data->getQueryPriority(); - wait(store(lock, data->ssLock.lock(readPriority))); + wait(delay(0, TaskPriority::DefaultEndpoint)); + state int rankIndex = data->getQueryPriority(); + wait(store(lock, data->ssLock.lock(data->readPriorityRanks[rankIndex]))); if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", @@ -3381,10 +3385,10 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0)); - state int readPriority = + wait(delay(0, TaskPriority::DefaultEndpoint)); + state int rankIndex = (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : data->getQueryPriority(); - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[rankIndex])); try { if (req.debugID.present()) @@ -4099,10 +4103,10 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0)); - state int readPriority = + wait(delay(0, TaskPriority::DefaultEndpoint)); + state int rankIndex = (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : data->getQueryPriority(); - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[rankIndex])); try { if (req.debugID.present()) @@ -4290,8 +4294,9 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe state Span span("SS:getKeyValuesStream"_loc, req.spanContext); state int64_t resultSize = 0; state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; - state int readPriority = + state int rankIndex = (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; + state int readPriority = data->readPriorityRanks[rankIndex]; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4306,7 +4311,7 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0)); + wait(delay(0, TaskPriority::DefaultEndpoint)); state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); try { @@ -4464,6 +4469,9 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe end = lastKey; } + lock.release(); + wait(store(lock, data->ssLock.lock(readPriority))); + data->transactionTagCounter.addRequest(req.tags, resultSize); } } @@ -4499,9 +4507,9 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0)); - state int readPriority = data->getQueryPriority(); - wait(store(lock, data->ssLock.lock(readPriority))); + wait(delay(0, TaskPriority::DefaultEndpoint)); + state int rankIndex = data->getQueryPriority(); + wait(store(lock, data->ssLock.lock(data->readPriorityRanks[rankIndex]))); try { Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag); @@ -8671,6 +8679,24 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) [self = self](TraceEvent& te) { te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString()); te.detail("Tag", self->tag.toString()); + std::vector rpr = self->readPriorityRanks; + te.detail("ActiveReads", self->ssLock.totalWorkers()); + te.detail("AwaitReads", self->ssLock.totalWaiters()); + int type = (int)ReadType::EAGER; + te.detail("ActiveEager", self->ssLock.numWorkers(rpr[type])); + te.detail("AwaitEager", self->ssLock.numWaiters(rpr[type])); + type = (int)ReadType::FETCH; + te.detail("ActiveFetch", self->ssLock.numWorkers(rpr[type])); + te.detail("AwaitFetch", self->ssLock.numWaiters(rpr[type])); + type = (int)ReadType::LOW; + te.detail("ActiveLow", self->ssLock.numWorkers(rpr[type])); + te.detail("AwaitLow", self->ssLock.numWaiters(rpr[type])); + type = (int)ReadType::NORMAL; + te.detail("ActiveNormal", self->ssLock.numWorkers(rpr[type])); + te.detail("AwaitNormal", self->ssLock.numWaiters(rpr[type])); + type = (int)ReadType::HIGH; + te.detail("ActiveHigh", self->ssLock.numWorkers(rpr[type])); + te.detail("AwaitHigh", self->ssLock.numWaiters(rpr[type])); StorageBytes sb = self->storage.getStorageBytes(); te.detail("KvstoreBytesUsed", sb.used); te.detail("KvstoreBytesFree", sb.free); diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index 42d0298aa9..ec215cb5f0 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -2175,6 +2175,7 @@ public: this->launchLimit = parseStringToVector(launchLimit, ','); ASSERT(this->launchLimit.size() == maxPriority + 1); waiters.resize(maxPriority + 1); + workerCounts.resize(maxPriority + 1, 0); fRunner = runner(this); } @@ -2185,8 +2186,9 @@ public: // This shortcut may enable a waiter to jump the line when the releaser loop yields if (available > 0) { --available; + workerCounts[priority] += 1; Lock p; - addRunner(p); + addRunner(p, priority); return p; } @@ -2234,6 +2236,20 @@ public: return s; } + int totalWaiters() { return waiting; } + + int numWaiters(const unsigned int priority) { + ASSERT(priority < waiters.size()); + return waiters[priority].size(); + } + + int totalWorkers() { return concurrency - available; } + + int numWorkers(const unsigned int priority) { + ASSERT(priority < waiters.size()); + return workerCounts[priority]; + } + private: struct Waiter { Waiter() : queuedTime(now()) {} @@ -2247,14 +2263,16 @@ private: typedef Deque Queue; std::vector launchLimit; std::vector waiters; + std::vector workerCounts; Deque> runners; Future fRunner; AsyncTrigger release; Promise brokenOnDestruct; - void addRunner(Lock& lock) { + void addRunner(Lock& lock, int priority) { runners.push_back(map(ready(lock.promise.getFuture()), [=](Void) { ++available; + workerCounts[priority] -= 1; if (waiting > 0 || runners.size() > 100) { release.trigger(); } @@ -2307,7 +2325,8 @@ private: // If the lock was not already released, add it to the runners future queue if (lock.promise.canBeSet()) { - self->addRunner(lock); + self->workerCounts[priority] += 1; + self->addRunner(lock, priority); // A slot has been consumed, so stop reading from this queue if there aren't any more if (--self->available == 0) { From 1eb8b8ba6e1684ac68154d8279a54ac9cfad0d0c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 25 Jul 2022 23:25:08 -0700 Subject: [PATCH 012/216] getInitialDataDistribution; simplify mgs keyServers --- fdbserver/DDTxnProcessor.actor.cpp | 51 +++++++++++++++++++ fdbserver/MockGlobalState.cpp | 20 ++++++-- fdbserver/include/fdbserver/DDTxnProcessor.h | 24 +++++++-- fdbserver/include/fdbserver/MockGlobalState.h | 31 ++++++++--- 4 files changed, 112 insertions(+), 14 deletions(-) diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index a9be37ea86..5e98d8f4e4 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -431,3 +431,54 @@ Future> DDTxnProcessor::getInitialDataDistrib Future DDTxnProcessor::waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const { return DDTxnProcessorImpl::waitForDataDistributionEnabled(cx, ddEnabledState); } + +Future>> +DDMockTxnProcessor::getServerListAndProcessClasses() { + std::vector> res; + for (auto& [_, mss] : mgs->servers) { + res.emplace_back(mss.ssi, ProcessClass(ProcessClass::StorageClass, ProcessClass::DBSource)); + } + return res; +} + +std::set> DDMockTxnProcessor::getPrimaryTeams() const { + std::set> res; + for (auto& [idx, team] : mgs->teams) { + res.emplace(team.getServerIds()); + } + return res; +} + +std::vector DDMockTxnProcessor::getDDShardInfos() const { + std::vector res; + res.reserve(mgs->keyServers.size() - 1); + for (auto& [beginK, value] : mgs->keyServers) { + if (beginK == allKeys.end) + break; + + // FIXME: now just use anonymousShardId + DDShardInfo info(beginK, anonymousShardId, anonymousShardId); + info.primarySrc = mgs->teams.at(value.srcIdx).getServerIds(); + if (value.destIdx.present()) { + info.primaryDest = mgs->teams.at(value.destIdx.get()).getServerIds(); + info.hasDest = true; + } + } + res.emplace_back(allKeys.end); +} + +Future> DDMockTxnProcessor::getInitialDataDistribution( + const UID& distributorId, + const MoveKeysLock& moveKeysLock, + const std::vector>& remoteDcIds, + const DDEnabledState* ddEnabledState) { + + // FIXME: now we just ignore ddEnabledState and moveKeysLock, will fix it in the future + Reference res = makeReference(); + res->mode = 1; + res->allServers = getServerListAndProcessClasses().get(); + // TODO: consider remote region setting. For now assume all server is in primary dc + res->shards = getDDShardInfos(); + res->primaryTeams = getPrimaryTeams(); + return res; +} diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.cpp index a1a882ffe1..d7364cbcb5 100644 --- a/fdbserver/MockGlobalState.cpp +++ b/fdbserver/MockGlobalState.cpp @@ -23,11 +23,21 @@ void MockGlobalState::initialAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) { ASSERT(conf.storageTeamSize > 0); configuration = conf; - std::vector allServers; - for(int i = 1; i <= conf.storageTeamSize; ++ i) { - allServers.emplace_back(UID(i, 0)); - servers[allServers.back()] = MockStorageServer(allServers.back(), defaultDiskSpace); + Team seedTeam; + seedTeam.teamIdx = 1; + for (int i = 1; i <= conf.storageTeamSize; ++i) { + seedTeam.serverIdx.emplace_back(i); + servers[i] = MockStorageServer(indexToUID(i), defaultDiskSpace); } - keyServers.insert(allKeys.begin, allServers); + teams[seedTeam.teamIdx] = seedTeam; + keyServers[allKeys.begin] = { seedTeam.teamIdx, Optional() }; + keyServers[allKeys.end] = { 0, Optional() }; } +std::vector MockGlobalState::Team::getServerIds() const { + std::vector res(serverIdx.size()); + for (int i = 0; i < serverIdx.size(); ++i) { + res[i] = indexToUID(serverIdx.at(i)); + } + return res; +} diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index 8926a51b10..3bd92ac74f 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -23,8 +23,10 @@ #include "fdbserver/Knobs.h" #include "fdbserver/MoveKeys.actor.h" +#include "fdbserver/MockGlobalState.h" struct InitialDataDistribution; +struct DDShardInfo; /* Testability Contract: * a. The DataDistributor has to use this interface to interact with data-plane (aka. run transaction), because the @@ -37,7 +39,7 @@ public: std::vector srcServers, completeSources; // the same as RelocateData.src, RelocateData.completeSources; }; // get the source server list and complete source server list for range - virtual Future getSourceServersForRange(const KeyRangeRef range) = 0; + virtual Future getSourceServersForRange(const KeyRangeRef range) { return SourceServers{}; }; // get the storage server list and Process class virtual Future>> getServerListAndProcessClasses() = 0; @@ -60,7 +62,7 @@ public: return Void(); } - virtual Future waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const = 0; + virtual Future waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const { return Void(); }; }; class DDTxnProcessorImpl; @@ -99,6 +101,22 @@ public: // A mock transaction implementation for test usage. // Contract: every function involving mock transaction should return immediately to mimic the ACI property of real // transaction. -class DDMockTxnProcessor : public IDDTxnProcessor {}; +class DDMockTxnProcessor : public IDDTxnProcessor { + std::shared_ptr mgs; + + std::vector getDDShardInfos() const; + std::set> getPrimaryTeams() const; + +public: + explicit DDMockTxnProcessor(std::shared_ptr mgs = nullptr) : mgs(mgs){}; + + Future>> getServerListAndProcessClasses() override; + + Future> getInitialDataDistribution( + const UID& distributorId, + const MoveKeysLock& moveKeysLock, + const std::vector>& remoteDcIds, + const DDEnabledState* ddEnabledState) override; +}; #endif // FOUNDATIONDB_DDTXNPROCESSOR_H diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index 201370c4f7..1d53e74741 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -18,8 +18,8 @@ * limitations under the License. */ -#ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H_H -#define FOUNDATIONDB_MOCKGLOBALSTATE_H_H +#ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H +#define FOUNDATIONDB_MOCKGLOBALSTATE_H #include "StorageMetrics.h" #include "fdbclient/KeyRangeMap.h" @@ -42,21 +42,40 @@ public: MockStorageServer() = default; MockStorageServer(const UID& id, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0) - : usedDiskSpace(usedDiskSpace), availableDiskSpace(availableDiskSpace), id(id) {} + : usedDiskSpace(usedDiskSpace), availableDiskSpace(availableDiskSpace), id(id) { + ssi.uniqueID = id; + } }; class MockGlobalState { public: - KeyRangeMap> keyServers; // a shard belongs to which servers - std::map servers; // all mock servers + // Index starting from 1. 0 indicates invalid index; + typedef uint32_t TeamIndex; + typedef uint64_t ServerIndex; + struct Team { + TeamIndex teamIdx; + std::vector serverIdx; + + std::vector getServerIds() const; + }; + + struct ShardTeamValue { + TeamIndex srcIdx; + Optional destIdx; + }; + + std::map keyServers; // a shard belongs to which teams, key is the beginning key of a shard + std::map servers; // all mock servers + std::map teams; DatabaseConfiguration configuration; // user defined parameters for mock workload purpose double emptyProb; // probability of doing an empty read uint32_t minByteSize, maxByteSize; // the size band of a point data operation + static UID indexToUID(uint64_t a) { return UID(a, a); } void initialAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace = 1000LL * 1024 * 1024 * 1024); }; -#endif // FOUNDATIONDB_MOCKGLOBALSTATE_H_H +#endif // FOUNDATIONDB_MOCKGLOBALSTATE_H From 32ab7421e1c396ce11233d7b1858810cee63c321 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Tue, 26 Jul 2022 10:12:54 -0700 Subject: [PATCH 013/216] add readtype to requests --- fdbclient/NativeAPI.actor.cpp | 4 +- .../fdbclient/StorageServerInterface.h | 18 ++++---- fdbserver/StorageCache.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 41 ++++++++----------- 4 files changed, 28 insertions(+), 37 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 481e74348b..126a509ad0 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -4264,7 +4264,7 @@ Future getRange(Reference trState, req.arena.dependsOn(mapper.arena()); setMatchIndex(req, matchIndex); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); - req.isFetchKeys = (trState->readType == ReadType::FETCH); + req.readType = trState->readType; req.version = readVersion; trState->cx->getLatestCommitVersions( @@ -4721,7 +4721,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, req.limitBytes = std::numeric_limits::max(); // it is used to inform the storage that the rangeRead is for Fetch // req.isFetchKeys = (trState->readType == ReadType::FETCH); - req.isFetchKeys = false; + req.readType = trState->readType; trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index febbc1311b..6db0ef6bd7 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -370,7 +370,7 @@ struct GetKeyValuesRequest : TimedRequest { KeyRef mapper = KeyRef(); Version version; // or latestVersion int limit, limitBytes; - bool isFetchKeys; + ReadType readType; Optional tags; Optional debugID; ReplyPromise reply; @@ -378,7 +378,7 @@ struct GetKeyValuesRequest : TimedRequest { // to this client, of all storage replicas that // serve the given key - GetKeyValuesRequest() : isFetchKeys(false) {} + GetKeyValuesRequest() : readType(ReadType::NORMAL) {} template void serialize(Ar& ar) { @@ -388,7 +388,7 @@ struct GetKeyValuesRequest : TimedRequest { version, limit, limitBytes, - isFetchKeys, + readType, tags, debugID, reply, @@ -427,7 +427,7 @@ struct GetMappedKeyValuesRequest : TimedRequest { Version version; // or latestVersion int limit, limitBytes; int matchIndex; - bool isFetchKeys; + ReadType readType; Optional tags; Optional debugID; ReplyPromise reply; @@ -435,7 +435,7 @@ struct GetMappedKeyValuesRequest : TimedRequest { // to this client, of all storage replicas that // serve the given key range - GetMappedKeyValuesRequest() : isFetchKeys(false) {} + GetMappedKeyValuesRequest() : readType(ReadType::NORMAL) {} template void serialize(Ar& ar) { serializer(ar, @@ -445,7 +445,7 @@ struct GetMappedKeyValuesRequest : TimedRequest { version, limit, limitBytes, - isFetchKeys, + readType, tags, debugID, reply, @@ -492,7 +492,7 @@ struct GetKeyValuesStreamRequest { KeySelectorRef begin, end; Version version; // or latestVersion int limit, limitBytes; - bool isFetchKeys; + ReadType readType; Optional tags; Optional debugID; ReplyPromiseStream reply; @@ -500,7 +500,7 @@ struct GetKeyValuesStreamRequest { // to this client, of all storage replicas that // serve the given key range - GetKeyValuesStreamRequest() : isFetchKeys(false) {} + GetKeyValuesStreamRequest() : readType(ReadType::NORMAL) {} template void serialize(Ar& ar) { @@ -510,7 +510,7 @@ struct GetKeyValuesStreamRequest { version, limit, limitBytes, - isFetchKeys, + readType, tags, debugID, reply, diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 5565b8492f..604b9f8f83 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -735,7 +735,7 @@ ACTOR Future getKeyValues(StorageCacheData* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here TaskPriority taskType = TaskPriority::DefaultEndpoint; - if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.readType == ReadType::FETCH) { taskType = TaskPriority::LowPriorityRead; // } else if (false) { // // Placeholder for up-prioritizing fetches for important requests diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 0df7536b27..88639da973 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1171,14 +1171,13 @@ public: // Normally the storage server prefers to serve read requests over making mutations // durable to disk. However, when the storage server falls to far behind on // making mutations durable, this function will change the priority to prefer writes. - - int getQueryPriority() { + Future getQueryDelay() { if ((version.get() - durableVersion.get() > SERVER_KNOBS->LOW_PRIORITY_DURABILITY_LAG) || (queueSize() > SERVER_KNOBS->LOW_PRIORITY_STORAGE_QUEUE_BYTES)) { ++counters.lowPriorityQueries; - return (int)ReadType::LOW; + return delay(0, TaskPriority::LowPriorityRead); } - return (int)ReadType::NORMAL; + return delay(0, TaskPriority::DefaultEndpoint); } template @@ -1572,9 +1571,8 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0, TaskPriority::DefaultEndpoint)); - state int rankIndex = data->getQueryPriority(); - wait(store(lock, data->ssLock.lock(data->readPriorityRanks[rankIndex]))); + wait(data->getQueryDelay()); + wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)ReadType::NORMAL]))); if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", @@ -3369,7 +3367,7 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) { state Span span("SS:getKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; - state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; + state ReadType type = req.readType; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -3385,10 +3383,8 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0, TaskPriority::DefaultEndpoint)); - state int rankIndex = - (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : data->getQueryPriority(); - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[rankIndex])); + wait(data->getQueryDelay()); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type])); try { if (req.debugID.present()) @@ -3574,7 +3570,7 @@ ACTOR Future quickGetKeyValues( // TODO: Use remainingLimit, remainingLimitBytes rather than separate knobs. req.limit = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT; req.limitBytes = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT_BYTES; - req.isFetchKeys = false; + req.readType = ReadType::NORMAL; req.tags = pOriginalReq->tags; req.ssLatestCommitVersions = VersionVector(); req.debugID = pOriginalReq->debugID; @@ -4087,7 +4083,7 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe { state Span span("SS:getMappedKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; - state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; + state ReadType type = req.readType; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4103,10 +4099,8 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0, TaskPriority::DefaultEndpoint)); - state int rankIndex = - (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : data->getQueryPriority(); - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[rankIndex])); + wait(data->getQueryDelay()); + state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type])); try { if (req.debugID.present()) @@ -4293,10 +4287,8 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe { state Span span("SS:getKeyValuesStream"_loc, req.spanContext); state int64_t resultSize = 0; - state ReadType type = req.isFetchKeys ? ReadType::FETCH : ReadType::NORMAL; - state int rankIndex = - (req.isFetchKeys && SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY) ? (int)ReadType::FETCH : (int)ReadType::NORMAL; - state int readPriority = data->readPriorityRanks[rankIndex]; + state ReadType type = req.readType; + state int readPriority = data->readPriorityRanks[(int)type]; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4507,9 +4499,8 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0, TaskPriority::DefaultEndpoint)); - state int rankIndex = data->getQueryPriority(); - wait(store(lock, data->ssLock.lock(data->readPriorityRanks[rankIndex]))); + wait(data->getQueryDelay()); + wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)ReadType::NORMAL]))); try { Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag); From 53d1e1abf37c05f6720fbb9eb522846a00a06795 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 26 Jul 2022 10:15:51 -0700 Subject: [PATCH 014/216] create ShardsAffectedByTeamFailure files --- fdbserver/DataDistributionTracker.actor.cpp | 179 +---------------- fdbserver/ShardsAffectedByTeamFailure.cpp | 185 ++++++++++++++++++ .../fdbserver/DataDistribution.actor.h | 78 +------- .../fdbserver/ShardsAffectedByTeamFailure.h | 104 ++++++++++ flow/include/flow/FastRef.h | 1 + 5 files changed, 298 insertions(+), 249 deletions(-) create mode 100644 fdbserver/ShardsAffectedByTeamFailure.cpp create mode 100644 fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 412bf05a69..1dde048f7a 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -866,7 +866,8 @@ ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get } if (metrics.bytesReadPerKSecond > 0) { - minReadLoad = std::min(metrics.bytesReadPerKSecond, std::max((decltype(minReadLoad))0, minReadLoad)); + minReadLoad = + std::min(metrics.bytesReadPerKSecond, std::max((decltype(minReadLoad))0, minReadLoad)); maxReadLoad = std::max(metrics.bytesReadPerKSecond, maxReadLoad); if (req.minBytesReadPerKSecond <= metrics.bytesReadPerKSecond && metrics.bytesReadPerKSecond <= req.maxBytesReadPerKSecond) { @@ -1060,178 +1061,13 @@ ACTOR Future dataDistributionTracker(Reference in } } -std::vector ShardsAffectedByTeamFailure::getShardsFor(Team team) const { - std::vector r; - for (auto it = team_shards.lower_bound(std::pair(team, KeyRangeRef())); - it != team_shards.end() && it->first == team; - ++it) - r.push_back(it->second); - return r; -} - -bool ShardsAffectedByTeamFailure::hasShards(Team team) const { - auto it = team_shards.lower_bound(std::pair(team, KeyRangeRef())); - return it != team_shards.end() && it->first == team; -} - -int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const { - auto it = storageServerShards.find(ssID); - return it == storageServerShards.end() ? 0 : it->second; -} - -std::pair, std::vector> -ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) { - return shard_teams[keys.begin]; -} - -void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) { - if (team_shards.erase(std::pair(team, range)) > 0) { - for (auto uid = team.servers.begin(); uid != team.servers.end(); ++uid) { - // Safeguard against going negative after eraseServer() sets value to 0 - if (storageServerShards[*uid] > 0) { - storageServerShards[*uid]--; - } - } - } -} - -void ShardsAffectedByTeamFailure::insert(Team team, KeyRange const& range) { - if (team_shards.insert(std::pair(team, range)).second) { - for (auto uid = team.servers.begin(); uid != team.servers.end(); ++uid) - storageServerShards[*uid]++; - } -} - -void ShardsAffectedByTeamFailure::defineShard(KeyRangeRef keys) { - std::vector teams; - std::vector prevTeams; - auto rs = shard_teams.intersectingRanges(keys); - for (auto it = rs.begin(); it != rs.end(); ++it) { - for (auto t = it->value().first.begin(); t != it->value().first.end(); ++t) { - teams.push_back(*t); - erase(*t, it->range()); - } - for (auto t = it->value().second.begin(); t != it->value().second.end(); ++t) { - prevTeams.push_back(*t); - } - } - uniquify(teams); - uniquify(prevTeams); - - /*TraceEvent("ShardsAffectedByTeamFailureDefine") - .detail("KeyBegin", keys.begin) - .detail("KeyEnd", keys.end) - .detail("TeamCount", teams.size());*/ - - auto affectedRanges = shard_teams.getAffectedRangesAfterInsertion(keys); - shard_teams.insert(keys, std::make_pair(teams, prevTeams)); - - for (auto r = affectedRanges.begin(); r != affectedRanges.end(); ++r) { - auto& t = shard_teams[r->begin]; - for (auto it = t.first.begin(); it != t.first.end(); ++it) { - insert(*it, *r); - } - } - check(); -} - -// Move keys to destinationTeams by updating shard_teams -void ShardsAffectedByTeamFailure::moveShard(KeyRangeRef keys, std::vector destinationTeams) { - /*TraceEvent("ShardsAffectedByTeamFailureMove") - .detail("KeyBegin", keys.begin) - .detail("KeyEnd", keys.end) - .detail("NewTeamSize", destinationTeam.size()) - .detail("NewTeam", describe(destinationTeam));*/ - - auto ranges = shard_teams.intersectingRanges(keys); - std::vector, std::vector>, KeyRange>> modifiedShards; - for (auto it = ranges.begin(); it != ranges.end(); ++it) { - if (keys.contains(it->range())) { - // erase the many teams that were associated with this one shard - for (auto t = it->value().first.begin(); t != it->value().first.end(); ++t) { - erase(*t, it->range()); - } - - // save this modification for later insertion - std::vector prevTeams = it->value().second; - prevTeams.insert(prevTeams.end(), it->value().first.begin(), it->value().first.end()); - uniquify(prevTeams); - - modifiedShards.push_back(std::pair, std::vector>, KeyRange>( - std::make_pair(destinationTeams, prevTeams), it->range())); - } else { - // for each range that touches this move, add our team as affecting this range - for (auto& team : destinationTeams) { - insert(team, it->range()); - } - - // if we are not in the list of teams associated with this shard, add us in - auto& teams = it->value(); - teams.second.insert(teams.second.end(), teams.first.begin(), teams.first.end()); - uniquify(teams.second); - - teams.first.insert(teams.first.end(), destinationTeams.begin(), destinationTeams.end()); - uniquify(teams.first); - } - } - - // we cannot modify the KeyRangeMap while iterating through it, so add saved modifications now - for (int i = 0; i < modifiedShards.size(); i++) { - for (auto& t : modifiedShards[i].first.first) { - insert(t, modifiedShards[i].second); - } - shard_teams.insert(modifiedShards[i].second, modifiedShards[i].first); - } - - check(); -} - -void ShardsAffectedByTeamFailure::finishMove(KeyRangeRef keys) { - auto ranges = shard_teams.containedRanges(keys); - for (auto it = ranges.begin(); it != ranges.end(); ++it) { - it.value().second.clear(); - } -} - -void ShardsAffectedByTeamFailure::check() const { - if (EXPENSIVE_VALIDATION) { - for (auto t = team_shards.begin(); t != team_shards.end(); ++t) { - auto i = shard_teams.rangeContaining(t->second.begin); - if (i->range() != t->second || !std::count(i->value().first.begin(), i->value().first.end(), t->first)) { - ASSERT(false); - } - } - auto rs = shard_teams.ranges(); - for (auto i = rs.begin(); i != rs.end(); ++i) { - for (auto t = i->value().first.begin(); t != i->value().first.end(); ++t) { - if (!team_shards.count(std::make_pair(*t, i->range()))) { - std::string teamDesc, shards; - for (int k = 0; k < t->servers.size(); k++) - teamDesc += format("%llx ", t->servers[k].first()); - for (auto x = team_shards.lower_bound(std::make_pair(*t, KeyRangeRef())); - x != team_shards.end() && x->first == *t; - ++x) - shards += printable(x->second.begin) + "-" + printable(x->second.end) + ","; - TraceEvent(SevError, "SATFInvariantError2") - .detail("KB", i->begin()) - .detail("KE", i->end()) - .detail("Team", teamDesc) - .detail("Shards", shards); - ASSERT(false); - } - } - } - } -} - -namespace data_distribution_test { -} // namespace data_distribution_test +namespace data_distribution_test {} // namespace data_distribution_test TEST_CASE("/DataDistributor/Tracker/FetchTopK") { state DataDistributionTracker self; state GetTopKMetricsRequest req; req.topK = 3; - for(int i = 1; i <= 10; i += 2) { - KeyRange keys(KeyRangeRef(doubleToTestKey(i), doubleToTestKey(i+2))); + for (int i = 1; i <= 10; i += 2) { + KeyRange keys(KeyRangeRef(doubleToTestKey(i), doubleToTestKey(i + 2))); req.keys.push_back(keys); // std::cout << "here: " << req.keys.back().begin.toString() << "\n"; } @@ -1239,9 +1075,8 @@ TEST_CASE("/DataDistributor/Tracker/FetchTopK") { req.minBytesReadPerKSecond = 10000; self.shards = std::make_shared>(); - double targetDensities[10] = {2, 1, 3, 5, 4, 10, 6, 8, 7, 0}; - for(int i = 0; i <= 5; ++ i) { - + double targetDensities[10] = { 2, 1, 3, 5, 4, 10, 6, 8, 7, 0 }; + for (int i = 0; i <= 5; ++i) { } wait(fetchTopKShardMetrics_impl(&self, req)); auto& reply = req.reply.getFuture().get(); diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp new file mode 100644 index 0000000000..aa7bbcd55d --- /dev/null +++ b/fdbserver/ShardsAffectedByTeamFailure.cpp @@ -0,0 +1,185 @@ +/* + * ShardsAffectedByTeamFailure.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/ShardsAffectedByTeamFailure.h" + +std::vector ShardsAffectedByTeamFailure::getShardsFor(Team team) const { + std::vector r; + for (auto it = team_shards.lower_bound(std::pair(team, KeyRangeRef())); + it != team_shards.end() && it->first == team; + ++it) + r.push_back(it->second); + return r; +} + +bool ShardsAffectedByTeamFailure::hasShards(Team team) const { + auto it = team_shards.lower_bound(std::pair(team, KeyRangeRef())); + return it != team_shards.end() && it->first == team; +} + +int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const { + auto it = storageServerShards.find(ssID); + return it == storageServerShards.end() ? 0 : it->second; +} + +std::pair, std::vector> +ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) { + return shard_teams[keys.begin]; +} + +void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) { + if (team_shards.erase(std::pair(team, range)) > 0) { + for (auto uid = team.servers.begin(); uid != team.servers.end(); ++uid) { + // Safeguard against going negative after eraseServer() sets value to 0 + if (storageServerShards[*uid] > 0) { + storageServerShards[*uid]--; + } + } + } +} + +void ShardsAffectedByTeamFailure::insert(Team team, KeyRange const& range) { + if (team_shards.insert(std::pair(team, range)).second) { + for (auto uid = team.servers.begin(); uid != team.servers.end(); ++uid) + storageServerShards[*uid]++; + } +} + +void ShardsAffectedByTeamFailure::defineShard(KeyRangeRef keys) { + std::vector teams; + std::vector prevTeams; + auto rs = shard_teams.intersectingRanges(keys); + for (auto it = rs.begin(); it != rs.end(); ++it) { + for (auto t = it->value().first.begin(); t != it->value().first.end(); ++t) { + teams.push_back(*t); + erase(*t, it->range()); + } + for (auto t = it->value().second.begin(); t != it->value().second.end(); ++t) { + prevTeams.push_back(*t); + } + } + uniquify(teams); + uniquify(prevTeams); + + /*TraceEvent("ShardsAffectedByTeamFailureDefine") + .detail("KeyBegin", keys.begin) + .detail("KeyEnd", keys.end) + .detail("TeamCount", teams.size());*/ + + auto affectedRanges = shard_teams.getAffectedRangesAfterInsertion(keys); + shard_teams.insert(keys, std::make_pair(teams, prevTeams)); + + for (auto r = affectedRanges.begin(); r != affectedRanges.end(); ++r) { + auto& t = shard_teams[r->begin]; + for (auto it = t.first.begin(); it != t.first.end(); ++it) { + insert(*it, *r); + } + } + check(); +} + +// Move keys to destinationTeams by updating shard_teams +void ShardsAffectedByTeamFailure::moveShard(KeyRangeRef keys, std::vector destinationTeams) { + /*TraceEvent("ShardsAffectedByTeamFailureMove") + .detail("KeyBegin", keys.begin) + .detail("KeyEnd", keys.end) + .detail("NewTeamSize", destinationTeam.size()) + .detail("NewTeam", describe(destinationTeam));*/ + + auto ranges = shard_teams.intersectingRanges(keys); + std::vector, std::vector>, KeyRange>> modifiedShards; + for (auto it = ranges.begin(); it != ranges.end(); ++it) { + if (keys.contains(it->range())) { + // erase the many teams that were associated with this one shard + for (auto t = it->value().first.begin(); t != it->value().first.end(); ++t) { + erase(*t, it->range()); + } + + // save this modification for later insertion + std::vector prevTeams = it->value().second; + prevTeams.insert(prevTeams.end(), it->value().first.begin(), it->value().first.end()); + uniquify(prevTeams); + + modifiedShards.push_back(std::pair, std::vector>, KeyRange>( + std::make_pair(destinationTeams, prevTeams), it->range())); + } else { + // for each range that touches this move, add our team as affecting this range + for (auto& team : destinationTeams) { + insert(team, it->range()); + } + + // if we are not in the list of teams associated with this shard, add us in + auto& teams = it->value(); + teams.second.insert(teams.second.end(), teams.first.begin(), teams.first.end()); + uniquify(teams.second); + + teams.first.insert(teams.first.end(), destinationTeams.begin(), destinationTeams.end()); + uniquify(teams.first); + } + } + + // we cannot modify the KeyRangeMap while iterating through it, so add saved modifications now + for (int i = 0; i < modifiedShards.size(); i++) { + for (auto& t : modifiedShards[i].first.first) { + insert(t, modifiedShards[i].second); + } + shard_teams.insert(modifiedShards[i].second, modifiedShards[i].first); + } + + check(); +} + +void ShardsAffectedByTeamFailure::finishMove(KeyRangeRef keys) { + auto ranges = shard_teams.containedRanges(keys); + for (auto it = ranges.begin(); it != ranges.end(); ++it) { + it.value().second.clear(); + } +} + +void ShardsAffectedByTeamFailure::check() const { + if (EXPENSIVE_VALIDATION) { + for (auto t = team_shards.begin(); t != team_shards.end(); ++t) { + auto i = shard_teams.rangeContaining(t->second.begin); + if (i->range() != t->second || !std::count(i->value().first.begin(), i->value().first.end(), t->first)) { + ASSERT(false); + } + } + auto rs = shard_teams.ranges(); + for (auto i = rs.begin(); i != rs.end(); ++i) { + for (auto t = i->value().first.begin(); t != i->value().first.end(); ++t) { + if (!team_shards.count(std::make_pair(*t, i->range()))) { + std::string teamDesc, shards; + for (int k = 0; k < t->servers.size(); k++) + teamDesc += format("%llx ", t->servers[k].first()); + for (auto x = team_shards.lower_bound(std::make_pair(*t, KeyRangeRef())); + x != team_shards.end() && x->first == *t; + ++x) + shards += printable(x->second.begin) + "-" + printable(x->second.end) + ","; + TraceEvent(SevError, "SATFInvariantError2") + .detail("KB", i->begin()) + .detail("KE", i->end()) + .detail("Team", teamDesc) + .detail("Shards", shards); + ASSERT(false); + } + } + } + } +} \ No newline at end of file diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 08a3a58ec8..5a270f84f0 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -27,12 +27,12 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/RunTransaction.actor.h" #include "fdbserver/DDTxnProcessor.h" +#include "fdbserver/ShardsAffectedByTeamFailure.h" #include "fdbserver/Knobs.h" #include "fdbserver/LogSystem.h" #include "fdbserver/MoveKeys.actor.h" #include #include - #include "flow/actorcompiler.h" // This must be the last #include. enum class RelocateReason { INVALID = -1, OTHER, REBALANCE_DISK, REBALANCE_READ }; @@ -282,82 +282,6 @@ struct TeamCollectionInterface { PromiseStream getTeam; }; -class ShardsAffectedByTeamFailure : public ReferenceCounted { -public: - ShardsAffectedByTeamFailure() {} - - struct Team { - std::vector servers; // sorted - bool primary; - - Team() : primary(true) {} - Team(std::vector const& servers, bool primary) : servers(servers), primary(primary) {} - - bool operator<(const Team& r) const { - if (servers == r.servers) - return primary < r.primary; - return servers < r.servers; - } - bool operator>(const Team& r) const { return r < *this; } - bool operator<=(const Team& r) const { return !(*this > r); } - bool operator>=(const Team& r) const { return !(*this < r); } - bool operator==(const Team& r) const { return servers == r.servers && primary == r.primary; } - bool operator!=(const Team& r) const { return !(*this == r); } - - std::string toString() const { return describe(servers); }; - }; - - // This tracks the data distribution on the data distribution server so that teamTrackers can - // relocate the right shards when a team is degraded. - - // The following are important to make sure that failure responses don't revert splits or merges: - // - The shards boundaries in the two data structures reflect "queued" RelocateShard requests - // (i.e. reflects the desired set of shards being tracked by dataDistributionTracker, - // rather than the status quo). These boundaries are modified in defineShard and the content - // of what servers correspond to each shard is a copy or union of the shards already there - // - The teams associated with each shard reflect either the sources for non-moving shards - // or the destination team for in-flight shards (the change is atomic with respect to team selection). - // moveShard() changes the servers associated with a shard and will never adjust the shard - // boundaries. If a move is received for a shard that has been redefined (the exact shard is - // no longer in the map), the servers will be set for all contained shards and added to all - // intersecting shards. - - int getNumberOfShards(UID ssID) const; - std::vector getShardsFor(Team team) const; - bool hasShards(Team team) const; - - // The first element of the pair is either the source for non-moving shards or the destination team for in-flight - // shards The second element of the pair is all previous sources for in-flight shards - std::pair, std::vector> getTeamsFor(KeyRangeRef keys); - - void defineShard(KeyRangeRef keys); - void moveShard(KeyRangeRef keys, std::vector destinationTeam); - void finishMove(KeyRangeRef keys); - void check() const; - - PromiseStream restartShardTracker; - -private: - struct OrderByTeamKey { - bool operator()(const std::pair& lhs, const std::pair& rhs) const { - if (lhs.first < rhs.first) - return true; - if (lhs.first > rhs.first) - return false; - return lhs.second.begin < rhs.second.begin; - } - }; - - KeyRangeMap, std::vector>> - shard_teams; // A shard can be affected by the failure of multiple teams if it is a queued merge, or when - // usable_regions > 1 - std::set, OrderByTeamKey> team_shards; - std::map storageServerShards; - - void erase(Team team, KeyRange const& range); - void insert(Team team, KeyRange const& range); -}; - // DDShardInfo is so named to avoid link-time name collision with ShardInfo within the StorageServer struct DDShardInfo { Key key; diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h new file mode 100644 index 0000000000..542fe1e736 --- /dev/null +++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h @@ -0,0 +1,104 @@ +/* + * ShardsAffectedByTeamFailure.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef FOUNDATIONDB_SHARDSAFFECTEDBYTEAMFAILURE_H +#define FOUNDATIONDB_SHARDSAFFECTEDBYTEAMFAILURE_H + +#include "flow/FastRef.h" +#include "flow/IRandom.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/KeyRangeMap.h" + +class ShardsAffectedByTeamFailure : public ReferenceCounted { +public: + ShardsAffectedByTeamFailure() {} + + struct Team { + std::vector servers; // sorted + bool primary; + + Team() : primary(true) {} + Team(std::vector const& servers, bool primary) : servers(servers), primary(primary) {} + + bool operator<(const Team& r) const { + if (servers == r.servers) + return primary < r.primary; + return servers < r.servers; + } + bool operator>(const Team& r) const { return r < *this; } + bool operator<=(const Team& r) const { return !(*this > r); } + bool operator>=(const Team& r) const { return !(*this < r); } + bool operator==(const Team& r) const { return servers == r.servers && primary == r.primary; } + bool operator!=(const Team& r) const { return !(*this == r); } + + std::string toString() const { return describe(servers); }; + }; + + // This tracks the data distribution on the data distribution server so that teamTrackers can + // relocate the right shards when a team is degraded. + + // The following are important to make sure that failure responses don't revert splits or merges: + // - The shards boundaries in the two data structures reflect "queued" RelocateShard requests + // (i.e. reflects the desired set of shards being tracked by dataDistributionTracker, + // rather than the status quo). These boundaries are modified in defineShard and the content + // of what servers correspond to each shard is a copy or union of the shards already there + // - The teams associated with each shard reflect either the sources for non-moving shards + // or the destination team for in-flight shards (the change is atomic with respect to team selection). + // moveShard() changes the servers associated with a shard and will never adjust the shard + // boundaries. If a move is received for a shard that has been redefined (the exact shard is + // no longer in the map), the servers will be set for all contained shards and added to all + // intersecting shards. + + int getNumberOfShards(UID ssID) const; + std::vector getShardsFor(Team team) const; + bool hasShards(Team team) const; + + // The first element of the pair is either the source for non-moving shards or the destination team for in-flight + // shards The second element of the pair is all previous sources for in-flight shards + std::pair, std::vector> getTeamsFor(KeyRangeRef keys); + + void defineShard(KeyRangeRef keys); + void moveShard(KeyRangeRef keys, std::vector destinationTeam); + void finishMove(KeyRangeRef keys); + void check() const; + + PromiseStream restartShardTracker; + +private: + struct OrderByTeamKey { + bool operator()(const std::pair& lhs, const std::pair& rhs) const { + if (lhs.first < rhs.first) + return true; + if (lhs.first > rhs.first) + return false; + return lhs.second.begin < rhs.second.begin; + } + }; + + KeyRangeMap, std::vector>> + shard_teams; // A shard can be affected by the failure of multiple teams if it is a queued merge, or when + // usable_regions > 1 + std::set, OrderByTeamKey> team_shards; + std::map storageServerShards; + + void erase(Team team, KeyRange const& range); + void insert(Team team, KeyRange const& range); +}; + +#endif // FOUNDATIONDB_SHARDSAFFECTEDBYTEAMFAILURE_H diff --git a/flow/include/flow/FastRef.h b/flow/include/flow/FastRef.h index 3c65cc6002..2e49b34de2 100644 --- a/flow/include/flow/FastRef.h +++ b/flow/include/flow/FastRef.h @@ -24,6 +24,7 @@ #include #include +#include // The thread safety this class provides is that it's safe to call addref and // delref on the same object concurrently in different threads. Subclass does From b1d4626e4010eb7515e692dc7923d965ac890c4e Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Tue, 26 Jul 2022 11:20:09 -0700 Subject: [PATCH 015/216] fix minor issue --- fdbserver/StorageCache.actor.cpp | 1 - fdbserver/storageserver.actor.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 604b9f8f83..f00cb77cd2 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -1192,7 +1192,6 @@ ACTOR Future tryFetchRange(Database cx, ASSERT(!cx->switchable); tr.setVersion(version); - // tr.trState->taskID = TaskPriority::FetchKeys; tr.trState->readType = ReadType::FETCH; limits.minRows = 0; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 88639da973..3edb57e57e 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3570,7 +3570,7 @@ ACTOR Future quickGetKeyValues( // TODO: Use remainingLimit, remainingLimitBytes rather than separate knobs. req.limit = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT; req.limitBytes = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT_BYTES; - req.readType = ReadType::NORMAL; + req.readType = pOriginalReq->readType; req.tags = pOriginalReq->tags; req.ssLatestCommitVersions = VersionVector(); req.debugID = pOriginalReq->debugID; From dccf0c5fd775ebcee96326945c1691696859bcec Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Tue, 2 Aug 2022 15:47:15 -0700 Subject: [PATCH 016/216] add readType to other requests --- fdbclient/NativeAPI.actor.cpp | 3 +++ fdbclient/include/fdbclient/NativeAPI.actor.h | 1 + .../fdbclient/StorageServerInterface.h | 19 ++++++++----- fdbserver/storageserver.actor.cpp | 27 +++++++++++++++---- fdbserver/workloads/ReadWrite.actor.cpp | 4 ++- 5 files changed, 41 insertions(+), 13 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 126a509ad0..6aead7870d 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3313,6 +3313,7 @@ ACTOR Future> getValue(Reference trState, useTenant ? trState->getTenantInfo() : TenantInfo(), key, ver, + trState->readType, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), getValueID, @@ -3438,6 +3439,7 @@ ACTOR Future getKey(Reference trState, useTenant ? trState->getTenantInfo() : TenantInfo(), k, version.get(), + trState->readType, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), getKeyID, ssLatestCommitVersions); @@ -3884,6 +3886,7 @@ Future getExactRange(Reference trState, req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); + req.readType = trState->readType; setMatchIndex(req, matchIndex); req.spanContext = span.context; trState->cx->getLatestCommitVersions( diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 7903b4481f..3601343379 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -451,6 +451,7 @@ public: Database getDatabase() const { return trState->cx; } static Reference createTrLogInfoProbabilistically(const Database& cx); + Transaction& getTransaction() { return *this; } void setTransactionID(UID id); void setToken(uint64_t token); diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index 6db0ef6bd7..b275aac489 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -276,6 +276,7 @@ struct GetValueRequest : TimedRequest { TenantInfo tenantInfo; Key key; Version version; + ReadType readType; Optional tags; Optional debugID; ReplyPromise reply; @@ -283,20 +284,21 @@ struct GetValueRequest : TimedRequest { // to this client, of all storage replicas that // serve the given key - GetValueRequest() {} + GetValueRequest() : readType(ReadType::NORMAL) {} GetValueRequest(SpanContext spanContext, const TenantInfo& tenantInfo, const Key& key, Version ver, + ReadType type, Optional tags, Optional debugID, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), debugID(debugID), - ssLatestCommitVersions(latestCommitVersions) {} + : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), readType(type), tags(tags), + debugID(debugID), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, key, version, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions); + serializer(ar, key, version, readType, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions); } }; @@ -542,6 +544,7 @@ struct GetKeyRequest : TimedRequest { TenantInfo tenantInfo; KeySelectorRef sel; Version version; // or latestVersion + ReadType readType; Optional tags; Optional debugID; ReplyPromise reply; @@ -549,21 +552,23 @@ struct GetKeyRequest : TimedRequest { // to this client, of all storage replicas that // serve the given key - GetKeyRequest() {} + GetKeyRequest() : readType(ReadType::NORMAL) {} GetKeyRequest(SpanContext spanContext, TenantInfo tenantInfo, KeySelectorRef const& sel, Version version, + ReadType type, Optional tags, Optional debugID, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), debugID(debugID), + : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), readType(type), debugID(debugID), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, sel, version, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions); + serializer( + ar, sel, version, readType, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions); } }; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 3edb57e57e..f6ca78b80e 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1572,7 +1572,7 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here wait(data->getQueryDelay()); - wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)ReadType::NORMAL]))); + wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)req.readType]))); if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", @@ -1716,8 +1716,14 @@ ACTOR Future watchWaitForValueChange(StorageServer* data, SpanContext p state Version latest = data->version.get(); TEST(latest >= minVersion && latest < data->data().latestVersion); // Starting watch loop with latestVersion > data->version - GetValueRequest getReq( - span.context, TenantInfo(), metadata->key, latest, metadata->tags, metadata->debugID, VersionVector()); + GetValueRequest getReq(span.context, + TenantInfo(), + metadata->key, + latest, + ReadType::NORMAL, + metadata->tags, + metadata->debugID, + VersionVector()); state Future getValue = getValueQ( data, getReq); // we are relying on the delay zero at the top of getValueQ, if removed we need one here GetValueReply reply = wait(getReq.reply.getFuture()); @@ -2964,6 +2970,7 @@ ACTOR Future quickGetValue(StorageServer* data, pOriginalReq->tenantInfo, key, version, + ReadType::HIGH, pOriginalReq->tags, pOriginalReq->debugID, VersionVector()); @@ -3384,6 +3391,9 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here wait(data->getQueryDelay()); + if (!SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && type == ReadType::FETCH) { + type = ReadType::NORMAL; + } state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type])); try { @@ -4100,6 +4110,9 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here wait(data->getQueryDelay()); + if (!SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && type == ReadType::FETCH) { + type = ReadType::NORMAL; + } state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type])); try { @@ -4288,7 +4301,6 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe state Span span("SS:getKeyValuesStream"_loc, req.spanContext); state int64_t resultSize = 0; state ReadType type = req.readType; - state int readPriority = data->readPriorityRanks[(int)type]; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4304,6 +4316,10 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here wait(delay(0, TaskPriority::DefaultEndpoint)); + if (!SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && type == ReadType::FETCH) { + type = ReadType::NORMAL; + } + state int readPriority = data->readPriorityRanks[(int)type]; state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); try { @@ -4500,7 +4516,7 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here wait(data->getQueryDelay()); - wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)ReadType::NORMAL]))); + wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)req.readType]))); try { Version commitVersion = getLatestCommitVersion(req.ssLatestCommitVersions, data->tag); @@ -8925,6 +8941,7 @@ ACTOR Future serveWatchValueRequestsImpl(StorageServer* self, FutureStream TenantInfo(), metadata->key, latest, + ReadType::NORMAL, metadata->tags, metadata->debugID, VersionVector()); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index ef77eaae76..4afeaf369b 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -377,6 +377,7 @@ struct ReadWriteWorkload : ReadWriteCommon { bool adjacentReads; // keys are adjacent within a transaction bool adjacentWrites; int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; + int readType; Optional transactionTag; int transactionsTagThrottled{ 0 }; @@ -401,6 +402,7 @@ struct ReadWriteWorkload : ReadWriteCommon { rampUpConcurrency = getOption(options, LiteralStringRef("rampUpConcurrency"), false); batchPriority = getOption(options, LiteralStringRef("batchPriority"), false); descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("ReadWrite")); + readType = getOption(options, LiteralStringRef("readType"), 3); if (hasOption(options, LiteralStringRef("transactionTag"))) { transactionTag = getOption(options, LiteralStringRef("transactionTag"), ""_sr); } @@ -430,6 +432,7 @@ struct ReadWriteWorkload : ReadWriteCommon { if (transactionTag.present() && tr.getTags().size() == 0) { tr.setOption(FDBTransactionOptions::AUTO_THROTTLE_TAG, transactionTag.get()); } + tr.getTransaction().trState->readType = static_cast(readType); } std::string description() const override { return descriptionString.toString(); } @@ -505,7 +508,6 @@ struct ReadWriteWorkload : ReadWriteCommon { state double startTime = now(); loop { state Transaction tr(cx); - try { self->setupTransaction(tr); wait(self->readOp(&tr, keys, self, false)); From aca76671ab1551ed10d399049ecc9c4c612103a7 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Thu, 4 Aug 2022 13:17:44 -0700 Subject: [PATCH 017/216] pass req readType to storage --- fdbserver/storageserver.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 8d90a1e9f4..3aa113bc17 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1851,7 +1851,7 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { path = 1; } else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) { path = 2; - Optional vv = wait(data->storage.readValue(req.key, ReadType::NORMAL, req.debugID)); + Optional vv = wait(data->storage.readValue(req.key, req.readType, req.debugID)); data->counters.kvGetBytes += vv.expectedSize(); // Validate that while we were reading the data we didn't lose the version or shard if (version < data->storageVersion()) { @@ -1961,6 +1961,7 @@ ACTOR Future watchWaitForValueChange(StorageServer* data, SpanContext p state Version latest = data->version.get(); CODE_PROBE(latest >= minVersion && latest < data->data().latestVersion, "Starting watch loop with latestVersion > data->version"); + // perhaps use a lower priority? GetValueRequest getReq(span.context, TenantInfo(), metadata->key, @@ -4845,8 +4846,7 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { KeyRangeRef searchRange = data->clampRangeToTenant(shard, tenantEntry, req.arena); state int offset; - Key absoluteKey = - wait(findKey(data, req.sel, version, searchRange, &offset, req.spanContext, ReadType::NORMAL)); + Key absoluteKey = wait(findKey(data, req.sel, version, searchRange, &offset, req.spanContext, req.readType)); data->checkChangeCounter(changeCounter, KeyRangeRef(std::min(req.sel.getKey(), absoluteKey), From 70ae25574475883708b21a648fa7a64c95bad345 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Mon, 15 Aug 2022 11:34:09 -0700 Subject: [PATCH 018/216] add Redwood PML metrics --- fdbserver/VersionedBTree.actor.cpp | 46 +++++++++++++++++++++++++ flow/include/flow/genericactors.actor.h | 9 ++--- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index d23dc01623..5653a6e869 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1472,6 +1472,8 @@ struct RedwoodMetrics { kvSizeReadByGetRange = Reference( new Histogram(Reference(), "kvSize", "ReadByGetRange", Histogram::Unit::bytes)); + ioLock = nullptr; + // These histograms are used for Btree events, hence level > 0 unsigned int levelCounter = 0; for (RedwoodMetrics::Level& level : levels) { @@ -1514,6 +1516,8 @@ struct RedwoodMetrics { // btree levels and one extra level for non btree level. Level levels[btreeLevels + 1]; metrics metric; + // pointer to the priority multi lock used in pager + PriorityMultiLock* ioLock; Reference kvSizeWritten; Reference kvSizeReadByGet; @@ -1568,9 +1572,12 @@ struct RedwoodMetrics { // The string is a reasonably well formatted page of information void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false); + void getIOLockFields(TraceEvent* e, std::string* s = nullptr); + std::string toString(bool clearAfter) { std::string s; getFields(nullptr, &s); + getIOLockFields(nullptr, &s); if (clearAfter) { clear(); @@ -1605,6 +1612,7 @@ ACTOR Future redwoodMetricsLogger() { double elapsed = now() - g_redwoodMetrics.startTime; e.detail("Elapsed", elapsed); g_redwoodMetrics.getFields(&e); + g_redwoodMetrics.getIOLockFields(&e); g_redwoodMetrics.clear(); } } @@ -2018,6 +2026,7 @@ public: if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); } + g_redwoodMetrics.ioLock = &ioLock; commitFuture = Void(); recoverFuture = forwardError(recover(this), errorPromise); @@ -8427,6 +8436,43 @@ void RedwoodMetrics::getFields(TraceEvent* e, std::string* s, bool skipZeroes) { } } +void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) { + if (ioLock == nullptr) + return; + + int maxPriority = ioLock->maxPriority(); + + if (e != nullptr) { + e->detail("ActiveReads", ioLock->totalWorkers()); + e->detail("AwaitReads", ioLock->totalWaiters()); + + for (int priority = 0; priority <= maxPriority; ++priority) { + e->detail(format("ActiveP%d", priority), ioLock->numWorkers(priority)); + e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority)); + } + } + + if (s != nullptr) { + std::string active = "Active"; + std::string await = "Await"; + + *s += "\n"; + *s += format("%-15s %-8u ", "ActiveReads", ioLock->totalWorkers()); + *s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters()); + *s += "\n"; + + for (int priority = 0; priority <= maxPriority; ++priority) { + *s += + format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numWorkers(priority)); + } + *s += "\n"; + for (int priority = 0; priority <= maxPriority; ++priority) { + *s += + format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority)); + } + } +} + TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3); ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4); diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index c65d09d207..385f0e2365 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -2245,17 +2245,18 @@ public: s += "}"; return s; } + int maxPriority() const { return launchLimit.size() - 1; } - int totalWaiters() { return waiting; } + int totalWaiters() const { return waiting; } - int numWaiters(const unsigned int priority) { + int numWaiters(const unsigned int priority) const { ASSERT(priority < waiters.size()); return waiters[priority].size(); } - int totalWorkers() { return concurrency - available; } + int totalWorkers() const { return concurrency - available; } - int numWorkers(const unsigned int priority) { + int numWorkers(const unsigned int priority) const { ASSERT(priority < waiters.size()); return workerCounts[priority]; } From 8eae717fafb50194473f8d6d0384313b0d71388a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 15 Aug 2022 13:59:49 -0700 Subject: [PATCH 019/216] provide default pollMoveKeysLock implementation --- fdbserver/include/fdbserver/DDTxnProcessor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index 3166a6122e..3388188074 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -64,7 +64,7 @@ public: virtual Future waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const { return Void(); }; - virtual Future pollMoveKeysLock(MoveKeysLock lock, const DDEnabledState* ddEnabledState) const = 0; + virtual Future pollMoveKeysLock(MoveKeysLock lock, const DDEnabledState* ddEnabledState) const { return Never();}; }; class DDTxnProcessorImpl; From 336f92ff2ad87c30324bc0ade96ff66d45f50cee Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 19 Aug 2022 15:32:10 -0700 Subject: [PATCH 020/216] add default nullptr to trackerCancelled and shards --- fdbserver/DataDistributionTracker.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 586304643d..8610055530 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -74,7 +74,7 @@ struct DataDistributionTracker { UID distributorId; // At now, the lifetime of shards is guaranteed longer than DataDistributionTracker. - KeyRangeMap* shards; + KeyRangeMap* shards = nullptr; ActorCollection sizeChanges; int64_t systemSizeEstimate = 0; @@ -95,7 +95,7 @@ struct DataDistributionTracker { // The reference to trackerCancelled must be extracted by actors, // because by the time (trackerCancelled == true) this memory cannot // be accessed - bool* trackerCancelled; + bool* trackerCancelled = nullptr; // This class extracts the trackerCancelled reference from a DataDistributionTracker object // Because some actors spawned by the dataDistributionTracker outlive the DataDistributionTracker From 6db625cd3d7bf59256f54ad9c5764b5c4f0ba186 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 19 Aug 2022 16:27:46 -0700 Subject: [PATCH 021/216] use ShardsAffectedByTeamFailure as shardMapping --- fdbserver/MockGlobalState.cpp | 23 +++++++------------ fdbserver/include/fdbserver/MockGlobalState.h | 23 ++++--------------- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.cpp index d7364cbcb5..e6b420c964 100644 --- a/fdbserver/MockGlobalState.cpp +++ b/fdbserver/MockGlobalState.cpp @@ -23,21 +23,14 @@ void MockGlobalState::initialAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) { ASSERT(conf.storageTeamSize > 0); configuration = conf; - Team seedTeam; - seedTeam.teamIdx = 1; - for (int i = 1; i <= conf.storageTeamSize; ++i) { - seedTeam.serverIdx.emplace_back(i); - servers[i] = MockStorageServer(indexToUID(i), defaultDiskSpace); + std::vector serverIds; + for(int i = 1; i <= conf.storageTeamSize; ++ i) { + UID id = indexToUID(i); + serverIds.push_back(id); + allServers[id] = MockStorageServer(id, defaultDiskSpace); } - teams[seedTeam.teamIdx] = seedTeam; - keyServers[allKeys.begin] = { seedTeam.teamIdx, Optional() }; - keyServers[allKeys.end] = { 0, Optional() }; -} -std::vector MockGlobalState::Team::getServerIds() const { - std::vector res(serverIdx.size()); - for (int i = 0; i < serverIdx.size(); ++i) { - res[i] = indexToUID(serverIdx.at(i)); - } - return res; + shardMapping->defineShard(allKeys); + shardMapping->moveShard(allKeys, {ShardsAffectedByTeamFailure::Team(serverIds, true)}); + shardMapping->finishMove(allKeys); } diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index 1d53e74741..69f3535d3b 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -26,6 +26,7 @@ #include "fdbclient/StorageServerInterface.h" #include "fdbclient/DatabaseConfiguration.h" #include "SimulatedCluster.h" +#include "ShardsAffectedByTeamFailure.h" class MockStorageServer { public: @@ -49,30 +50,16 @@ public: class MockGlobalState { public: - // Index starting from 1. 0 indicates invalid index; - typedef uint32_t TeamIndex; - typedef uint64_t ServerIndex; - struct Team { - TeamIndex teamIdx; - std::vector serverIdx; - - std::vector getServerIds() const; - }; - - struct ShardTeamValue { - TeamIndex srcIdx; - Optional destIdx; - }; - - std::map keyServers; // a shard belongs to which teams, key is the beginning key of a shard - std::map servers; // all mock servers - std::map teams; + Reference shardMapping; + std::map allServers; DatabaseConfiguration configuration; // user defined parameters for mock workload purpose double emptyProb; // probability of doing an empty read uint32_t minByteSize, maxByteSize; // the size band of a point data operation + MockGlobalState() : shardMapping(new ShardsAffectedByTeamFailure) {} + static UID indexToUID(uint64_t a) { return UID(a, a); } void initialAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace = 1000LL * 1024 * 1024 * 1024); From 8c6af958f06209ca669621ae8d54f319557e6375 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 19 Aug 2022 16:44:10 -0700 Subject: [PATCH 022/216] change the DDTracker parameters back --- fdbserver/DDShardTracker.actor.cpp | 6 +++--- fdbserver/DataDistribution.actor.cpp | 4 ++-- fdbserver/include/fdbserver/DataDistribution.actor.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp index ab7483f02a..8c90896539 100644 --- a/fdbserver/DDShardTracker.actor.cpp +++ b/fdbserver/DDShardTracker.actor.cpp @@ -1046,8 +1046,8 @@ ACTOR Future dataDistributionTracker(Reference in Promise readyToStart, Reference> anyZeroHealthyTeams, UID distributorId, - std::shared_ptr> shards, - std::shared_ptr trackerCancelled) { + KeyRangeMap* shards, + bool* trackerCancelled) { state DataDistributionTracker self(cx, distributorId, readyToStart, @@ -1646,7 +1646,7 @@ TEST_CASE("/DataDistributor/Tracker/FetchTopK") { req.minBytesReadPerKSecond = 1000; req.minBytesReadPerKSecond = 10000; - self.shards = std::make_shared>(); + // self.shards = std::make_shared>(); double targetDensities[10] = { 2, 1, 3, 5, 4, 10, 6, 8, 7, 0 }; for (int i = 0; i <= 5; ++i) { } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 5e202a4ebd..50398e7356 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -642,8 +642,8 @@ ACTOR Future dataDistribution(Reference self, readyToStart, anyZeroHealthyTeams, self->ddId, - shards, - trackerCancelled), + shards.get(), + trackerCancelled.get()), "DDTracker", self->ddId, &normalDDQueueErrors())); diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 2a7d1c7030..675e3a4523 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -614,8 +614,8 @@ ACTOR Future dataDistributionTracker(Reference in Promise readyToStart, Reference> zeroHealthyTeams, UID distributorId, - std::shared_ptr> shards, - std::shared_ptr trackerCancelled); + KeyRangeMap* shards, + bool* trackerCancelled); ACTOR Future dataDistributionQueue(Database cx, PromiseStream output, From 8544cf48d452d1c6e288bc8054c222df53d27b5b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 29 Aug 2022 14:59:40 -0700 Subject: [PATCH 023/216] implement DDMockTxnProcessor::getInitialDataDistribution (compiled) --- fdbserver/DDTxnProcessor.actor.cpp | 48 +++++++++++++------ fdbserver/ShardsAffectedByTeamFailure.cpp | 18 ++++++- fdbserver/include/fdbserver/DDTxnProcessor.h | 2 +- .../fdbserver/ShardsAffectedByTeamFailure.h | 7 +++ 4 files changed, 59 insertions(+), 16 deletions(-) diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index 272b661291..da764d22dd 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -498,36 +498,55 @@ Future DDTxnProcessor::pollMoveKeysLock(const MoveKeysLock& lock, const DD Future>> DDMockTxnProcessor::getServerListAndProcessClasses() { std::vector> res; - for (auto& [_, mss] : mgs->servers) { + for (auto& [_, mss] : mgs->allServers) { res.emplace_back(mss.ssi, ProcessClass(ProcessClass::StorageClass, ProcessClass::DBSource)); } return res; } -std::set> DDMockTxnProcessor::getPrimaryTeams() const { +std::set> DDMockTxnProcessor::getAllTeamsInRegion(bool primary) const { + auto teams = mgs->shardMapping->getAllTeams(); std::set> res; - for (auto& [idx, team] : mgs->teams) { - res.emplace(team.getServerIds()); + for (auto& team : teams) { + if (primary == team.primary) { + res.emplace(team.servers); + } } return res; } +inline void transformTeamsToServerIds(std::vector& teams, + std::vector& primaryIds, std::vector& remoteIds) { + std::set primary, remote; + for (auto& team : teams) { + team.primary ? primary.insert(team.servers.begin(), team.servers.end()) + : remote.insert(team.servers.begin(), team.servers.end()); + } + primaryIds = std::vector(primary.begin(), primary.end()); + remoteIds = std::vector(remote.begin(), remote.end()); +} + std::vector DDMockTxnProcessor::getDDShardInfos() const { std::vector res; - res.reserve(mgs->keyServers.size() - 1); - for (auto& [beginK, value] : mgs->keyServers) { - if (beginK == allKeys.end) - break; - + res.reserve(mgs->shardMapping->getNumberOfShards()); + auto allRange = mgs->shardMapping->getAllRanges(); + ASSERT(allRange.end().end() == allKeys.end); + for (auto it = allRange.begin(); it != allRange.end(); ++it) { // FIXME: now just use anonymousShardId - DDShardInfo info(beginK, anonymousShardId, anonymousShardId); - info.primarySrc = mgs->teams.at(value.srcIdx).getServerIds(); - if (value.destIdx.present()) { - info.primaryDest = mgs->teams.at(value.destIdx.get()).getServerIds(); + KeyRangeRef curRange = it->range(); + DDShardInfo info(curRange.begin, anonymousShardId, anonymousShardId); + auto teams = mgs->shardMapping->getTeamsFor(curRange); + if (!teams.second.empty()) { + // in-flight shard info.hasDest = true; + transformTeamsToServerIds(teams.second, info.primarySrc, info.remoteSrc); + transformTeamsToServerIds(teams.first, info.primaryDest, info.remoteDest); + } else { + transformTeamsToServerIds(teams.first, info.primarySrc, info.remoteSrc); } } res.emplace_back(allKeys.end); + return res; } Future> DDMockTxnProcessor::getInitialDataDistribution( @@ -542,6 +561,7 @@ Future> DDMockTxnProcessor::getInitialDataDis res->allServers = getServerListAndProcessClasses().get(); // TODO: consider remote region setting. For now assume all server is in primary dc res->shards = getDDShardInfos(); - res->primaryTeams = getPrimaryTeams(); + res->primaryTeams = getAllTeamsInRegion(true); + res->remoteTeams = getAllTeamsInRegion(false); return res; } diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp index 61f1dc174b..8ea36d6080 100644 --- a/fdbserver/ShardsAffectedByTeamFailure.cpp +++ b/fdbserver/ShardsAffectedByTeamFailure.cpp @@ -194,4 +194,20 @@ void ShardsAffectedByTeamFailure::check() const { } } } -} \ No newline at end of file +} + +std::set ShardsAffectedByTeamFailure::getAllTeams() const { + std::set res; + for(const auto& teamKeys: team_shards) { + res.insert(res.end(), teamKeys.first); + } + return res; +} + +size_t ShardsAffectedByTeamFailure::getNumberOfShards() const { + return shard_teams.size(); +} + +auto ShardsAffectedByTeamFailure::getAllRanges() const -> decltype(shard_teams)::ConstRanges { + return shard_teams.ranges(); +} diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index 12588eecea..e8c1c9da73 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -136,7 +136,7 @@ class DDMockTxnProcessor : public IDDTxnProcessor { std::shared_ptr mgs; std::vector getDDShardInfos() const; - std::set> getPrimaryTeams() const; + std::set> getAllTeamsInRegion(bool primary) const; public: explicit DDMockTxnProcessor(std::shared_ptr mgs = nullptr) : mgs(mgs){}; diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h index 0109921a6b..93e2861640 100644 --- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h +++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h @@ -67,6 +67,7 @@ public: // no longer in the map), the servers will be set for all contained shards and added to all // intersecting shards. + std::set getAllTeams() const; int getNumberOfShards(UID ssID) const; std::vector getShardsFor(Team team) const; bool hasShards(Team team) const; @@ -103,6 +104,12 @@ private: void erase(Team team, KeyRange const& range); void insert(Team team, KeyRange const& range); + +public: + // return the iterator that can traverse all ranges + auto getAllRanges() const -> decltype(shard_teams)::ConstRanges; + // get total shards count + size_t getNumberOfShards() const; }; #endif // FOUNDATIONDB_SHARDSAFFECTEDBYTEAMFAILURE_H From 0ba37ca0c4a954a20c1683375da0daee46dbe285 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 30 Aug 2022 10:59:14 -0700 Subject: [PATCH 024/216] add counterpart of system keyspace --- fdbserver/DDTxnProcessor.actor.cpp | 20 +++++++++++++++++++ fdbserver/MockGlobalState.cpp | 6 ++---- fdbserver/ShardsAffectedByTeamFailure.cpp | 6 ++++++ fdbserver/include/fdbserver/DDTxnProcessor.h | 20 ++++++++++++++++--- fdbserver/include/fdbserver/MockGlobalState.h | 9 ++++++++- .../fdbserver/ShardsAffectedByTeamFailure.h | 2 ++ 6 files changed, 55 insertions(+), 8 deletions(-) diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index da764d22dd..cb73a9e851 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -565,3 +565,23 @@ Future> DDMockTxnProcessor::getInitialDataDis res->remoteTeams = getAllTeamsInRegion(false); return res; } + +Future DDMockTxnProcessor::removeKeysFromFailedServer(const UID& serverID, + const std::vector& teamForDroppedRange, + const MoveKeysLock& lock, + const DDEnabledState* ddEnabledState) const { + ShardsAffectedByTeamFailure::Team team(teamForDroppedRange, true); + + team.primary = false; + + return Void(); +} + +Future DDMockTxnProcessor::removeStorageServer(const UID& serverID, + const Optional& tssPairID, + const MoveKeysLock& lock, + const DDEnabledState* ddEnabledState) const { + + mgs->allServers.erase(serverID); + return Void(); +} diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.cpp index e6b420c964..3a852411f7 100644 --- a/fdbserver/MockGlobalState.cpp +++ b/fdbserver/MockGlobalState.cpp @@ -28,9 +28,7 @@ void MockGlobalState::initialAsEmptyDatabaseMGS(const DatabaseConfiguration& con UID id = indexToUID(i); serverIds.push_back(id); allServers[id] = MockStorageServer(id, defaultDiskSpace); + allServers[id].serverKeys.insert(allKeys, true); } - - shardMapping->defineShard(allKeys); - shardMapping->moveShard(allKeys, {ShardsAffectedByTeamFailure::Team(serverIds, true)}); - shardMapping->finishMove(allKeys); + shardMapping->assignRangeToTeams(allKeys, {Team(serverIds, true)}); } diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp index 8ea36d6080..4468270e64 100644 --- a/fdbserver/ShardsAffectedByTeamFailure.cpp +++ b/fdbserver/ShardsAffectedByTeamFailure.cpp @@ -211,3 +211,9 @@ size_t ShardsAffectedByTeamFailure::getNumberOfShards() const { auto ShardsAffectedByTeamFailure::getAllRanges() const -> decltype(shard_teams)::ConstRanges { return shard_teams.ranges(); } + +void ShardsAffectedByTeamFailure::assignRangeToTeams(KeyRangeRef keys, const std::vector& destinationTeam) { + defineShard(keys); + moveShard(keys, destinationTeam); + finishMove(keys); +} diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index e8c1c9da73..be9fc56327 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -64,9 +64,13 @@ public: virtual Future waitForDataDistributionEnabled(const DDEnabledState* ddEnabledState) const { return Void(); }; - virtual Future isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const = 0; + virtual Future isDataDistributionEnabled(const DDEnabledState* ddEnabledState) const { + return ddEnabledState->isDDEnabled(); + }; - virtual Future pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const = 0; + virtual Future pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const { + return Never(); + }; virtual Future removeKeysFromFailedServer(const UID& serverID, const std::vector& teamForDroppedRange, @@ -139,7 +143,7 @@ class DDMockTxnProcessor : public IDDTxnProcessor { std::set> getAllTeamsInRegion(bool primary) const; public: - explicit DDMockTxnProcessor(std::shared_ptr mgs = nullptr) : mgs(mgs){}; + explicit DDMockTxnProcessor(std::shared_ptr mgs = nullptr) : mgs(std::move(mgs)){}; Future>> getServerListAndProcessClasses() override; @@ -148,6 +152,16 @@ public: const MoveKeysLock& moveKeysLock, const std::vector>& remoteDcIds, const DDEnabledState* ddEnabledState) override; + + Future removeKeysFromFailedServer(const UID& serverID, + const std::vector& teamForDroppedRange, + const MoveKeysLock& lock, + const DDEnabledState* ddEnabledState) const override; + + Future removeStorageServer(const UID& serverID, + const Optional& tssPairID, + const MoveKeysLock& lock, + const DDEnabledState* ddEnabledState) const override; }; #endif // FOUNDATIONDB_DDTXNPROCESSOR_H diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index 69f3535d3b..6b4dcc6844 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -32,7 +32,10 @@ class MockStorageServer { public: // control plane statistics associated with a real storage server uint64_t usedDiskSpace = 0, availableDiskSpace; - KeyRangeMap shardTotalBytes; // randomly generated in setup phase + + // In-memory counterpart of the `serverKeys` in system keyspace + // the value bool is equal to "[[serverKeysTrue]]" |" [[serverKeysFalse]]" and metrics uint64_t is the shard size + KeyRangeMap serverKeys; // sampled metrics StorageServerMetrics metrics; @@ -40,6 +43,7 @@ public: StorageServerInterface ssi; // serve RPC requests UID id; + bool primary = true; // Only support single region MGS for now MockStorageServer() = default; MockStorageServer(const UID& id, uint64_t availableDiskSpace, uint64_t usedDiskSpace = 0) @@ -50,7 +54,10 @@ public: class MockGlobalState { public: + typedef ShardsAffectedByTeamFailure::Team Team; + // In-memory counterpart of the `keyServers` in system keyspace Reference shardMapping; + // In-memory counterpart of the `serverListKeys` in system keyspace std::map allServers; DatabaseConfiguration configuration; diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h index 93e2861640..ac097b99ca 100644 --- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h +++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h @@ -79,6 +79,8 @@ public: void defineShard(KeyRangeRef keys); void moveShard(KeyRangeRef keys, std::vector destinationTeam); void finishMove(KeyRangeRef keys); + // a convenient function for (defineShard, moveShard, finishMove) pipeline + void assignRangeToTeams(KeyRangeRef keys, const std::vector& destinationTeam); void check() const; void setCheckMode(CheckMode); From 620c119e9a2feeb7b65b3d11bf7bd749bc6f7aa9 Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Tue, 30 Aug 2022 12:07:45 -0700 Subject: [PATCH 025/216] update storage server priorities --- fdbclient/ServerKnobs.cpp | 6 +++--- fdbserver/VersionedBTree.actor.cpp | 4 +++- fdbserver/storageserver.actor.cpp | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 58680339fa..948b3a1a9a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -746,9 +746,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); init( STORAGE_SERVER_SHARD_AWARE, true ); - init( STORAGESERVER_MAX_RANK, 4 ); - init( STORAGESERVER_READ_RANKS, "0,1,2,3,4" ); - init( STORAGESERVER_READ_PRIORITIES, "32,8,12,32,48" ); + init( STORAGESERVER_MAX_RANK, 2 ); + init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" ); + init( STORAGESERVER_READ_PRIORITIES, "32,24,6" ); //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 5653a6e869..30328d2887 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2023,10 +2023,10 @@ public: // This sets the page cache size for all PageCacheT instances using the same evictor pageCache.evictor().sizeLimit = pageCacheBytes; + g_redwoodMetrics.ioLock = &ioLock; if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); } - g_redwoodMetrics.ioLock = &ioLock; commitFuture = Void(); recoverFuture = forwardError(recover(this), errorPromise); @@ -7551,6 +7551,8 @@ public: ACTOR void shutdown(KeyValueStoreRedwood* self, bool dispose) { TraceEvent(SevInfo, "RedwoodShutdown").detail("Filename", self->m_filename).detail("Dispose", dispose); + g_redwoodMetrics.ioLock = nullptr; + // In simulation, if the instance is being disposed of then sometimes run destructive sanity check. if (g_network->isSimulated() && dispose && BUGGIFY) { // Only proceed if the last commit is a success, but don't throw if it's not because shutdown diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 3aa113bc17..d3f9ad0991 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1265,6 +1265,7 @@ public: storageServerSourceTLogIDEventHolder( makeReference(ssi.id().toString() + "/StorageServerSourceTLogID")) { readPriorityRanks = parseStringToVector(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ','); + ASSERT(readPriorityRanks.size() > (int)ReadType::MAX); version.initMetric(LiteralStringRef("StorageServer.Version"), counters.cc.id); oldestVersion.initMetric(LiteralStringRef("StorageServer.OldestVersion"), counters.cc.id); durableVersion.initMetric(LiteralStringRef("StorageServer.DurableVersion"), counters.cc.id); From d192a5630c3768397129ce11b05ef2b58df4fe84 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 30 Aug 2022 14:36:14 -0700 Subject: [PATCH 026/216] add comment to ShardsAffectedByTeamFailure --- fdbserver/DDTxnProcessor.actor.cpp | 8 +++++--- .../include/fdbserver/ShardsAffectedByTeamFailure.h | 9 +++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index cb73a9e851..ada695584f 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -566,13 +566,15 @@ Future> DDMockTxnProcessor::getInitialDataDis return res; } +// Remove the server from shardMapping and set serverKeysFalse to the server's serverKeys list. +// Changes to keyServer and serverKey must happen symmetrically in this function. +// If serverID is the last source server for a shard, the shard will be erased, and then be assigned +// to teamForDroppedRange. Future DDMockTxnProcessor::removeKeysFromFailedServer(const UID& serverID, const std::vector& teamForDroppedRange, const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const { - ShardsAffectedByTeamFailure::Team team(teamForDroppedRange, true); - - team.primary = false; + auto& mss = mgs->allServers.at(serverID); return Void(); } diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h index ac097b99ca..efd281734f 100644 --- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h +++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h @@ -75,9 +75,12 @@ public: // The first element of the pair is either the source for non-moving shards or the destination team for in-flight // shards The second element of the pair is all previous sources for in-flight shards std::pair, std::vector> getTeamsFor(KeyRangeRef keys); - + // Shard boundaries are modified in defineShard and the content of what servers correspond to each shard is a copy + // or union of the shards already there void defineShard(KeyRangeRef keys); + // moveShard never change the shard boundary but just change the team value void moveShard(KeyRangeRef keys, std::vector destinationTeam); + // finishMove never change the shard boundary but just clear the old source team value void finishMove(KeyRangeRef keys); // a convenient function for (defineShard, moveShard, finishMove) pipeline void assignRangeToTeams(KeyRangeRef keys, const std::vector& destinationTeam); @@ -104,11 +107,13 @@ private: std::set, OrderByTeamKey> team_shards; std::map storageServerShards; + // only erase from team_shards void erase(Team team, KeyRange const& range); + // only insert into team_shards void insert(Team team, KeyRange const& range); public: - // return the iterator that can traverse all ranges + // return the iterator that traversing all ranges auto getAllRanges() const -> decltype(shard_teams)::ConstRanges; // get total shards count size_t getNumberOfShards() const; From 0aa096dc17ecf5267ec0691942c461bfa33e1e0a Mon Sep 17 00:00:00 2001 From: Fuheng Zhao Date: Wed, 31 Aug 2022 15:46:39 -0700 Subject: [PATCH 027/216] sync with upstream main --- CONTRIBUTING.md | 10 +- bindings/c/CMakeLists.txt | 30 +- bindings/c/fdb_c.cpp | 119 +- bindings/c/foundationdb/fdb_c.h | 31 +- bindings/c/foundationdb/fdb_c_internal.h | 23 + .../TesterBlobGranuleCorrectnessWorkload.cpp | 85 +- .../TesterBlobGranuleErrorsWorkload.cpp | 145 ++ .../test/apitester/TesterBlobGranuleUtil.cpp | 80 + .../c/test/apitester/TesterBlobGranuleUtil.h | 49 + .../test/apitester/TesterExampleWorkload.cpp | 65 + bindings/c/test/apitester/TesterOptions.h | 1 + bindings/c/test/apitester/TesterWorkload.cpp | 9 +- .../CApiBlobGranuleErrorsMultiThr.toml | 22 + ...CApiBlobGranuleErrorsOnExternalThread.toml | 22 + .../CApiBlobGranuleErrorsSingleThr.toml | 15 + .../c/test/apitester/fdb_c_api_tester.cpp | 12 + .../upgrade/ApiBlobGranulesCorrectness.toml | 23 + .../upgrade/MixedApiWorkloadMultiThr.toml | 12 +- .../upgrade/MixedApiWorkloadSingleThr.toml | 12 +- bindings/c/test/fdb_api.hpp | 4 +- bindings/c/test/mako/blob_granules.cpp | 3 + bindings/c/test/unit/fdb_api.cpp | 12 +- bindings/c/test/unit/fdb_api.hpp | 2 +- bindings/c/test/unit/unit_tests.cpp | 2 +- bindings/go/src/fdb/generated.go | 14 + bindings/java/CMakeLists.txt | 3 + bindings/java/fdbJNI.cpp | 264 +++ .../main/com/apple/foundationdb/Database.java | 125 +- .../com/apple/foundationdb/FDBDatabase.java | 52 +- .../apple/foundationdb/FDBTransaction.java | 16 + .../com/apple/foundationdb/FutureBool.java | 37 + .../foundationdb/FutureKeyRangeArray.java | 37 + .../foundationdb/KeyRangeArrayResult.java | 36 + .../apple/foundationdb/ReadTransaction.java | 11 + .../com/apple/foundationdb/test/Context.java | 3 +- bindings/python/tests/size_limit_tests.py | 3 + cmake/AddFdbTest.cmake | 48 +- cmake/ConfigureCompiler.cmake | 4 + cmake/awssdk.cmake | 9 +- contrib/Joshua/scripts/correctnessTest.sh | 4 +- contrib/Joshua/scripts/correctnessTimeout.sh | 6 +- contrib/Joshua/scripts/valgrindTest.sh | 2 +- contrib/Joshua/scripts/valgrindTimeout.sh | 6 +- contrib/TestHarness/Program.cs | 74 +- contrib/TestHarness2/.gitignore | 2 + contrib/TestHarness2/test_harness/__init__.py | 2 + contrib/TestHarness2/test_harness/app.py | 25 + contrib/TestHarness2/test_harness/config.py | 263 +++ contrib/TestHarness2/test_harness/fdb.py | 144 ++ contrib/TestHarness2/test_harness/joshua.py | 161 ++ contrib/TestHarness2/test_harness/results.py | 144 ++ contrib/TestHarness2/test_harness/run.py | 465 ++++ .../TestHarness2/test_harness/summarize.py | 620 ++++++ .../test_harness/test_valgrind_parser.py | 16 + contrib/TestHarness2/test_harness/timeout.py | 60 + contrib/TestHarness2/test_harness/valgrind.py | 141 ++ contrib/TestHarness2/test_harness/version.py | 66 + .../details.xml | 431 ++++ .../performance_overview.xml | 323 +++ .../ratekeeper.xml | 928 ++++++++ .../recovery.xml | 873 ++++++++ .../transaction_latency.xml | 247 +++ contrib/pkg_tester/test_fdb_pkgs.py | 3 +- .../transaction_profiling_analyzer.py | 6 + contrib/tsan.suppressions | 5 + design/data-distributor-internals.md | 24 +- design/dynamic-knobs.md | 420 ++++ design/global-tag-throttling.md | 3 - documentation/sphinx/source/architecture.rst | 8 +- .../sphinx/source/client-testing.rst | 299 +++ documentation/sphinx/source/configuration.rst | 3 + .../source/mr-status-json-schemas.rst.inc | 10 +- documentation/sphinx/source/mr-status.rst | 3 + .../release-notes/release-notes-710.rst | 28 +- documentation/sphinx/source/special-keys.rst | 3 + documentation/sphinx/source/tenants.rst | 2 +- fdbbackup/backup.actor.cpp | 2 +- fdbcli/BlobRangeCommand.actor.cpp | 124 +- fdbcli/ConfigureCommand.actor.cpp | 4 + fdbcli/ExpensiveDataCheckCommand.actor.cpp | 2 +- fdbcli/KillCommand.actor.cpp | 2 +- fdbcli/MetaclusterCommands.actor.cpp | 432 ++++ fdbcli/StatusCommand.actor.cpp | 11 +- fdbcli/SuspendCommand.actor.cpp | 2 +- fdbcli/TenantCommands.actor.cpp | 233 +- fdbcli/Util.actor.cpp | 74 +- fdbcli/fdbcli.actor.cpp | 52 +- fdbcli/include/fdbcli/fdbcli.actor.h | 18 +- fdbclient/BackupContainer.actor.cpp | 45 +- fdbclient/BackupContainerFileSystem.actor.cpp | 45 +- fdbclient/BlobGranuleCommon.cpp | 45 + fdbclient/BlobGranuleFiles.cpp | 100 +- fdbclient/BlobGranuleReader.actor.cpp | 70 +- fdbclient/CMakeLists.txt | 4 +- fdbclient/ClientKnobs.cpp | 21 +- fdbclient/KeyRangeMap.actor.cpp | 160 +- fdbclient/ManagementAPI.actor.cpp | 2 +- fdbclient/Metacluster.cpp | 71 + fdbclient/MetaclusterManagement.actor.cpp | 67 + fdbclient/MonitorLeader.actor.cpp | 84 +- fdbclient/MultiVersionTransaction.actor.cpp | 245 ++- fdbclient/NativeAPI.actor.cpp | 748 +++++-- fdbclient/ReadYourWrites.actor.cpp | 8 +- fdbclient/S3BlobStore.actor.cpp | 21 +- fdbclient/Schemas.cpp | 8 +- fdbclient/ServerKnobs.cpp | 105 +- fdbclient/SpecialKeySpace.actor.cpp | 83 +- fdbclient/SystemData.cpp | 24 +- fdbclient/Tenant.cpp | 53 +- fdbclient/TenantManagement.actor.cpp | 40 + fdbclient/ThreadSafeTransaction.cpp | 105 +- fdbclient/Tuple.cpp | 2 +- .../BackupContainerAzureBlobStore.actor.cpp | 7 +- fdbclient/azure_backup/README.md | 33 + fdbclient/azurestorage.cmake | 2 + .../fdbclient/BackupContainerAzureBlobStore.h | 2 - .../include/fdbclient/BlobGranuleCommon.h | 18 +- .../fdbclient/BlobGranuleReader.actor.h | 2 + .../include/fdbclient/BlobWorkerCommon.h | 42 +- .../include/fdbclient/BlobWorkerInterface.h | 55 +- fdbclient/include/fdbclient/ClientKnobs.h | 19 +- .../include/fdbclient/ClusterInterface.h | 52 +- .../include/fdbclient/CommitProxyInterface.h | 17 +- .../include/fdbclient/CommitTransaction.h | 61 +- fdbclient/include/fdbclient/DatabaseContext.h | 23 +- .../fdbclient}/EncryptKeyProxyInterface.h | 0 fdbclient/include/fdbclient/FDBTypes.h | 65 +- .../fdbclient/GenericManagementAPI.actor.h | 10 + .../fdbclient/GetEncryptCipherKeys.actor.h | 112 +- .../include/fdbclient/GrvProxyInterface.h | 1 - fdbclient/include/fdbclient/IClientApi.h | 25 +- .../include/fdbclient/IConfigTransaction.h | 2 +- .../fdbclient/ISingleThreadTransaction.h | 2 +- fdbclient/include/fdbclient/KeyBackedTypes.h | 24 + fdbclient/include/fdbclient/KeyRangeMap.h | 12 +- fdbclient/include/fdbclient/Metacluster.h | 183 ++ .../fdbclient/MetaclusterManagement.actor.h | 1926 +++++++++++++++++ .../fdbclient/MultiVersionTransaction.h | 118 +- fdbclient/include/fdbclient/NativeAPI.actor.h | 28 +- fdbclient/include/fdbclient/ReadYourWrites.h | 15 +- fdbclient/include/fdbclient/ServerKnobs.h | 77 +- .../include/fdbclient/SpecialKeySpace.actor.h | 9 + .../fdbclient/StorageServerInterface.h | 52 +- fdbclient/include/fdbclient/SystemData.h | 5 +- fdbclient/include/fdbclient/Tenant.h | 116 +- .../fdbclient/TenantManagement.actor.h | 235 +- .../fdbclient/TenantSpecialKeys.actor.h | 16 +- .../include/fdbclient/ThreadSafeTransaction.h | 28 +- fdbclient/vexillographer/fdb.options | 8 +- fdbrpc/CMakeLists.txt | 2 + fdbrpc/FlowTransport.actor.cpp | 92 +- fdbrpc/HTTP.actor.cpp | 133 +- fdbrpc/JsonWebKeySet.cpp | 11 +- fdbrpc/SimExternalConnection.actor.cpp | 4 + fdbrpc/TokenCache.actor.cpp | 17 +- fdbrpc/TokenSign.cpp | 188 +- fdbrpc/include/fdbrpc/AsyncFileEIO.actor.h | 4 +- fdbrpc/include/fdbrpc/AsyncFileKAIO.actor.h | 4 +- fdbrpc/include/fdbrpc/FlowTransport.h | 6 + fdbrpc/include/fdbrpc/HTTP.h | 6 + fdbrpc/include/fdbrpc/LoadBalance.actor.h | 33 + fdbrpc/include/fdbrpc/MultiInterface.h | 1 + fdbrpc/include/fdbrpc/SimExternalConnection.h | 1 + fdbrpc/include/fdbrpc/simulator.h | 23 +- fdbrpc/sim2.actor.cpp | 10 +- fdbrpc/sim_validation.cpp | 20 +- fdbrpc/tests/AuthzTlsTest.actor.cpp | 357 +++ fdbrpc/tests/CMakeLists.txt | 6 + fdbserver/ApplyMetadataMutation.cpp | 73 +- fdbserver/BackupWorker.actor.cpp | 62 +- fdbserver/BlobGranuleServerCommon.actor.cpp | 84 +- fdbserver/BlobGranuleValidation.actor.cpp | 133 +- fdbserver/BlobManager.actor.cpp | 1408 +++++++++--- fdbserver/BlobWorker.actor.cpp | 767 +++++-- fdbserver/ClusterController.actor.cpp | 57 +- fdbserver/ClusterRecovery.actor.cpp | 39 +- fdbserver/CommitProxyServer.actor.cpp | 39 +- ....actor.cpp => DDRelocationQueue.actor.cpp} | 570 +++-- ...ker.actor.cpp => DDShardTracker.actor.cpp} | 656 +++++- fdbserver/DDTeamCollection.actor.cpp | 188 +- fdbserver/DDTxnProcessor.actor.cpp | 65 +- fdbserver/DataDistribution.actor.cpp | 265 ++- fdbserver/EncryptKeyProxy.actor.cpp | 37 +- fdbserver/GlobalTagThrottler.actor.cpp | 257 ++- fdbserver/GrvProxyServer.actor.cpp | 93 +- .../KeyValueStoreCompressTestData.actor.cpp | 31 +- fdbserver/KeyValueStoreMemory.actor.cpp | 35 +- fdbserver/KeyValueStoreRocksDB.actor.cpp | 324 +-- fdbserver/KeyValueStoreSQLite.actor.cpp | 29 +- .../KeyValueStoreShardedRocksDB.actor.cpp | 485 +++-- fdbserver/MutationTracking.cpp | 3 - fdbserver/QuietDatabase.actor.cpp | 7 +- fdbserver/Ratekeeper.actor.cpp | 382 +++- fdbserver/RemoteIKeyValueStore.actor.cpp | 26 +- fdbserver/Resolver.actor.cpp | 33 +- fdbserver/RestoreLoader.actor.cpp | 7 +- fdbserver/RkTagThrottleCollection.cpp | 3 +- fdbserver/RocksDBCheckpointUtils.actor.cpp | 4 +- fdbserver/ServerCheckpoint.actor.cpp | 2 +- fdbserver/SimKmsConnector.actor.cpp | 27 +- fdbserver/SimulatedCluster.actor.cpp | 220 +- fdbserver/Status.actor.cpp | 162 +- fdbserver/StorageCache.actor.cpp | 83 +- fdbserver/TLogServer.actor.cpp | 108 +- fdbserver/TenantCache.actor.cpp | 2 +- fdbserver/VersionedBTree.actor.cpp | 186 +- fdbserver/fdbserver.actor.cpp | 134 +- .../include/fdbserver/ApplyMetadataMutation.h | 6 +- .../fdbserver/BlobGranuleServerCommon.actor.h | 7 +- .../fdbserver/BlobGranuleValidation.actor.h | 5 + .../include/fdbserver/BlobManagerInterface.h | 41 +- .../fdbserver/ClusterController.actor.h | 3 + .../include/fdbserver/CoordinationInterface.h | 2 +- .../include/fdbserver/DDRelocationQueue.h | 35 + fdbserver/include/fdbserver/DDShardTracker.h | 40 + fdbserver/include/fdbserver/DDSharedContext.h | 70 + .../include/fdbserver/DDTeamCollection.h | 15 +- fdbserver/include/fdbserver/DDTxnProcessor.h | 35 +- .../fdbserver/DataDistribution.actor.h | 331 ++- .../fdbserver/EncryptedMutationMessage.h | 119 - .../include/fdbserver/EncryptionOpsUtils.h | 48 + .../include/fdbserver/GetEncryptCipherKeys.h | 58 - .../fdbserver/IEncryptionKeyProvider.actor.h | 284 +++ fdbserver/include/fdbserver/IKeyValueStore.h | 24 +- fdbserver/include/fdbserver/IPager.h | 185 +- .../include/fdbserver/ProxyCommitData.actor.h | 8 +- fdbserver/include/fdbserver/QuietDatabase.h | 4 +- fdbserver/include/fdbserver/Ratekeeper.h | 40 +- .../include/fdbserver/RatekeeperInterface.h | 27 +- .../fdbserver/RemoteIKeyValueStore.actor.h | 39 +- fdbserver/include/fdbserver/TagThrottler.h | 6 + .../fdbserver/TenantEntryCache.actor.h | 390 ++++ .../include/fdbserver/WorkerInterface.actor.h | 9 +- .../include/fdbserver/workloads/ApiWorkload.h | 6 +- .../fdbserver/workloads/BulkSetup.actor.h | 31 +- .../workloads/MetaclusterConsistency.actor.h | 363 ++++ .../workloads/TenantConsistency.actor.h | 229 ++ .../fdbserver/workloads/workloads.actor.h | 2 + fdbserver/masterserver.actor.cpp | 7 - fdbserver/storageserver.actor.cpp | 683 +++--- fdbserver/tester.actor.cpp | 8 +- fdbserver/worker.actor.cpp | 192 +- .../workloads/AtomicSwitchover.actor.cpp | 4 +- .../workloads/BackupCorrectness.actor.cpp | 6 +- fdbserver/workloads/BackupToDBAbort.actor.cpp | 4 +- .../workloads/BackupToDBCorrectness.actor.cpp | 4 +- .../workloads/BackupToDBUpgrade.actor.cpp | 4 +- .../BlobGranuleCorrectnessWorkload.actor.cpp | 26 +- .../BlobGranuleRangesWorkload.actor.cpp | 636 ++++++ .../workloads/BlobGranuleVerifier.actor.cpp | 758 ++++++- fdbserver/workloads/ChangeConfig.actor.cpp | 64 +- .../workloads/ConfigureDatabase.actor.cpp | 1 + .../workloads/ConsistencyCheck.actor.cpp | 3 +- .../DifferentClustersSameRV.actor.cpp | 5 +- .../workloads/EncryptKeyProxyTest.actor.cpp | 2 +- fdbserver/workloads/EncryptionOps.actor.cpp | 156 +- .../workloads/FuzzApiCorrectness.actor.cpp | 2 +- fdbserver/workloads/LocalRatekeeper.actor.cpp | 12 +- .../MetaclusterManagementWorkload.actor.cpp | 643 ++++++ fdbserver/workloads/ProtocolVersion.actor.cpp | 2 +- fdbserver/workloads/ReadWrite.actor.cpp | 8 +- fdbserver/workloads/SaveAndKill.actor.cpp | 2 + fdbserver/workloads/SkewedReadWrite.actor.cpp | 7 +- .../SpecialKeySpaceCorrectness.actor.cpp | 45 +- .../TenantEntryCacheWorkload.actor.cpp | 312 +++ ...antManagementConcurrencyWorkload.actor.cpp | 345 +++ .../TenantManagementWorkload.actor.cpp | 734 +++++-- fdbserver/workloads/UnitTests.actor.cpp | 9 + fdbserver/workloads/VersionStamp.actor.cpp | 12 +- fdbserver/workloads/WriteDuringRead.actor.cpp | 6 +- flow/BlobCipher.cpp | 277 ++- flow/CMakeLists.txt | 8 +- flow/DeterministicRandom.cpp | 11 +- flow/EncryptUtils.cpp | 6 +- flow/Knobs.cpp | 97 +- flow/MkCert.cpp | 4 +- flow/Net2.actor.cpp | 296 ++- flow/Platform.actor.cpp | 57 +- flow/ProtocolVersion.cpp | 37 + ...tocolVersion.h => ProtocolVersion.h.cmake} | 175 +- flow/ProtocolVersions.cmake | 90 + flow/TLSConfig.actor.cpp | 74 +- flow/Trace.cpp | 31 +- flow/actorcompiler/ActorCompiler.cs | 2 +- flow/actorcompiler/ParseTree.cs | 2 +- flow/actorcompiler/Program.cs | 2 +- flow/include/flow/BlobCipher.h | 57 +- flow/include/flow/DebugTrace.h | 3 + flow/include/flow/Knobs.h | 3 + flow/include/flow/MkCert.h | 2 +- flow/include/flow/ObjectSerializer.h | 2 +- flow/include/flow/Platform.h | 6 +- flow/include/flow/TLSConfig.actor.h | 30 +- flow/include/flow/Trace.h | 8 +- flow/include/flow/WatchFile.actor.h | 77 + flow/include/flow/error_definitions.h | 21 +- flow/include/flow/genericactors.actor.h | 52 +- flow/include/flow/network.h | 9 +- flow/include/flow/serialize.h | 2 +- packaging/docker/build-images.sh | 53 +- pull_request_template.md | 4 +- tests/CMakeLists.txt | 57 +- tests/TestRunner/binary_download.py | 56 +- tests/TestRunner/local_cluster.py | 118 +- tests/TestRunner/tmp_cluster.py | 38 +- tests/TestRunner/upgrade_test.py | 105 +- tests/authorization/admin_server.py | 135 ++ tests/authorization/authz_test.py | 297 +++ tests/authorization/conftest.py | 173 ++ tests/authorization/requirements.txt | 12 + tests/authorization/util.py | 124 ++ tests/fast/AtomicBackupToDBCorrectness.toml | 2 +- tests/fast/BackupAzureBlobCorrectness.toml | 8 +- tests/fast/BackupCorrectness.toml | 2 + tests/fast/BackupCorrectnessClean.toml | 2 + tests/fast/BackupS3BlobCorrectness.toml | 2 + tests/fast/BackupToDBCorrectness.toml | 4 +- tests/fast/BackupToDBCorrectnessClean.toml | 4 +- tests/fast/BlobGranuleMoveVerifyCycle.toml | 6 +- tests/fast/BlobGranuleVerifyAtomicOps.toml | 4 +- tests/fast/BlobGranuleVerifyCycle.toml | 6 +- tests/fast/BlobGranuleVerifySmall.toml | 7 +- tests/fast/BlobGranuleVerifySmallClean.toml | 8 +- tests/fast/ChangeFeedOperations.toml | 3 +- tests/fast/ChangeFeedOperationsMove.toml | 17 +- tests/fast/ChangeFeeds.toml | 1 + tests/fast/EncryptKeyProxyTest.toml | 3 + tests/fast/EncryptionOps.toml | 6 + tests/fast/PhysicalShardMove.toml | 1 + tests/fast/TenantEntryCache.toml | 10 + tests/loopback_cluster/run_custom_cluster.sh | 49 +- tests/rare/BlobGranuleRanges.toml | 38 + .../DrUpgradeRestart-2.txt | 2 +- .../DrUpgradeRestart-1.txt | 0 .../DrUpgradeRestart-2.txt | 2 +- .../from_7.0.0/UpgradeAndBackupRestore-1.toml | 1 + .../from_7.0.0/UpgradeAndBackupRestore-2.toml | 2 + .../from_7.1.0/SnapCycleRestart-1.txt | 1 + .../from_7.1.0/SnapCycleRestart-2.txt | 1 + .../from_7.1.0/SnapIncrementalRestore-1.txt | 1 + .../from_7.1.0/SnapIncrementalRestore-2.txt | 1 + .../from_7.1.0/SnapTestAttrition-1.txt | 1 + .../from_7.1.0/SnapTestAttrition-2.txt | 1 + .../from_7.1.0/SnapTestRestart-1.txt | 1 + .../from_7.1.0/SnapTestRestart-2.txt | 1 + .../from_7.1.0/SnapTestSimpleRestart-1.txt | 1 + .../from_7.1.0/SnapTestSimpleRestart-2.txt | 1 + .../from_7.2.0/DrUpgradeRestart-1.txt | 21 + .../from_7.2.0/DrUpgradeRestart-2.txt | 18 + tests/slow/ApiCorrectnessSwitchover.toml | 2 +- tests/slow/BlobGranuleCorrectness.toml | 1 - tests/slow/BlobGranuleCorrectnessClean.toml | 1 - tests/slow/BlobGranuleVerifyBalance.toml | 3 - tests/slow/BlobGranuleVerifyBalanceClean.toml | 4 +- tests/slow/BlobGranuleVerifyLarge.toml | 3 - tests/slow/BlobGranuleVerifyLargeClean.toml | 4 +- tests/slow/DifferentClustersSameRV.toml | 2 +- tests/slow/MetaclusterManagement.toml | 18 + tests/slow/SharedBackupCorrectness.toml | 2 +- tests/slow/SharedBackupToDBCorrectness.toml | 2 +- tests/slow/SwizzledTenantManagement.toml | 12 +- .../SwizzledTenantManagementMetacluster.toml | 40 + tests/slow/TenantManagement.toml | 4 +- tests/slow/TenantManagementConcurrency.toml | 16 + tests/slow/VersionStampBackupToDB.toml | 2 +- tests/slow/VersionStampSwitchover.toml | 2 +- tests/slow/WriteDuringReadSwitchover.toml | 2 +- 367 files changed, 27212 insertions(+), 4712 deletions(-) create mode 100644 bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp create mode 100644 bindings/c/test/apitester/TesterBlobGranuleUtil.cpp create mode 100644 bindings/c/test/apitester/TesterBlobGranuleUtil.h create mode 100644 bindings/c/test/apitester/TesterExampleWorkload.cpp create mode 100644 bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml create mode 100644 bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml create mode 100644 bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml create mode 100644 bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml create mode 100644 bindings/java/src/main/com/apple/foundationdb/FutureBool.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java create mode 100644 contrib/TestHarness2/.gitignore create mode 100644 contrib/TestHarness2/test_harness/__init__.py create mode 100644 contrib/TestHarness2/test_harness/app.py create mode 100644 contrib/TestHarness2/test_harness/config.py create mode 100644 contrib/TestHarness2/test_harness/fdb.py create mode 100644 contrib/TestHarness2/test_harness/joshua.py create mode 100644 contrib/TestHarness2/test_harness/results.py create mode 100644 contrib/TestHarness2/test_harness/run.py create mode 100644 contrib/TestHarness2/test_harness/summarize.py create mode 100644 contrib/TestHarness2/test_harness/test_valgrind_parser.py create mode 100644 contrib/TestHarness2/test_harness/timeout.py create mode 100644 contrib/TestHarness2/test_harness/valgrind.py create mode 100644 contrib/TestHarness2/test_harness/version.py create mode 100644 contrib/observability_splunk_dashboard/details.xml create mode 100644 contrib/observability_splunk_dashboard/performance_overview.xml create mode 100644 contrib/observability_splunk_dashboard/ratekeeper.xml create mode 100644 contrib/observability_splunk_dashboard/recovery.xml create mode 100644 contrib/observability_splunk_dashboard/transaction_latency.xml create mode 100644 contrib/tsan.suppressions create mode 100644 design/dynamic-knobs.md create mode 100644 fdbcli/MetaclusterCommands.actor.cpp create mode 100644 fdbclient/BlobGranuleCommon.cpp create mode 100644 fdbclient/Metacluster.cpp create mode 100644 fdbclient/MetaclusterManagement.actor.cpp create mode 100644 fdbclient/TenantManagement.actor.cpp create mode 100644 fdbclient/azure_backup/README.md rename {fdbserver/include/fdbserver => fdbclient/include/fdbclient}/EncryptKeyProxyInterface.h (100%) rename fdbserver/GetEncryptCipherKeys.actor.cpp => fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h (66%) create mode 100644 fdbclient/include/fdbclient/Metacluster.h create mode 100644 fdbclient/include/fdbclient/MetaclusterManagement.actor.h create mode 100644 fdbrpc/tests/AuthzTlsTest.actor.cpp create mode 100644 fdbrpc/tests/CMakeLists.txt rename fdbserver/{DataDistributionQueue.actor.cpp => DDRelocationQueue.actor.cpp} (81%) rename fdbserver/{DataDistributionTracker.actor.cpp => DDShardTracker.actor.cpp} (59%) create mode 100644 fdbserver/include/fdbserver/DDRelocationQueue.h create mode 100644 fdbserver/include/fdbserver/DDShardTracker.h create mode 100644 fdbserver/include/fdbserver/DDSharedContext.h delete mode 100644 fdbserver/include/fdbserver/EncryptedMutationMessage.h create mode 100644 fdbserver/include/fdbserver/EncryptionOpsUtils.h delete mode 100644 fdbserver/include/fdbserver/GetEncryptCipherKeys.h create mode 100644 fdbserver/include/fdbserver/IEncryptionKeyProvider.actor.h create mode 100644 fdbserver/include/fdbserver/TenantEntryCache.actor.h create mode 100644 fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h create mode 100644 fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h create mode 100644 fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp create mode 100644 fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp create mode 100644 fdbserver/workloads/TenantEntryCacheWorkload.actor.cpp create mode 100644 fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp create mode 100644 flow/ProtocolVersion.cpp rename flow/{include/flow/ProtocolVersion.h => ProtocolVersion.h.cmake} (55%) create mode 100644 flow/ProtocolVersions.cmake create mode 100644 flow/include/flow/WatchFile.actor.h create mode 100644 tests/authorization/admin_server.py create mode 100644 tests/authorization/authz_test.py create mode 100644 tests/authorization/conftest.py create mode 100644 tests/authorization/requirements.txt create mode 100644 tests/authorization/util.py create mode 100644 tests/fast/TenantEntryCache.toml create mode 100644 tests/rare/BlobGranuleRanges.toml rename tests/restarting/{from_6.3.13 => from_6.3.13_until_7.2.0}/DrUpgradeRestart-1.txt (100%) rename tests/restarting/{from_6.3.13 => from_6.3.13_until_7.2.0}/DrUpgradeRestart-2.txt (93%) create mode 100644 tests/restarting/from_7.2.0/DrUpgradeRestart-1.txt create mode 100644 tests/restarting/from_7.2.0/DrUpgradeRestart-2.txt create mode 100644 tests/slow/MetaclusterManagement.toml create mode 100644 tests/slow/SwizzledTenantManagementMetacluster.toml create mode 100644 tests/slow/TenantManagementConcurrency.toml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9549297305..35845dbb08 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ If you have questions, we encourage you to engage in discussion on the [communit ## Before you get started ### Community Guidelines -We want the FoundationDB community to be as welcoming and inclusive as possible, and have adopted a [Code of Conduct](CODE_OF_CONDUCT.md) that we ask all community members to read and observe. +We want the FoundationDB community to be as welcoming and inclusive as possible, and have adopted a [Code of Conduct](CODE_OF_CONDUCT.md) that we ask all community members to read and abide by. ### Project Licensing By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the Apache 2.0 license. @@ -34,7 +34,7 @@ Members of the Apple FoundationDB team are part of the core committers helping r ## Contributing ### Opening a Pull Request -We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines. +We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to the [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines. CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer. Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them. @@ -46,10 +46,10 @@ To report a security issue, please **DO NOT** start by filing a public issue or ## Project Communication ### Community Forums -We encourage your participation asking questions and helping improve the FoundationDB project. Check out the [FoundationDB community forums](https://forums.foundationdb.org), which serve a similar function as mailing lists in many open source projects. The forums are organized into three sections: +We encourage your participation asking questions and helping improve the FoundationDB project. Check out the [FoundationDB community forums](https://forums.foundationdb.org), which serve a similar function as mailing lists in many open source projects. The forums are organized into three categories: * [Development](https://forums.foundationdb.org/c/development): For discussing the internals and development of the FoundationDB core, as well as layers. -* [Using FoundationDB](https://forums.foundationdb.org/c/using-foundationdb): For discussing user-facing topics. Getting started and have a question? This is the place for you. +* [Using FoundationDB](https://forums.foundationdb.org/c/using-foundationdb): For discussing user-facing topics. Getting started and have a question? This is the category for you. * [Site Feedback](https://forums.foundationdb.org/c/site-feedback): A category for discussing the forums and the OSS project, its organization, how it works, and how we can improve it. ### Using GitHub Issues and Community Forums @@ -63,4 +63,4 @@ GitHub Issues should be used for tracking tasks. If you know the specific code t * Implementing an agreed upon feature: *GitHub Issues* ### Project and Development Updates -Stay connected to the project and the community! For project and community updates, follow the [FoundationDB project blog](https://www.foundationdb.org/blog/). Development announcements will be made via the community forums' [dev-announce](https://forums.foundationdb.org/c/development/dev-announce) section. +Stay connected to the project and the community! For project and community updates, follow the [FoundationDB project blog](https://www.foundationdb.org/blog/). Development announcements will be made via the community forums' [dev-announce](https://forums.foundationdb.org/c/development/dev-announce) category. diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 9e864aa509..1f31a8739c 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -139,8 +139,12 @@ if(NOT WIN32) test/apitester/TesterTestSpec.cpp test/apitester/TesterTestSpec.h test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp + test/apitester/TesterBlobGranuleErrorsWorkload.cpp + test/apitester/TesterBlobGranuleUtil.cpp + test/apitester/TesterBlobGranuleUtil.h test/apitester/TesterCancelTransactionWorkload.cpp test/apitester/TesterCorrectnessWorkload.cpp + test/apitester/TesterExampleWorkload.cpp test/apitester/TesterKeyValueStore.cpp test/apitester/TesterKeyValueStore.h test/apitester/TesterOptions.h @@ -332,6 +336,24 @@ if(NOT WIN32) @SERVER_CA_FILE@ ) + add_test(NAME fdb_c_upgrade_to_future_version + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --process-number 3 + ) + set_tests_properties("fdb_c_upgrade_to_future_version" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") + + add_test(NAME fdb_c_upgrade_to_future_version_blob_granules + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml + --upgrade-path "7.2.0" "7.3.0" "7.2.0" + --blob-granules-enabled + --process-number 3 + ) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT USE_SANITIZER) add_test(NAME fdb_c_upgrade_single_threaded_630api COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py @@ -439,7 +461,7 @@ if (OPEN_FOR_IDE) target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads) target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include) -elseif(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only +elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer only set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -465,7 +487,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only DEPENDS ${IMPLIBSO_SRC} COMMENT "Generating source code for C shim library") - add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp) + add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp) target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack") target_link_libraries(fdb_c_shim PUBLIC dl) target_include_directories(fdb_c_shim PUBLIC @@ -492,7 +514,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only --api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests ) -endif() # End Linux only, non-ubsan only +endif() # End Linux only, non-sanitizer only # TODO: re-enable once the old vcxproj-based build system is removed. #generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT" @@ -537,7 +559,7 @@ fdb_install( DESTINATION_SUFFIX "/cmake/${targets_export_name}" COMPONENT clients) -if(NOT WIN32 AND NOT APPLE AND NOT USE_UBSAN) # Linux Only, non-ubsan only +if(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-sanitizer only fdb_install( FILES foundationdb/fdb_c_shim.h diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index c97604b98c..bc16cbf1a1 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -79,9 +79,10 @@ extern "C" DLLEXPORT fdb_bool_t fdb_error_predicate(int predicate_test, fdb_erro if (predicate_test == FDBErrorPredicates::RETRYABLE_NOT_COMMITTED) { return code == error_code_not_committed || code == error_code_transaction_too_old || code == error_code_future_version || code == error_code_database_locked || - code == error_code_proxy_memory_limit_exceeded || code == error_code_batch_transaction_throttled || - code == error_code_process_behind || code == error_code_tag_throttled || - code == error_code_unknown_tenant; + code == error_code_grv_proxy_memory_limit_exceeded || + code == error_code_commit_proxy_memory_limit_exceeded || + code == error_code_batch_transaction_throttled || code == error_code_process_behind || + code == error_code_tag_throttled || code == error_code_unknown_tenant; } return false; } @@ -238,6 +239,10 @@ fdb_error_t fdb_future_get_version_v619(FDBFuture* f, int64_t* out_version) { CATCH_AND_RETURN(*out_version = TSAV(Version, f)->get();); } +extern "C" DLLEXPORT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out_value) { + CATCH_AND_RETURN(*out_value = TSAV(bool, f)->get();); +} + extern "C" DLLEXPORT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out_value) { CATCH_AND_RETURN(*out_value = TSAV(int64_t, f)->get();); } @@ -493,6 +498,54 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDat FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr()); } +extern "C" DLLEXPORT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length) { + return (FDBFuture*)(DB(db) + ->blobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length))) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length) { + return (FDBFuture*)(DB(db) + ->unblobbifyRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length))) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int rangeLimit) { + return (FDBFuture*)(DB(db) + ->listBlobbifiedRanges(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length)), + rangeLimit) + .extractPtr()); +} + +extern "C" DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t version) { + return (FDBFuture*)(DB(db) + ->verifyBlobRange(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length), + StringRef(end_key_name, end_key_name_length)), + version) + .extractPtr()); +} + extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) { CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr();); } @@ -855,11 +908,12 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTrans uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, - int end_key_name_length) { + int end_key_name_length, + int rangeLimit) { RETURN_FUTURE_ON_ERROR( Standalone>, KeyRangeRef range(KeyRef(begin_key_name, begin_key_name_length), KeyRef(end_key_name, end_key_name_length)); - return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range).extractPtr());); + return (FDBFuture*)(TXN(tr)->getBlobGranuleRanges(range, rangeLimit).extractPtr());); } extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* tr, @@ -889,6 +943,57 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr());); } +extern "C" DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut) { + Optional rv; + if (readVersion != latestVersion) { + rv = readVersion; + } + return (FDBFuture*)(TXN(tr) + ->readBlobGranulesStart(KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length), + KeyRef(end_key_name, end_key_name_length)), + beginVersion, + rv, + readVersionOut) + .extractPtr()); +} + +extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr, + FDBFuture* f, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granule_context) { + // FIXME: better way to convert? + ReadBlobGranuleContext context; + context.userContext = granule_context->userContext; + context.start_load_f = granule_context->start_load_f; + context.get_load_f = granule_context->get_load_f; + context.free_load_f = granule_context->free_load_f; + context.debugNoMaterialize = granule_context->debugNoMaterialize; + context.granuleParallelism = granule_context->granuleParallelism; + ThreadFuture>> startFuture( + TSAV(Standalone>, f)); + + return (FDBResult*)(TXN(tr) + ->readBlobGranulesFinish(startFuture, + KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length), + KeyRef(end_key_name, end_key_name_length)), + beginVersion, + readVersion, + context) + .extractPtr()); +} + #include "fdb_c_function_pointers.g.h" #define FDB_API_CHANGED(func, ver) \ @@ -964,6 +1069,10 @@ extern "C" DLLEXPORT const char* fdb_get_client_version() { return API->getClientVersion(); } +extern "C" DLLEXPORT void fdb_use_future_protocol_version() { + API->useFutureProtocolVersion(); +} + #if defined(__APPLE__) #include __attribute__((constructor)) static void initialize() { diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 409fd8ef55..10534a94dc 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -227,6 +227,8 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_set_callback(FDBFuture* f, DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_error(FDBFuture* f); #endif +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_bool(FDBFuture* f, fdb_bool_t* out); + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_int64(FDBFuture* f, int64_t* out); DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_uint64(FDBFuture* f, uint64_t* out); @@ -321,6 +323,32 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complet uint8_t const* purge_key_name, int purge_key_name_length); +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_blobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_unblobbify_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_list_blobbified_ranges(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int rangeLimit); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_verify_blob_range(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t version); + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction); @@ -479,7 +507,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges( uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, - int end_key_name_length); + int end_key_name_length, + int rangeLimit); /* LatestVersion (-2) for readVersion means get read version from transaction Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */ diff --git a/bindings/c/foundationdb/fdb_c_internal.h b/bindings/c/foundationdb/fdb_c_internal.h index 2b1a2163c7..62b77f354e 100644 --- a/bindings/c/foundationdb/fdb_c_internal.h +++ b/bindings/c/foundationdb/fdb_c_internal.h @@ -49,6 +49,29 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_shared_state(FDBFuture* DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_create_database_from_connection_string(const char* connection_string, FDBDatabase** out_database); +DLLEXPORT void fdb_use_future_protocol_version(); + +// the logical read_blob_granules is broken out (at different points depending on the client type) into the asynchronous +// start() that happens on the fdb network thread, and synchronous finish() that happens off it +DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_start(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut); + +DLLEXPORT FDBResult* fdb_transaction_read_blob_granules_finish(FDBTransaction* tr, + FDBFuture* f, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granuleContext); + #ifdef __cplusplus } #endif diff --git a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp index 52d8ddc651..e1af440f09 100644 --- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp @@ -18,61 +18,13 @@ * limitations under the License. */ #include "TesterApiWorkload.h" +#include "TesterBlobGranuleUtil.h" #include "TesterUtil.h" #include #include namespace FdbApiTester { -class TesterGranuleContext { -public: - std::unordered_map loadsInProgress; - int64_t nextId = 0; - std::string basePath; - - ~TesterGranuleContext() { - // if there was an error or not all loads finished, delete data - for (auto& it : loadsInProgress) { - uint8_t* dataToFree = it.second; - delete[] dataToFree; - } - } -}; - -static int64_t granule_start_load(const char* filename, - int filenameLength, - int64_t offset, - int64_t length, - int64_t fullFileLength, - void* context) { - - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - int64_t loadId = ctx->nextId++; - - uint8_t* buffer = new uint8_t[length]; - std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary); - fin.seekg(offset); - fin.read((char*)buffer, length); - - ctx->loadsInProgress.insert({ loadId, buffer }); - - return loadId; -} - -static uint8_t* granule_get_load(int64_t loadId, void* context) { - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - return ctx->loadsInProgress.at(loadId); -} - -static void granule_free_load(int64_t loadId, void* context) { - TesterGranuleContext* ctx = (TesterGranuleContext*)context; - auto it = ctx->loadsInProgress.find(loadId); - uint8_t* dataToFree = it->second; - delete[] dataToFree; - - ctx->loadsInProgress.erase(it); -} - class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload { public: ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) { @@ -80,9 +32,12 @@ public: if (Random::get().randomInt(0, 1) == 0) { excludedOpTypes.push_back(OP_CLEAR_RANGE); } + // FIXME: remove! this bug is fixed in another PR + excludedOpTypes.push_back(OP_GET_RANGES); } private: + // FIXME: use other new blob granule apis! enum OpType { OP_INSERT, OP_CLEAR, OP_CLEAR_RANGE, OP_READ, OP_GET_RANGES, OP_LAST = OP_GET_RANGES }; std::vector excludedOpTypes; @@ -101,16 +56,8 @@ private: execTransaction( [this, begin, end, results, tooOld](auto ctx) { ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); - TesterGranuleContext testerContext; - testerContext.basePath = ctx->getBGBasePath(); - - fdb::native::FDBReadBlobGranuleContext granuleContext; - granuleContext.userContext = &testerContext; - granuleContext.debugNoMaterialize = false; - granuleContext.granuleParallelism = 1; - granuleContext.start_load_f = &granule_start_load; - granuleContext.get_load_f = &granule_get_load; - granuleContext.free_load_f = &granule_free_load; + TesterGranuleContext testerContext(ctx->getBGBasePath()); + fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext); fdb::Result res = ctx->tx().readBlobGranules( begin, end, 0 /* beginVersion */, -2 /* latest read version */, granuleContext); @@ -124,8 +71,10 @@ private: } else if (err.code() != error_code_success) { ctx->onError(err); } else { - auto& [out_kv, out_count, out_more] = out; + auto resCopy = copyKeyValueArray(out); + auto& [resVector, out_more] = resCopy; ASSERT(!out_more); + results.get()->assign(resVector.begin(), resVector.end()); if (!seenReadSuccess) { info("BlobGranuleCorrectness::randomReadOp first success\n"); } @@ -178,7 +127,7 @@ private: } execTransaction( [begin, end, results](auto ctx) { - fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end).eraseType(); + fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType(); ctx->continueAfter( f, [ctx, f, results]() { @@ -196,11 +145,25 @@ private: for (int i = 0; i < results->size(); i++) { // no empty or inverted ranges + if ((*results)[i].beginKey >= (*results)[i].endKey) { + error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})", + fdb::toCharsRef((*results)[i].beginKey), + fdb::toCharsRef((*results)[i].endKey), + fdb::toCharsRef(begin), + fdb::toCharsRef(end))); + } ASSERT((*results)[i].beginKey < (*results)[i].endKey); } for (int i = 1; i < results->size(); i++) { // ranges contain entire requested key range + if ((*results)[i].beginKey != (*results)[i].endKey) { + error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})", + fdb::toCharsRef((*results)[i].beginKey), + fdb::toCharsRef((*results)[i].endKey), + fdb::toCharsRef(begin), + fdb::toCharsRef(end))); + } ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey); } diff --git a/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp new file mode 100644 index 0000000000..7bb879a185 --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp @@ -0,0 +1,145 @@ +/* + * TesterBlobGranuleErrorsWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "TesterApiWorkload.h" +#include "TesterBlobGranuleUtil.h" +#include "TesterUtil.h" +#include +#include + +namespace FdbApiTester { + +class BlobGranuleErrorsWorkload : public ApiWorkload { +public: + BlobGranuleErrorsWorkload(const WorkloadConfig& config) : ApiWorkload(config) {} + +private: + enum OpType { + OP_READ_NO_MATERIALIZE, + OP_READ_FILE_LOAD_ERROR, + OP_READ_TOO_OLD, + OP_CANCEL_RANGES, + OP_LAST = OP_CANCEL_RANGES + }; + + // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet + // FIXME: should still guarantee a read succeeds eventually somehow + bool seenReadSuccess = false; + + void doErrorOp(TTaskFct cont, + std::string basePathAddition, + bool doMaterialize, + int64_t readVersion, + fdb::native::fdb_error_t expectedError) { + fdb::Key begin = randomKeyName(); + fdb::Key end = begin; + // [K - K) empty range will succeed read because there is trivially nothing to do, so don't do it + while (end == begin) { + end = randomKeyName(); + } + if (begin > end) { + std::swap(begin, end); + } + + execTransaction( + [this, begin, end, basePathAddition, doMaterialize, readVersion, expectedError](auto ctx) { + ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); + + TesterGranuleContext testerContext(ctx->getBGBasePath() + basePathAddition); + fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext); + granuleContext.debugNoMaterialize = !doMaterialize; + + fdb::Result res = + ctx->tx().readBlobGranules(begin, end, 0 /* beginVersion */, readVersion, granuleContext); + auto out = fdb::Result::KeyValueRefArray{}; + fdb::Error err = res.getKeyValueArrayNothrow(out); + + if (err.code() == error_code_success) { + error(fmt::format("Operation succeeded in error test!")); + } + ASSERT(err.code() != error_code_success); + if (err.code() != error_code_blob_granule_transaction_too_old) { + seenReadSuccess = true; + } + if (err.code() != expectedError) { + info(fmt::format("incorrect error. Expected {}, Got {}", err.code(), expectedError)); + if (err.code() == error_code_blob_granule_transaction_too_old) { + ASSERT(!seenReadSuccess); + ctx->done(); + } else { + ctx->onError(err); + } + } else { + ctx->done(); + } + }, + [this, cont]() { schedule(cont); }); + } + + void randomOpReadNoMaterialize(TTaskFct cont) { + // ensure setting noMaterialize flag produces blob_granule_not_materialized + doErrorOp(cont, "", false, -2 /*latest read version */, error_code_blob_granule_not_materialized); + } + + void randomOpReadFileLoadError(TTaskFct cont) { + // point to a file path that doesn't exist by adding an extra suffix + doErrorOp(cont, "extrapath/", true, -2 /*latest read version */, error_code_blob_granule_file_load_error); + } + + void randomOpReadTooOld(TTaskFct cont) { + // read at a version (1) that should predate granule data + doErrorOp(cont, "", true, 1, error_code_blob_granule_transaction_too_old); + } + + void randomCancelGetRangesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomOperation(TTaskFct cont) override { + OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { + case OP_READ_NO_MATERIALIZE: + randomOpReadNoMaterialize(cont); + break; + case OP_READ_FILE_LOAD_ERROR: + randomOpReadFileLoadError(cont); + break; + case OP_READ_TOO_OLD: + randomOpReadTooOld(cont); + break; + case OP_CANCEL_RANGES: + randomCancelGetRangesOp(cont); + break; + } + } +}; + +WorkloadFactory BlobGranuleErrorsWorkloadFactory("BlobGranuleErrors"); + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp new file mode 100644 index 0000000000..a908a9c0bf --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.cpp @@ -0,0 +1,80 @@ +/* + * TesterBlobGranuleUtil.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterBlobGranuleUtil.h" +#include "TesterUtil.h" +#include + +namespace FdbApiTester { + +// FIXME: avoid duplicating this between files! +static int64_t granule_start_load(const char* filename, + int filenameLength, + int64_t offset, + int64_t length, + int64_t fullFileLength, + void* context) { + + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + int64_t loadId = ctx->nextId++; + + uint8_t* buffer = new uint8_t[length]; + std::ifstream fin(ctx->basePath + std::string(filename, filenameLength), std::ios::in | std::ios::binary); + if (fin.fail()) { + delete[] buffer; + buffer = nullptr; + } else { + fin.seekg(offset); + fin.read((char*)buffer, length); + } + + ctx->loadsInProgress.insert({ loadId, buffer }); + + return loadId; +} + +static uint8_t* granule_get_load(int64_t loadId, void* context) { + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + return ctx->loadsInProgress.at(loadId); +} + +static void granule_free_load(int64_t loadId, void* context) { + TesterGranuleContext* ctx = (TesterGranuleContext*)context; + auto it = ctx->loadsInProgress.find(loadId); + uint8_t* dataToFree = it->second; + delete[] dataToFree; + + ctx->loadsInProgress.erase(it); +} + +fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext) { + fdb::native::FDBReadBlobGranuleContext granuleContext; + + granuleContext.userContext = (void*)testerContext; + granuleContext.debugNoMaterialize = false; + granuleContext.granuleParallelism = 1 + Random::get().randomInt(0, 3); + granuleContext.start_load_f = &granule_start_load; + granuleContext.get_load_f = &granule_get_load; + granuleContext.free_load_f = &granule_free_load; + + return granuleContext; +} + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterBlobGranuleUtil.h b/bindings/c/test/apitester/TesterBlobGranuleUtil.h new file mode 100644 index 0000000000..7b4b0dba81 --- /dev/null +++ b/bindings/c/test/apitester/TesterBlobGranuleUtil.h @@ -0,0 +1,49 @@ +/* + * TesterBlobGranuleUtil.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef APITESTER_BLOBGRANULE_UTIL_H +#define APITESTER_BLOBGRANULE_UTIL_H +#include "TesterUtil.h" +#include "test/fdb_api.hpp" +#include + +namespace FdbApiTester { + +class TesterGranuleContext { +public: + std::unordered_map loadsInProgress; + std::string basePath; + int64_t nextId; + + TesterGranuleContext(const std::string& basePath) : basePath(basePath), nextId(0) {} + + ~TesterGranuleContext() { + // this should now never happen with proper memory management + ASSERT(loadsInProgress.empty()); + } +}; + +fdb::native::FDBReadBlobGranuleContext createGranuleContext(const TesterGranuleContext* testerContext); + +} // namespace FdbApiTester + +#endif diff --git a/bindings/c/test/apitester/TesterExampleWorkload.cpp b/bindings/c/test/apitester/TesterExampleWorkload.cpp new file mode 100644 index 0000000000..3765dc50fb --- /dev/null +++ b/bindings/c/test/apitester/TesterExampleWorkload.cpp @@ -0,0 +1,65 @@ +/* + * TesterExampleWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterWorkload.h" +#include "TesterUtil.h" + +namespace FdbApiTester { + +class SetAndGetWorkload : public WorkloadBase { +public: + fdb::Key keyPrefix; + Random random; + + SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) { + keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId)); + } + + void start() override { setAndGet(NO_OP_TASK); } + + void setAndGet(TTaskFct cont) { + fdb::Key key = keyPrefix + random.randomStringLowerCase(10, 100); + fdb::Value value = random.randomStringLowerCase(10, 1000); + execTransaction( + [key, value](auto ctx) { + ctx->tx().set(key, value); + ctx->commit(); + }, + [this, key, value, cont]() { + execTransaction( + [this, key, value](auto ctx) { + auto future = ctx->tx().get(key, false); + ctx->continueAfter(future, [this, ctx, future, value]() { + std::optional res = copyValueRef(future.get()); + if (res != value) { + error(fmt::format( + "expected: {} actual: {}", fdb::toCharsRef(value), fdb::toCharsRef(res.value()))); + } + ctx->done(); + }); + }, + cont); + }); + } +}; + +WorkloadFactory SetAndGetWorkloadFactory("SetAndGet"); + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 3ff57ec183..1160b696b0 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -38,6 +38,7 @@ public: std::string logGroup; std::string externalClientLibrary; std::string externalClientDir; + std::string futureVersionClientLibrary; std::string tmpDir; bool disableLocalClient = false; std::string testFile; diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index cbce118f10..6cdfacc423 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -165,8 +165,11 @@ void WorkloadManager::add(std::shared_ptr workload, TTaskFct cont) { void WorkloadManager::run() { std::vector> initialWorkloads; - for (auto iter : workloads) { - initialWorkloads.push_back(iter.second.ref); + { + std::unique_lock lock(mutex); + for (auto iter : workloads) { + initialWorkloads.push_back(iter.second.ref); + } } for (auto iter : initialWorkloads) { iter->init(this); @@ -324,4 +327,4 @@ std::unordered_map& IWorkloadFactory::factories( return theFactories; } -} // namespace FdbApiTester \ No newline at end of file +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml new file mode 100644 index 0000000000..788bd04d85 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsMultiThr.toml @@ -0,0 +1,22 @@ +[[test]] +title = 'Blob Granule Errors Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml new file mode 100644 index 0000000000..788bd04d85 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsOnExternalThread.toml @@ -0,0 +1,22 @@ +[[test]] +title = 'Blob Granule Errors Multi Threaded' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml new file mode 100644 index 0000000000..85e78975f6 --- /dev/null +++ b/bindings/c/test/apitester/blobgranuletests/CApiBlobGranuleErrorsSingleThr.toml @@ -0,0 +1,15 @@ +[[test]] +title = 'Blob Granule Errors Single Threaded' +minClients = 1 +maxClients = 3 +multiThreaded = false + + [[test.workload]] + name = 'BlobGranuleErrors' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 \ No newline at end of file diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 62b6af6dd4..310ebd9b83 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -46,6 +46,7 @@ enum TesterOptionId { OPT_KNOB, OPT_EXTERNAL_CLIENT_LIBRARY, OPT_EXTERNAL_CLIENT_DIRECTORY, + OPT_FUTURE_VERSION_CLIENT_LIBRARY, OPT_TMP_DIR, OPT_DISABLE_LOCAL_CLIENT, OPT_TEST_FILE, @@ -72,6 +73,7 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, { OPT_EXTERNAL_CLIENT_DIRECTORY, "--external-client-dir", SO_REQ_SEP }, + { OPT_FUTURE_VERSION_CLIENT_LIBRARY, "--future-version-client-library", SO_REQ_SEP }, { OPT_TMP_DIR, "--tmp-dir", SO_REQ_SEP }, { OPT_DISABLE_LOCAL_CLIENT, "--disable-local-client", SO_NONE }, { OPT_TEST_FILE, "-f", SO_REQ_SEP }, @@ -110,6 +112,8 @@ void printProgramUsage(const char* execName) { " Path to the external client library.\n" " --external-client-dir DIR\n" " Directory containing external client libraries.\n" + " --future-version-client-library FILE\n" + " Path to a client library to be used with a future protocol version.\n" " --tmp-dir DIR\n" " Directory for temporary files of the client.\n" " --disable-local-client DIR\n" @@ -204,6 +208,9 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { case OPT_EXTERNAL_CLIENT_DIRECTORY: options.externalClientDir = args.OptionArg(); break; + case OPT_FUTURE_VERSION_CLIENT_LIBRARY: + options.futureVersionClientLibrary = args.OptionArg(); + break; case OPT_TMP_DIR: options.tmpDir = args.OptionArg(); break; @@ -296,6 +303,11 @@ void applyNetworkOptions(TesterOptions& options) { } } + if (!options.futureVersionClientLibrary.empty()) { + fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_FUTURE_VERSION_CLIENT_LIBRARY, + options.futureVersionClientLibrary); + } + if (options.testSpec.multiThreaded) { fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads); } diff --git a/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml new file mode 100644 index 0000000000..84531ea9c8 --- /dev/null +++ b/bindings/c/test/apitester/tests/upgrade/ApiBlobGranulesCorrectness.toml @@ -0,0 +1,23 @@ +[[test]] +title = 'Mixed Workload for Upgrade Tests with a Multi-Threaded Client' +multiThreaded = true +buggify = true +databasePerTransaction = false +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'ApiBlobGranuleCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml index 86e65c5918..94bf4e0509 100644 --- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml +++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml @@ -32,4 +32,14 @@ maxClients = 8 maxKeysPerTransaction = 50 initialSize = 100 runUntilStop = true - readExistingKeysRatio = 0.9 \ No newline at end of file + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + runUntilStop = true + + [[test.workload]] + name = 'WatchAndWait' + initialSize = 0 + runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml index 42df76521b..daf070b31b 100644 --- a/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml +++ b/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml @@ -30,4 +30,14 @@ maxClients = 8 maxKeysPerTransaction = 50 initialSize = 100 runUntilStop = true - readExistingKeysRatio = 0.9 \ No newline at end of file + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + runUntilStop = true + + [[test.workload]] + name = 'WatchAndWait' + initialSize = 0 + runUntilStop = true \ No newline at end of file diff --git a/bindings/c/test/fdb_api.hpp b/bindings/c/test/fdb_api.hpp index bee40981c3..6d0db008a2 100644 --- a/bindings/c/test/fdb_api.hpp +++ b/bindings/c/test/fdb_api.hpp @@ -559,9 +559,9 @@ public: reverse); } - TypedFuture getBlobGranuleRanges(KeyRef begin, KeyRef end) { + TypedFuture getBlobGranuleRanges(KeyRef begin, KeyRef end, int rangeLimit) { return native::fdb_transaction_get_blob_granule_ranges( - tr.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + tr.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit); } Result readBlobGranules(KeyRef begin, diff --git a/bindings/c/test/mako/blob_granules.cpp b/bindings/c/test/mako/blob_granules.cpp index af805f2e56..1071737211 100644 --- a/bindings/c/test/mako/blob_granules.cpp +++ b/bindings/c/test/mako/blob_granules.cpp @@ -26,6 +26,9 @@ extern thread_local mako::Logger logr; +// FIXME: use the same implementation as the api tester! this implementation was from back when mako was written in C +// and is inferior. + namespace mako::blob_granules::local_file { int64_t startLoad(const char* filename, diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index d3c1dec30d..d454082af3 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -356,9 +356,15 @@ fdb_error_t Transaction::add_conflict_range(std::string_view begin_key, tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size(), type); } -KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key) { - return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges( - tr_, (const uint8_t*)begin_key.data(), begin_key.size(), (const uint8_t*)end_key.data(), end_key.size())); +KeyRangeArrayFuture Transaction::get_blob_granule_ranges(std::string_view begin_key, + std::string_view end_key, + int rangeLimit) { + return KeyRangeArrayFuture(fdb_transaction_get_blob_granule_ranges(tr_, + (const uint8_t*)begin_key.data(), + begin_key.size(), + (const uint8_t*)end_key.data(), + end_key.size(), + rangeLimit)); } KeyValueArrayResult Transaction::read_blob_granules(std::string_view begin_key, std::string_view end_key, diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index 7d44a30a9a..d0c4abd8db 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -348,7 +348,7 @@ public: // Wrapper around fdb_transaction_add_conflict_range. fdb_error_t add_conflict_range(std::string_view begin_key, std::string_view end_key, FDBConflictRangeType type); - KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key); + KeyRangeArrayFuture get_blob_granule_ranges(std::string_view begin_key, std::string_view end_key, int rangeLimit); KeyValueArrayResult read_blob_granules(std::string_view begin_key, std::string_view end_key, int64_t beginVersion, diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 9f5c015bfb..2ab80cf90c 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -2853,7 +2853,7 @@ TEST_CASE("Blob Granule Functions") { // test ranges while (1) { - fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh")); + fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"), 1000); fdb_error_t err = wait_future(f); if (err) { fdb::EmptyFuture f2 = tr.on_error(err); diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index a3c0674e64..a58cea3f1f 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -239,6 +239,13 @@ func (o NetworkOptions) SetClientThreadsPerVersion(param int64) error { return o.setOpt(65, int64ToBytes(param)) } +// Adds an external client library to be used with a future version protocol. This option can be used testing purposes only! +// +// Parameter: path to client library +func (o NetworkOptions) SetFutureVersionClientLibrary(param string) error { + return o.setOpt(66, []byte(param)) +} + // Disables logging of client statistics, such as sampled transaction activity. func (o NetworkOptions) SetDisableClientStatisticsLogging() error { return o.setOpt(70, nil) @@ -615,6 +622,13 @@ func (o TransactionOptions) SetUseGrvCache() error { return o.setOpt(1101, nil) } +// Attach given authorization token to the transaction such that subsequent tenant-aware requests are authorized +// +// Parameter: A JSON Web Token authorized to access data belonging to one or more tenants, indicated by 'tenants' claim of the token's payload. +func (o TransactionOptions) SetAuthorizationToken(param string) error { + return o.setOpt(2000, []byte(param)) +} + type StreamingMode int const ( diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 22564dccc8..7057f22384 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -34,9 +34,11 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/FDBDatabase.java src/main/com/apple/foundationdb/FDBTenant.java src/main/com/apple/foundationdb/FDBTransaction.java + src/main/com/apple/foundationdb/FutureBool.java src/main/com/apple/foundationdb/FutureInt64.java src/main/com/apple/foundationdb/FutureKey.java src/main/com/apple/foundationdb/FutureKeyArray.java + src/main/com/apple/foundationdb/FutureKeyRangeArray.java src/main/com/apple/foundationdb/FutureResult.java src/main/com/apple/foundationdb/FutureResults.java src/main/com/apple/foundationdb/FutureMappedResults.java @@ -56,6 +58,7 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/RangeQuery.java src/main/com/apple/foundationdb/MappedRangeQuery.java src/main/com/apple/foundationdb/KeyArrayResult.java + src/main/com/apple/foundationdb/KeyRangeArrayResult.java src/main/com/apple/foundationdb/RangeResult.java src/main/com/apple/foundationdb/MappedRangeResult.java src/main/com/apple/foundationdb/RangeResultInfo.java diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index e685d3ee53..c2b5ea90cc 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -25,9 +25,11 @@ #include "com_apple_foundationdb_FDB.h" #include "com_apple_foundationdb_FDBDatabase.h" #include "com_apple_foundationdb_FDBTransaction.h" +#include "com_apple_foundationdb_FutureBool.h" #include "com_apple_foundationdb_FutureInt64.h" #include "com_apple_foundationdb_FutureKey.h" #include "com_apple_foundationdb_FutureKeyArray.h" +#include "com_apple_foundationdb_FutureKeyRangeArray.h" #include "com_apple_foundationdb_FutureResult.h" #include "com_apple_foundationdb_FutureResults.h" #include "com_apple_foundationdb_FutureStrings.h" @@ -55,7 +57,11 @@ static jclass mapped_range_result_class; static jclass mapped_key_value_class; static jclass string_class; static jclass key_array_result_class; +static jclass keyrange_class; +static jclass keyrange_array_result_class; static jmethodID key_array_result_init; +static jmethodID keyrange_init; +static jmethodID keyrange_array_result_init; static jmethodID range_result_init; static jmethodID mapped_range_result_init; static jmethodID mapped_key_value_from_bytes; @@ -278,6 +284,23 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_NativeFuture_Future_1releaseM fdb_future_release_memory(var); } +JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FutureBool_FutureBool_1get(JNIEnv* jenv, jobject, jlong future) { + if (!future) { + throwParamNotNull(jenv); + return 0; + } + FDBFuture* f = (FDBFuture*)future; + + fdb_bool_t value = false; + fdb_error_t err = fdb_future_get_bool(f, &value); + if (err) { + safeThrow(jenv, getThrowable(jenv, err)); + return 0; + } + + return (jboolean)value; +} + JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FutureInt64_FutureInt64_1get(JNIEnv* jenv, jobject, jlong future) { if (!future) { throwParamNotNull(jenv); @@ -407,6 +430,61 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyArray_FutureKeyAr return result; } +JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureKeyRangeArray_FutureKeyRangeArray_1get(JNIEnv* jenv, + jobject, + jlong future) { + if (!future) { + throwParamNotNull(jenv); + return JNI_NULL; + } + + FDBFuture* f = (FDBFuture*)future; + + const FDBKeyRange* fdbKr; + int count; + fdb_error_t err = fdb_future_get_keyrange_array(f, &fdbKr, &count); + if (err) { + safeThrow(jenv, getThrowable(jenv, err)); + return JNI_NULL; + } + + jobjectArray kr_values = jenv->NewObjectArray(count, keyrange_class, NULL); + if (!kr_values) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + + for (int i = 0; i < count; i++) { + jbyteArray beginArr = jenv->NewByteArray(fdbKr[i].begin_key_length); + if (!beginArr) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + jbyteArray endArr = jenv->NewByteArray(fdbKr[i].end_key_length); + if (!endArr) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + jenv->SetByteArrayRegion(beginArr, 0, fdbKr[i].begin_key_length, (const jbyte*)fdbKr[i].begin_key); + jenv->SetByteArrayRegion(endArr, 0, fdbKr[i].end_key_length, (const jbyte*)fdbKr[i].end_key); + + jobject kr = jenv->NewObject(keyrange_class, keyrange_init, beginArr, endArr); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + jenv->SetObjectArrayElement(kr_values, i, kr); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + } + jobject krarr = jenv->NewObject(keyrange_array_result_class, keyrange_array_result_init, kr_values); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + + return krarr; +} + // SOMEDAY: explore doing this more efficiently with Direct ByteBuffers JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResults_1get(JNIEnv* jenv, jobject, @@ -830,6 +908,142 @@ Java_com_apple_foundationdb_FDBDatabase_Database_1waitPurgeGranulesComplete(JNIE return (jlong)f; } +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1blobbifyRange(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + + FDBDatabase* database = (FDBDatabase*)dbPtr; + + uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!beginKeyArr) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKeyArr) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_blobbify_range( + database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes)); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1unblobbifyRange(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + + FDBDatabase* database = (FDBDatabase*)dbPtr; + + uint8_t* beginKeyArr = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!beginKeyArr) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKeyArr = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKeyArr) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_unblobbify_range( + database, beginKeyArr, jenv->GetArrayLength(beginKeyBytes), endKeyArr, jenv->GetArrayLength(endKeyBytes)); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)beginKeyArr, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKeyArr, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1listBlobbifiedRanges(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jint rangeLimit) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + FDBDatabase* tr = (FDBDatabase*)dbPtr; + + uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!startKey) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKey) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_list_blobbified_ranges( + tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rangeLimit); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); + return (jlong)f; +} + +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verifyBlobRange(JNIEnv* jenv, + jobject, + jlong dbPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jlong version) { + if (!dbPtr || !beginKeyBytes || !endKeyBytes) { + throwParamNotNull(jenv); + return 0; + } + FDBDatabase* tr = (FDBDatabase*)dbPtr; + + uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!startKey) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKey) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_database_list_blobbified_ranges( + tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), version); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); + return (jlong)f; +} + JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv, jobject, jint predicate, @@ -1307,6 +1521,41 @@ Java_com_apple_foundationdb_FDBTransaction_Transaction_1getRangeSplitPoints(JNIE return (jlong)f; } +JNIEXPORT jlong JNICALL +Java_com_apple_foundationdb_FDBTransaction_Transaction_1getBlobGranuleRanges(JNIEnv* jenv, + jobject, + jlong tPtr, + jbyteArray beginKeyBytes, + jbyteArray endKeyBytes, + jint rowLimit) { + if (!tPtr || !beginKeyBytes || !endKeyBytes || !rowLimit) { + throwParamNotNull(jenv); + return 0; + } + FDBTransaction* tr = (FDBTransaction*)tPtr; + + uint8_t* startKey = (uint8_t*)jenv->GetByteArrayElements(beginKeyBytes, JNI_NULL); + if (!startKey) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + uint8_t* endKey = (uint8_t*)jenv->GetByteArrayElements(endKeyBytes, JNI_NULL); + if (!endKey) { + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return 0; + } + + FDBFuture* f = fdb_transaction_get_blob_granule_ranges( + tr, startKey, jenv->GetArrayLength(beginKeyBytes), endKey, jenv->GetArrayLength(endKeyBytes), rowLimit); + jenv->ReleaseByteArrayElements(beginKeyBytes, (jbyte*)startKey, JNI_ABORT); + jenv->ReleaseByteArrayElements(endKeyBytes, (jbyte*)endKey, JNI_ABORT); + return (jlong)f; +} + JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1set(JNIEnv* jenv, jobject, jlong tPtr, @@ -1746,6 +1995,15 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { key_array_result_init = env->GetMethodID(local_key_array_result_class, "", "([B[I)V"); key_array_result_class = (jclass)(env)->NewGlobalRef(local_key_array_result_class); + jclass local_keyrange_class = env->FindClass("com/apple/foundationdb/Range"); + keyrange_init = env->GetMethodID(local_keyrange_class, "", "([B[B)V"); + keyrange_class = (jclass)(env)->NewGlobalRef(local_keyrange_class); + + jclass local_keyrange_array_result_class = env->FindClass("com/apple/foundationdb/KeyRangeArrayResult"); + keyrange_array_result_init = + env->GetMethodID(local_keyrange_array_result_class, "", "([Lcom/apple/foundationdb/Range;)V"); + keyrange_array_result_class = (jclass)(env)->NewGlobalRef(local_keyrange_array_result_class); + jclass local_range_result_summary_class = env->FindClass("com/apple/foundationdb/RangeResultSummary"); range_result_summary_init = env->GetMethodID(local_range_result_summary_class, "", "([BIZ)V"); range_result_summary_class = (jclass)(env)->NewGlobalRef(local_range_result_summary_class); @@ -1770,6 +2028,12 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { if (range_result_class != JNI_NULL) { env->DeleteGlobalRef(range_result_class); } + if (keyrange_array_result_class != JNI_NULL) { + env->DeleteGlobalRef(keyrange_array_result_class); + } + if (keyrange_class != JNI_NULL) { + env->DeleteGlobalRef(keyrange_class); + } if (mapped_range_result_class != JNI_NULL) { env->DeleteGlobalRef(mapped_range_result_class); } diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java index 8608effe53..a3f012ba7c 100644 --- a/bindings/java/src/main/com/apple/foundationdb/Database.java +++ b/bindings/java/src/main/com/apple/foundationdb/Database.java @@ -161,6 +161,20 @@ public interface Database extends AutoCloseable, TransactionContext { */ double getMainThreadBusyness(); + /** + * Runs {@link #purgeBlobGranules(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param purgeVersion version to purge at + * @param force if true delete all data, if not keep data >= purgeVersion + * + * @return the key to watch for purge complete + */ + default CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force) { + return purgeBlobGranules(beginKey, endKey, purgeVersion, force, getExecutor()); + } + /** * Queues a purge of blob granules for the specified key range, at the specified version. * @@ -168,17 +182,126 @@ public interface Database extends AutoCloseable, TransactionContext { * @param endKey end of the key range * @param purgeVersion version to purge at * @param force if true delete all data, if not keep data >= purgeVersion + * @param e the {@link Executor} to use for asynchronous callbacks + * @return the key to watch for purge complete */ CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e); + /** - * Wait for a previous call to purgeBlobGranules to complete + * Runs {@link #waitPurgeGranulesComplete(Function)} on the default executor. * * @param purgeKey key to watch */ + default CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey) { + return waitPurgeGranulesComplete(purgeKey, getExecutor()); + } + + /** + * Wait for a previous call to purgeBlobGranules to complete. + * + * @param purgeKey key to watch + * @param e the {@link Executor} to use for asynchronous callbacks + */ CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey, Executor e); + /** + * Runs {@link #blobbifyRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + + * @return if the recording of the range was successful + */ + default CompletableFuture blobbifyRange(byte[] beginKey, byte[] endKey) { + return blobbifyRange(beginKey, endKey, getExecutor()); + } + + /** + * Sets a range to be blobbified in the database. Must be a completely unblobbified range. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return if the recording of the range was successful + */ + CompletableFuture blobbifyRange(byte[] beginKey, byte[] endKey, Executor e); + + /** + * Runs {@link #unblobbifyRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + + * @return if the recording of the range was successful + */ + default CompletableFuture unblobbifyRange(byte[] beginKey, byte[] endKey) { + return unblobbifyRange(beginKey, endKey, getExecutor()); + } + + /** + * Unsets a blobbified range in the database. The range must be aligned to known blob ranges. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return if the recording of the range was successful + */ + CompletableFuture unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e); + + /** + * Runs {@link #listBlobbifiedRanges(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param rangeLimit batch size + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)] + */ + default CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit) { + return listBlobbifiedRanges(beginKey, endKey, rangeLimit, getExecutor()); + } + + /** + * Lists blobbified ranges in the database. There may be more if result.size() == rangeLimit. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param rangeLimit batch size + * @param e the {@link Executor} to use for asynchronous callbacks + + * @return a future with the list of blobbified ranges: [lastLessThan(beginKey), firstGreaterThanOrEqual(endKey)] + */ + CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e); + + /** + * Runs {@link #verifyBlobRange(Function)} on the default executor. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param version version to read at + * + * @return a future with the version of the last blob granule. + */ + default CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey, long version) { + return verifyBlobRange(beginKey, endKey, version, getExecutor()); + } + + /** + * Checks if a blob range is blobbified. + * + * @param beginKey start of the key range + * @param endKey end of the key range + * @param version version to read at + * + * @return a future with the version of the last blob granule. + */ + CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e); + /** * Runs a read-only transactional function against this {@code Database} with retry logic. * {@link Function#apply(Object) apply(ReadTransaction)} will be called on the diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java index 50a63cc910..98c001a1b0 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java @@ -201,20 +201,60 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume } @Override - public CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor executor) { + public CompletableFuture purgeBlobGranules(byte[] beginKey, byte[] endKey, long purgeVersion, boolean force, Executor e) { pointerReadLock.lock(); try { - return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), executor, eventKeeper); + return new FutureKey(Database_purgeBlobGranules(getPtr(), beginKey, endKey, purgeVersion, force), e, eventKeeper); } finally { pointerReadLock.unlock(); } } @Override - public CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey, Executor executor) { + public CompletableFuture waitPurgeGranulesComplete(byte[] purgeKey, Executor e) { pointerReadLock.lock(); try { - return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), executor); + return new FutureVoid(Database_waitPurgeGranulesComplete(getPtr(), purgeKey), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture blobbifyRange(byte[] beginKey, byte[] endKey, Executor e) { + pointerReadLock.lock(); + try { + return new FutureBool(Database_blobbifyRange(getPtr(), beginKey, endKey), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture unblobbifyRange(byte[] beginKey, byte[] endKey, Executor e) { + pointerReadLock.lock(); + try { + return new FutureBool(Database_unblobbifyRange(getPtr(), beginKey, endKey), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture listBlobbifiedRanges(byte[] beginKey, byte[] endKey, int rangeLimit, Executor e) { + pointerReadLock.lock(); + try { + return new FutureKeyRangeArray(Database_listBlobbifiedRanges(getPtr(), beginKey, endKey, rangeLimit), e); + } finally { + pointerReadLock.unlock(); + } + } + + @Override + public CompletableFuture verifyBlobRange(byte[] beginKey, byte[] endKey, long version, Executor e) { + pointerReadLock.lock(); + try { + return new FutureInt64(Database_verifyBlobRange(getPtr(), beginKey, endKey, version), e); } finally { pointerReadLock.unlock(); } @@ -237,4 +277,8 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume private native double Database_getMainThreadBusyness(long cPtr); private native long Database_purgeBlobGranules(long cPtr, byte[] beginKey, byte[] endKey, long purgeVersion, boolean force); private native long Database_waitPurgeGranulesComplete(long cPtr, byte[] purgeKey); + private native long Database_blobbifyRange(long cPtr, byte[] beginKey, byte[] endKey); + private native long Database_unblobbifyRange(long cPtr, byte[] beginKey, byte[] endKey); + private native long Database_listBlobbifiedRanges(long cPtr, byte[] beginKey, byte[] endKey, int rangeLimit); + private native long Database_verifyBlobRange(long cPtr, byte[] beginKey, byte[] endKey, long version); } \ No newline at end of file diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java index b35196c146..7943c5e9d1 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java @@ -97,6 +97,11 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC return FDBTransaction.this.getRangeSplitPoints(range, chunkSize); } + @Override + public CompletableFuture getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) { + return FDBTransaction.this.getBlobGranuleRanges(begin, end, rowLimit); + } + @Override public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, int matchIndex, boolean reverse, @@ -352,6 +357,16 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC return this.getRangeSplitPoints(range.begin, range.end, chunkSize); } + @Override + public CompletableFuture getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit) { + pointerReadLock.lock(); + try { + return new FutureKeyRangeArray(Transaction_getBlobGranuleRanges(getPtr(), begin, end, rowLimit), executor); + } finally { + pointerReadLock.unlock(); + } + } + @Override public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, int matchIndex, boolean reverse, StreamingMode mode) { @@ -842,4 +857,5 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC private native long Transaction_getKeyLocations(long cPtr, byte[] key); private native long Transaction_getEstimatedRangeSizeBytes(long cPtr, byte[] keyBegin, byte[] keyEnd); private native long Transaction_getRangeSplitPoints(long cPtr, byte[] keyBegin, byte[] keyEnd, long chunkSize); + private native long Transaction_getBlobGranuleRanges(long cPtr, byte[] keyBegin, byte[] keyEnd, int rowLimit); } diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureBool.java b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java new file mode 100644 index 0000000000..ddbbd02649 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/FutureBool.java @@ -0,0 +1,37 @@ +/* + * FutureBool.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.util.concurrent.Executor; + +class FutureBool extends NativeFuture { + FutureBool(long cPtr, Executor executor) { + super(cPtr); + registerMarshalCallback(executor); + } + + @Override + protected Boolean getIfDone_internal(long cPtr) throws FDBException { + return FutureBool_get(cPtr); + } + + private native boolean FutureBool_get(long cPtr) throws FDBException; +} diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java new file mode 100644 index 0000000000..d866e9fca4 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/FutureKeyRangeArray.java @@ -0,0 +1,37 @@ +/* + * FutureKeyRangeArray.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.util.concurrent.Executor; + +class FutureKeyRangeArray extends NativeFuture { + FutureKeyRangeArray(long cPtr, Executor executor) { + super(cPtr); + registerMarshalCallback(executor); + } + + @Override + protected KeyRangeArrayResult getIfDone_internal(long cPtr) throws FDBException { + return FutureKeyRangeArray_get(cPtr); + } + + private native KeyRangeArrayResult FutureKeyRangeArray_get(long cPtr) throws FDBException; +} diff --git a/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java new file mode 100644 index 0000000000..7385b8fe0a --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/KeyRangeArrayResult.java @@ -0,0 +1,36 @@ +/* + * KeyRangeArrayResult.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.util.Arrays; +import java.util.List; + +public class KeyRangeArrayResult { + final List keyRanges; + + public KeyRangeArrayResult(Range[] keyRangeArr) { + this.keyRanges = Arrays.asList(keyRangeArr); + } + + public List getKeyRanges() { + return keyRanges; + } +} diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index 11ed7e900c..04050de6fb 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -513,6 +513,17 @@ public interface ReadTransaction extends ReadTransactionContext { */ CompletableFuture getRangeSplitPoints(Range range, long chunkSize); + /** + * Gets the blob granule ranges for a given region. + * Returned in batches, requires calling again moving the begin key up. + * + * @param begin beginning of the range (inclusive) + * @param end end of the range (exclusive) + + * @return list of blob granules in the given range. May not be all. + */ + CompletableFuture getBlobGranuleRanges(byte[] begin, byte[] end, int rowLimit); + /** * Returns a set of options that can be set on a {@code Transaction} diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Context.java b/bindings/java/src/test/com/apple/foundationdb/test/Context.java index a594e088a1..151a4ba599 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/Context.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/Context.java @@ -29,6 +29,7 @@ import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; import com.apple.foundationdb.Database; import com.apple.foundationdb.FDB; @@ -64,7 +65,7 @@ abstract class Context implements Runnable, AutoCloseable { private List children = new LinkedList<>(); private static Map transactionMap = new HashMap<>(); private static Map transactionRefCounts = new HashMap<>(); - private static Map tenantMap = new HashMap<>(); + private static Map tenantMap = new ConcurrentHashMap<>(); Context(Database db, byte[] prefix) { this.db = db; diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py index cd27f985b0..b94d7ea8e4 100644 --- a/bindings/python/tests/size_limit_tests.py +++ b/bindings/python/tests/size_limit_tests.py @@ -66,6 +66,9 @@ def test_size_limit_option(db): except fdb.FDBError as e: assert(e.code == 2101) # Transaction exceeds byte limit (2101) + # Reset the size limit for future tests + db.options.set_transaction_size_limit(10000000) + @fdb.transactional def test_get_approximate_size(tr): tr[b'key1'] = b'value1' diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 066baf7100..786126359b 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -142,7 +142,7 @@ function(add_fdb_test) ${VALGRIND_OPTION} ${ADD_FDB_TEST_TEST_FILES} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - set_tests_properties("${test_name}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) + set_tests_properties("${test_name}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") get_filename_component(test_dir_full ${first_file} DIRECTORY) if(NOT ${test_dir_full} STREQUAL "") get_filename_component(test_dir ${test_dir_full} NAME) @@ -172,8 +172,7 @@ function(stage_correctness_package) file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin) string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) foreach(test IN LISTS TEST_NAMES) - if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND - (${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND + if((${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE})) foreach(file IN LISTS TEST_FILES_${test}) string(SUBSTRING ${file} ${base_length} -1 rel_out_file) @@ -199,16 +198,17 @@ function(stage_correctness_package) set(src_dir "${src_dir}/") string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir) string(SUBSTRING ${file} ${dir_len} -1 rel_out_file) - set(out_file ${STAGE_OUT_DIR}/${rel_out_file}) + set(out_file ${STAGE_OUT_DIR}/${rel_out_file}) list(APPEND external_files ${out_file}) - add_custom_command( + add_custom_command( OUTPUT ${out_file} - DEPENDS ${file} - COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file} - COMMENT "Copying ${STAGE_CONTEXT} external file ${file}" - ) + DEPENDS ${file} + COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file} + COMMENT "Copying ${STAGE_CONTEXT} external file ${file}" + ) endforeach() endforeach() + list(APPEND package_files ${STAGE_OUT_DIR}/bin/fdbserver ${STAGE_OUT_DIR}/bin/coverage.fdbserver.xml ${STAGE_OUT_DIR}/bin/coverage.fdbclient.xml @@ -218,6 +218,7 @@ function(stage_correctness_package) ${STAGE_OUT_DIR}/bin/TraceLogHelper.dll ${STAGE_OUT_DIR}/CMakeCache.txt ) + add_custom_command( OUTPUT ${package_files} DEPENDS ${CMAKE_BINARY_DIR}/CMakeCache.txt @@ -239,6 +240,20 @@ function(stage_correctness_package) ${STAGE_OUT_DIR}/bin COMMENT "Copying files for ${STAGE_CONTEXT} package" ) + + set(test_harness_dir "${CMAKE_SOURCE_DIR}/contrib/TestHarness2") + file(GLOB_RECURSE test_harness2_files RELATIVE "${test_harness_dir}" CONFIGURE_DEPENDS "${test_harness_dir}/*.py") + foreach(file IN LISTS test_harness2_files) + set(src_file "${test_harness_dir}/${file}") + set(out_file "${STAGE_OUT_DIR}/${file}") + get_filename_component(dir "${out_file}" DIRECTORY) + file(MAKE_DIRECTORY "${dir}") + add_custom_command(OUTPUT ${out_file} + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${out_file}" + DEPENDS "${src_file}") + list(APPEND package_files "${out_file}") + endforeach() + list(APPEND package_files ${test_files} ${external_files}) if(STAGE_OUT_FILES) set(${STAGE_OUT_FILES} ${package_files} PARENT_SCOPE) @@ -404,7 +419,7 @@ endfunction() # Creates a single cluster before running the specified command (usually a ctest test) function(add_fdbclient_test) - set(options DISABLED ENABLED DISABLE_LOG_DUMP API_TEST_BLOB_GRANULES_ENABLED TLS_ENABLED) + set(options DISABLED ENABLED DISABLE_TENANTS DISABLE_LOG_DUMP API_TEST_BLOB_GRANULES_ENABLED TLS_ENABLED) set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY) set(multiValueArgs COMMAND) cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") @@ -431,6 +446,9 @@ function(add_fdbclient_test) if(T_DISABLE_LOG_DUMP) list(APPEND TMP_CLUSTER_CMD --disable-log-dump) endif() + if(T_DISABLE_TENANTS) + list(APPEND TMP_CLUSTER_CMD --disable-tenants) + endif() if(T_API_TEST_BLOB_GRANULES_ENABLED) list(APPEND TMP_CLUSTER_CMD --blob-granules-enabled) endif() @@ -447,9 +465,13 @@ function(add_fdbclient_test) set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT}) else() # default timeout - set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + if(USE_SANITIZER) + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 1200) + else() + set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 300) + endif() endif() - set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) + set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") endfunction() # Creates a cluster file for a nonexistent cluster before running the specified command @@ -483,7 +505,7 @@ function(add_unavailable_fdbclient_test) # default timeout set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT 60) endif() - set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) + set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT "${SANITIZER_OPTIONS}") endfunction() # Creates 3 distinct clusters before running the specified command. diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 79e20420af..d753cf394d 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -69,6 +69,7 @@ if(WIN32) add_definitions(-DWIN32_LEAN_AND_MEAN) add_definitions(-D_ITERATOR_DEBUG_LEVEL=0) add_definitions(-DNOGDI) # WinGDI.h defines macro ERROR + add_definitions(-D_USE_MATH_DEFINES) # Math constants endif() if (USE_CCACHE) @@ -191,6 +192,7 @@ else() endif() if(USE_GCOV) + add_compile_options(--coverage) add_link_options(--coverage) endif() @@ -199,6 +201,8 @@ else() -fsanitize=undefined # TODO(atn34) Re-enable -fsanitize=alignment once https://github.com/apple/foundationdb/issues/1434 is resolved -fno-sanitize=alignment + # https://github.com/apple/foundationdb/issues/7955 + -fno-sanitize=function -DBOOST_USE_UCONTEXT) list(APPEND SANITIZER_LINK_OPTIONS -fsanitize=undefined) endif() diff --git a/cmake/awssdk.cmake b/cmake/awssdk.cmake index 88cb7c78e9..0fef54338d 100644 --- a/cmake/awssdk.cmake +++ b/cmake/awssdk.cmake @@ -11,7 +11,7 @@ endif() include(ExternalProject) ExternalProject_Add(awssdk_project GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git - GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200 + GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331 SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build" GIT_CONFIG advice.detachedHead=false @@ -35,6 +35,7 @@ ExternalProject_Add(awssdk_project "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a" "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a" @@ -75,6 +76,10 @@ add_library(awssdk_c_io STATIC IMPORTED) add_dependencies(awssdk_c_io awssdk_project) set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a") +add_library(awssdk_c_sdkutils STATIC IMPORTED) +add_dependencies(awssdk_c_sdkutils awssdk_project) +set_target_properties(awssdk_c_sdkutils PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a") + add_library(awssdk_checksums STATIC IMPORTED) add_dependencies(awssdk_checksums awssdk_project) set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a") @@ -94,4 +99,4 @@ set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURR # link them all together in one interface target add_library(awssdk_target INTERFACE) target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include) -target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl) \ No newline at end of file +target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_sdkutils awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl) diff --git a/contrib/Joshua/scripts/correctnessTest.sh b/contrib/Joshua/scripts/correctnessTest.sh index a617d81088..bee09acf25 100755 --- a/contrib/Joshua/scripts/correctnessTest.sh +++ b/contrib/Joshua/scripts/correctnessTest.sh @@ -4,4 +4,6 @@ export ASAN_OPTIONS="detect_leaks=0" OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" -mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false +#mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" false + +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} diff --git a/contrib/Joshua/scripts/correctnessTimeout.sh b/contrib/Joshua/scripts/correctnessTimeout.sh index 7917aae591..6bd0bfeee0 100755 --- a/contrib/Joshua/scripts/correctnessTimeout.sh +++ b/contrib/Joshua/scripts/correctnessTimeout.sh @@ -1,4 +1,4 @@ #!/bin/bash -u -for file in `find . -name 'trace*.xml'` ; do - mono ./bin/TestHarness.exe summarize "${file}" summary.xml "" JoshuaTimeout true -done + + +python3 -m test_harness.timeout diff --git a/contrib/Joshua/scripts/valgrindTest.sh b/contrib/Joshua/scripts/valgrindTest.sh index 5409429691..820750f3b2 100755 --- a/contrib/Joshua/scripts/valgrindTest.sh +++ b/contrib/Joshua/scripts/valgrindTest.sh @@ -1,3 +1,3 @@ #!/bin/sh OLDBINDIR="${OLDBINDIR:-/app/deploy/global_data/oldBinaries}" -mono bin/TestHarness.exe joshua-run "${OLDBINDIR}" true +python3 -m test_harness.app -s ${JOSHUA_SEED} --old-binaries-path ${OLDBINDIR} --use-valgrind diff --git a/contrib/Joshua/scripts/valgrindTimeout.sh b/contrib/Joshua/scripts/valgrindTimeout.sh index b9d9e7ebad..2224598e43 100755 --- a/contrib/Joshua/scripts/valgrindTimeout.sh +++ b/contrib/Joshua/scripts/valgrindTimeout.sh @@ -1,6 +1,2 @@ #!/bin/bash -u -for file in `find . -name 'trace*.xml'` ; do - for valgrindFile in `find . -name 'valgrind*.xml'` ; do - mono ./bin/TestHarness.exe summarize "${file}" summary.xml "${valgrindFile}" JoshuaTimeout true - done -done +python3 -m test_harness.timeout --use-valgrind diff --git a/contrib/TestHarness/Program.cs b/contrib/TestHarness/Program.cs index a31a6d6382..b3e003dee5 100644 --- a/contrib/TestHarness/Program.cs +++ b/contrib/TestHarness/Program.cs @@ -19,6 +19,7 @@ */ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; @@ -302,6 +303,7 @@ namespace SummarizeTest uniqueFileSet.Add(file.Substring(0, file.LastIndexOf("-"))); // all restarting tests end with -1.txt or -2.txt } uniqueFiles = uniqueFileSet.ToArray(); + Array.Sort(uniqueFiles); testFile = random.Choice(uniqueFiles); // The on-disk format changed in 4.0.0, and 5.x can't load files from 3.x. string oldBinaryVersionLowerBound = "4.0.0"; @@ -334,8 +336,9 @@ namespace SummarizeTest // thus, by definition, if "until_" appears, we do not want to run with the current binary version oldBinaries = oldBinaries.Concat(currentBinary); } - List oldBinariesList = oldBinaries.ToList(); - if (oldBinariesList.Count == 0) { + string[] oldBinariesList = oldBinaries.ToArray(); + Array.Sort(oldBinariesList); + if (oldBinariesList.Count() == 0) { // In theory, restarting tests are named to have at least one old binary version to run // But if none of the provided old binaries fall in the range, we just skip the test Console.WriteLine("No available old binary version from {0} to {1}", oldBinaryVersionLowerBound, oldBinaryVersionUpperBound); @@ -347,6 +350,7 @@ namespace SummarizeTest else { uniqueFiles = Directory.GetFiles(testDir); + Array.Sort(uniqueFiles); testFile = random.Choice(uniqueFiles); } } @@ -487,6 +491,16 @@ namespace SummarizeTest useValgrind ? "on" : "off"); } + IDictionary data = Environment.GetEnvironmentVariables(); + foreach (DictionaryEntry i in data) + { + string k=(string)i.Key; + string v=(string)i.Value; + if (k.StartsWith("FDB_KNOB")) { + process.StartInfo.EnvironmentVariables[k]=v; + } + } + process.Start(); // SOMEDAY: Do we want to actually do anything with standard output or error? @@ -718,7 +732,7 @@ namespace SummarizeTest process.Refresh(); if (process.HasExited) return; - long mem = process.PrivateMemorySize64; + long mem = process.PagedMemorySize64; MaxMem = Math.Max(MaxMem, mem); //Console.WriteLine(string.Format("Process used {0} bytes", MaxMem)); Thread.Sleep(1000); @@ -744,16 +758,28 @@ namespace SummarizeTest AppendToSummary(summaryFileName, xout); } - // Parses the valgrind XML file and returns a list of "what" tags for each error. + static string ParseValgrindStack(XElement stackElement) { + string backtrace = ""; + foreach (XElement frame in stackElement.Elements()) { + backtrace += " " + frame.Element("ip").Value.ToLower(); + } + if (backtrace.Length > 0) { + backtrace = "addr2line -e fdbserver.debug -p -C -f -i" + backtrace; + } + + return backtrace; + } + + // Parses the valgrind XML file and returns a list of error elements. // All errors for which the "kind" tag starts with "Leak" are ignored - static string[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout) + static XElement[] ParseValgrindOutput(string valgrindOutputFileName, bool traceToStdout) { if (!traceToStdout) { Console.WriteLine("Reading vXML file: " + valgrindOutputFileName); } - ISet whats = new HashSet(); + IList errors = new List(); XElement xdoc = XDocument.Load(valgrindOutputFileName).Element("valgrindoutput"); foreach(var elem in xdoc.Elements()) { if (elem.Name != "error") @@ -761,9 +787,29 @@ namespace SummarizeTest string kind = elem.Element("kind").Value; if(kind.StartsWith("Leak")) continue; - whats.Add(elem.Element("what").Value); + + XElement errorElement = new XElement("ValgrindError", + new XAttribute("Severity", (int)Magnesium.Severity.SevError)); + + int num = 1; + string suffix = ""; + foreach (XElement sub in elem.Elements()) { + if (sub.Name == "what") { + errorElement.SetAttributeValue("What", sub.Value); + } else if (sub.Name == "auxwhat") { + suffix = "Aux" + num++; + errorElement.SetAttributeValue("What" + suffix, sub.Value); + } else if (sub.Name == "stack") { + errorElement.SetAttributeValue("Backtrace" + suffix, ParseValgrindStack(sub)); + } else if (sub.Name == "origin") { + errorElement.SetAttributeValue("WhatOrigin", sub.Element("what").Value); + errorElement.SetAttributeValue("BacktraceOrigin", ParseValgrindStack(sub.Element("stack"))); + } + } + + errors.Add(errorElement); } - return whats.ToArray(); + return errors.ToArray(); } delegate IEnumerable parseDelegate(System.IO.Stream stream, string file, @@ -927,6 +973,10 @@ namespace SummarizeTest { xout.Add(new XElement(ev.Type, new XAttribute("File", ev.Details.File), new XAttribute("Line", ev.Details.Line))); } + if (ev.Type == "RunningUnitTest") + { + xout.Add(new XElement(ev.Type, new XAttribute("Name", ev.Details.Name), new XAttribute("File", ev.Details.File), new XAttribute("Line", ev.Details.Line))); + } if (ev.Type == "TestsExpectedToPass") testCount = int.Parse(ev.Details.Count); if (ev.Type == "TestResults" && ev.Details.Passed == "1") @@ -1065,12 +1115,10 @@ namespace SummarizeTest try { // If there are any errors reported "ok" will be set to false - var whats = ParseValgrindOutput(valgrindOutputFileName, traceToStdout); - foreach (var what in whats) + var valgrindErrors = ParseValgrindOutput(valgrindOutputFileName, traceToStdout); + foreach (var vError in valgrindErrors) { - xout.Add(new XElement("ValgrindError", - new XAttribute("Severity", (int)Magnesium.Severity.SevError), - new XAttribute("What", what))); + xout.Add(vError); ok = false; error = true; } diff --git a/contrib/TestHarness2/.gitignore b/contrib/TestHarness2/.gitignore new file mode 100644 index 0000000000..80682f9552 --- /dev/null +++ b/contrib/TestHarness2/.gitignore @@ -0,0 +1,2 @@ +/tmp/ +/venv diff --git a/contrib/TestHarness2/test_harness/__init__.py b/contrib/TestHarness2/test_harness/__init__.py new file mode 100644 index 0000000000..3cb95520ec --- /dev/null +++ b/contrib/TestHarness2/test_harness/__init__.py @@ -0,0 +1,2 @@ +# Currently this file is left intentionally empty. It's main job for now is to indicate that this directory +# should be used as a module. diff --git a/contrib/TestHarness2/test_harness/app.py b/contrib/TestHarness2/test_harness/app.py new file mode 100644 index 0000000000..3e300c6bf4 --- /dev/null +++ b/contrib/TestHarness2/test_harness/app.py @@ -0,0 +1,25 @@ +import argparse +import sys +import traceback + +from test_harness.config import config +from test_harness.run import TestRunner +from test_harness.summarize import SummaryTree + +if __name__ == '__main__': + try: + parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + test_runner = TestRunner() + if not test_runner.run(): + exit(1) + except Exception as e: + _, _, exc_traceback = sys.exc_info() + error = SummaryTree('TestHarnessError') + error.attributes['Severity'] = '40' + error.attributes['ErrorMessage'] = str(e) + error.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + error.dump(sys.stdout) + exit(1) diff --git a/contrib/TestHarness2/test_harness/config.py b/contrib/TestHarness2/test_harness/config.py new file mode 100644 index 0000000000..d0a11cf85c --- /dev/null +++ b/contrib/TestHarness2/test_harness/config.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +import argparse +import collections +import copy +import os +import random +from enum import Enum +from pathlib import Path +from typing import List, Any, OrderedDict, Dict + + +class BuggifyOptionValue(Enum): + ON = 1 + OFF = 2 + RANDOM = 3 + + +class BuggifyOption: + def __init__(self, val: str | None = None): + self.value = BuggifyOptionValue.RANDOM + if val is not None: + v = val.lower() + if v in ['on', '1', 'true']: + self.value = BuggifyOptionValue.ON + elif v in ['off', '0', 'false']: + self.value = BuggifyOptionValue.OFF + elif v in ['random', 'rnd', 'r']: + pass + else: + assert False, 'Invalid value {} -- use true, false, or random'.format(v) + + +class ConfigValue: + def __init__(self, name: str, **kwargs): + self.name = name + self.value = None + self.kwargs = kwargs + if 'default' in self.kwargs: + self.value = self.kwargs['default'] + + def get_arg_name(self) -> str: + if 'long_name' in self.kwargs: + return self.kwargs['long_name'] + else: + return self.name + + def add_to_args(self, parser: argparse.ArgumentParser): + kwargs = copy.copy(self.kwargs) + long_name = self.name + short_name = None + if 'long_name' in kwargs: + long_name = kwargs['long_name'] + del kwargs['long_name'] + if 'short_name' in kwargs: + short_name = kwargs['short_name'] + del kwargs['short_name'] + if 'action' in kwargs and kwargs['action'] in ['store_true', 'store_false']: + del kwargs['type'] + long_name = long_name.replace('_', '-') + if short_name is None: + # line below is useful for debugging + # print('add_argument(\'--{}\', [{{{}}}])'.format(long_name, ', '.join(['\'{}\': \'{}\''.format(k, v) + # for k, v in kwargs.items()]))) + parser.add_argument('--{}'.format(long_name), **kwargs) + else: + # line below is useful for debugging + # print('add_argument(\'-{}\', \'--{}\', [{{{}}}])'.format(short_name, long_name, + # ', '.join(['\'{}\': \'{}\''.format(k, v) + # for k, v in kwargs.items()]))) + parser.add_argument('-{}'.format(short_name), '--{}'.format(long_name), **kwargs) + + def get_value(self, args: argparse.Namespace) -> tuple[str, Any]: + return self.name, args.__getattribute__(self.get_arg_name()) + + +class Config: + """ + This is the central configuration class for test harness. The values in this class are exposed globally through + a global variable test_harness.config.config. This class provides some "magic" to keep test harness flexible. + Each parameter can further be configured using an `_args` member variable which is expected to be a dictionary. + * The value of any variable can be set through the command line. For a variable named `variable_name` we will + by default create a new command line option `--variable-name` (`_` is automatically changed to `-`). This + default can be changed by setting the `'long_name'` property in the `_arg` dict. + * In addition the user can also optionally set a short-name. This can be achieved by setting the `'short_name'` + property in the `_arg` dictionary. + * All additional properties in `_args` are passed to `argparse.add_argument`. + * If the default of a variable is `None` the user should explicitly set the `'type'` property to an appropriate + type. + * In addition to command line flags, all configuration options can also be controlled through environment variables. + By default, `variable-name` can be changed by setting the environment variable `TH_VARIABLE_NAME`. This default + can be changed by setting the `'env_name'` property. + * Test harness comes with multiple executables. Each of these should use the config facility. For this, + `Config.build_arguments` should be called first with the `argparse` parser. Then `Config.extract_args` needs + to be called with the result of `argparse.ArgumentParser.parse_args`. A sample example could look like this: + ``` + parser = argparse.ArgumentParser('TestHarness', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + ``` + * Changing the default value for all executables might not always be desirable. If it should be only changed for + one executable Config.change_default should be used. + """ + def __init__(self): + self.random = random.Random() + self.cluster_file: str | None = None + self.cluster_file_args = {'short_name': 'C', 'type': str, 'help': 'Path to fdb cluster file', 'required': False, + 'env_name': 'JOSHUA_CLUSTER_FILE'} + self.joshua_dir: str | None = None + self.joshua_dir_args = {'type': str, 'help': 'Where to write FDB data to', 'required': False, + 'env_name': 'JOSHUA_APP_DIR'} + self.stats: str | None = None + self.stats_args = {'type': str, 'help': 'A base64 encoded list of statistics (used to reproduce runs)', + 'required': False} + self.random_seed: int | None = None + self.random_seed_args = {'type': int, + 'help': 'Force given seed given to fdbserver -- mostly useful for debugging', + 'required': False} + self.kill_seconds: int = 30 * 60 + self.kill_seconds_args = {'help': 'Timeout for individual test'} + self.buggify_on_ratio: float = 0.8 + self.buggify_on_ratio_args = {'help': 'Probability that buggify is turned on'} + self.write_run_times = False + self.write_run_times_args = {'help': 'Write back probabilities after each test run', + 'action': 'store_true'} + self.unseed_check_ratio: float = 0.05 + self.unseed_check_ratio_args = {'help': 'Probability for doing determinism check'} + self.test_dirs: List[str] = ['slow', 'fast', 'restarting', 'rare', 'noSim'] + self.test_dirs_args: dict = {'nargs': '*', 'help': 'test_directories to look for files in'} + self.trace_format: str = 'json' + self.trace_format_args = {'choices': ['json', 'xml'], 'help': 'What format fdb should produce'} + self.crash_on_error: bool = True + self.crash_on_error_args = {'long_name': 'no_crash', 'action': 'store_false', + 'help': 'Don\'t crash on first error'} + self.max_warnings: int = 10 + self.max_warnings_args = {'short_name': 'W'} + self.max_errors: int = 10 + self.max_errors_args = {'short_name': 'E'} + self.old_binaries_path: Path = Path('/app/deploy/global_data/oldBinaries/') + self.old_binaries_path_args = {'help': 'Path to the directory containing the old fdb binaries'} + self.use_valgrind: bool = False + self.use_valgrind_args = {'action': 'store_true'} + self.buggify = BuggifyOption('random') + self.buggify_args = {'short_name': 'b', 'choices': ['on', 'off', 'random']} + self.pretty_print: bool = False + self.pretty_print_args = {'short_name': 'P', 'action': 'store_true'} + self.clean_up: bool = True + self.clean_up_args = {'long_name': 'no_clean_up', 'action': 'store_false'} + self.run_dir: Path = Path('tmp') + self.joshua_seed: int = random.randint(0, 2 ** 32 - 1) + self.joshua_seed_args = {'short_name': 's', 'help': 'A random seed', 'env_name': 'JOSHUA_SEED'} + self.print_coverage = False + self.print_coverage_args = {'action': 'store_true'} + self.binary = Path('bin') / ('fdbserver.exe' if os.name == 'nt' else 'fdbserver') + self.binary_args = {'help': 'Path to executable'} + self.hit_per_runs_ratio: int = 20000 + self.hit_per_runs_ratio_args = {'help': 'Maximum test runs before each code probe hit at least once'} + self.output_format: str = 'xml' + self.output_format_args = {'short_name': 'O', 'choices': ['json', 'xml'], + 'help': 'What format TestHarness should produce'} + self.include_test_files: str = r'.*' + self.include_test_files_args = {'help': 'Only consider test files whose path match against the given regex'} + self.exclude_test_files: str = r'.^' + self.exclude_test_files_args = {'help': 'Don\'t consider test files whose path match against the given regex'} + self.include_test_classes: str = r'.*' + self.include_test_classes_args = {'help': 'Only consider tests whose names match against the given regex'} + self.exclude_test_names: str = r'.^' + self.exclude_test_names_args = {'help': 'Don\'t consider tests whose names match against the given regex'} + self.details: bool = False + self.details_args = {'help': 'Print detailed results', 'short_name': 'c', 'action': 'store_true'} + self.success: bool = False + self.success_args = {'help': 'Print successful results', 'action': 'store_true'} + self.cov_include_files: str = r'.*' + self.cov_include_files_args = {'help': 'Only consider coverage traces that originated in files matching regex'} + self.cov_exclude_files: str = r'.^' + self.cov_exclude_files_args = {'help': 'Ignore coverage traces that originated in files matching regex'} + self.max_stderr_bytes: int = 1000 + self.write_stats: bool = True + self.read_stats: bool = True + self.reproduce_prefix: str | None = None + self.reproduce_prefix_args = {'type': str, 'required': False, + 'help': 'When printing the results, prepend this string to the command'} + self._env_names: Dict[str, str] = {} + self._config_map = self._build_map() + self._read_env() + self.random.seed(self.joshua_seed, version=2) + + def change_default(self, attr: str, default_val): + assert attr in self._config_map, 'Unknown config attribute {}'.format(attr) + self.__setattr__(attr, default_val) + self._config_map[attr].kwargs['default'] = default_val + + def _get_env_name(self, var_name: str) -> str: + return self._env_names.get(var_name, 'TH_{}'.format(var_name.upper())) + + def dump(self): + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or callable(obj) or attr.endswith('_args'): + continue + print('config.{}: {} = {}'.format(attr, type(obj), obj)) + + def _build_map(self) -> OrderedDict[str, ConfigValue]: + config_map: OrderedDict[str, ConfigValue] = collections.OrderedDict() + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or callable(obj): + continue + if attr.endswith('_args'): + name = attr[0:-len('_args')] + assert name in config_map + assert isinstance(obj, dict) + for k, v in obj.items(): + if k == 'env_name': + self._env_names[name] = v + else: + config_map[name].kwargs[k] = v + else: + # attribute_args has to be declared after the attribute + assert attr not in config_map + val_type = type(obj) + kwargs = {'type': val_type, 'default': obj} + config_map[attr] = ConfigValue(attr, **kwargs) + return config_map + + def _read_env(self): + for attr in dir(self): + obj = getattr(self, attr) + if attr == 'random' or attr.startswith('_') or attr.endswith('_args') or callable(obj): + continue + env_name = self._get_env_name(attr) + attr_type = self._config_map[attr].kwargs['type'] + assert type(None) != attr_type + e = os.getenv(env_name) + if e is not None: + # Use the env var to supply the default value, so that if the + # environment variable is set and the corresponding command line + # flag is not, the environment variable has an effect. + self._config_map[attr].kwargs['default'] = attr_type(e) + + def build_arguments(self, parser: argparse.ArgumentParser): + for val in self._config_map.values(): + val.add_to_args(parser) + + def extract_args(self, args: argparse.Namespace): + for val in self._config_map.values(): + k, v = val.get_value(args) + if v is not None: + config.__setattr__(k, v) + self.random.seed(self.joshua_seed, version=2) + + +config = Config() + +if __name__ == '__main__': + # test the config setup + parser = argparse.ArgumentParser('TestHarness Config Tester', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + config.dump() diff --git a/contrib/TestHarness2/test_harness/fdb.py b/contrib/TestHarness2/test_harness/fdb.py new file mode 100644 index 0000000000..1e6afa3906 --- /dev/null +++ b/contrib/TestHarness2/test_harness/fdb.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +from typing import OrderedDict, Tuple, List + +import collections +import fdb +import fdb.tuple +import struct + +from test_harness.run import StatFetcher, TestDescription +from test_harness.config import config +from test_harness.summarize import SummaryTree, Coverage + +# Before increasing this, make sure that all Joshua clusters (at Apple and Snowflake) have been upgraded. +# This version needs to be changed if we either need newer features from FDB or the current API version is +# getting retired. +fdb.api_version(630) + + +def str_to_tuple(s: str | None): + if s is None: + return s + return tuple(s.split(',')) + + +fdb_db = None + + +def open_db(cluster_file: str | None): + global fdb_db + if fdb_db is None: + fdb_db = fdb.open(cluster_file) + return fdb_db + + +def chunkify(iterable, sz: int): + res = [] + for item in iterable: + res.append(item) + if len(res) >= sz: + yield res + res = [] + if len(res) > 0: + yield res + + +@fdb.transactional +def write_coverage_chunk(tr, path: Tuple[str, ...], metadata: Tuple[str, ...], + coverage: List[Tuple[Coverage, bool]], initialized: bool) -> bool: + cov_dir = fdb.directory.create_or_open(tr, path) + if not initialized: + metadata_dir = fdb.directory.create_or_open(tr, metadata) + v = tr[metadata_dir['initialized']] + initialized = v.present() + for cov, covered in coverage: + if not initialized or covered: + tr.add(cov_dir.pack((cov.file, cov.line, cov.comment)), struct.pack(' OrderedDict[Coverage, int]: + res = collections.OrderedDict() + cov_dir = fdb.directory.create_or_open(tr, cov_path) + for k, v in tr[cov_dir.range()]: + file, line, comment = cov_dir.unpack(k) + count = struct.unpack(' OrderedDict[Coverage, int]: + db = open_db(cluster_file) + return _read_coverage(db, cov_path) + + +class TestStatistics: + def __init__(self, runtime: int, run_count: int): + self.runtime: int = runtime + self.run_count: int = run_count + + +class Statistics: + def __init__(self, cluster_file: str | None, joshua_dir: Tuple[str, ...]): + self.db = open_db(cluster_file) + self.stats_dir = self.open_stats_dir(self.db, joshua_dir) + self.stats: OrderedDict[str, TestStatistics] = self.read_stats_from_db(self.db) + + @fdb.transactional + def open_stats_dir(self, tr, app_dir: Tuple[str]): + stats_dir = app_dir + ('runtime_stats',) + return fdb.directory.create_or_open(tr, stats_dir) + + @fdb.transactional + def read_stats_from_db(self, tr) -> OrderedDict[str, TestStatistics]: + result = collections.OrderedDict() + for k, v in tr[self.stats_dir.range()]: + test_name = self.stats_dir.unpack(k)[0] + runtime, run_count = struct.unpack(' None: + key = self.stats_dir.pack((test_name,)) + tr.add(key, struct.pack(' None: + assert self.db is not None + self._write_runtime(self.db, test_name, time) + + +class FDBStatFetcher(StatFetcher): + def __init__(self, tests: OrderedDict[str, TestDescription], + joshua_dir: Tuple[str] = str_to_tuple(config.joshua_dir)): + super().__init__(tests) + self.statistics = Statistics(config.cluster_file, joshua_dir) + + def read_stats(self): + for k, v in self.statistics.stats.items(): + if k in self.tests.keys(): + self.tests[k].total_runtime = v.runtime + self.tests[k].num_runs = v.run_count + + def add_run_time(self, test_name: str, runtime: int, out: SummaryTree): + self.statistics.write_runtime(test_name, runtime) + super().add_run_time(test_name, runtime, out) diff --git a/contrib/TestHarness2/test_harness/joshua.py b/contrib/TestHarness2/test_harness/joshua.py new file mode 100644 index 0000000000..33c5881dcc --- /dev/null +++ b/contrib/TestHarness2/test_harness/joshua.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +import collections +import io +import sys +import xml.sax +import xml.sax.handler +from pathlib import Path +from typing import List, OrderedDict, Set + +from joshua import joshua_model + +import test_harness.run +from test_harness.config import config +from test_harness.summarize import SummaryTree + + +class ToSummaryTree(xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.root: SummaryTree | None = None + self.stack: List[SummaryTree] = [] + + def result(self) -> SummaryTree: + assert len(self.stack) == 0 and self.root is not None, 'Parse Error' + return self.root + + def startElement(self, name, attrs): + new_child = SummaryTree(name) + for k, v in attrs.items(): + new_child.attributes[k] = v + self.stack.append(new_child) + + def endElement(self, name): + closed = self.stack.pop() + assert closed.name == name + if len(self.stack) == 0: + self.root = closed + else: + self.stack[-1].children.append(closed) + + +def _print_summary(summary: SummaryTree, commands: Set[str]): + cmd = [] + if config.reproduce_prefix is not None: + cmd.append(config.reproduce_prefix) + cmd.append('fdbserver') + if 'TestFile' in summary.attributes: + file_name = summary.attributes['TestFile'] + role = 'test' if test_harness.run.is_no_sim(Path(file_name)) else 'simulation' + cmd += ['-r', role, '-f', file_name] + else: + cmd += ['-r', 'simulation', '-f', ''] + if 'RandomSeed' in summary.attributes: + cmd += ['-s', summary.attributes['RandomSeed']] + else: + cmd += ['-s', ''] + if 'BuggifyEnabled' in summary.attributes: + arg = 'on' + if summary.attributes['BuggifyEnabled'].lower() in ['0', 'off', 'false']: + arg = 'off' + cmd += ['-b', arg] + else: + cmd += ['b', ''] + cmd += ['--crash', '--trace_format', config.trace_format] + key = ' '.join(cmd) + count = 1 + while key in commands: + key = '{} # {}'.format(' '.join(cmd), count) + count += 1 + # we want the command as the first attribute + attributes = {'Command': ' '.join(cmd)} + for k, v in summary.attributes.items(): + if k == 'Errors': + attributes['ErrorCount'] = v + else: + attributes[k] = v + summary.attributes = attributes + if config.details: + key = str(len(commands)) + str_io = io.StringIO() + summary.dump(str_io, prefix=(' ' if config.pretty_print else '')) + if config.output_format == 'json': + sys.stdout.write('{}"Test{}": {}'.format(' ' if config.pretty_print else '', + key, str_io.getvalue())) + else: + sys.stdout.write(str_io.getvalue()) + if config.pretty_print: + sys.stdout.write('\n' if config.output_format == 'xml' else ',\n') + return key + error_count = 0 + warning_count = 0 + small_summary = SummaryTree('Test') + small_summary.attributes = attributes + errors = SummaryTree('Errors') + warnings = SummaryTree('Warnings') + buggifies: OrderedDict[str, List[int]] = collections.OrderedDict() + for child in summary.children: + if 'Severity' in child.attributes and child.attributes['Severity'] == '40' and error_count < config.max_errors: + error_count += 1 + errors.append(child) + if 'Severity' in child.attributes and child.attributes[ + 'Severity'] == '30' and warning_count < config.max_warnings: + warning_count += 1 + warnings.append(child) + if child.name == 'BuggifySection': + file = child.attributes['File'] + line = int(child.attributes['Line']) + buggifies.setdefault(file, []).append(line) + buggifies_elem = SummaryTree('Buggifies') + for file, lines in buggifies.items(): + lines.sort() + if config.output_format == 'json': + buggifies_elem.attributes[file] = ' '.join(str(line) for line in lines) + else: + child = SummaryTree('Buggify') + child.attributes['File'] = file + child.attributes['Lines'] = ' '.join(str(line) for line in lines) + small_summary.append(child) + small_summary.children.append(buggifies_elem) + if len(errors.children) > 0: + small_summary.children.append(errors) + if len(warnings.children) > 0: + small_summary.children.append(warnings) + output = io.StringIO() + small_summary.dump(output, prefix=(' ' if config.pretty_print else '')) + if config.output_format == 'json': + sys.stdout.write('{}"{}": {}'.format(' ' if config.pretty_print else '', key, output.getvalue().strip())) + else: + sys.stdout.write('{}{}'.format(' ' if config.pretty_print else '', output.getvalue().strip())) + sys.stdout.write('\n' if config.output_format == 'xml' else ',\n') + + +def print_errors(ensemble_id: str): + joshua_model.open(config.cluster_file) + properties = joshua_model.get_ensemble_properties(ensemble_id) + compressed = properties["compressed"] if "compressed" in properties else False + for rec in joshua_model.tail_results(ensemble_id, errors_only=(not config.success), compressed=compressed): + if len(rec) == 5: + version_stamp, result_code, host, seed, output = rec + elif len(rec) == 4: + version_stamp, result_code, host, output = rec + seed = None + elif len(rec) == 3: + version_stamp, result_code, output = rec + host = None + seed = None + elif len(rec) == 2: + version_stamp, seed = rec + output = str(joshua_model.fdb.tuple.unpack(seed)[0]) + "\n" + result_code = None + host = None + seed = None + else: + raise Exception("Unknown result format") + lines = output.splitlines() + commands: Set[str] = set() + for line in lines: + summary = ToSummaryTree() + xml.sax.parseString(line, summary) + commands.add(_print_summary(summary.result(), commands)) diff --git a/contrib/TestHarness2/test_harness/results.py b/contrib/TestHarness2/test_harness/results.py new file mode 100644 index 0000000000..486c497d35 --- /dev/null +++ b/contrib/TestHarness2/test_harness/results.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import argparse +import io +import json +import re +import sys +import test_harness.fdb + +from typing import List, Tuple, OrderedDict +from test_harness.summarize import SummaryTree, Coverage +from test_harness.config import config +from xml.sax.saxutils import quoteattr + + +class GlobalStatistics: + def __init__(self): + self.total_probes_hit: int = 0 + self.total_cpu_time: int = 0 + self.total_test_runs: int = 0 + self.total_missed_probes: int = 0 + + +class EnsembleResults: + def __init__(self, cluster_file: str | None, ensemble_id: str): + self.global_statistics = GlobalStatistics() + self.fdb_path = ('joshua', 'ensembles', 'results', 'application', ensemble_id) + self.coverage_path = self.fdb_path + ('coverage',) + self.statistics = test_harness.fdb.Statistics(cluster_file, self.fdb_path) + coverage_dict: OrderedDict[Coverage, int] = test_harness.fdb.read_coverage(cluster_file, self.coverage_path) + self.coverage: List[Tuple[Coverage, int]] = [] + self.min_coverage_hit: int | None = None + self.ratio = self.global_statistics.total_test_runs / config.hit_per_runs_ratio + for cov, count in coverage_dict.items(): + if re.search(config.cov_include_files, cov.file) is None: + continue + if re.search(config.cov_exclude_files, cov.file) is not None: + continue + self.global_statistics.total_probes_hit += count + self.coverage.append((cov, count)) + if count <= self.ratio: + self.global_statistics.total_missed_probes += 1 + if self.min_coverage_hit is None or self.min_coverage_hit > count: + self.min_coverage_hit = count + self.coverage.sort(key=lambda x: (x[1], x[0].file, x[0].line)) + self.stats: List[Tuple[str, int, int]] = [] + for k, v in self.statistics.stats.items(): + self.global_statistics.total_test_runs += v.run_count + self.global_statistics.total_cpu_time += v.runtime + self.stats.append((k, v.runtime, v.run_count)) + self.stats.sort(key=lambda x: x[1], reverse=True) + if self.min_coverage_hit is not None: + self.coverage_ok = self.min_coverage_hit > self.ratio + else: + self.coverage_ok = False + + def dump(self, prefix: str): + errors = 0 + out = SummaryTree('EnsembleResults') + out.attributes['TotalRuntime'] = str(self.global_statistics.total_cpu_time) + out.attributes['TotalTestRuns'] = str(self.global_statistics.total_test_runs) + out.attributes['TotalProbesHit'] = str(self.global_statistics.total_probes_hit) + out.attributes['MinProbeHit'] = str(self.min_coverage_hit) + out.attributes['TotalProbes'] = str(len(self.coverage)) + out.attributes['MissedProbes'] = str(self.global_statistics.total_missed_probes) + + for cov, count in self.coverage: + severity = 10 if count > self.ratio else 40 + if severity == 40: + errors += 1 + if (severity == 40 and errors <= config.max_errors) or config.details: + child = SummaryTree('CodeProbe') + child.attributes['Severity'] = str(severity) + child.attributes['File'] = cov.file + child.attributes['Line'] = str(cov.line) + child.attributes['Comment'] = '' if cov.comment is None else cov.comment + child.attributes['HitCount'] = str(count) + out.append(child) + + if config.details: + for k, runtime, run_count in self.stats: + child = SummaryTree('Test') + child.attributes['Name'] = k + child.attributes['Runtime'] = str(runtime) + child.attributes['RunCount'] = str(run_count) + out.append(child) + if errors > 0: + out.attributes['Errors'] = str(errors) + str_io = io.StringIO() + out.dump(str_io, prefix=prefix, new_line=config.pretty_print) + if config.output_format == 'xml': + sys.stdout.write(str_io.getvalue()) + else: + sys.stdout.write('{}"EnsembleResults":{}{}'.format(' ' if config.pretty_print else '', + '\n' if config.pretty_print else ' ', + str_io.getvalue())) + + +def write_header(ensemble_id: str): + if config.output_format == 'json': + if config.pretty_print: + print('{') + print(' "{}": {},\n'.format('ID', json.dumps(ensemble_id.strip()))) + else: + sys.stdout.write('{{{}: {},'.format('ID', json.dumps(ensemble_id.strip()))) + elif config.output_format == 'xml': + sys.stdout.write(''.format(quoteattr(ensemble_id.strip()))) + if config.pretty_print: + sys.stdout.write('\n') + else: + assert False, 'unknown output format {}'.format(config.output_format) + + +def write_footer(): + if config.output_format == 'xml': + sys.stdout.write('\n') + elif config.output_format == 'json': + sys.stdout.write('}\n') + else: + assert False, 'unknown output format {}'.format(config.output_format) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('TestHarness Results', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.change_default('pretty_print', True) + config.change_default('max_warnings', 0) + config.build_arguments(parser) + parser.add_argument('ensemble_id', type=str, help='The ensemble to fetch the result for') + args = parser.parse_args() + config.extract_args(args) + config.output_format = args.output_format + write_header(args.ensemble_id) + try: + import test_harness.joshua + test_harness.joshua.print_errors(args.ensemble_id) + except ModuleNotFoundError: + child = SummaryTree('JoshuaNotFound') + child.attributes['Severity'] = '30' + child.attributes['Message'] = 'Could not import Joshua -- set PYTHONPATH to joshua checkout dir' + child.dump(sys.stdout, prefix=(' ' if config.pretty_print else ''), new_line=config.pretty_print) + results = EnsembleResults(config.cluster_file, args.ensemble_id) + results.dump(' ' if config.pretty_print else '') + write_footer() + exit(0 if results.coverage_ok else 1) diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py new file mode 100644 index 0000000000..c5e948eb6d --- /dev/null +++ b/contrib/TestHarness2/test_harness/run.py @@ -0,0 +1,465 @@ +from __future__ import annotations + +import array +import base64 +import collections +import math +import os +import resource +import shutil +import subprocess +import re +import sys +import threading +import time +import uuid + +from functools import total_ordering +from pathlib import Path +from test_harness.version import Version +from test_harness.config import config +from typing import List, Pattern, OrderedDict + +from test_harness.summarize import Summary, SummaryTree + + +@total_ordering +class TestDescription: + def __init__(self, path: Path, name: str, priority: float): + self.paths: List[Path] = [path] + self.name = name + self.priority: float = priority + # we only measure in seconds. Otherwise, keeping determinism will be difficult + self.total_runtime: int = 0 + self.num_runs: int = 0 + + def __lt__(self, other): + if isinstance(other, TestDescription): + return self.name < other.name + else: + return self.name < str(other) + + def __eq__(self, other): + if isinstance(other, TestDescription): + return self.name < other.name + else: + return self.name < str(other.name) + + +class StatFetcher: + def __init__(self, tests: OrderedDict[str, TestDescription]): + self.tests = tests + + def read_stats(self): + pass + + def add_run_time(self, test_name: str, runtime: int, out: SummaryTree): + self.tests[test_name].total_runtime += runtime + + +class TestPicker: + def __init__(self, test_dir: Path): + if not test_dir.exists(): + raise RuntimeError('{} is neither a directory nor a file'.format(test_dir)) + self.include_files_regex = re.compile(config.include_test_files) + self.exclude_files_regex = re.compile(config.exclude_test_files) + self.include_tests_regex = re.compile(config.include_test_classes) + self.exclude_tests_regex = re.compile(config.exclude_test_names) + self.test_dir: Path = test_dir + self.tests: OrderedDict[str, TestDescription] = collections.OrderedDict() + self.restart_test: Pattern = re.compile(r".*-\d+\.(txt|toml)") + self.follow_test: Pattern = re.compile(r".*-[2-9]\d*\.(txt|toml)") + + for subdir in self.test_dir.iterdir(): + if subdir.is_dir() and subdir.name in config.test_dirs: + self.walk_test_dir(subdir) + self.stat_fetcher: StatFetcher + if config.stats is not None or config.joshua_dir is None: + self.stat_fetcher = StatFetcher(self.tests) + else: + from test_harness.fdb import FDBStatFetcher + self.stat_fetcher = FDBStatFetcher(self.tests) + if config.stats is not None: + self.load_stats(config.stats) + else: + self.fetch_stats() + + def add_time(self, test_file: Path, run_time: int, out: SummaryTree) -> None: + # getting the test name is fairly inefficient. But since we only have 100s of tests, I won't bother + test_name: str | None = None + test_desc: TestDescription | None = None + for name, test in self.tests.items(): + for p in test.paths: + test_files: List[Path] + if self.restart_test.match(p.name): + test_files = self.list_restart_files(p) + else: + test_files = [p] + for file in test_files: + if file.absolute() == test_file.absolute(): + test_name = name + test_desc = test + break + if test_name is not None: + break + if test_name is not None: + break + assert test_name is not None and test_desc is not None + self.stat_fetcher.add_run_time(test_name, run_time, out) + out.attributes['TotalTestTime'] = str(test_desc.total_runtime) + out.attributes['TestRunCount'] = str(test_desc.num_runs) + + def dump_stats(self) -> str: + res = array.array('I') + for _, spec in self.tests.items(): + res.append(spec.total_runtime) + return base64.standard_b64encode(res.tobytes()).decode('utf-8') + + def fetch_stats(self): + self.stat_fetcher.read_stats() + + def load_stats(self, serialized: str): + times = array.array('I') + times.frombytes(base64.standard_b64decode(serialized)) + assert len(times) == len(self.tests.items()) + for idx, (_, spec) in enumerate(self.tests.items()): + spec.total_runtime = times[idx] + + def parse_txt(self, path: Path): + if self.include_files_regex.search(str(path)) is None or self.exclude_files_regex.search(str(path)) is not None: + return + with path.open('r') as f: + test_name: str | None = None + test_class: str | None = None + priority: float | None = None + for line in f: + line = line.strip() + kv = line.split('=') + if len(kv) != 2: + continue + kv[0] = kv[0].strip() + kv[1] = kv[1].strip(' \r\n\t\'"') + if kv[0] == 'testTitle' and test_name is None: + test_name = kv[1] + if kv[0] == 'testClass' and test_class is None: + test_class = kv[1] + if kv[0] == 'testPriority' and priority is None: + try: + priority = float(kv[1]) + except ValueError: + raise RuntimeError("Can't parse {} -- testPriority in {} should be set to a float".format(kv[1], + path)) + if test_name is not None and test_class is not None and priority is not None: + break + if test_name is None: + return + if test_class is None: + test_class = test_name + if priority is None: + priority = 1.0 + if self.include_tests_regex.search(test_class) is None \ + or self.exclude_tests_regex.search(test_class) is not None: + return + if test_class not in self.tests: + self.tests[test_class] = TestDescription(path, test_class, priority) + else: + self.tests[test_class].paths.append(path) + + def walk_test_dir(self, test: Path): + if test.is_dir(): + for file in test.iterdir(): + self.walk_test_dir(file) + else: + # check whether we're looking at a restart test + if self.follow_test.match(test.name) is not None: + return + if test.suffix == '.txt' or test.suffix == '.toml': + self.parse_txt(test) + + @staticmethod + def list_restart_files(start_file: Path) -> List[Path]: + name = re.sub(r'-\d+.(txt|toml)', '', start_file.name) + res: List[Path] = [] + for test_file in start_file.parent.iterdir(): + if test_file.name.startswith(name): + res.append(test_file) + assert len(res) > 1 + res.sort() + return res + + def choose_test(self) -> List[Path]: + min_runtime: float | None = None + candidates: List[TestDescription] = [] + for _, v in self.tests.items(): + this_time = v.total_runtime * v.priority + if min_runtime is None or this_time < min_runtime: + min_runtime = this_time + candidates = [v] + elif this_time == min_runtime: + candidates.append(v) + candidates.sort() + choice = config.random.randint(0, len(candidates) - 1) + test = candidates[choice] + result = test.paths[config.random.randint(0, len(test.paths) - 1)] + if self.restart_test.match(result.name): + return self.list_restart_files(result) + else: + return [result] + + +class OldBinaries: + def __init__(self): + self.first_file_expr = re.compile(r'.*-1\.(txt|toml)') + self.old_binaries_path: Path = config.old_binaries_path + self.binaries: OrderedDict[Version, Path] = collections.OrderedDict() + if not self.old_binaries_path.exists() or not self.old_binaries_path.is_dir(): + return + exec_pattern = re.compile(r'fdbserver-\d+\.\d+\.\d+(\.exe)?') + for file in self.old_binaries_path.iterdir(): + if not file.is_file() or not os.access(file, os.X_OK): + continue + if exec_pattern.fullmatch(file.name) is not None: + self._add_file(file) + + def _add_file(self, file: Path): + version_str = file.name.split('-')[1] + if version_str.endswith('.exe'): + version_str = version_str[0:-len('.exe')] + ver = Version.parse(version_str) + self.binaries[ver] = file + + def choose_binary(self, test_file: Path) -> Path: + if len(self.binaries) == 0: + return config.binary + max_version = Version.max_version() + min_version = Version.parse('5.0.0') + dirs = test_file.parent.parts + if 'restarting' not in dirs: + return config.binary + version_expr = dirs[-1].split('_') + first_file = self.first_file_expr.match(test_file.name) is not None + if first_file and version_expr[0] == 'to': + # downgrade test -- first binary should be current one + return config.binary + if not first_file and version_expr[0] == 'from': + # upgrade test -- we only return an old version for the first test file + return config.binary + if version_expr[0] == 'from' or version_expr[0] == 'to': + min_version = Version.parse(version_expr[1]) + if len(version_expr) == 4 and version_expr[2] == 'until': + max_version = Version.parse(version_expr[3]) + candidates: List[Path] = [] + for ver, binary in self.binaries.items(): + if min_version <= ver <= max_version: + candidates.append(binary) + if len(candidates) == 0: + return config.binary + return config.random.choice(candidates) + + +def is_restarting_test(test_file: Path): + for p in test_file.parts: + if p == 'restarting': + return True + return False + + +def is_no_sim(test_file: Path): + return test_file.parts[-2] == 'noSim' + + +class ResourceMonitor(threading.Thread): + def __init__(self): + super().__init__() + self.start_time = time.time() + self.end_time: float | None = None + self._stop_monitor = False + self.max_rss = 0 + + def run(self) -> None: + while not self._stop_monitor: + time.sleep(1) + resources = resource.getrusage(resource.RUSAGE_CHILDREN) + self.max_rss = max(resources.ru_maxrss, self.max_rss) + + def stop(self): + self.end_time = time.time() + self._stop_monitor = True + + def time(self): + return self.end_time - self.start_time + + +class TestRun: + def __init__(self, binary: Path, test_file: Path, random_seed: int, uid: uuid.UUID, + restarting: bool = False, test_determinism: bool = False, buggify_enabled: bool = False, + stats: str | None = None, expected_unseed: int | None = None, will_restart: bool = False): + self.binary = binary + self.test_file = test_file + self.random_seed = random_seed + self.uid = uid + self.restarting = restarting + self.test_determinism = test_determinism + self.stats: str | None = stats + self.expected_unseed: int | None = expected_unseed + self.use_valgrind: bool = config.use_valgrind + self.old_binary_path: Path = config.old_binaries_path + self.buggify_enabled: bool = buggify_enabled + self.fault_injection_enabled: bool = True + self.trace_format: str | None = config.trace_format + if Version.of_binary(self.binary) < "6.1.0": + self.trace_format = None + self.temp_path = config.run_dir / str(self.uid) + # state for the run + self.retryable_error: bool = False + self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed, + will_restart=will_restart) + self.run_time: int = 0 + self.success = self.run() + + def log_test_plan(self, out: SummaryTree): + test_plan: SummaryTree = SummaryTree('TestPlan') + test_plan.attributes['TestUID'] = str(self.uid) + test_plan.attributes['RandomSeed'] = str(self.random_seed) + test_plan.attributes['TestFile'] = str(self.test_file) + test_plan.attributes['Buggify'] = '1' if self.buggify_enabled else '0' + test_plan.attributes['FaultInjectionEnabled'] = '1' if self.fault_injection_enabled else '0' + test_plan.attributes['DeterminismCheck'] = '1' if self.test_determinism else '0' + out.append(test_plan) + + def delete_simdir(self): + shutil.rmtree(self.temp_path / Path('simfdb')) + + def run(self): + command: List[str] = [] + valgrind_file: Path | None = None + if self.use_valgrind: + command.append('valgrind') + valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed)) + dbg_path = os.getenv('FDB_VALGRIND_DBGPATH') + if dbg_path is not None: + command.append('--extra-debuginfo-path={}'.format(dbg_path)) + command += ['--xml=yes', '--xml-file={}'.format(valgrind_file.absolute()), '-q'] + command += [str(self.binary.absolute()), + '-r', 'test' if is_no_sim(self.test_file) else 'simulation', + '-f', str(self.test_file), + '-s', str(self.random_seed)] + if self.trace_format is not None: + command += ['--trace_format', self.trace_format] + if Version.of_binary(self.binary) >= '7.1.0': + command += ['-fi', 'on' if self.fault_injection_enabled else 'off'] + if self.restarting: + command.append('--restarting') + if self.buggify_enabled: + command += ['-b', 'on'] + if config.crash_on_error: + command.append('--crash') + + self.temp_path.mkdir(parents=True, exist_ok=True) + + # self.log_test_plan(out) + resources = ResourceMonitor() + resources.start() + process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path, + text=True) + did_kill = False + timeout = 20 * config.kill_seconds if self.use_valgrind else config.kill_seconds + err_out: str + try: + _, err_out = process.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + _, err_out = process.communicate() + did_kill = True + resources.stop() + resources.join() + # we're rounding times up, otherwise we will prefer running very short tests (<1s) + self.run_time = math.ceil(resources.time()) + self.summary.runtime = resources.time() + self.summary.max_rss = resources.max_rss + self.summary.was_killed = did_kill + self.summary.valgrind_out_file = valgrind_file + self.summary.error_out = err_out + self.summary.summarize(self.temp_path, ' '.join(command)) + return self.summary.ok() + + +def decorate_summary(out: SummaryTree, test_file: Path, seed: int, buggify: bool): + """Sometimes a test can crash before ProgramStart is written to the traces. These + tests are then hard to reproduce (they can be reproduced through TestHarness but + require the user to run in the joshua docker container). To account for this we + will write the necessary information into the attributes if it is missing.""" + if 'TestFile' not in out.attributes: + out.attributes['TestFile'] = str(test_file) + if 'RandomSeed' not in out.attributes: + out.attributes['RandomSeed'] = str(seed) + if 'BuggifyEnabled' not in out.attributes: + out.attributes['BuggifyEnabled'] = '1' if buggify else '0' + + +class TestRunner: + def __init__(self): + self.uid = uuid.uuid4() + self.test_path: Path = Path('tests') + self.cluster_file: str | None = None + self.fdb_app_dir: str | None = None + self.binary_chooser = OldBinaries() + self.test_picker = TestPicker(self.test_path) + + def backup_sim_dir(self, seed: int): + temp_dir = config.run_dir / str(self.uid) + src_dir = temp_dir / 'simfdb' + assert src_dir.is_dir() + dest_dir = temp_dir / 'simfdb.{}'.format(seed) + assert not dest_dir.exists() + shutil.copytree(src_dir, dest_dir) + + def restore_sim_dir(self, seed: int): + temp_dir = config.run_dir / str(self.uid) + src_dir = temp_dir / 'simfdb.{}'.format(seed) + assert src_dir.exists() + dest_dir = temp_dir / 'simfdb' + shutil.rmtree(dest_dir) + shutil.move(src_dir, dest_dir) + + def run_tests(self, test_files: List[Path], seed: int, test_picker: TestPicker) -> bool: + result: bool = True + for count, file in enumerate(test_files): + will_restart = count + 1 < len(test_files) + binary = self.binary_chooser.choose_binary(file) + unseed_check = not is_no_sim(file) and config.random.random() < config.unseed_check_ratio + buggify_enabled: bool = config.random.random() < config.buggify_on_ratio + if unseed_check and count != 0: + # for restarting tests we will need to restore the sim2 after the first run + self.backup_sim_dir(seed + count - 1) + run = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0, + stats=test_picker.dump_stats(), will_restart=will_restart, buggify_enabled=buggify_enabled) + result = result and run.success + test_picker.add_time(test_files[0], run.run_time, run.summary.out) + decorate_summary(run.summary.out, file, seed + count, run.buggify_enabled) + if unseed_check and run.summary.unseed: + run.summary.out.append(run.summary.list_simfdb()) + run.summary.out.dump(sys.stdout) + if not result: + return False + if unseed_check and run.summary.unseed is not None: + if count != 0: + self.restore_sim_dir(seed + count - 1) + run2 = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0, + stats=test_picker.dump_stats(), expected_unseed=run.summary.unseed, + will_restart=will_restart, buggify_enabled=buggify_enabled) + test_picker.add_time(file, run2.run_time, run.summary.out) + decorate_summary(run2.summary.out, file, seed + count, run.buggify_enabled) + run2.summary.out.dump(sys.stdout) + result = result and run2.success + if not result: + return False + return result + + def run(self) -> bool: + seed = config.random_seed if config.random_seed is not None else config.random.randint(0, 2 ** 32 - 1) + test_files = self.test_picker.choose_test() + success = self.run_tests(test_files, seed, self.test_picker) + if config.clean_up: + shutil.rmtree(config.run_dir / str(self.uid)) + return success diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py new file mode 100644 index 0000000000..8be5d2b507 --- /dev/null +++ b/contrib/TestHarness2/test_harness/summarize.py @@ -0,0 +1,620 @@ +from __future__ import annotations + +import collections +import inspect +import json +import os +import re +import sys +import traceback +import uuid +import xml.sax +import xml.sax.handler +import xml.sax.saxutils + +from pathlib import Path +from typing import List, Dict, TextIO, Callable, Optional, OrderedDict, Any, Tuple, Iterator, Iterable + +from test_harness.config import config +from test_harness.valgrind import parse_valgrind_output + + +class SummaryTree: + def __init__(self, name: str): + self.name = name + self.children: List[SummaryTree] = [] + self.attributes: Dict[str, str] = {} + + def append(self, element: SummaryTree): + self.children.append(element) + + def to_dict(self, add_name: bool = True) -> Dict[str, Any] | List[Any]: + if len(self.children) > 0 and len(self.attributes) == 0: + children = [] + for child in self.children: + children.append(child.to_dict()) + if add_name: + return {self.name: children} + else: + return children + res: Dict[str, Any] = {} + if add_name: + res['Type'] = self.name + for k, v in self.attributes.items(): + res[k] = v + children = [] + child_keys: Dict[str, int] = {} + for child in self.children: + if child.name in child_keys: + child_keys[child.name] += 1 + else: + child_keys[child.name] = 1 + for child in self.children: + if child_keys[child.name] == 1 and child.name not in self.attributes: + res[child.name] = child.to_dict(add_name=False) + else: + children.append(child.to_dict()) + if len(children) > 0: + res['children'] = children + return res + + def to_json(self, out: TextIO, prefix: str = ''): + res = json.dumps(self.to_dict(), indent=(' ' if config.pretty_print else None)) + for line in res.splitlines(False): + out.write('{}{}\n'.format(prefix, line)) + + def to_xml(self, out: TextIO, prefix: str = ''): + # minidom doesn't support omitting the xml declaration which is a problem for joshua + # However, our xml is very simple and therefore serializing manually is easy enough + attrs = [] + print_width = 120 + try: + print_width, _ = os.get_terminal_size() + except OSError: + pass + for k, v in self.attributes.items(): + attrs.append('{}={}'.format(k, xml.sax.saxutils.quoteattr(v))) + elem = '{}<{}{}'.format(prefix, self.name, ('' if len(attrs) == 0 else ' ')) + out.write(elem) + if config.pretty_print: + curr_line_len = len(elem) + for i in range(len(attrs)): + attr_len = len(attrs[i]) + if i == 0 or attr_len + curr_line_len + 1 <= print_width: + if i != 0: + out.write(' ') + out.write(attrs[i]) + curr_line_len += attr_len + else: + out.write('\n') + out.write(' ' * len(elem)) + out.write(attrs[i]) + curr_line_len = len(elem) + attr_len + else: + out.write(' '.join(attrs)) + if len(self.children) == 0: + out.write('/>') + else: + out.write('>') + for child in self.children: + if config.pretty_print: + out.write('\n') + child.to_xml(out, prefix=(' {}'.format(prefix) if config.pretty_print else prefix)) + if len(self.children) > 0: + out.write('{}{}'.format(('\n' if config.pretty_print else ''), prefix, self.name)) + + def dump(self, out: TextIO, prefix: str = '', new_line: bool = True): + if config.output_format == 'json': + self.to_json(out, prefix=prefix) + else: + self.to_xml(out, prefix=prefix) + if new_line: + out.write('\n') + + +ParserCallback = Callable[[Dict[str, str]], Optional[str]] + + +class ParseHandler: + def __init__(self, out: SummaryTree): + self.out = out + self.events: OrderedDict[Optional[Tuple[str, Optional[str]]], List[ParserCallback]] = collections.OrderedDict() + + def add_handler(self, attr: Tuple[str, Optional[str]], callback: ParserCallback) -> None: + self.events.setdefault(attr, []).append(callback) + + def _call(self, callback: ParserCallback, attrs: Dict[str, str]) -> str | None: + try: + return callback(attrs) + except Exception as e: + _, _, exc_traceback = sys.exc_info() + child = SummaryTree('NonFatalParseError') + child.attributes['Severity'] = '30' + child.attributes['ErrorMessage'] = str(e) + child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + self.out.append(child) + return None + + def handle(self, attrs: Dict[str, str]): + if None in self.events: + for callback in self.events[None]: + self._call(callback, attrs) + for k, v in attrs.items(): + if (k, None) in self.events: + for callback in self.events[(k, None)]: + remap = self._call(callback, attrs) + if remap is not None: + v = remap + attrs[k] = v + if (k, v) in self.events: + for callback in self.events[(k, v)]: + remap = self._call(callback, attrs) + if remap is not None: + v = remap + attrs[k] = v + + +class Parser: + def parse(self, file: TextIO, handler: ParseHandler) -> None: + pass + + +class XmlParser(Parser, xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.handler: ParseHandler | None = None + + def parse(self, file: TextIO, handler: ParseHandler) -> None: + xml.sax.parse(file, self) + + def startElement(self, name, attrs) -> None: + attributes: Dict[str, str] = {} + for name in attrs.getNames(): + attributes[name] = attrs.getValue(name) + assert self.handler is not None + self.handler.handle(attributes) + + +class JsonParser(Parser): + def __init__(self): + super().__init__() + + def parse(self, file: TextIO, handler: ParseHandler): + for line in file: + obj = json.loads(line) + handler.handle(obj) + + +class Coverage: + def __init__(self, file: str, line: str | int, comment: str | None = None): + self.file = file + self.line = int(line) + self.comment = comment + + def to_tuple(self) -> Tuple[str, int, str | None]: + return self.file, self.line, self.comment + + def __eq__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() == other + elif isinstance(other, Coverage): + return self.to_tuple() == other.to_tuple() + else: + return False + + def __lt__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() < other + elif isinstance(other, Coverage): + return self.to_tuple() < other.to_tuple() + else: + return False + + def __le__(self, other) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() <= other + elif isinstance(other, Coverage): + return self.to_tuple() <= other.to_tuple() + else: + return False + + def __gt__(self, other: Coverage) -> bool: + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() > other + elif isinstance(other, Coverage): + return self.to_tuple() > other.to_tuple() + else: + return False + + def __ge__(self, other): + if isinstance(other, tuple) and len(other) == 3: + return self.to_tuple() >= other + elif isinstance(other, Coverage): + return self.to_tuple() >= other.to_tuple() + else: + return False + + def __hash__(self): + return hash((self.file, self.line, self.comment)) + + +class TraceFiles: + def __init__(self, path: Path): + self.path: Path = path + self.timestamps: List[int] = [] + self.runs: OrderedDict[int, List[Path]] = collections.OrderedDict() + trace_expr = re.compile(r'trace.*\.(json|xml)') + for file in self.path.iterdir(): + if file.is_file() and trace_expr.match(file.name) is not None: + ts = int(file.name.split('.')[6]) + if ts in self.runs: + self.runs[ts].append(file) + else: + self.timestamps.append(ts) + self.runs[ts] = [file] + self.timestamps.sort(reverse=True) + + def __getitem__(self, idx: int) -> List[Path]: + res = self.runs[self.timestamps[idx]] + res.sort() + return res + + def __len__(self) -> int: + return len(self.runs) + + def items(self) -> Iterator[List[Path]]: + class TraceFilesIterator(Iterable[List[Path]]): + def __init__(self, trace_files: TraceFiles): + self.current = 0 + self.trace_files: TraceFiles = trace_files + + def __iter__(self): + return self + + def __next__(self) -> List[Path]: + if len(self.trace_files) <= self.current: + raise StopIteration + self.current += 1 + return self.trace_files[self.current - 1] + return TraceFilesIterator(self) + + +class Summary: + def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None, + was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None, + exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None, + error_out: str = None, will_restart: bool = False): + self.binary = binary + self.runtime: float = runtime + self.max_rss: int | None = max_rss + self.was_killed: bool = was_killed + self.expected_unseed: int | None = expected_unseed + self.exit_code: int = exit_code + self.out: SummaryTree = SummaryTree('Test') + self.test_begin_found: bool = False + self.test_end_found: bool = False + self.unseed: int | None = None + self.valgrind_out_file: Path | None = valgrind_out_file + self.severity_map: OrderedDict[tuple[str, int], int] = collections.OrderedDict() + self.error: bool = False + self.errors: int = 0 + self.warnings: int = 0 + self.coverage: OrderedDict[Coverage, bool] = collections.OrderedDict() + self.test_count: int = 0 + self.tests_passed: int = 0 + self.error_out = error_out + self.stderr_severity: str = '40' + self.will_restart: bool = will_restart + self.test_dir: Path | None = None + + if uid is not None: + self.out.attributes['TestUID'] = str(uid) + if stats is not None: + self.out.attributes['Statistics'] = stats + self.out.attributes['JoshuaSeed'] = str(config.joshua_seed) + self.out.attributes['WillRestart'] = '1' if self.will_restart else '0' + + self.handler = ParseHandler(self.out) + self.register_handlers() + + def summarize_files(self, trace_files: List[Path]): + assert len(trace_files) > 0 + for f in trace_files: + self.parse_file(f) + self.done() + + def summarize(self, trace_dir: Path, command: str): + self.test_dir = trace_dir + trace_files = TraceFiles(trace_dir) + if len(trace_files) == 0: + self.error = True + child = SummaryTree('NoTracesFound') + child.attributes['Severity'] = '40' + child.attributes['Path'] = str(trace_dir.absolute()) + child.attributes['Command'] = command + self.out.append(child) + return + self.summarize_files(trace_files[0]) + if config.joshua_dir is not None: + import test_harness.fdb + test_harness.fdb.write_coverage(config.cluster_file, + test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage',), + test_harness.fdb.str_to_tuple(config.joshua_dir) + ('coverage-metadata',), + self.coverage) + + def list_simfdb(self) -> SummaryTree: + res = SummaryTree('SimFDB') + res.attributes['TestDir'] = str(self.test_dir) + if self.test_dir is None: + return res + simfdb = self.test_dir / Path('simfdb') + if not simfdb.exists(): + res.attributes['NoSimDir'] = "simfdb doesn't exist" + return res + elif not simfdb.is_dir(): + res.attributes['NoSimDir'] = 'simfdb is not a directory' + return res + for file in simfdb.iterdir(): + child = SummaryTree('Directory' if file.is_dir() else 'File') + child.attributes['Name'] = file.name + res.append(child) + return res + + def ok(self): + return not self.error + + def done(self): + if config.print_coverage: + for k, v in self.coverage.items(): + child = SummaryTree('CodeCoverage') + child.attributes['File'] = k.file + child.attributes['Line'] = str(k.line) + if not v: + child.attributes['Covered'] = '0' + if k.comment is not None and len(k.comment): + child.attributes['Comment'] = k.comment + self.out.append(child) + if self.warnings > config.max_warnings: + child = SummaryTree('WarningLimitExceeded') + child.attributes['Severity'] = '30' + child.attributes['WarningCount'] = str(self.warnings) + self.out.append(child) + if self.errors > config.max_errors: + child = SummaryTree('ErrorLimitExceeded') + child.attributes['Severity'] = '40' + child.attributes['ErrorCount'] = str(self.errors) + self.out.append(child) + if self.was_killed: + child = SummaryTree('ExternalTimeout') + child.attributes['Severity'] = '40' + self.out.append(child) + self.error = True + if self.max_rss is not None: + self.out.attributes['PeakMemory'] = str(self.max_rss) + if self.valgrind_out_file is not None: + try: + valgrind_errors = parse_valgrind_output(self.valgrind_out_file) + for valgrind_error in valgrind_errors: + if valgrind_error.kind.startswith('Leak'): + continue + self.error = True + child = SummaryTree('ValgrindError') + child.attributes['Severity'] = '40' + child.attributes['What'] = valgrind_error.what.what + child.attributes['Backtrace'] = valgrind_error.what.backtrace + aux_count = 0 + for aux in valgrind_error.aux: + child.attributes['WhatAux{}'.format(aux_count)] = aux.what + child.attributes['BacktraceAux{}'.format(aux_count)] = aux.backtrace + aux_count += 1 + self.out.append(child) + except Exception as e: + self.error = True + child = SummaryTree('ValgrindParseError') + child.attributes['Severity'] = '40' + child.attributes['ErrorMessage'] = str(e) + _, _, exc_traceback = sys.exc_info() + child.attributes['Trace'] = repr(traceback.format_tb(exc_traceback)) + self.out.append(child) + if not self.test_end_found: + child = SummaryTree('TestUnexpectedlyNotFinished') + child.attributes['Severity'] = '40' + self.out.append(child) + if self.error_out is not None and len(self.error_out) > 0: + lines = self.error_out.splitlines() + stderr_bytes = 0 + for line in lines: + if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"): + # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives. + continue + if line.endswith("Warning: unimplemented fcntl command: 1036"): + # Valgrind produces this warning when F_SET_RW_HINT is used + continue + if self.stderr_severity == '40': + self.error = True + remaining_bytes = config.max_stderr_bytes - stderr_bytes + if remaining_bytes > 0: + out_err = line[0:remaining_bytes] + ('...' if len(line) > remaining_bytes else '') + child = SummaryTree('StdErrOutput') + child.attributes['Severity'] = self.stderr_severity + child.attributes['Output'] = out_err + self.out.append(child) + stderr_bytes += len(line) + if stderr_bytes > config.max_stderr_bytes: + child = SummaryTree('StdErrOutputTruncated') + child.attributes['Severity'] = self.stderr_severity + child.attributes['BytesRemaining'] = stderr_bytes - config.max_stderr_bytes + self.out.append(child) + + self.out.attributes['Ok'] = '1' if self.ok() else '0' + if not self.ok(): + reason = 'Unknown' + if self.error: + reason = 'ProducedErrors' + elif not self.test_end_found: + reason = 'TestDidNotFinish' + elif self.tests_passed == 0: + reason = 'NoTestsPassed' + elif self.test_count != self.tests_passed: + reason = 'Expected {} tests to pass, but only {} did'.format(self.test_count, self.tests_passed) + self.out.attributes['FailReason'] = reason + + def parse_file(self, file: Path): + parser: Parser + if file.suffix == '.json': + parser = JsonParser() + elif file.suffix == '.xml': + parser = XmlParser() + else: + child = SummaryTree('TestHarnessBug') + child.attributes['File'] = __file__ + frame = inspect.currentframe() + if frame is not None: + child.attributes['Line'] = str(inspect.getframeinfo(frame).lineno) + child.attributes['Details'] = 'Unexpected suffix {} for file {}'.format(file.suffix, file.name) + self.error = True + self.out.append(child) + return + with file.open('r') as f: + try: + parser.parse(f, self.handler) + except Exception as e: + child = SummaryTree('SummarizationError') + child.attributes['Severity'] = '40' + child.attributes['ErrorMessage'] = str(e) + self.out.append(child) + + def register_handlers(self): + def remap_event_severity(attrs): + if 'Type' not in attrs or 'Severity' not in attrs: + return None + k = (attrs['Type'], int(attrs['Severity'])) + if k in self.severity_map: + return str(self.severity_map[k]) + + self.handler.add_handler(('Severity', None), remap_event_severity) + + def program_start(attrs: Dict[str, str]): + if self.test_begin_found: + return + self.test_begin_found = True + self.out.attributes['RandomSeed'] = attrs['RandomSeed'] + self.out.attributes['SourceVersion'] = attrs['SourceVersion'] + self.out.attributes['Time'] = attrs['ActualTime'] + self.out.attributes['BuggifyEnabled'] = attrs['BuggifyEnabled'] + self.out.attributes['DeterminismCheck'] = '0' if self.expected_unseed is None else '1' + if self.binary.name != 'fdbserver': + self.out.attributes['OldBinary'] = self.binary.name + if 'FaultInjectionEnabled' in attrs: + self.out.attributes['FaultInjectionEnabled'] = attrs['FaultInjectionEnabled'] + + self.handler.add_handler(('Type', 'ProgramStart'), program_start) + + def set_test_file(attrs: Dict[str, str]): + test_file = Path(attrs['TestFile']) + cwd = Path('.').absolute() + try: + test_file = test_file.relative_to(cwd) + except ValueError: + pass + self.out.attributes['TestFile'] = str(test_file) + + self.handler.add_handler(('Type', 'Simulation'), set_test_file) + self.handler.add_handler(('Type', 'NonSimulationTest'), set_test_file) + + def set_elapsed_time(attrs: Dict[str, str]): + if self.test_end_found: + return + self.test_end_found = True + self.unseed = int(attrs['RandomUnseed']) + if self.expected_unseed is not None and self.unseed != self.expected_unseed: + severity = 40 if ('UnseedMismatch', 40) not in self.severity_map \ + else self.severity_map[('UnseedMismatch', 40)] + if severity >= 30: + child = SummaryTree('UnseedMismatch') + child.attributes['Unseed'] = str(self.unseed) + child.attributes['ExpectedUnseed'] = str(self.expected_unseed) + child.attributes['Severity'] = str(severity) + if severity >= 40: + self.error = True + self.out.append(child) + self.out.attributes['SimElapsedTime'] = attrs['SimTime'] + self.out.attributes['RealElapsedTime'] = attrs['RealTime'] + if self.unseed is not None: + self.out.attributes['RandomUnseed'] = str(self.unseed) + + self.handler.add_handler(('Type', 'ElapsedTime'), set_elapsed_time) + + def parse_warning(attrs: Dict[str, str]): + self.warnings += 1 + if self.warnings > config.max_warnings: + return + child = SummaryTree(attrs['Type']) + for k, v in attrs.items(): + if k != 'Type': + child.attributes[k] = v + self.out.append(child) + + self.handler.add_handler(('Severity', '30'), parse_warning) + + def parse_error(attrs: Dict[str, str]): + self.errors += 1 + self.error = True + if self.errors > config.max_errors: + return + child = SummaryTree(attrs['Type']) + for k, v in attrs.items(): + child.attributes[k] = v + self.out.append(child) + + self.handler.add_handler(('Severity', '40'), parse_error) + + def coverage(attrs: Dict[str, str]): + covered = True + if 'Covered' in attrs: + covered = int(attrs['Covered']) != 0 + comment = '' + if 'Comment' in attrs: + comment = attrs['Comment'] + c = Coverage(attrs['File'], attrs['Line'], comment) + if covered or c not in self.coverage: + self.coverage[c] = covered + + self.handler.add_handler(('Type', 'CodeCoverage'), coverage) + + def expected_test_pass(attrs: Dict[str, str]): + self.test_count = int(attrs['Count']) + + self.handler.add_handler(('Type', 'TestsExpectedToPass'), expected_test_pass) + + def test_passed(attrs: Dict[str, str]): + if attrs['Passed'] == '1': + self.tests_passed += 1 + + self.handler.add_handler(('Type', 'TestResults'), test_passed) + + def remap_event_severity(attrs: Dict[str, str]): + self.severity_map[(attrs['TargetEvent'], int(attrs['OriginalSeverity']))] = int(attrs['NewSeverity']) + + self.handler.add_handler(('Type', 'RemapEventSeverity'), remap_event_severity) + + def buggify_section(attrs: Dict[str, str]): + if attrs['Type'] == 'FaultInjected' or attrs.get('Activated', '0') == '1': + child = SummaryTree(attrs['Type']) + child.attributes['File'] = attrs['File'] + child.attributes['Line'] = attrs['Line'] + self.out.append(child) + self.handler.add_handler(('Type', 'BuggifySection'), buggify_section) + self.handler.add_handler(('Type', 'FaultInjected'), buggify_section) + + def running_unit_test(attrs: Dict[str, str]): + child = SummaryTree('RunningUnitTest') + child.attributes['Name'] = attrs['Name'] + child.attributes['File'] = attrs['File'] + child.attributes['Line'] = attrs['Line'] + self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test) + + def stderr_severity(attrs: Dict[str, str]): + if 'NewSeverity' in attrs: + self.stderr_severity = attrs['NewSeverity'] + self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity) diff --git a/contrib/TestHarness2/test_harness/test_valgrind_parser.py b/contrib/TestHarness2/test_harness/test_valgrind_parser.py new file mode 100644 index 0000000000..0b36e8e6d5 --- /dev/null +++ b/contrib/TestHarness2/test_harness/test_valgrind_parser.py @@ -0,0 +1,16 @@ +import sys + +from test_harness.valgrind import parse_valgrind_output +from pathlib import Path + + +if __name__ == '__main__': + errors = parse_valgrind_output(Path(sys.argv[1])) + for valgrind_error in errors: + print('ValgrindError: what={}, kind={}'.format(valgrind_error.what.what, valgrind_error.kind)) + print('Backtrace: {}'.format(valgrind_error.what.backtrace)) + counter = 0 + for aux in valgrind_error.aux: + print('Aux {}:'.format(counter)) + print(' What: {}'.format(aux.what)) + print(' Backtrace: {}'.format(aux.backtrace)) diff --git a/contrib/TestHarness2/test_harness/timeout.py b/contrib/TestHarness2/test_harness/timeout.py new file mode 100644 index 0000000000..90af7096fd --- /dev/null +++ b/contrib/TestHarness2/test_harness/timeout.py @@ -0,0 +1,60 @@ +import argparse +import re +import sys + +from pathlib import Path +from test_harness.config import config +from test_harness.summarize import Summary, TraceFiles +from typing import Pattern, List + + +def files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]: + res: List[Path] = [] + for file in path.iterdir(): + if file.is_file() and pattern.match(file.name) is not None: + res.append(file) + elif file.is_dir() and recurse: + res += files_matching(file, pattern, recurse) + return res + + +def dirs_with_files_matching(path: Path, pattern: Pattern, recurse: bool = True) -> List[Path]: + res: List[Path] = [] + sub_directories: List[Path] = [] + has_file = False + for file in path.iterdir(): + if file.is_file() and pattern.match(file.name) is not None: + has_file = True + elif file.is_dir() and recurse: + sub_directories.append(file) + if has_file: + res.append(path) + if recurse: + for file in sub_directories: + res += dirs_with_files_matching(file, pattern, recurse=True) + res.sort() + return res + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('TestHarness Timeout', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + config.build_arguments(parser) + args = parser.parse_args() + config.extract_args(args) + valgrind_files: List[Path] = [] + if config.use_valgrind: + valgrind_files = files_matching(Path.cwd(), re.compile(r'valgrind.*\.xml')) + + for directory in dirs_with_files_matching(Path.cwd(), re.compile(r'trace.*\.(json|xml)'), recurse=True): + trace_files = TraceFiles(directory) + for files in trace_files.items(): + if config.use_valgrind: + for valgrind_file in valgrind_files: + summary = Summary(Path('bin/fdbserver'), was_killed=True) + summary.valgrind_out_file = valgrind_file + summary.summarize_files(files) + summary.out.dump(sys.stdout) + else: + summary = Summary(Path('bin/fdbserver'), was_killed=True) + summary.summarize_files(files) + summary.out.dump(sys.stdout) diff --git a/contrib/TestHarness2/test_harness/valgrind.py b/contrib/TestHarness2/test_harness/valgrind.py new file mode 100644 index 0000000000..399b47c0cc --- /dev/null +++ b/contrib/TestHarness2/test_harness/valgrind.py @@ -0,0 +1,141 @@ +import enum +import xml +import xml.sax.handler +from pathlib import Path +from typing import List + + +class ValgrindWhat: + def __init__(self): + self.what: str = '' + self.backtrace: str = '' + + +class ValgrindError: + def __init__(self): + self.what: ValgrindWhat = ValgrindWhat() + self.kind: str = '' + self.aux: List[ValgrindWhat] = [] + + +# noinspection PyArgumentList +class ValgrindParseState(enum.Enum): + ROOT = enum.auto() + ERROR = enum.auto() + ERROR_AUX = enum.auto() + KIND = enum.auto() + WHAT = enum.auto() + TRACE = enum.auto() + AUX_WHAT = enum.auto() + STACK = enum.auto() + STACK_AUX = enum.auto() + STACK_IP = enum.auto() + STACK_IP_AUX = enum.auto() + + +class ValgrindHandler(xml.sax.handler.ContentHandler): + def __init__(self): + super().__init__() + self.stack: List[ValgrindError] = [] + self.result: List[ValgrindError] = [] + self.state_stack: List[ValgrindParseState] = [] + + def state(self) -> ValgrindParseState: + if len(self.state_stack) == 0: + return ValgrindParseState.ROOT + return self.state_stack[-1] + + @staticmethod + def from_content(content): + # pdb.set_trace() + if isinstance(content, bytes): + return content.decode() + assert isinstance(content, str) + return content + + def characters(self, content): + # pdb.set_trace() + state = self.state() + if len(self.state_stack) == 0: + return + else: + assert len(self.stack) > 0 + if state is ValgrindParseState.KIND: + self.stack[-1].kind += self.from_content(content) + elif state is ValgrindParseState.WHAT: + self.stack[-1].what.what += self.from_content(content) + elif state is ValgrindParseState.AUX_WHAT: + self.stack[-1].aux[-1].what += self.from_content(content) + elif state is ValgrindParseState.STACK_IP: + self.stack[-1].what.backtrace += self.from_content(content) + elif state is ValgrindParseState.STACK_IP_AUX: + self.stack[-1].aux[-1].backtrace += self.from_content(content) + + def startElement(self, name, attrs): + # pdb.set_trace() + if name == 'error': + self.stack.append(ValgrindError()) + self.state_stack.append(ValgrindParseState.ERROR) + if len(self.stack) == 0: + return + if name == 'kind': + self.state_stack.append(ValgrindParseState.KIND) + elif name == 'what': + self.state_stack.append(ValgrindParseState.WHAT) + elif name == 'auxwhat': + assert self.state() in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX] + self.state_stack.pop() + self.state_stack.append(ValgrindParseState.ERROR_AUX) + self.state_stack.append(ValgrindParseState.AUX_WHAT) + self.stack[-1].aux.append(ValgrindWhat()) + elif name == 'stack': + state = self.state() + assert state in [ValgrindParseState.ERROR, ValgrindParseState.ERROR_AUX] + if state == ValgrindParseState.ERROR: + self.state_stack.append(ValgrindParseState.STACK) + else: + self.state_stack.append(ValgrindParseState.STACK_AUX) + elif name == 'ip': + state = self.state() + assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + if state == ValgrindParseState.STACK: + self.state_stack.append(ValgrindParseState.STACK_IP) + if len(self.stack[-1].what.backtrace) == 0: + self.stack[-1].what.backtrace = 'addr2line -e fdbserver.debug -p -C -f -i ' + else: + self.stack[-1].what.backtrace += ' ' + else: + self.state_stack.append(ValgrindParseState.STACK_IP_AUX) + if len(self.stack[-1].aux[-1].backtrace) == 0: + self.stack[-1].aux[-1].backtrace = 'addr2line -e fdbserver.debug -p -C -f -i ' + else: + self.stack[-1].aux[-1].backtrace += ' ' + + def endElement(self, name): + # pdb.set_trace() + if name == 'error': + self.result.append(self.stack.pop()) + self.state_stack.pop() + elif name == 'kind': + assert self.state() == ValgrindParseState.KIND + self.state_stack.pop() + elif name == 'what': + assert self.state() == ValgrindParseState.WHAT + self.state_stack.pop() + elif name == 'auxwhat': + assert self.state() == ValgrindParseState.AUX_WHAT + self.state_stack.pop() + elif name == 'stack': + assert self.state() in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + self.state_stack.pop() + elif name == 'ip': + self.state_stack.pop() + state = self.state() + assert state in [ValgrindParseState.STACK, ValgrindParseState.STACK_AUX] + + +def parse_valgrind_output(valgrind_out_file: Path) -> List[ValgrindError]: + handler = ValgrindHandler() + with valgrind_out_file.open('r') as f: + xml.sax.parse(f, handler) + return handler.result diff --git a/contrib/TestHarness2/test_harness/version.py b/contrib/TestHarness2/test_harness/version.py new file mode 100644 index 0000000000..fe04206a8a --- /dev/null +++ b/contrib/TestHarness2/test_harness/version.py @@ -0,0 +1,66 @@ +from functools import total_ordering +from pathlib import Path +from typing import Tuple + + +@total_ordering +class Version: + def __init__(self): + self.major: int = 0 + self.minor: int = 0 + self.patch: int = 0 + + def version_tuple(self): + return self.major, self.minor, self.patch + + def _compare(self, other) -> int: + lhs: Tuple[int, int, int] = self.version_tuple() + rhs: Tuple[int, int, int] + if isinstance(other, Version): + rhs = other.version_tuple() + else: + rhs = Version.parse(str(other)).version_tuple() + if lhs < rhs: + return -1 + elif lhs > rhs: + return 1 + else: + return 0 + + def __eq__(self, other) -> bool: + return self._compare(other) == 0 + + def __lt__(self, other) -> bool: + return self._compare(other) < 0 + + def __hash__(self): + return hash(self.version_tuple()) + + def __str__(self): + return format('{}.{}.{}'.format(self.major, self.minor, self.patch)) + + @staticmethod + def of_binary(binary: Path): + parts = binary.name.split('-') + if len(parts) != 2: + return Version.max_version() + return Version.parse(parts[1]) + + @staticmethod + def parse(version: str): + version_tuple = version.split('.') + self = Version() + self.major = int(version_tuple[0]) + if len(version_tuple) > 1: + self.minor = int(version_tuple[1]) + if len(version_tuple) > 2: + self.patch = int(version_tuple[2]) + return self + + @staticmethod + def max_version(): + self = Version() + self.major = 2**32 - 1 + self.minor = 2**32 - 1 + self.patch = 2**32 - 1 + return self diff --git a/contrib/observability_splunk_dashboard/details.xml b/contrib/observability_splunk_dashboard/details.xml new file mode 100644 index 0000000000..70ff15883b --- /dev/null +++ b/contrib/observability_splunk_dashboard/details.xml @@ -0,0 +1,431 @@ +
+ + Details for FoundationDB Cluster +
+ + + * + + + + * + + + + + -60m@m + now + + + + + Default + 5 seconds + 1 minute + 10 minutes + 1 hour + 1 day + bins=100 + bins=100 + + + + All + Storage Server + Transaction Log + Proxy + Resolver + Master + Cluster Controller + Log Router + Data Distributor + Ratekeeper + Tester + + + + + * + + + + * + +
+ + + + Storage Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | rex field=BytesDurable "(?<DurableRate>.*) (?<DurableRoughness>.*) (?<DurableCounter>.*)" | eval QueueSize=InputCounter-DurableCounter | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Input Rate + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesInput "(?<InputRate>.*) (?<InputRoughness>.*) (?<InputCounter>.*)" | timechart $Span$ avg(InputRate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Storage Bytes Queried + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | rex field=BytesQueried "(?<Rate>.*) (?<Roughness>.*) (?<Counter>.*)" | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Average Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ avg(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Max Process CPU by Role (capped at 2; beware kernel bug) + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval Cpu=CPUSeconds/Elapsed | timechart $Span$ max(Cpu) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Disk Busyness + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=ProcessMetrics TrackLatestType=Original | eval DiskBusyPercentage=(Elapsed-DiskIdleSeconds)/Elapsed | timechart $Span$ avg(DiskBusyPercentage) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Max Run Loop Busyness by Role (for <=6.1, S2Pri1) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics NOT TrackLatestType=Rolled | eval Busyness=if(isnull(PriorityStarvedBelow1), if(isnull(PriorityBusy1), S2Pri1, PriorityBusy1/Elapsed), PriorityStarvedBelow1/Elapsed) | timechart $Span$ max(Busyness) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Max Run Loop Busyness by Priority (6.2+ only) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=NetworkMetrics TrackLatestType=Original | foreach PriorityBusy* [eval Busyness<<MATCHSTR>>=PriorityBusy<<MATCHSTR>>/Elapsed] | timechart $Span$ max(Busyness*) + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + TLog Queue Size + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval QueueSize=SharedBytesInput-SharedBytesDurable | timechart $Span$ avg(QueueSize) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Connection Timeouts (counted on both sides of connection) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) $Roles$ host=$Host$ | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) | rex field=WithAddr "(?<OtherAddr>[^:]*:[^:]*).*" | eval Machine=Machine+","+OtherAddr | makemv delim="," Machine | search Machine=$Machine$ | eval Count=1+SuppressedEventCount | timechart sum(Count) by Machine useother=f + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Pairwise Connection Timeouts Between Datacenters + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut) host=* Machine=* NOT TrackLatestType=Rolled +| eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr) +| rex field=host "(?<Datacenter>..).*" +| eval Datacenter=if(isnotnull(pie_work_unit), pie_work_unit, Datacenter) +| rex field=WithAddr "(?<OtherIP>[^:]*):.*" +| join OtherIP + [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled + | rex field=Machine "(?<OtherIP>[^:]*):.*" + | rex field=host "(?<OtherDatacenter>..).*" + | eval OtherDatacenter=if(isnotnull(pie_work_unit), pie_work_unit, OtherDatacenter)] +| eval DC1=if(Datacenter>OtherDatacenter, Datacenter, OtherDatacenter), DC2=if(Datacenter>OtherDatacenter, OtherDatacenter, Datacenter) +| eval Connection=DC1+" <-> " + DC2 +| eval Count=1+SuppressedEventCount +| timechart count by Connection + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Pairwise Connection Timeouts Between Known Server Processes (Sorted by Count, descending) + + index=$Index$ LogGroup=$LogGroup$ (Type=ConnectionTimeout OR Type=ConnectionTimedOut OR Type=ProcessMetrics) $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | eval WithAddr=if(Type=="ConnectionTimedOut", PeerAddr, WithAddr), Reason=if(Type=="ConnectionTimedOut", "Timed out trying to connect", "Established connection timed out") | rex field=Machine "(?<IP>[^:]*):.*" | rex field=host "(?<Datacenter>..).*" | rex field=WithAddr "(?<OtherIP>[^:]*):.*" | eventstats values(Roles) as Roles by IP | join OtherIP [search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics NOT TrackLatestType=Rolled | rex field=Machine "(?<OtherIP>[^:]*):.*" | rex field=host "(?<OtherDatacenter>..).*" | stats values(Roles) as OtherRoles by OtherIP, OtherDatacenter | eval OtherRoles="("+mvjoin(OtherRoles,",")+")"] | eval Roles="("+mvjoin(Roles,",")+")" | eval IP=Datacenter+": "+IP+" "+Roles, OtherIP=OtherDatacenter+": "+OtherIP+" "+OtherRoles | eval Addr1=if(IP>OtherIP, IP, OtherIP), Addr2=if(IP>OtherIP, OtherIP, IP) | eval Connection=Addr1+" <-> " + Addr2 | eval Count=1+SuppressedEventCount | stats sum(Count) as Count, values(Reason) as Reasons by Connection | sort -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Lazy Deletion Rate (making space available for reuse) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=LazyDeletePages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Vacuuming Rate (shrinking file) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ Type=SpringCleaningMetrics | eval Metric=VacuumedPages | streamstats current=f global=f window=1 first(Metric) as NextMetric, first(Time), as NextTime by ID | eval Rate=4096*(NextMetric-Metric)/(NextTime-Time) | timechart $Span$ avg(Rate) by Machine + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + Roles + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | makemv delim="," Roles | mvexpand Roles | timechart $Span$ distinct_count(Machine) by Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + + Slow Tasks (Sorted by Duration, Descending) + + index=$Index$ LogGroup=$LogGroup$ Type=SlowTask $Roles$ host=$Host$ Machine=$Machine$ | sort -Duration | table _time, Duration, Machine, TaskID, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+ + + Event Counts (Sorted by Severity and Count, Descending) + + index=$Index$ LogGroup=$LogGroup$ $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | stats count as Count by Type, Severity | sort -Severity, -Count + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Errors + + index=$Index$ LogGroup=$LogGroup$ Severity=40 $Roles$ host=$Host$ Machine=$Machine$ NOT TrackLatestType=Rolled | table _time, Type, Machine, Roles + $TimeRange.earliest$ + $TimeRange.latest$ + + + +
+
+
+ + + + Recoveries (Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MasterRecoveryState TrackLatestType=Original (StatusCode=0 OR StatusCode=11) | eval RecoveryResetInterval=10 | sort _time | streamstats earliest(_time) as RecoveryStart, count as EventCount reset_after="(StatusCode=11)" | where StatusCode=11 | eval EventCount=if(EventCount==1, 2, EventCount), RecoveryStart=if(RecoveryStart==_time, _time-RecoveryDuration, RecoveryStart) | sort -_time | streamstats current=f global=f window=1 first(RecoveryStart) as NextRecoveryStart | eval RecoverySpan=NextRecoveryStart-_time, FailedRecoveries=EventCount-2, SuccessfulRecoveries=1 | eval AvailableSeconds=if(RecoverySpan<RecoveryResetInterval, RecoverySpan, 0) | sort _time | streamstats earliest(RecoveryStart) as RecoveryStart, sum(FailedRecoveries) as FailedRecoveryCount, sum(SuccessfulRecoveries) as SuccessfulRecoveryCount, sum(AvailableSeconds) as AvailableSeconds reset_after="(NOT RecoverySpan < RecoveryResetInterval)" | where NOT RecoverySpan < RecoveryResetInterval | eval Duration=_time-RecoveryStart, StartTime=strftime(RecoveryStart, "%F %X.%Q"), ShortLivedRecoveryCount=SuccessfulRecoveryCount-1 | table StartTime, Duration, FailedRecoveryCount, ShortLivedRecoveryCount, AvailableSeconds | sort -StartTime + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Process (Re)starts + + index=$Index$ LogGroup=$LogGroup$ Type=ProgramStart TrackLatestType=Original $Roles$ host=$Host$ Machine=$Machine$ | table _time, Machine | sort -_time + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+ + + Failure Detection (Machine Filter Only) + + index=$Index$ LogGroup=$LogGroup$ Type=FailureDetectionStatus System=$Machine$ | sort _time | eval Failed=if(Status=="Failed", 1, 0) | streamstats current=t global=f window=2 first(Failed) as PrevFailed by System | where PrevFailed=1 OR Failed=1 | eval Failed=PrevFailed + "," + Failed | makemv delim="," Failed | mvexpand Failed | timechart $Span$ max(Failed) by System + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + +
+ + + + Storage Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBStored=BytesStored/1e9, Overhead=KvstoreBytesUsed/BytesStored, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeSpacePercent) as FreeSpacePercent, latest(GBStored) as GBStored, latest(GBUsed) as GBUsed, latest(Overhead) as OverheadFactor, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + TLog Server Space Usage (Sorted by Available Space Percentage, Ascending) + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics host=* Machine=* TrackLatestType=Original Roles=TL | eval AvailableSpacePercent=KvstoreBytesAvailable/KvstoreBytesTotal, FreeDiskSpacePercent=KvstoreBytesFree/KvstoreBytesTotal, GBUsed=KvstoreBytesUsed/1e9, GBTotalSpace=KvstoreBytesTotal/1e9 | stats latest(AvailableSpacePercent) as AvailableSpacePercent, latest(FreeDiskSpacePercent) as FreeDiskSpacePercent, latest(GBUsed) as GBUsed, latest(GBTotalSpace) as GBTotalSpace by Machine | sort AvailableSpacePercent + $TimeRange.earliest$ + $TimeRange.latest$ + + + + +
+
+
+ + + + Data Movement by Type (Log Scale, Ignores Filters) + + index=$Index$ LogGroup=$LogGroup$ Type=MovingData TrackLatestType=Original | timechart avg(Priority*) as * + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + Storage Server Max Bytes Stored by Host + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics $Roles$ host=$Host$ Machine=$Machine$ TrackLatestType=Original | eval GBStored=BytesStored/1e9 | timechart max(GBStored) by host limit=100 + $TimeRange.earliest$ + $TimeRange.latest$ + + + + + + + + + + + Master Failed Clients + + index=$Index$ LogGroup=$LogGroup$ Type=WaitFailureClient +| stats count by FailedEndpoint + $TimeRange.earliest$ + $TimeRange.latest$ + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/performance_overview.xml b/contrib/observability_splunk_dashboard/performance_overview.xml new file mode 100644 index 0000000000..0719e2bbab --- /dev/null +++ b/contrib/observability_splunk_dashboard/performance_overview.xml @@ -0,0 +1,323 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 60s + +
+ + + Transaction Rate measured on Proxies + + Sum in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " TxnThrottled +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), TxnThrottledRate=mvindex(TxnThrottled, 0) +| timechart span=$ChartBinSizeToken$ sum(TxnRequestInRate) as StartedTxnBatchRate, sum(TxnRequestOutRate) as FinishedTxnBatchRate, sum(TxnStartInRate) as StartedTxnRate, sum(TxnStartOutRate) as FinishedTxnRate, sum(TxnThrottledRate) as ThrottledTxnRate + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Read Rate measured on Storage Servers + + Average in $ChartBinSizeToken$ seconds + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" +| rex field=RowsQueried "(?<KRate>.*) (?<KRoughness>.*) (?<KCounter>.*)" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(RRate) as BytesReadPerSecond, avg(KRate) as RowsReadPerSecond, avg(FRate) as DDReadPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Proxies + + 1min Average + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " MutationBytes +| makemv delim=" " Mutations +| eval MutationBytesRate=mvindex(MutationBytes, 0), MutationsRate=mvindex(Mutations,0) +| bucket span=5s _time +| stats sum(MutationBytesRate) as MutationBytes, sum(MutationsRate) as Mutations by _time +|eval MutationMB=MutationBytes/1024/1024, MutationsK=Mutations/1000 +| timechart span=$ChartBinSizeToken$ avg(MutationMB) as MutationMB, avg(MutationsK) as MutationsK + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Write Rate measured on Storage Servers + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" +| rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" +| rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" +| timechart span=$ChartBinSizeToken$ avg(WRate) as BytesPerSecond, avg(FRate) as DDBytesWrittenPerSecond + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + GRV Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=GRVLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Commit Latency measured on all Proxies + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=CommitLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Read Latency measured on all Storage Servers + + Seconds + + index=$Index$ LogGroup=$LogGroup$ Type=ReadLatencyMetrics AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ avg(Max) as maxLatency, avg(Mean) as meanLatency, avg(P99) as P99Latency, avg(P99.9) as P999Latency, avg(P95) as P95Latency + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + RateKeeper: ReleasedTPS vs LimitTPS + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + RateKeeper: Throttling Reason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + RateKeeper: Throttling Server + + Ratekeeper: Limit Reason: ReasonServerID (Most recent 10 records) + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate AND TrackLatestType="Original" +| streamstats count as numOfEvents +| where numOfEvents < 10 +| eval DateTime=strftime(Time, "%Y-%m-%dT%H:%M:%S") +| table DateTime, ReasonServerID + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+
+ + + Disk Overhead = Disk Usage / Logical KV Size + + Y-axis is capped at 10 + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* (Type=StorageMetrics OR Type=DDTrackerStats) TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes, avg(TotalSizeBytes) as LogicalKVBytes by _time +| eval overhead=StorageDiskUsedBytes/LogicalKVBytes +| timechart avg(overhead) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + KV Data Size + + + index=$Index$ LogGroup=$LogGroup$ +Roles=*DD* host=* Machine=* Type=DDTrackerStats TrackLatestType=Original +| eval TotalKVGB=TotalSizeBytes/1024/1024/1024, SystemKVGB=SystemSizeBytes/1024/1024/1024 +|timechart avg(TotalKVGB), avg(SystemKVGB), avg(Shards) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Disk Usage + + + index=$Index$ LogGroup=$LogGroup$ host=* Machine=* Type=StorageMetrics TrackLatestType=Original +| bucket _time span=5s +| stats sum(KvstoreBytesUsed) as StorageDiskUsedBytes, sum(KvstoreBytesTotal) as StorageDiskTotalBytes by _time +|eval StorageDiskTotalMB = StorageDiskTotalBytes/1024/1024, StorageDiskUsedMB=StorageDiskUsedBytes/1024/1024 +| timechart avg(StorageDiskTotalMB) as StorageDiskTotalMB, avg(StorageDiskUsedMB) as StorageDiskUsedMB + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Cluster Roles + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics TrackLatestType="Original" +| rex field=host "(?<HostDC>..).*-..(?<HostConfig>..).*" +| eval HostDC=if(isnotnull(pie_work_unit), pie_work_unit, HostDC) +| makemv delim="," Roles +| stats dc(Machine) as MachineCount by Roles, HostDC +| stats list(HostDC), list(MachineCount) by Roles +| sort Roles + $TimeSpan.earliest$ + $TimeSpan.latest$ + + +
+
+
+ + + Storage Engine + + + index=$Index$ LogGroup=$LogGroup$ Type=Role Origination=Recruited As=StorageServer | table StorageEngine, OriginalDateTime, DateTime |head 2 + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + +
+
+ + Cluster Generations + + Indicate FDB recoveries + + index=$Index$ LogGroup=$LogGroup$ Type=TLogMetrics |timechart max(Generation) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + +
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/ratekeeper.xml b/contrib/observability_splunk_dashboard/ratekeeper.xml new file mode 100644 index 0000000000..c4a31a8fbc --- /dev/null +++ b/contrib/observability_splunk_dashboard/ratekeeper.xml @@ -0,0 +1,928 @@ +
+ +
+ + + * + + + + + + + + + -60m@m + now + + + + + Normal + Batch + + + + + 30s + + + + Yes + No + + + + + MasterServer + MasterProxyServer + StorageServer + TLog + Resolver + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + + + + MasterServer + MasterProxyServer + Resolver + TLog + StorageServer + GrvProxyServer + CommitProxyServer + +
+ + + Aggregated Storage Server Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ Type=StorageMetrics TrackLatestType="Original" + | rex field=BytesQueried "(?<RRate>.*) (?<RRoughness>.*) (?<RCounter>.*)" + | rex field=BytesInput "(?<WRate>.*) (?<WRoughness>.*) (?<WCounter>.*)" + | rex field=BytesFetched "(?<FRate>.*) (?<FRoughness>.*) (?<FCounter>.*)" + | bin span=5s _time + | stats sum(RRate) as ReadSum, sum(WRate) as WriteSum, sum(FRate) as FetchedKeyRate by _time + | eval ReadSpeedMB=ReadSum/1024/1024, WriteSpeedMB=WriteSum/1024/1024, FetchedKeyRateMB=FetchedKeyRate/1024/1024 + |timechart avg(ReadSpeedMB), avg(WriteSpeedMB), avg(FetchedKeyRateMB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Aggregated Proxy Bandwidth + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| makemv delim=" " TxnRequestIn | makemv delim=" " TxnRequestOut | makemv delim=" " TxnStartIn | makemv delim=" " TxnStartOut | makemv delim=" " MutationBytes +| eval TxnRequestInRate=mvindex(TxnRequestIn, 0), TxnRequestOutRate=mvindex(TxnRequestOut, 0), TxnStartInRate=mvindex(TxnStartIn, 0), TxnStartOutRate=mvindex(TxnStartOut, 0), MutationBytesRate=mvindex(MutationBytes, 0) +| bin span=60s _time +| stats avg(TxnRequestInRate) as TxnRequestInRatePerHost, avg(TxnRequestOutRate) as TxnRequestOutRatePerHost, avg(TxnStartInRate) as TxnStartInRatePerHost, avg(TxnStartOutRate) as TxnStartOutRatePerHost, avg(MutationBytesRate) as MutationBytesRatePerHost by Machine,_time +| eval WriteThroughputKB=sum(MutationBytesRatePerHost)/1000 +| timechart span=1m sum(TxnRequestInRatePerHost), sum(TxnRequestOutRatePerHost), sum(TxnStartInRatePerHost), sum(TxnStartOutRatePerHost), sum(WriteThroughputKB) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 1: Overview - GRV Arrivals and Leaves per Second Seen by Proxies + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| eval TxnRequestIn=mvindex(TxnRequestIn, 0), TxnRequestOut=mvindex(TxnRequestOut, 0), TxnStartIn=mvindex(TxnStartIn, 0), TxnStartOut=mvindex(TxnStartOut, 0) +| timechart span=30s avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) by Machine + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + Chart 2: RKOverview - Input ReleasedTPS and Output TPSLimit + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time ReleasedTPS TPSLimit +| timechart span=$ChartBinSizeToken$ avg(ReleasedTPS) avg(TPSLimit) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 3: RKOverview - RKLimitReason + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| eval _time=Time +| table _time Reason + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + + + + Chart 4: Don't Process Transactions - RkSSListFetchTimeout (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkSSListFetchTimeout" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 5: Don't Process Transactions - RkTlogMinFreeSpaceZero (TpsLimit = 0) + + + index=$Index$ LogGroup=$LogGroup$ +Type="RkTlogMinFreeSpaceZero" +| timechart span=1s count + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 6: Don't Process Transactions - ProxyGRVThresholdExceeded + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyGRVThresholdExceeded*") AND TrackLatestType="Original" +| timechart span=1s count by Type + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 7: RKLimitReasonCandidate - LimitingStorageServerDurabilityLag (MVCCVersionInMemory) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerDurabilityLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 8: RKLimitReasonCandidate - LimitingStorageServerVersionLag (TLogVer-SSVer) + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerVersionLag) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 9: RKLimitReasonCandidate - LimitingStorageServerQueue + + + index=$Index$ LogGroup=$LogGroup$ Type=RkUpdate$UpdateRateTypeToken$ AND TrackLatestType="Original" +| replace inf with 100000000000 +| timechart span=$ChartBinSizeToken$ avg(LimitingStorageServerQueue) + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 10: Runtime Monitoring - StorageServer MVCCVersionInMemory (storage_server_durability_lag) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval NonDurableVersions=Version-DurableVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(NonDurableVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 11: Runtime Monitoring - StorageServer LocalRate (higher MVCCVersionInMemory -> lower LocalRate) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(LocalRate) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 12: Runtime Monitoring - StorageServer ReadsRejected (lower LocalRate -> higher probability of rejecting read)) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" +| timechart limit=0 avg(ReadsRejected) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 13: Runtime Monitoring - Version Lag between StorageServer and Tlog (storage_server_readable_behind) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval SSFallBehindVersions=VersionLag +| timechart span=$ChartBinSizeToken$ limit=0 avg(SSFallBehindVersions) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 14: Runtime Monitoring - StorageServerBytes (storage_server_write_queue_size) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput | makemv delim=" " BytesDurable | makemv delim=" " BytesFetched | makemv delim=" " MutationBytes +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesFetched=mvindex(BytesFetched, 2), MutationBytes=mvindex(MutationBytes, 2), BytesInMemoryQueue=BytesInput-BytesDurable +| timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 15: Runtime Monitoring - StorageServer KVStore Free Space Ratio (storage_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 16: Runtime Monitoring - TLog Queue Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval QueueBytesFreeRatio=QueueDiskBytesFree/QueueDiskBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueueBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 17: Runtime Monitoring - TLog KVStore Free Space Ratio (log_server_min_free_space) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval KvstoreBytesFreeRatio=KvstoreBytesFree/KvstoreBytesTotal +| timechart span=$ChartBinSizeToken$ limit=0 avg(KvstoreBytesFreeRatio) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 18: Runtime Monitoring - TLogBytes (log_server_write_queue) + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| makemv delim=" " BytesInput +| makemv delim=" " BytesDurable +| eval BytesInput=mvindex(BytesInput, 2), BytesDurable=mvindex(BytesDurable, 2), BytesInMemoryQueue=BytesInput-BytesDurable | timechart span=$ChartBinSizeToken$ limit=0 avg(BytesInMemoryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 19: Runtime Monitoring - Proxy Throughput + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" +| timechart span=$ChartBinSizeToken$ limit=0 avg(TxnRequestIn) avg(TxnRequestOut) avg(TxnStartIn) avg(TxnStartOut) avg(TxnStartBatch) avg(TxnStartErrors) avg(TxnCommitIn) avg(TxnCommitVersionAssigned) avg(TxnCommitResolving) avg(TxnCommitResolved) avg(TxnCommitOut) avg(TxnCommitOutSuccess) avg(TxnCommitErrors) avg(TxnThrottled) avg(TxnConflicts) avg(CommitBatchIn) avg(CommitBatchOut) avg(TxnRejectedForQueuedTooLong) avg(Mutations) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 20: Runtime Monitoring - Proxy Queue Length + + + index=$Index$ LogGroup=$LogGroup$ (Type="ProxyMetrics" OR Type="GrvProxyMetrics") AND TrackLatestType="Original" | timechart span=$ChartBinSizeToken$ limit=0 avg(*QueueSize*) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 21: Runtime Monitoring - TLog UnpoppedVersion + + + index=$Index$ LogGroup=$LogGroup$ Type="TLogMetrics" AND TrackLatestType="Original" +| eval UnpoppedVersion=PersistentDataDurableVersion-QueuePoppedVersion +| timechart span=$ChartBinSizeToken$ limit=0 avg(UnpoppedVersion) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 22: Runtime Monitoring - Storage Server Disk (AIODiskStall) + + + index=$Index$ LogGroup=$LogGroup$ Type="ProcessMetrics" +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As="StorageServer" + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ limit=0 avg(AIODiskStall) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 23: Runtime Monitoring - StorageServer Query Queue Length + + + index=$Index$ LogGroup=$LogGroup$ Type="StorageMetrics" AND TrackLatestType="Original" +| makemv QueryQueue | eval QueryQueue=mvindex(QueryQueue, 1) | table _time QueryQueue Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(QueryQueue) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 24: Transaction Trace Stats - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.masterProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=TBegin +| bin bins=20 span=$StatsGRVSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GRVByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 25: Transaction Trace Stats - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time ID TimeSpan Machine Location Time +| bin bins=20 span=$StatsReadSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $GetValueByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 26: Transaction Trace Stats - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Machine + + + + 500ms + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| sort 0 Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count>=2 +| eval TBegin=mvindex(Time, 0), TEnd=mvindex(Time, -1), TimeSpan=TEnd-TBegin, _time=T1 +| table _time TimeSpan Machine +| bin bins=20 span=$StatsCommitSpanToken$ TimeSpan +| chart limit=0 count by TimeSpan $CommitByMachineStatsToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + + Chart 27: Transaction Tracing - GRV Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (*ProxyServer.*ProxyServerCore.Broadcast OR *ProxyServer.getLiveCommittedVersion.confirmEpochLive OR *ProxyServer.getLiveCommittedVersion.After) +| table Time Type ID Location Machine Roles +| append + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionDebug" AND (*ProxyServer.queueTransactionStartRequests.Before) + | rename ID as ParentID + | table Time Type ParentID Location Machine Roles + | join ParentID + [ search index=$Index$ LogGroup=$LogGroup$ Type="TransactionAttachID" + | rename ID as ParentID + | rename To as ID + | table ParentID ID] + | table Time Type ID Location Machine Roles] +| table Time Type ID Location Machine Roles +| eval Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Order Type ID Location Machine Roles +| sort 0 Order Time +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), TimeInQueue = T2-T1, TimeGetVersionFromProxies = if(mvcount==4, T3-T2, -0.0000001), TimeConfirmLivenessFromTLogs = if(mvcount==4, T4-T3, T3-T2), TimeSpan=if(mvcount==4,T4-T1,T3-T1), _time=T1 +| table _time TimeSpan TimeInQueue TimeGetVersionFromProxies TimeConfirmLivenessFromTLogs Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeInQueue), avg(TimeGetVersionFromProxies), avg(TimeConfirmLivenessFromTLogs) $GRVLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 28: Transaction Tracing - GetValue Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (storageServer.received OR getValueQ.DoRead OR getValueQ.AfterVersion OR Reader.Before OR Reader.After OR getValueQ.AfterRead OR NativeAPI.getKeyLocation.Before OR NativeAPI.getKeyLocation.After) +| table Machine Location Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Time Order +| stats list(*) by ID +| rename list(*) as * +| table Machine Location Time Roles ID Type +| eval count = mvcount(Location) +| search count>2 +| eval TEnd=mvindex(Time, -1), TBegin=mvindex(Time, 0), TimeSpan=TEnd-TBegin, _time=TBegin +| table _time TimeSpan +| timechart span=30s limit=0 avg(TimeSpan) $GetValueLatencyByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 29: Transaction Tracing - Commit Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (*ProxyServer.commitBatch.Before OR *ProxyServer.commitBatch.GettingCommitVersion OR *ProxyServer.commitBatch.GotCommitVersion OR *ProxyServer.commitBatch.ProcessingMutations OR *ProxyServer.commitBatch.AfterStoreCommits OR *ProxyServer.commitBatch.AfterLogPush OR *ProxyServer.commitBatch.AfterResolution) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location) +| search Count=7 +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 1), T3=mvindex(Time, 2), T4=mvindex(Time, 3), T5=mvindex(Time, 4), T6=mvindex(Time, 5), T7=mvindex(Time, 6), TimeSpan=T7-T1, TimeResolution=T4-T3, TimePostResolution=T5-T4, TimeProcessingMutation=T6-T5, TimeTLogPush=T7-T6, _time=T1 +| table _time TimeSpan TimeResolution TimePostResolution TimeProcessingMutation TimeTLogPush Machine +| timechart span=$ChartBinSizeToken$ limit=0 avg(TimeSpan), avg(TimeResolution), avg(TimePostResolution), avg(TimeProcessingMutation), avg(TimeTLogPush) $CommitByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 30: Transaction Tracing - Commit - TLogPush and Resolver Latency (only show CC transactions by default; it shows client transactions only when you manually open client transaction trace) + + + Yes + No + Step + + + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (Resolver.resolveBatch.Before OR Resolver.resolveBatch.AfterQueueSizeCheck OR Resolver.resolveBatch.AfterOrderer OR Resolver.resolveBatch.After OR TLog.tLogCommit.BeforeWaitForVersion OR TLog.tLogCommit.Before OR TLog.tLogCommit.AfterTLogCommit OR TLog.tLogCommit.After) +| table Time Type ID Location Machine Roles +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location=="MasterProxyServer.batcher", 1, Location=="MasterProxyServer.commitBatch.Before", 2, Location=="MasterProxyServer.commitBatch.GettingCommitVersion", 3, Location=="MasterProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location=="MasterProxyServer.commitBatch.AfterResolution", 8.5, Location=="MasterProxyServer.commitBatch.ProcessingMutations", 9, Location=="MasterProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLog.tLogCommit.BeforeWaitForVersion", 11, Location=="TLog.tLogCommit.Before", 12, Location=="TLog.tLogCommit.AfterTLogCommit", 13, Location=="TLog.tLogCommit.After", 14, Location=="MasterProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| table Time Order Type ID Location Machine Roles +| sort 0 Time Order +| table Machine Location Time Roles Type ID +| stats list(*) by ID +| rename list(*) as * +| eval Count=mvcount(Location), Step=case(Count=4 and (mvindex(Location, 0) like "TLog%"), "TimeTLogCommit", Count=4 and (mvindex(Location, 0) like "Resolver%"), "TimeResolver", Count=10, "TimeSpan"), BeginTime=mvindex(Time, 0), EndTime=mvindex(Time, -1), Duration=EndTime-BeginTime, _time=BeginTime +| search Count=4 +| eval Machinei=mvindex(Machine, 0), MachineStep = Step."-".Machinei +| table _time Step Duration Machinei Location Machine MachineStep +| timechart span=$ChartBinSizeToken$ limit=0 avg(Duration) by $TLogResolverByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 31: Machine Performance - CPU Utilization (CPU Time divided by Elapsed) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory Elapsed +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization=CPUSeconds/Elapsed +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 32: Machine Performance - Memory Utilization (ResidentMemory divided by Memory) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = ResidentMemory/Memory +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 33: Machine Performance - Disk Utilization ((DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| eval Utilization = (DiskTotalBytes-DiskFreeBytes)/DiskTotalBytes +| timechart span=$ChartBinSizeToken$ avg(Utilization) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 34: Machine Performance - Network (Mbps Received and Mbps Sent) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(MbpsReceived) avg(MbpsSent) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + Chart 35: Machine Performance - Disk (Reads Count and Writes Count) + + + index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics AND TrackLatestType="Original" +| table _time Machine CPUSeconds DiskFreeBytes DiskIdleSeconds DiskQueueDepth DiskReadsCount DiskWriteSectors DiskTotalBytes DiskWritesCount FileReads MbpsReceived MbpsSent Memory ResidentMemory UnusedAllocatedMemory +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND As=$RolePerformanceChartToken$ + | stats first(Machine) by Machine + | rename first(Machine) as Machine + | table Machine] +| timechart span=$ChartBinSizeToken$ avg(DiskReadsCount) avg(DiskWritesCount) $ChartByMachineToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + + + Chart 36: Network Performance - Timeout + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ count $TimeoutByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + + Chart 37: Network Performance - PingLatency + + + Yes + No + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=PingLatency) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($SourcePerfConnectionToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($DestinationPerfConnectionToken$)) + | dedup ID + | rename Machine as PeerAddr] +| eval Connection=Machine."-".PeerAddr +| timechart useother=0 span=$ChartBinSizeToken$ avg(MeanLatency) avg(MaxLatency) $PingLatencyByConnectionToken$ + $TimeSpan.earliest$ + $TimeSpan.latest$ + + + + + + + +
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/recovery.xml b/contrib/observability_splunk_dashboard/recovery.xml new file mode 100644 index 0000000000..6ba6b9a63b --- /dev/null +++ b/contrib/observability_splunk_dashboard/recovery.xml @@ -0,0 +1,873 @@ +
+ +
+ + + Table 1: Find long recovery (Input Index and LogGroup and Select a time span). + + + * + + + + + + + + + -0s + now + + + + + index=$IndexForOverview$ LogGroup=$LogGroupForOverview$ + ((Type="MasterRecoveryState" AND (Status="reading_coordinated_state" OR Status="fully_recovered" OR Status="accepting_commits")) OR (Type="Role" AND As="MasterServer" AND ("Transition"="Begin" OR "Transition"="End")) OR Type="MasterTerminated") AND (NOT TrackLatestType="Rolled") | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Machine Type Transition As Status DateTime Time ErrorDescription LogGroup +| search NOT ErrorDescription="Success" +| eval EventType=case(Transition="Begin" AND As="MasterServer" AND Type="Role", "MasterStart", Type="MasterRecoveryState" AND Status="fully_recovered", "FullRecovery", Type="MasterRecoveryState" AND Status="reading_coordinated_state", "StartRecoveryAttempt", Transition="End" AND As="MasterServer" AND Type="Role", "MasterTerminated", Type="MasterTerminated", "MasterTerminated", Type="MasterRecoveryState" AND Status="accepting_commits", "AcceptingCommits") +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| fillnull value="-" +| sort -Time +| eval ifMasterTerminatedEvent=if(EventType="MasterTerminated", 1, 0) +| stats list(*) by ID Machine ifMasterTerminatedEvent +| rename list(*) as * +| table ID Machine EventType DateTime Time ErrorDescription LogGroup +| sort -Time +| eval LastTime=mvindex(Time, 0), FirstTime=mvindex(Time, -1), Duration=LastTime-FirstTime +| table ID Machine Duration EventType DateTime Time ErrorDescription LogGroup + $time_token_for_recoveryhistorytable.earliest$ + $time_token_for_recoveryhistorytable.latest$ + + + + +
+
+
+ + + Table 2: Select timespan containing the long recovery and see all recovery attempts in the time span (The input Index and LogGroup and Timespan are for all following tables and charts) + + + * + + + + + + + + -0s@s + now + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="MasterRecoveryState" OR (Type="MasterTerminated") OR (Type="Role" AND As="MasterServer" AND "Transition"="End") OR Type="RecoveryInternal" OR Type="ProxyReplies" OR Type="CommitProxyReplies" OR Type="ResolverReplies" OR Type="MasterRecruitedInitialStorageServers") AND (NOT TrackLatestType="Rolled") +| rename ID as MasterID +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table MasterID Machine Status Step Type DateTime Time StatusCode MyRecoveryCount ErrorDescription Reason ErrorCode +| fillnull value="-" ErrorDescription Reason ErrorCode +| eval Status=case(Type=="MasterRecoveryState", Status, Type=="Role", "RoleEnd", Type=="MasterTerminated", "MasterTerminated", Type=="RecoveryInternal", Status."/".Step, Type=="ProxyReplies" OR Type=="CommitProxyReplies", "initializing_transaction_servers/ProxyReplies", Type="ResolverReplies", "initializing_transaction_servers/ResolverReplies", Type=="MasterRecruitedInitialStorageServers", "initializing_transaction_servers/MasterRecruitedInitialStorageServers"), StatusCode=case(Type=="ProxyReplies" OR Type=="CommitProxyReplies" OR Type=="ResolverReplies" OR Type=="MasterRecruitedInitialStorageServers", "8", Type!="ProxyReplies" AND Type!="CommitProxyReplies" AND Type!="ResolverReplies" AND Type!="MasterRecruitedInitialStorageServers", StatusCode) +| fillnull value="-" StatusCode +| sort 0 -Time -StatusCode +| stats list(*) by MasterID Machine +| rename list(*) as * +| eval FirstTime=mvindex(Time, -1), LastTime=mvindex(Time, 0), Duration=LastTime-FirstTime +| table MasterID Machine MyRecoveryCount Duration ErrorDescription Reason ErrorCode StatusCode Status DateTime Time +| sort -MyRecoveryCount +| fillnull value="-" MyRecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 3: Why recovery is triggered? Using WaitFailureClient event. Machine A detects Machine B's failure. First column is the time when WaitFailureClient happens. Columns of 2,3,4,5 are for A. Columns of 6,7 are for B. + + + index=$Index$ LogGroup=$LogGroup$ + Type="WaitFailureClient" +| table Type Time Machine FailedEndpoint +| replace *:tls with * in FailedEndpoint +| join Machine type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" AND Transition="End" + | eval EndTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | rename As as Role + | table ID EndTime Machine Role] +| join FailedEndpoint type=left + [ search index=$Index$ LogGroup=$LogGroup$ Type="Role" + | stats latest(*) by ID | rename latest(*) as * + | rename Machine as FailedEndpoint + | eval FailedEndpointLatestRoleEventInfo=As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(*) by FailedEndpoint + | rename list(*) as * + | table FailedEndpoint FailedEndpointLatestRoleEventInfo] +| eval FailureDetectedTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| makemv delim=" " FailedEndpointLatestRoleEventInfo +| table FailureDetectedTime Machine ID Role EndTime FailedEndpoint FailedEndpointLatestRoleEventInfo + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 4: New Recruitment Configuration (using MasterRecoveredConfig event) + + + index=$Index$ LogGroup=$LogGroup$ + Type="MasterRecoveredConfig" AND TrackLatestType="Original" +| eval Configuration=replace(Conf, "&quot;", "\"") +| rename Configuration as _raw + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + Table 5: Data Centers (using ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics +| dedup DCID +| rename DCID as DataCenterID +| table DataCenterID pie_work_unit +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 6: New Role (using Role event joined by ProcessMetrics event) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="ClusterController") OR (As="MasterServer") OR (As="TLog") OR (As="Resolver") OR (As="MasterProxyServer") OR (As="CommitProxyServer") OR (As="GrvProxyServer") OR (As="LogRouter")) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search count=1 AND Transition="Begin" +| table ID Role Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter +| fillnull value="null" DataCenter +| stats count by Role DataCenter + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 7: Role Details + + + MasterServer + TLog + Resolver + MasterProxyServer (for <7.0) + LogRouter + CommitProxyServer (for 7.0+) + GrvProxyServer (for 7.0+) + As=" + " + OR + + + + Begin + End + Begin->End + count=1 AND Transition="Begin" + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($RolesToken$) AND (NOT TrackLatestType="Rolled") AND (NOT Transition="Refresh")) +| eventstats count by ID +| rename As as Role +| search $RoleDetailTableWhichRoleToken$ +| table ID Role Machine Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Role Machine DataCenter Time +| fillnull value="null" DataCenter +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID Role Machine DataCenter DateTime +| sort 0 -DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 8: CC Recruitment SevWarn OR SevError (use events in clusterRecruitFromConfiguration and clusterRecruitRemoteFromConfiguration) + + + index=$Index$ LogGroup=$LogGroup$ + Type="RecruitFromConfigurationNotAvailable" OR Type="RecruitFromConfigurationRetry" OR Type="RecruitFromConfigurationError" OR Type="RecruitRemoteFromConfigurationNotAvailable" OR Type="RecruitRemoteFromConfigurationRetry" OR Type="RecruitRemoteFromConfigurationError" + | eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)"), GoodRecruitmentTimeReady=case(Type=="RecruitFromConfigurationNotAvailable" OR Type=="RecruitRemoteFromConfigurationNotAvailable", "True", Type=="RecruitFromConfigurationRetry" OR Type=="RecruitRemoteFromConfigurationRetry", GoodRecruitmentTimeReady, Type=="RecruitFromConfigurationError" OR Type=="RecruitRemoteFromConfigurationError", "-") + | table Type GoodRecruitmentTimeReady Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + +
+
+
+ + + Table 9: RecoveryCount of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogStart") OR (LogId=$row.TLogID$ AND Type="TLogPersistentStateRestore") +| eval ID=if(Type="TLogStart", ID, LogId), DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table ID RecoveryCount Type DateTime | fillnull value="Not found. The fdb version is somewhat old." + -7d@h + now + + + +
+
+ + Table 10: Which roles the selected TLog (in Table 11) talks to + + + index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| table TLogID TLogEvents +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), MasterID=mvindex(temp,2) +| fields - temp - TLogEvents +| sort 0 -Time +| search NOT MasterID="NULL" +| dedup MasterID +| rename MasterID as ID +| join type=left ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role") + | sort 0 -Time + | dedup ID + | table ID Machine As] +| table ID Machine As | fillnull value="null" Machine As + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 11: TLog Events (Collecting all TLogs that produce interesting events during the time span) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2")) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled") AND $SeeLogEventDetailTableToken$ +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."null", (Type="TLogReady"), Time." ".Type." "."null", (Type="TLogStart"), Time." ".Type." "."null", (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."null") +| stats list(TLogEvents) by TLogID +| rename list(TLogEvents) As TLogEvents +| eval EarliestEvent=mvindex(TLogEvents, -1) , LatestEvent=mvindex(TLogEvents, 0) +| table TLogID TLogEvents EarliestEvent LatestEvent +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND As="TLog") + | sort 0 -Time + | dedup ID + | rename ID as TLogID + | table TLogID host LogGroup Machine] +| table TLogID Machine LogGroup host EarliestEvent LatestEvent +| fillnull value="null" Machine host LogGroup +| eval temp=split(LatestEvent," "), LatestTime=mvindex(temp,0), LatestEvent=mvindex(temp,1), temp2=split(EarliestEvent," "), EarliestTime=mvindex(temp2,0), EarliestEvent=mvindex(temp2,1), Duration=LatestTime-EarliestTime +| table TLogID Machine EarliestTime Duration LogGroup host +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="null" DataCenter +| table TLogID Machine DataCenter EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + ((Type="TLogRejoining") OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow")) OR ((Type="TLogLockStarted" OR Type="TLogLocked")) OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh")) AND (NOT TrackLatestType="Rolled") + | sort -Time + | eval TLogID=case((Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRejoining"), Time." ".Type." ".Master, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."Null", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."Null", (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."Null") + | stats list(*) by TLogID + | rename list(*) As * + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | sort TLogID + | table TLogID TLogEvents + | mvexpand TLogEvents + | eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), RoleID=mvindex(temp,2) + | fields - temp - TLogEvents + | sort 0 -Time + | search NOT RoleID="NULL" + | table TLogID RoleID MasterMachine + | stats list(*) by TLogID + | rename list(*) as * + | streamstats count + | mvexpand RoleID + | dedup count RoleID + | fields - count + | stats count by TLogID + | rename count as Roles + | table TLogID Roles] +| table TLogID Machine DataCenter Roles EarliestTime Duration host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover") OR (Type="TLogReady") OR (Type="TLogStart") OR + ((Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked") OR (Type="TLogStop") OR (Type="TLogStop2") OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh") AND (NOT TrackLatestType="Rolled")) + | sort -Time + | eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=if(Type="Role", Type.Transition, Type) + | sort 0 TLogEvents + | stats list(TLogEvents) by TLogID + | rename list(TLogEvents) As TLogEvents + | table TLogID TLogEvents + | eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) + | search ignore=0 + | mvcombine delim=" " TLogEvents + | table TLogID TLogEvents] +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestTime host LogGroup +| eval EarliestDateTime=strftime(EarliestTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TLogStart") OR (Type="TLogPersistentStateRestore") + | eval TLogID=if(Type="TLogStart", ID, LogId) + | table TLogID RecoveryCount] +| table TLogID RecoveryCount Machine DataCenter Roles Duration TLogEvents EarliestDateTime host LogGroup +| fillnull value="TLog too old, click and see details" RecoveryCount + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + $click.value$ + +
+
+ + Table 12: Event Details (Including rejoining events) of the selected TLog (in Table 11) + + + index=$Index$ LogGroup=$LogGroup$ + (Type="TLogRecover" AND LogId=$row.TLogID$) OR (Type="TLogReady" AND ID=$row.TLogID$) OR (Type="TLogStart" AND ID=$row.TLogID$) OR + ((Type="TLogRejoining" AND ID=$row.TLogID$) OR ((Type="TLogJoinedMe" OR Type="TLogJoinedMeUnknown" OR Type="TLogRejoinSlow") AND TLog=$row.TLogID$) OR ((Type="TLogLockStarted" OR Type="TLogLocked") AND TLog=$row.TLogID$) OR (Type="TLogStop" AND ID=$row.TLogID$) OR (Type="TLogStop2" AND LogId=$row.TLogID$) OR (Type="Role" AND As="TLog" AND NOT Transition="Refresh" AND ID=$row.TLogID$)) AND (NOT TrackLatestType="Rolled") +| sort -Time +| eval TLogID=case((Type="TLogRecover"), LogId, (Type="TLogReady"), ID, (Type="TLogStart"), ID, (Type="TLogRejoining"), ID, (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow"), TLog, (Type="TLogLockStarted") OR (Type="TLogLocked"), TLog, (Type="TLogStop"), ID, (Type="TLogStop2"), LogId, Type="Role", ID), TLogEvents=case((Type="TLogRecover"), Time." ".Type." "."-"." "."-", (Type="TLogReady"), Time." ".Type." "."-"." "."-", (Type="TLogStart"), Time." ".Type." "."-"." "."-", (Type="TLogRejoining"), Time." ".Type." ".Master." "."-", (Type="TLogJoinedMe") OR (Type="TLogJoinedMeUnknown") OR (Type="TLogRejoinSlow") OR (Type="TLogLockStarted") OR (Type="TLogLocked"), Time." ".Type." ".ID." "."-", (Type="TLogStop") OR (Type="TLogStop2"), Time." ".Type." "."-"." "."-", (Type="Role" AND As="TLog" AND Transition="Begin" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." ".Origination, (Type="Role" AND As="TLog" AND Transition="End" AND NOT TrackLatestType="Rolled"), Time." "."Role".Transition." "."-"." "."-") +| stats list(*) by TLogID +| rename list(*) As * +| table TLogID TLogEvents +| eval ignore = if(mvcount(TLogEvents)==1 AND like(mvindex(TLogEvents, 0), "% RoleEnd"), 1, 0) +| search ignore=0 +| sort TLogID +| join type=left TLogID + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role" AND As="TLog" AND ID=$row.TLogID$) + | dedup ID + | rename ID as TLogID + | table TLogID Machine] +| table TLogID Machine TLogEvents +| fillnull value="-" Machine +| mvexpand TLogEvents +| eval temp=split(TLogEvents," "), Time=mvindex(temp,0), Event=mvindex(temp,1), ToID=mvindex(temp,2), Origination= mvindex(temp,3) +| fields - temp - TLogEvents +| join type=left + [ search index=$Index$ LogGroup=$LogGroup$ (Type="Role") + | dedup ID + | rename ID as ToID + | rename As as ToRole + | rename Machine as ToMachine + | table ToID ToRole ToMachine] +| sort 0 -Time +| fillnull value="-" ToRole ToMachine +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TLogID Machine Event DateTime ToID ToRole ToMachine Time DateTime + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+
+ + + Table 13: All Tags of the selected TLog (in Table 11) that have been popped by SSes (using TLogPoppedTag event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| rename Tags as UnpoppedRecoveredTagCount +| rename Tag as TagPopped +| rename DurableKCVer as DurableKnownCommittedVersion +| search TagPopped!="-1:2" +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt +| sort 0 -UnpoppedRecoveredTagCount +| join TagPopped type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | rename Tag as TagPopped + | table TagPopped ID Machine] +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped DurableKnownCommittedVersion RecoveredAt ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| rename ID as SSID +| rename Machine as SSMachine +| rename DataCenter as SSDataCenter +| table TLogID DateTime UnpoppedRecoveredTagCount TagPopped SSID SSMachine SSDataCenter DurableKnownCommittedVersion RecoveredAt +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + +
+
+ + Table 14: All Tags of the selected TLog (in Table 11) to be popped by SSes (using TLogReady event) + + + index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| rename ID as TLogID +| table TLogID Type AllTags Locality +| makemv delim="," AllTags +| mvexpand AllTags +| rename AllTags as Tag | sort 0 Tag +| join Tag type=left + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") + | stats latest(*) by Machine + | rename latest(*) as * + | table Tag ID Machine] +| table TLogID Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| fillnull value="-" +| table TLogID Tag ID Machine DataCenter +| rename ID as SSID | rename Machine as SSMachine | rename DataCenter as SSDataCenter +| search Tag!="-1:2" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 15: The Tags of the selected TLog (in Table 11) that are not popped by SSes (using set diff tags in Table 13 and Table 14) (if result contains "...", the result of Table 15 is wrong) + + + | set diff + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogReady") + | table AllTags + | makemv delim="," AllTags + | mvexpand AllTags + | rename AllTags as Tag + | table Tag] + [ search index=$Index$ LogGroup=$LogGroup$ + (ID=$row.TLogID$ AND Type="TLogPoppedTag") + | table Tag] + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+ + Table 16: All Current Storage Servers (assume each machine has at most one SS) + + + + + + index=$Index$ LogGroup=$LogGroup$ + (Type="StorageMetrics") AND $TriggerSSTableToken$ +| stats latest(*) by Machine +| rename latest(*) as * +| table Tag ID Machine +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table ID Machine DataCenter Tag +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ((As="StorageServer")) AND (NOT TrackLatestType="Rolled")) + | stats latest(*) by Machine + | rename latest(*) as * + | rename As as Role + | table ID Role Machine + | join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] + | table ID Role Machine DataCenter + | fillnull value="null" DataCenter] +| sort 0 DataCenter +| table Tag ID Machine DataCenter | sort 0 Tag + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 1: Timeout/TimedOut event distribution grouped by source (Machine) + + + 5s + + + + TLog + MasterServer + MasterProxyServer (for version < 7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for ver 7+) + As=" + " + OR + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by Machine + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Chart 2: Timeout/TimedOut event distribution grouped by destination (PeerAddr) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | dedup ID] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | dedup ID + | rename Machine as PeerAddr] +| timechart useother=0 span=$TimeoutEventByMachineTableTimeSpanToken$ count by PeerAddr + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 17: Check Type=ConnectionTimedOut OR Type=ConnectionTimeout events between transaction roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + (Type=ConnectionTimedOut OR Type=ConnectionTimeout) +| replace *:tls with * in PeerAddr +| stats count as TotalTimeouts by Machine PeerAddr +| table Machine PeerAddr TotalTimeouts +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableSourceRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS MachineRoleLatestEvent BY Machine + ] +| join PeerAddr + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="Role" AND ($TimeoutbyMachineTableDestinationRoleToken$)) + | stats latest(*) by ID + | rename latest(*) as * + | eval Role = As."/".ID."/".Type.Transition."/".strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") + | stats list(Role) AS PeerRoleLatestEvent BY Machine + | rename Machine AS PeerAddr + ] +| table Machine PeerAddr TotalTimeouts MachineRoleLatestEvent PeerRoleLatestEvent + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+ + + Table 18: Proxy 0 + + + index=$Index$ LogGroup=$LogGroup$ + (Type="ProxyReplies" OR Type="CommitProxyReplies") AND FirstProxy="True" +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table WorkerID LogGroup FirstProxy Time DateTime +| sort 0 -Time +| join type=left WorkerID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND As="Worker" AND Transition="Refresh" + | dedup ID + | rename ID as WorkerID + | stats list(*) by WorkerID + | rename list(*) as * + | table WorkerID Machine Roles] +| table WorkerID Machine Roles LogGroup FirstProxy Time DateTime +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND (As="MasterProxyServer" OR As="CommitProxyServer") AND Transition="Refresh" + | dedup ID + | rename ID as ProxyID + | table Machine ProxyID] +| table ProxyID Machine LogGroup FirstProxy + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Table 19: Latest Role Events on the input Machine (Input Machine, like 172.27.113.121:4500) + + + + + + index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND Machine=$SearchMachineToken$ +| stats latest(*) by ID Transition +| rename latest(*) as * +| eval DateTime=strftime(Time, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table DateTime Machine ID Transition As Roles LogGroup Error ErrorDescription Reason +| sort 0 -DateTime +| fillnull value="-" + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + +
+
+
+ + + Chart 3: severity>=20 event distribution (including roles that refresh/begin/end in the timespan) + + + * + + + + TLog + MasterServer + MasterProxyServer (for version <7) + Resolver + ClusterController + SharedTLog + LogRouter + Coordinator + StorageServer + CommitProxyServer (for version 7+) + GrvProxyServer (for version 7+) + As=" + " + OR + + + + EventType + Machine + Severity + Type + + + + 5s + + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 AND $BadEvents$ +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID | table Machine] +| table Machine Type Severity _time +| timechart useother=0 span=$BadEventChartTimeSpanToken$ count by $BadEventChartBy$ + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + + + + + + + + Table 20: Check severity>20 events of roles in the recovery (including the role that refresh/begin/end in the timespan) + + + index=$Index$ LogGroup=$LogGroup$ + Severity>10 +| stats count by Machine Type +| rename count as Count +| join Machine + [ search index=$Index$ LogGroup=$LogGroup$ + Type="Role" AND ($BadEventRoleToken$) + | dedup ID + | eval Role=As."-".ID + | stats list(Role) by Machine + | rename list(Role) as Roles + | table Machine Roles] +| table Type Count Roles Machine +| sort -Count + $ReoveryTime.earliest$ + $ReoveryTime.latest$ + + + + +
+
+
+
\ No newline at end of file diff --git a/contrib/observability_splunk_dashboard/transaction_latency.xml b/contrib/observability_splunk_dashboard/transaction_latency.xml new file mode 100644 index 0000000000..99b551f2c9 --- /dev/null +++ b/contrib/observability_splunk_dashboard/transaction_latency.xml @@ -0,0 +1,247 @@ +
+ + Design for ClusterController issued transactions. +
+ + + + + + + * + + + + * + + + + + @d + now + + +
+ + + All Transactions (Currently, this table also does not cover getrange operation and the operation which not do commit). + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ ID=$transactionID$ + (Type="TransactionAttachID" OR Type="GetValueAttachID" OR Type="CommitAttachID") +| eval To=case(Type=="TransactionAttachID", "0"."-".To, Type="GetValueAttachID", "1"."-".To, Type=="CommitAttachID", "2"."-".To) +| stats list(To) by ID +| rename list(To) as ToList +| table ID ToList +| eval Count = mvcount(ToList) +| search Count=3 +| eval To0=mvindex(ToList,0), To1=mvindex(ToList,1), To2=mvindex(ToList,2), To0=split(To0,"-"), To1=split(To1,"-"), To2=split(To2,"-"), GrvID=case(mvindex(To0, 0)=="0", mvindex(To0, 1), mvindex(To1, 0)=="0", mvindex(To1, 1), mvindex(To2, 0)=="0", mvindex(To2, 1)), ReadID=case(mvindex(To0, 0)=="1", mvindex(To0, 1), mvindex(To1, 0)=="1", mvindex(To1, 1), mvindex(To2, 0)=="1", mvindex(To2, 1)), CommitID=case(mvindex(To0, 0)=="2", mvindex(To0, 1), mvindex(To1, 0)=="2", mvindex(To1, 1), mvindex(To2, 0)=="2", mvindex(To2, 1)) +| table ID GrvID ReadID CommitID +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.Before") + | rename ID as GrvID + | rename Time as BeginTime + | table GrvID BeginTime + ] +| join GrvID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="TransactionDebug" AND Location="NativeAPI.getConsistentReadVersion.After") + | rename ID as GrvID + | rename Time as GRVDoneTime + | table GrvID GRVDoneTime + ] +| join ReadID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="GetValueDebug" AND Location="NativeAPI.getValue.After") + | rename ID as ReadID + | rename Time as ReadDoneTime + | table ReadID ReadDoneTime + ] +| join CommitID + [ search index=$Index$ LogGroup=$LogGroup$ + (Type="CommitDebug" AND Location="NativeAPI.commit.After") + | rename ID as CommitID + | rename Time as CommitDoneTime + | table CommitID CommitDoneTime + ] +| rename ID as TransactionID +| eval BeginToGRVDone = GRVDoneTime-BeginTime, GRVDoneToReadDone = ReadDoneTime-GRVDoneTime, ReadDoneToCommitDone = CommitDoneTime-ReadDoneTime, Duration=CommitDoneTime-BeginTime, BeginTimeScope=BeginTime-1, EndTimeScope=CommitDoneTime+1, BeginDateTime=strftime(BeginTime, "%Y-%m-%d %H:%M:%S.%Q (%Z)") +| table TransactionID Duration BeginDateTime BeginToGRVDone GRVDoneToReadDone ReadDoneToCommitDone Duration GrvID ReadID CommitID BeginTimeScope EndTimeScope | sort -Duration + $time_token.earliest$ + $time_token.latest$ + + + + $row.BeginTimeScope$ + $row.EndTimeScope$ + $row.ReadID$ + $row.GrvID$ + $row.CommitID$ + +
+
+
+ + + Step1: GRV + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND (NOT MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion) +AND (ID=$GrvID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To]) +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime, Order = case(Location=="NativeAPI.getConsistentReadVersion.Before", 0, Location like "%ProxyServer.queueTransactionStartRequests.Before", 1, Location=="MasterProxyServer.masterProxyServerCore.Broadcast", 2, Location=="GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster", 2.1, Location like "%ProxyServer.getLiveCommittedVersion.confirmEpochLive", 3, Location=="MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion", 4, Location like "%ProxyServer.getLiveCommittedVersion.After", 5, Location=="NativeAPI.getConsistentReadVersion.After", 6) +| table Time Delta Order Type ID Location Machine Roles +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+ + Step1: (Only for FDB v6.3): GRV --- Get Committed Version (MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion Events) + + only for FDB 6.3 + + index=$Index$ LogGroup=$LogGroup$ + Type="TransactionDebug" AND Location="MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion" + AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="TransactionAttachID" AND ID=$GrvID$ + | return $To] +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time - MinTime +| sort 0 -Time +| table Machine Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step2: GetValue + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ Type="GetValueDebug" AND ID=$ReadID$ +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.getKeyLocation.Before", 0, Location=="NativeAPI.getKeyLocation.After", 1, Location=="NativeAPI.getValue.Before", 2, Location=="storageServer.received", 3, Location=="getValueQ.DoRead", 4, Location=="getValueQ.AfterVersion", 5, Location=="Reader.Before", 6, Location=="Reader.After", 7, Location=="getValueQ.AfterRead", 8, Location=="NativeAPI.getValue.After", 9, Location=="NativeAPI.getValue.Error", 10) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + Type="CommitDebug" AND (ID=$CommitID$ OR ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To]) + +| table Time Type ID Location Machine Roles +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| table Machine Location Delta Time Roles ID Type +| eval Order=case(Location=="NativeAPI.commit.Before", 0, Location like "%ProxyServer.batcher", 1, Location like "%ProxyServer.commitBatch.Before", 2, Location like "%ProxyServer.commitBatch.GettingCommitVersion", 3, Location like "%ProxyServer.commitBatch.GotCommitVersion", 4, Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8, Location like "%ProxyServer.commitBatch.AfterResolution", 8.5, Location like "%ProxyServer.commitBatch.ProcessingMutations", 9, Location like "%ProxyServer.commitBatch.AfterStoreCommits", 10, Location=="TLogServer.tLogCommit.BeforeWaitForVersion", 11, Location=="TLogServer.tLogCommit.Before", 12, Location=="TLogServer.tLogCommit.AfterTLogCommit", 13, Location=="TLogServer.tLogCommit.After", 14, Location like "%ProxyServer.commitBatch.AfterLogPush", 15, Location=="NativeAPI.commit.After", 16) +| sort 0 Order +| table Machine Location Delta Time Roles ID Type + $BeginTime$ + $EndTime$ + + +
+
+
+ + + Step3: Commit --- Resolver + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="Resolver*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| eval Order=case(Location=="Resolver.resolveBatch.Before", 5, Location=="Resolver.resolveBatch.AfterQueueSizeCheck", 6, Location=="Resolver.resolveBatch.AfterOrderer", 7, Location=="Resolver.resolveBatch.After", 8) +| sort 0 Time Order +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time +| join type=left Machine + [ search index=$Index$ LogGroup=$LogGroup$ Type=ProcessMetrics + | dedup Machine, DCID + | rename DCID as DataCenter + | table Machine DataCenter] +| table Machine DataCenter Roles Duration Location Delta Time + $time_token.earliest$ + $time_token.latest$ + + +
+
+
+ + + Step3: Commit --- Commit to TLogs (CommitDebug Events), grouped by Machine and sorted by Duration + + for FDB 6.3 and 7.0+ + + index=$Index$ LogGroup=$LogGroup$ + (Location="TLog*") +| join ID + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID= + [ search index=$Index$ LogGroup=$LogGroup$ + Type="CommitAttachID" AND ID=$CommitID$ + | return $To] + | rename To as ID + | table ID] +| eventstats min(Time) as MinTime +| eval Delta = Time-MinTime +| sort 0 Time +| stats list(*) by Type ID Machine Roles +| rename list(*) as * +| eval T1=mvindex(Time, 0), T2=mvindex(Time, 3), Duration=T2-T1 | sort -Duration +| table Machine Roles Duration Location Delta Time + $BeginTime$ + $EndTime$ + + + +
+
+
+
\ No newline at end of file diff --git a/contrib/pkg_tester/test_fdb_pkgs.py b/contrib/pkg_tester/test_fdb_pkgs.py index 08ccd35aa6..178f84d93c 100644 --- a/contrib/pkg_tester/test_fdb_pkgs.py +++ b/contrib/pkg_tester/test_fdb_pkgs.py @@ -165,7 +165,6 @@ def centos_image_with_fdb_helper(versioned: bool) -> Iterator[Optional[Image]]: container = Container("centos:7", initd=True) for rpm in rpms: container.copy_to(rpm, "/opt") - container.run(["bash", "-c", "yum update -y"]) container.run( ["bash", "-c", "yum install -y prelink"] ) # this is for testing libfdb_c execstack permissions @@ -327,7 +326,7 @@ def test_execstack_permissions_libfdb_c(linux_container: Container, snapshot): [ "bash", "-c", - "execstack -q $(ldconfig -p | grep libfdb_c | awk '{print $(NF)}')", + "execstack -q $(ldconfig -p | grep libfdb_c.so | awk '{print $(NF)}')", ] ) diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index d0df0708aa..79534596b5 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -284,6 +284,12 @@ class ErrorCommitInfo(BaseInfo): if protocol_version >= PROTOCOL_VERSION_6_3: self.report_conflicting_keys = bb.get_bool() + if protocol_version >= PROTOCOL_VERSION_7_1: + lock_aware = bb.get_bool() + if bb.get_bool(): + spanId = bb.get_bytes(16) + + class UnsupportedProtocolVersionError(Exception): def __init__(self, protocol_version): super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version) diff --git a/contrib/tsan.suppressions b/contrib/tsan.suppressions new file mode 100644 index 0000000000..2078f7e8c6 --- /dev/null +++ b/contrib/tsan.suppressions @@ -0,0 +1,5 @@ +# ThreadSanitizer suppressions file for FDB +# https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions + +# FDB signal handler is not async-signal safe +signal:crashHandler diff --git a/design/data-distributor-internals.md b/design/data-distributor-internals.md index a935bbced6..ccaba537b6 100644 --- a/design/data-distributor-internals.md +++ b/design/data-distributor-internals.md @@ -20,7 +20,7 @@ Data distribution manages the lifetime of storage servers, decides which storage **RelocateShard (`struct RelocateShard`)**: A `RelocateShard` records the key range that need to be moved among servers and the data movement’s priority. DD always move shards with higher priorities first. -**Data distribution queue (`struct DDQueueData`)**: It receives shards to be relocated (i.e., RelocateShards), decides which shard should be moved to which server team, prioritizes the data movement based on relocate shard’s priority, and controls the progress of data movement based on servers’ workload. +**Data distribution queue (`struct DDQueue`)**: It receives shards to be relocated (i.e., RelocateShards), decides which shard should be moved to which server team, prioritizes the data movement based on relocate shard’s priority, and controls the progress of data movement based on servers’ workload. **Special keys in the system keyspace**: DD saves its state in the system keyspace to recover from failure and to ensure every process (e.g., commit proxies, tLogs and storage servers) has a consistent view of which storage server is responsible for which key range. @@ -153,3 +153,25 @@ CPU utilization. This metric is in a positive relationship with “FinishedQueri * The typical movement size under a read-skew scenario is 100M ~ 600M under default KNOB value `READ_REBALANCE_MAX_SHARD_FRAC=0.2, READ_REBALANCE_SRC_PARALLELISM = 20`. Increasing those knobs may accelerate the converge speed with the risk of data movement churn, which overwhelms the destination and over-cold the source. * The upper bound of `READ_REBALANCE_MAX_SHARD_FRAC` is 0.5. Any value larger than 0.5 can result in hot server switching. * When needing a deeper diagnosis of the read aware DD, `BgDDMountainChopper_New`, and `BgDDValleyFiller_New` trace events are where to go. + +## Data Distribution Diagnosis Q&A +* Why Read-aware DD hasn't been triggered when there's a read imbalance? + * Check `BgDDMountainChopper_New`, `BgDDValleyFiller_New` `SkipReason` field. +* The Read-aware DD is triggered, and some data movement happened, but it doesn't help the read balance. Why? + * Need to figure out which server is selected as the source and destination. The information is in `BgDDMountainChopper*`, `BgDDValleyFiller*` `DestTeam` and `SourceTeam` field. + * Also, the `DDQueueServerCounter` event tells how many times a server being a source or destination (defined in + ```c++ + enum CountType : uint8_t { ProposedSource = 0, QueuedSource, LaunchedSource, LaunchedDest }; + ``` + ) for different relocation reason (`Other`, `RebalanceDisk` and so on) in different phase within `DD_QUEUE_COUNTER_REFRESH_INTERVAL` (default 60) seconds. For example, + ```xml + + ``` + `RebalanceReadPQSD="2 0 0 5"` means server `0000000000000004` has been selected as for read balancing for twice, but it's not queued and executed yet. This server also has been a destination for read balancing for 5 times in the past 1 min. Note that the field will be skipped if all 4 numbers are 0. To avoid spammy traces, if is enabled with knob `DD_QUEUE_COUNTER_SUMMARIZE = true`, event `DDQueueServerCounterTooMany` will summarize the unreported servers that involved in launched relocations (aka. `LaunchedSource`, `LaunchedDest` count are non-zero): + ```xml + + ``` +* How to track the lifecycle of a relocation attempt for balancing? + * First find the TraceId fields in `BgDDMountainChopper*`, `BgDDValleyFiller*`, which indicates a relocation is triggered. + * (Only when enabled) Find the `QueuedRelocation` event with the same `BeginPair` and `EndPair` as the original `TraceId`. This means the relocation request is queued. + * Find the `RelocateShard` event whose `BeginPair`, `EndPair` field is the same as `TraceId`. This event means the relocation is ongoing. diff --git a/design/dynamic-knobs.md b/design/dynamic-knobs.md new file mode 100644 index 0000000000..00fe39e725 --- /dev/null +++ b/design/dynamic-knobs.md @@ -0,0 +1,420 @@ +# Dynamic Knobs + +This document is largely adapted from original design documents by Markus +Pilman and Trevor Clinkenbeard. + +## Background + +FoundationDB parameters control the behavior of the database, including whether +certain features are available and the value of internal constants. Parameters +will be referred to as knobs for the remainder of this document. Currently, +these knobs are configured through arguments passed to `fdbserver` processes, +often controlled by `fdbmonitor`. This has a number of problems: + +1. Updating knobs involves updating `foundationdb.conf` files on each host in a + cluster. This has a lot of overhead and typically requires external tooling + for large scale changes. +2. All knob changes require a process restart. +3. We can't easily track the history of knob changes. + +## Overview + +The dynamic knobs project creates a strictly serializable quorum-based +configuration database stored on the coordinators. Each `fdbserver` process +specifies a configuration path and applies knob overrides from the +configuration database for its specified classes. + +### Caveats + +The configuration database explicitly does not support the following: + +1. A high load. The update rate, while not specified, should be relatively low. +2. A large amount of data. The database is meant to be relatively small (under + one megabyte). Data is not sharded and every coordinator stores a complete + copy. +3. Concurrent writes. At most one write can succeed at a time, and clients must + retry their failed writes. + +## Design + +### Configuration Path + +Each `fdbserver` process can now include a `--config_path` argument specifying +its configuration path. A configuration path is a hierarchical list of +configuration classes specifying which knob overrides the `fdbserver` process +should apply from the configuration database. For example: + +```bash +$ fdbserver --config_path classA/classB/classC ... +``` + +Knob overrides follow descending priority: + +1. Manually specified command line knobs. +2. Individual configuration class overrides. + * Subdirectories override parent directories. For example, if the + configuration path is `az-1/storage/gp3`, the `gp3` configuration takes + priority over the `storage` configuration, which takes priority over the + `az-1` configuration. +3. Global configuration knobs. +4. Default knob values. + +#### Example + +For example, imagine an `fdbserver` process run as follows: + +```bash +$ fdbserver --datadir /mnt/fdb/storage/4500 --logdir /var/log/foundationdb --public_address auto:4500 --config_path az-1/storage/gp3 --knob_disable_asserts false +``` + +And the configuration database contains: + +| ConfigClass | KnobName | KnobValue | +|-------------|---------------------|-----------| +| az-2 | page_cache_4k | 8e9 | +| storage | min_trace_severity | 20 | +| az-1 | compaction_interval | 280 | +| storage | compaction_interval | 350 | +| az-1 | disable_asserts | true | +| \ | max_metric_size | 5000 | +| gp3 | max_metric_size | 1000 | + +The final configuration for the process will be: + +| KnobName | KnobValue | Explanation | +|---------------------|-------------|-------------| +| page_cache_4k | \ | The configuration database knob override for `az-2` is ignored, so the compiled default is used | +| min_trace_severity | 20 | Because the `storage` configuration class is part of the process’s configuration path, the corresponding knob override is applied from the configuration database | +| compaction_interval | 350 | The `storage` knob override takes precedence over the `az-1` knob override | +| disable_asserts | false | This knob is manually overridden, so all other overrides are ignored | +| max_metric_size | 1000 | Knob overrides for specific configuration classes take precedence over global knob overrides, so the global override is ignored | + +### Clients + +Clients can write to the configuration database using transactions. +Configuration database transactions are differentiated from regular +transactions through specification of the `USE_CONFIG_DATABASE` database +option. + +In configuration transactions, the client uses the tuple layer to interact with +the configuration database. Keys are tuples of size two, where the first item +is the configuration class being written, and the second item is the knob name. +The value should be specified as a string. It will be converted to the +appropriate type based on the declared type of the knob being set. + +Below is a sample Python script to write to the configuration database. + +```python +import fdb + +fdb.api_version(720) + +@fdb.transactional +def set_knob(tr, knob_name, knob_value, config_class, description): + tr['\xff\xff/description'] = description + tr[fdb.tuple.pack((config_class, knob_name,))] = knob_value + +# This function performs two knob changes transactionally. +@fdb.transactional +def set_multiple_knobs(tr): + tr['\xff\xff/description'] = 'description' + tr[fdb.tuple.pack((None, 'min_trace_severity',))] = '10' + tr[fdb.tuple.pack(('az-1', 'min_trace_severity',))] = '20' + +db = fdb.open() +db.options.set_use_config_database() + +set_knob(db, 'min_trace_severity', '10', None, 'description') +set_knob(db, 'min_trace_severity', '20', 'az-1', 'description') +``` + +### Disable the Configuration Database + +The configuration database includes both client and server changes and is +enabled by default. Thus, to disable the configuration database, changes must +be made to both. + +#### Server + +The configuration database can be disabled by specifying the ``fdbserver`` +command line option ``--no-config-db``. Note that this option must be specified +for *every* ``fdbserver`` process. + +#### Client + +The only client change from the configuration database is as part of the change +coordinators command. The change coordinators command is not considered +successful until the configuration database is readable on the new +coordinators. This will cause the change coordinators command to hang if run +against a database with dynamic knobs disabled. To disable the client side +configuration database liveness check, specify the ``--no-config-db`` flag when +changing coordinators. For example: + +``` +fdbcli> coordinators auto --no-config-db +``` + +## Status + +The current state of the configuration database is output as part of `status +json`. The configuration path for each process can be determined from the +``command_line`` key associated with each process. + +Sample from ``status json``: + +``` +"configuration_database" : { + "commits" : [ + { + "description" : "set some knobs", + "timestamp" : 1659570000, + "version" : 1 + }, + { + "description" : "make some other changes", + "timestamp" : 1659570000, + "version" : 2 + } + ], + "last_compacted_version" : 0, + "most_recent_version" : 2, + "mutations" : [ + { + "config_class" : "", + "knob_name" : "min_trace_severity", + "knob_value" : "int:5", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "", + "knob_name" : "compaction_interval", + "knob_value" : "double:30.000000", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "az-1", + "knob_name" : "compaction_interval", + "knob_value" : "double:60.000000", + "type" : "set", + "version" : 1 + }, + { + "config_class" : "", + "knob_name" : "compaction_interval", + "type" : "clear", + "version" : 2 + }, + { + "config_class" : "", + "knob_name" : "update_node_timeout", + "knob_value" : "double:4.000000", + "type" : "set", + "version" : 2 + } + ], + "snapshot" : { + "" : { + "min_trace_severity" : "int:5", + "update_node_timeout" : "double:4.000000" + }, + "az-1" : { + "compaction_interval" : "double:60.000000" + } + } +} +``` + +After compaction, ``status json`` would show: + +``` +"configuration_database" : { + "commits" : [ + ], + "last_compacted_version" : 2, + "most_recent_version" : 2, + "mutations" : [ + ], + "snapshot" : { + "" : { + "min_trace_severity" : "int:5", + "update_node_timeout" : "double:4.000000" + }, + "az-1" : { + "compaction_interval" : "double:60.000000" + } + } +} +``` + +## Detailed Implementation + +The configuration database is implemented as a replicated state machine living +on the coordinators. This allows configuration database transactions to +continue to function in the event of a catastrophic loss of the transaction +subsystem. + +To commit a transaction, clients run the two phase Paxos protocol. First, the +client asks for a live version from a quorum of coordinators. When a +coordinator receives a request for its live version, it increments its local +live version by one and returns it to the client. Then, the client submits its +writes at the live version it received in the previous step. A coordinator will +accept the commit if it is still on the same live version. If a majority of +coordinators accept the commit, it is considered committed. + +### Coordinator + +Each coordinator runs a ``ConfigNode`` which serves as a replica storing one +full copy of the configuration database. Coordinators never communicate with +other coordinators while processing configuration database transactions. +Instead, the client runs the transaction and determines when it has quorum +agreement. + +Coordinators serve the following ``ConfigTransactionInterface`` to allow +clients to read from and write to the configuration database. + +#### ``ConfigTransactionInterface`` +| Request | Request fields | Reply fields | Explanation | +|------------------|----------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------| +| GetGeneration | (coordinatorsHash) | (generation) or (coordinators_changed error) | Get a new read version. This read version is used for all future requests in the transaction | +| Get | (configuration class, knob name, coordinatorsHash, generation) | (knob value or empty) or (coordinators_changed error) or (transaction_too_old error) | Returns the current value stored at the specified configuration class and knob name, or empty if no value exists | +| GetConfigClasses | (coordinatorsHash, generation) | (configuration classes) or (coordinators_changed error) or (transaction_too_old error) | Returns a list of all configuration classes stored in the configuration database | +| GetKnobs | (configuration class, coordinatorsHash, generation) | (knob names) or (coordinators_changed error) or (transaction_too_old error) | Returns a list of all knob names stored for the provided configuration class | +| Commit | (mutation list, coordinatorsHash, generation) | ack or (coordinators_changed error) or (commit_unknown_result error) or (not_committed error) | Commit mutations set by the transaction | + +Coordinators also serve the following ``ConfigFollowerInterface`` to provide +access to (and modification of) their current state. Most interaction through +this interface is done by the cluster controller through its +``IConfigConsumer`` implementation living on the ``ConfigBroadcaster``. + +#### ``ConfigFollowerInterface`` +| Request | Request fields | Reply fields | Explanation | +|-----------------------|----------------------------------------------------------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| GetChanges | (lastSeenVersion, mostRecentVersion) | (mutation list, version) or (version_already_compacted error) or (process_behind error) | Request changes since the last seen version, receive a new most recent version, as well as recent mutations | +| GetSnapshotAndChanges | (mostRecentVersion) | (snapshot, snapshotVersion, changes) | Request the full configuration database, in the form of a base snapshot and changes to apply on top of the snapshot | +| Compact | (version) | ack | Compact mutations up to the provided version | +| Rollforward | (rollbackTo, lastKnownCommitted, target, changes, specialZeroQuorum) | ack or (version_already_compacted error) or (transaction_too_old error) | Rollback/rollforward mutations on a node to catch it up with the majority | +| GetCommittedVersion | () | (registered, lastCompacted, lastLive, lastCommitted) | Request version information from a ``ConfigNode`` | +| Lock | (coordinatorsHash) | ack | Lock a ``ConfigNode`` to prevent it from serving requests during a coordinator change | + +### Cluster Controller + +The cluster controller runs a singleton ``ConfigBroadcaster`` which is +responsible for periodically polling the ``ConfigNode``s for updates, then +broadcasting these updates to workers through the ``ConfigBroadcastInterface``. +When workers join the cluster, they register themselves and their +``ConfigBroadcastInterface`` with the broadcaster. The broadcaster then pushes +new updates to registered workers. + +The ``ConfigBroadcastInterface`` is also used by ``ConfigNode``s to register +with the ``ConfigBroadcaster``. ``ConfigNode``s need to register with the +broadcaster because the broadcaster decides when the ``ConfigNode`` may begin +serving requests, based on global information about status of other +``ConfigNode``s. For example, if a system with three ``ConfigNode``s suffers a +fault where one ``ConfigNode`` loses data, the faulty ``ConfigNode`` should +not be allowed to begin serving requests again until it has been rolled forward +and is up to date with the latest state of the configuration database. + +#### ``ConfigBroadcastInterface`` + +| Request | Request fields | Reply fields | Explanation | +|------------|------------------------------------------------------------|-------------------------------|---------------------------------------------------------------------------------------------| +| Snapshot | (snapshot, version, restartDelay) | ack | A snapshot of the configuration database sent by the broadcaster to workers | +| Changes | (changes, mostRecentVersion, restartDelay) | ack | A list of changes up to and including mostRecentVersion, sent by the broadcaster to workers | +| Registered | () | (registered, lastSeenVersion) | Sent by the broadcaster to new ``ConfigNode``s to determine their registration status | +| Ready | (snapshot, snapshotVersion, liveVersion, coordinatorsHash) | ack | Sent by the broadcaster to new ``ConfigNode``s to allow them to start serving requests | + +### Worker + +Each worker runs a ``LocalConfiguration`` instance which receives and applies +knob updates from the ``ConfigBroadcaster``. The local configuration maintains +a durable ``KeyValueStoreMemory`` containing the following: + +* The latest known configuration version +* The most recently used configuration path +* All knob overrides corresponding to the configuration path at the latest known version + +Once a worker starts, it will: + +* Apply manually set knobs +* Read its local configuration file + * If the stored configuration path does not match the configuration path + specified on the command line, delete the local configuration file + * Otherwise, apply knob updates from the local configuration file. Manually + specified knobs will not be overridden + * Register with the broadcaster to receive new updates for its configuration + classes + * Persist these updates when received and restart if necessary + +### Knob Atomicity + +All knobs are classified as either atomic or non-atomic. Atomic knobs require a +process restart when changed, while non-atomic knobs do not. + +### Compaction + +``ConfigNode``s store individual mutations in order to be able to update other, +out of date ``ConfigNode``s without needing to send a full snapshot. Each +configuration database commit also contains additional metadata such as a +timestamp and a text description of the changes being made. To keep the size of +the configuration database manageable, a compaction process runs periodically +(defaulting to every five minutes) which compacts individual mutations into a +simplified snapshot of key-value pairs. Compaction is controlled by the +``ConfigBroadcaster``, using information it peridiodically requests from +``ConfigNode``s. Compaction will only compact up to the minimum known version +across *all* ``ConfigNode``s. This means that if one ``ConfigNode`` is +permanently partitioned from the ``ConfigBroadcaster`` or from clients, no +compaction will ever take place. + +### Rollback / Rollforward + +It is necessary to be able to roll ``ConfigNode``s backward and forward with +respect to their committed versions due to the nature of quorum logic and +unreliable networks. + +Consider a case where a client commit gets persisted durably on one out of +three ``ConfigNode``s (assume commit messages to the other two nodes are lost). +Since the value is not committed on a majority of ``ConfigNode``s, it cannot be +considered committed. But it is also incorrect to have the value persist on one +out of three nodes as future commits are made. In this case, the most common +result is that the ``ConfigNode`` will be rolled back when the next commit from +a different client is made, and then rolled forward to contain the data from +the commit. ``PaxosConfigConsumer`` contains logic to recognize ``ConfigNode`` +minorities and update them to match the quorum. + +### Changing Coordinators + +Since the configuration database lives on the coordinators and the +[coordinators can be +changed](https://apple.github.io/foundationdb/configuration.html#configuration-changing-coordination-servers), +it is necessary to copy the configuration database from the old to the new +coordinators during such an event. A coordinator change performs the following +steps in regards to the configuration database: + +1. Write ``\xff/coordinatorsKey`` with the new coordinators string. The key + ``\xff/previousCoordinators`` contains the current (old) set of + coordinators. +2. Lock the old ``ConfigNode``s so they can no longer serve client requests. +3. Start a recovery, causing a new cluster controller (and therefore + ``ConfigBroadcaster``) to be selected. +4. Read ``\xff/previousCoordinators`` on the ``ConfigBroadcaster`` and, if + present, read an up-to-date snapshot of the configuration database on the + old coordinators. +5. Determine if each registering ``ConfigNode`` needs an up-to-date snapshot of + the configuration database sent to it, based on its reported version and the + snapshot version of the database received from the old coordinators. + * Some new coordinators which were also coordinators in the previous + configuration may not need a snapshot. +6. Send ready requests to new ``ConfigNode``s, including an up-to-date snapshot + if necessary. This allows the new coordinators to begin serving + configuration database requests from clients. + +## Testing + +The ``ConfigDatabaseUnitTests`` class unit test a number of different +configuration database dimensions. + +The ``ConfigIncrement`` workload tests contention between clients attempting to +write to the configuration database, paired with machine failure and +coordinator changes. diff --git a/design/global-tag-throttling.md b/design/global-tag-throttling.md index 82f5c847d1..fa710b5a8f 100644 --- a/design/global-tag-throttling.md +++ b/design/global-tag-throttling.md @@ -125,6 +125,3 @@ In each test, the `GlobalTagThrottlerTesting::monitor` function is used to perio On the ratekeeper, every `SERVER_KNOBS->TAG_THROTTLE_PUSH_INTERVAL` seconds, the ratekeeper will call `GlobalTagThrottler::getClientRates`. At the end of the rate calculation for each tag, a trace event of type `GlobalTagThrottler_GotClientRate` is produced. This trace event reports the relevant inputs that went in to the rate calculation, and can be used for debugging. On storage servers, every `SERVER_KNOBS->TAG_MEASUREMENT_INTERVAL` seconds, there are `BusyReadTag` events for every tag that has sufficient read cost to be reported to the ratekeeper. Both cost and fractional busyness are reported. - -### Status -For each storage server, the busiest read tag is reported in the full status output, along with its cost and fractional busyness. diff --git a/documentation/sphinx/source/architecture.rst b/documentation/sphinx/source/architecture.rst index 7c28518d74..f693865430 100644 --- a/documentation/sphinx/source/architecture.rst +++ b/documentation/sphinx/source/architecture.rst @@ -14,8 +14,12 @@ Detailed FoundationDB Architecture The FoundationDB architecture chooses a decoupled design, where processes are assigned different heterogeneous roles (e.g., -Coordinators, Storage Servers, Master). Scaling the database is achieved -by horizontally expanding the number of processes for separate roles: +Coordinators, Storage Servers, Master). Cluster attempts to recruit +different roles as separate processes, however, it is possible that +multiple Stateless roles gets colocated (recruited) on a single +process to meet the cluster recruitment goals. Scaling the database +is achieved by horizontally expanding the number of processes for +separate roles: Coordinators ~~~~~~~~~~~~ diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst index 433a47ce7d..0eb159e8f4 100644 --- a/documentation/sphinx/source/client-testing.rst +++ b/documentation/sphinx/source/client-testing.rst @@ -373,3 +373,302 @@ with the ``multitest`` role: fdbserver -r multitest -f testfile.txt This command will block until all tests are completed. + +########## +API Tester +########## + +Introduction +============ + +API tester is a framework for implementing end-to-end tests of FDB C API, i.e. testing the API on a real +FDB cluster through all layers of the FDB client. Its executable is ``fdb_c_api_tester``, and the source +code is located in ``bindings/c/test/apitester``. The structure of API Tests is similar to that of the +Simulation Tests. The tests are implemented as workloads using FDB API, which are all built into the +``fdb_c_api_tester``. A concrete test configuration is defined as a TOML file, which specifies the +combination of workloads to be executed by the test together with their parameters. The test can be then +executed by passing the TOML file as a parameter to ``fdb_c_api_tester``. + +Since simulation tests rely on the actor model to execute the tests deterministically in single-threaded +mode, they are not suitable for testing various multi-threaded aspects of the FDB client. End-to-end API +tests complement the simulation tests by testing the FDB Client layers above the single-threaded Native +Client. + +- The specific testing goals of the end-to-end tests are: +- Check functional correctness of the Multi-Version Client (MVC) and Thread-Safe Client +- Detecting race conditions. They can be caused by accessing the state of the Native Client from wrong + threads or introducing other shared state without proper synchronization +- Detecting memory management errors. Thread-safe reference counting must be used where necessary. MVC + works with multiple client libraries. Memory allocated by one client library must be also deallocated + by the same library. +- Maintaining interoperability with other client versions. The client functionality is made available + depending on the selected API version. The API changes are correctly adapted. +- Client API behaves correctly in case of cluster upgrades. Database and transaction state is correctly + migrated to the upgraded connections. Pending operations are canceled and successfully retried on the + upgraded connections. + +Implementing a Workload +======================= + +Each workload is declared as a direct or indirect subclass of ``WorkloadBase`` implementing a constructor +with ``WorkloadConfig`` as a parameter and the method ``start()``, which defines the entry point of the +workload. + +``WorkloadBase`` provides a set of methods that serve as building blocks for implementation of a workload: + +.. function:: execTransaction(start, cont, failOnError = true) + + creates and executes an FDB transaction. Here ``start`` is a function that takes a transaction context + as parameter and implements the starting point of the transaction, and ``cont`` is a function implementing + a continuation to be executed after finishing the transaction execution. Transactions are automatically + retried on retryable errors. Transactions are retried by calling the ``start`` function again. In case + of a fatal error, the entire workload is considered as failed unless ``failOnError`` is set to ``false``. + +.. function:: schedule(task) + + schedules a task for asynchronous execution. It is usually used in the continuations to schedule + the next step of the workload. + +.. function:: info(msg) + error(msg) + + are used for logging a message with a tag identifying the workload. Issuing an error message marks + the workload as failed. + +The transaction context provides methods for implementation of the transaction logics: + +.. function:: tx() + + the reference to the FDB transaction object + +.. function:: continueAfter(future, cont, retryOnError = true) + + set a continuation to be executed when the future is ready. The ``retryOnError`` flag controls whether + the transaction should be automatically retried in case the future results in a retriable error. + +.. function:: continueAfterAll(futures, cont) + + takes a vector of futures and sets a continuation to be executed when all of the futures get ready. + The transaction is retried if at least one of the futures results in an error. This method is useful + for handling multiple concurrent reads. + +.. function:: commit() + + commit and finish the transaction. If the commit is successful, the execution proceeds to the + continuation of ``execTransaction()``. In case of a retriable error the transaction is + automatically retried. A fatal error results in a failure of the workoad. + + +.. function:: done() + + finish the transaction without committing. This method should be used to finish read transactions. + The transaction gets destroyed and execution proceeds to the continuation of ``execTransaction()``. + Each transaction must be finished either by ``commit()`` or ``done()``, because otherwise + the framework considers that the transaction is still being executed, so it won't destroy it and + won't call the continuation. + +.. function:: onError(err) + + Handle an error: restart the transaction in case of a retriable error, otherwise fail the workload. + This method is typically used in the continuation of ``continueAfter`` called with + ``retryOnError=false`` as a fallback to the default error handling. + +A workload execution ends automatically when it is marked as failed or its last continuation does not +schedule any new task or transaction. + +The workload class should be defined in the namespace FdbApiTester. The file name convention is +``Tester{Name}Workload.cpp`` so that we distinguish them from the source files of simulation workloads. + +Basic Workload Example +====================== + +The code below implements a workload that consists of only two transactions. The first one sets a +randomly generated key to a randomly generated value, and the second one reads the key and checks if +the returned value matches the written one. + +.. literalinclude:: ../../../bindings/c/test/apitester/TesterExampleWorkload.cpp + :language: C++ + :lines: 21- + +The workload is implemented in the method ``setAndGet``. It generates a random key and a random value +and executes a transaction that writes that key-value pair and commits. In the continuation of the +first ``execTransaction`` call, we execute the second transaction that reads the same key. The read +operation returns a future. So we call ``continueAfter`` to set a continuation for that future. In the +continuation we check if the returned value matches the written one and finish the transaction by +calling ``ctx->done()``. After completing the second transaction we execute the continuation passed +as parameter to the ``setAndGet`` method by the start method. In this case it is ``NO_OP_TASK``, which +does nothing and so finishes the workload. + +Finally, we declare an instance ``WorkloadFactory`` to register this workload with the name ``SetAndGet``. + +Note that we use ``workloadId`` as a key prefix. This is necessary for isolating the key space of this +workload, because the framework may be instructed to create multiple instances of the ``SetAndGet`` +workload. If we do not isolate the key space, another workload can write a different value for the +same key and so break the assumption of the test. + +The workload is implemented using the internal C++ API, implemented in ``fdb_api.hpp``. It introduces +a set of classes representing the FDB objects (transactions, futures, etc.). These classes provide C++-style +methods wrapping FDB C API calls and automate memory management by means of reference counting. + +Implementing Control Structures +=============================== + +Our basic workload executes just 2 transactions, but in practice we want to have workloads that generate +multiple transactions. The following code demonstrates how we can modify our basic workload to generate +multiple transactions in a loop. + +.. code-block:: C++ + + class SetAndGetWorkload : public WorkloadBase { + public: + ... + int numIterations; + int iterationsLeft; + + SetAndGetWorkload(const WorkloadConfig& config) : WorkloadBase(config) { + keyPrefix = fdb::toBytesRef(fmt::format("{}/", workloadId)); + numIterations = config.getIntOption("numIterations", 1000); + } + + void start() override { + iterationsLeft = numIterations; + setAndGetLoop(); + } + + void setAndGetLoop() { + if (iterationsLeft == 0) { + return; + } + iterationsLeft--; + setAndGet([this]() { setAndGetLoop(); }); + } + ... + } + +We introduce a workload parameter ``numIterations`` to specify the number of iterations. If not specified +in the test configuration it defaults to 1000. + +The method ``setAndGetLoop`` implements the loop that decrements iterationsLeft counter until it reaches 0 +and each iteration calls setAndGet with a continuation that returns the execution to the loop. As you +can see we don't need any change in ``setAndGet``, just call it with another continuation. + +The pattern of passing a continuation as a parameter also can be used to decompose the workload into a +sequence of steps. For example, we can introduce setup and cleanUp steps to our workload and modify the +``setAndGetLoop`` to make it composable with an arbitrary continuation: + +.. code-block:: C++ + + void start() override { + setup([this](){ + iterationsLeft = numIterations; + setAndGetLoop([this](){ + cleanup(NO_OP_TASK); + }); + }); + } + + void setAndGetLoop(TTaskFct cont) { + if (iterationsLeft == 0) { + schedule(cont); + } + iterationsLeft--; + setAndGet([this, cont]() { setAndGetLoop(cont); }); + } + + void setup(TTaskFct cont) { ... } + + void cleanup(TTaskFct cont) { ... } + +Note that we call ``schedule(cont)`` in ``setAndGetLoop`` instead of calling the continuation directly. +In this way we avoid keeping ``setAndGetLoop`` in the call stack, when executing the next step. + +Subclassing ApiWorkload +======================= + +``ApiWorkload`` is an abstract subclass of ``WorkloadBase`` that provides a framework for a typical +implementation of API test workloads. It implements a workflow consisting of cleaning up the key space +of the workload, populating it with newly generated data and then running a loop consisting of random +database operations. The concrete subclasses of ``ApiWorkload`` are expected to override the method +``randomOperation`` with an implementation of concrete random operations. + +The ``ApiWorkload`` maintains a local key-value store that mirrors the part of the database state +relevant to the workload. A successful database write operation should be followed by a continuation +that performs equivalent changes in the local store, and the results of a database read operation should +be validated against the values from the local store. + +Test Configuration +================== + +A concrete test configuration is specified by a TOML file. The file must contain one ``[[test]]`` section +specifying the general settings for test execution followed by one or more ``[[test.workload]]`` +configuration sessions, specifying the workloads to be executed and their parameters. The specified +workloads are started all at once and executed concurrently. + +The ``[[test]]`` section can contain the following options: + +- ``title``: descriptive title of the test +- ``multiThreaded``: enable multi-threading (default: false) +- ``minFdbThreads`` and ``maxFdbThreads``: the number of FDB (network) threads to be randomly selected + from the given range (default: 1-1). Used only if ``multiThreaded=true``. It is also important to use + multiple database instances to make use of the multithreading. +- ``minDatabases`` and ``maxDatabases``: the number of database instances to be randomly selected from + the given range (default 1-1). The transactions of all workloads are randomly load-balanced over the + pool of database instances. +- ``minClients`` and ``maxClients``: the number of clients, i.e. instances of each workload, to be + randomly selected from the given range (default 1-8). +- ``minClientThreads`` and ``maxClientThreads``: the number of client threads, i.e. the threads used + for execution of the workload, to be randomly selected from the given range (default 1-1). +- ``blockOnFutures``: use blocking waits on futures instead of scheduling future callbacks asynchronously + (default: false) +- ``buggify``: Enable client-side failure injection (default: false) +- ``databasePerTransaction``: Create a separate database instance for each transaction (default: false). + It is a special mode useful for testing bugs related to creation and destruction of database instances. +- ``fdbCallbacksOnExternalThreads``: Enables the option ``FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS`` + causting the callbacks of futures to be executed directly on the threads of the external FDB clients + rather than on the thread of the local FDB client. + +The workload section ``[[test.workload]]`` must contain the attribute name matching the registered name +of the workload to be executed. Other options are workload-specific. + +The subclasses of the ``ApiWorkload`` inherit the following configuration options: + +- ``minKeyLength`` and ``maxKeyLength``: the size range of randomly generated keys (default: 1-64) +- ``minValueLength`` and ``maxValueLength``: the size range of randomly generated values + (default: 1-1000) +- ``maxKeysPerTransaction``: the maximum number of keys per transaction (default: 50) +- ``initialSize``: the number of key-value pairs in the initially populated database (default: 1000) +- ``readExistingKeysRatio``: the probability of choosing an existing key for read operations + (default: 0.9) +- ``numRandomOperations``: the number of random operations to be executed per workload (default: 1000) +- ``runUntilStop``: run the workload indefinitely until the stop command is received (default: false). + This execution mode in upgrade tests and other scripted tests, where the workload needs to + be generated continously until completion of the scripted test. +- ``numOperationsForProgressCheck``: the number of operations to be performed to confirm a progress + check (default: 10). This option is used in combination with ``runUntilStop``. Progress checks are + initiated by a test script to check if the client workload is successfully progressing after a + cluster change. + +Executing the Tests +=================== + +The ``fdb_c_api_tester`` executable takes a single TOML file as a parameter and executes the test +according to its specification. Before that we must create a FDB cluster and pass its cluster file as +a parameter to ``fdb_c_api_tester``. Note that multithreaded tests also need to be provided with an +external client library. + +For example, we can create a temporary cluster and use it for execution of one of the existing API tests: + +.. code-block:: bash + + ${srcDir}/tests/TestRunner/tmp_cluster.py --build-dir ${buildDir} -- \ + ${buildDir}/bin/fdb_c_api_tester \ + --cluster-file @CLUSTER_FILE@ \ + --external-client-library=${buildDir}/bindings/c/libfdb_c_external.so \ + --test-file ${srcDir}/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml + +The test specifications added to the ``bindings/c/test/apitester/tests/`` directory are executed as a part +of the regression test suite. They can be executed using the ``ctest`` target ``fdb_c_api_tests``: + +.. code-block:: bash + + ctest -R fdb_c_api_tests -VV diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index 699c811139..5d52d40910 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -416,6 +416,9 @@ FoundationDB will never use processes on the same machine for the replication of ``three_data_hall`` mode FoundationDB stores data in triplicate, with one copy on a storage server in each of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration enables the cluster to remain available after losing a single data hall and one machine in another data hall. +``three_data_hall_fallback`` mode + FoundationDB stores data in duplicate, with one copy each on a storage server in two of three data halls. The transaction logs are replicated four times, with two data halls containing two replicas apiece. Four available machines (two in each of two data halls) are therefore required to make progress. This configuration is similar to ``three_data_hall``, differing only in that data is stored on two instead of three replicas. This configuration is useful to unblock data distribution when a data hall becomes temporarily unavailable. Because ``three_data_hall_fallback`` reduces the redundancy level to two, it should only be used as a temporary measure to restore cluster health during a datacenter outage. + Datacenter-aware mode --------------------- diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 9d68ab36c6..2cca7fb608 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -379,7 +379,9 @@ "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -400,7 +402,9 @@ "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -599,7 +603,7 @@ "counter":0, "roughness":0.0 }, - "memory_errors":{ // measures number of proxy_memory_limit_exceeded errors + "memory_errors":{ // measures number of (commit/grv)_proxy_memory_limit_exceeded errors "hz":0.0, "counter":0, "roughness":0.0 diff --git a/documentation/sphinx/source/mr-status.rst b/documentation/sphinx/source/mr-status.rst index 5eb404bbd4..ed550cbee7 100644 --- a/documentation/sphinx/source/mr-status.rst +++ b/documentation/sphinx/source/mr-status.rst @@ -131,6 +131,9 @@ min_free_space_ratio Running out of space (approaching 5% limit). log_server_min_free_space Log server running out of space (approaching 100MB limit). log_server_min_free_space_ratio Log server running out of space (approaching 5% limit). storage_server_durability_lag Storage server durable version falling behind. +storage_server_list_fetch_failed Unable to fetch storage server list. +blob_worker_lag Blob worker granule version falling behind. +blob_worker_missing No blob workers are reporting metrics. =================================== ==================================================== The JSON path ``cluster.qos.throttled_tags``, when it exists, is an Object containing ``"auto"`` , ``"manual"`` and ``"recommended"``. The possible fields for those object are in the following table: diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst index 1cd51ad968..05a33625e8 100644 --- a/documentation/sphinx/source/release-notes/release-notes-710.rst +++ b/documentation/sphinx/source/release-notes/release-notes-710.rst @@ -2,6 +2,30 @@ Release Notes ############# +7.1.21 +====== +* Same as 7.1.20 release with AVX enabled. + +7.1.20 +====== +* Released with AVX disabled. +* Fixed missing localities for fdbserver that can cause cross DC calls among storage servers. `(PR #7995) `_ +* Removed extremely spammy trace event in FetchKeys and fixed transaction_profiling_analyzer.py. `(PR #7934) `_ +* Fixed bugs when GRV proxy returns an error. `(PR #7860) `_ + +7.1.19 +====== +* Same as 7.1.18 release with AVX enabled. + +7.1.18 +====== +* Released with AVX disabled. +* Added knobs for the minimum and the maximum of the Ratekeeper's default priority. `(PR #7820) `_ +* Fixed bugs in ``getRange`` of the special key space. `(PR #7778) `_, `(PR #7720) `_ +* Added debug ID for secondary queries in index prefetching. `(PR #7755) `_ +* Changed hostname resolving to prefer IPv6 addresses. `(PR #7750) `_ +* Added more transaction debug events for prefetch queries. `(PR #7732) `_ + 7.1.17 ====== * Same as 7.1.16 release with AVX enabled. @@ -15,7 +39,7 @@ Release Notes * Fixed ScopeEventFieldTypeMismatch error for TLogMetrics. `(PR #7640) `_ * Added getMappedRange latency metrics. `(PR #7632) `_ * Fixed a version vector performance bug due to not updating client side tag cache. `(PR #7616) `_ -* Fixed DiskReadSeconds and DiskWriteSeconds calculaion in ProcessMetrics. `(PR #7609) `_ +* Fixed DiskReadSeconds and DiskWriteSeconds calculation in ProcessMetrics. `(PR #7609) `_ * Added Rocksdb compression and data size stats. `(PR #7596) `_ 7.1.15 @@ -74,7 +98,7 @@ Release Notes * Added support of the reboot command in go bindings. `(PR #7270) `_ * Fixed several issues in profiling special keys using GlobalConfig. `(PR #7120) `_ * Fixed a stuck transaction system bug due to inconsistent recovery transaction version. `(PR #7261) `_ -* Fixed a unknown_error crash due to not resolving hostnames. `(PR #7254) `_ +* Fixed an unknown_error crash due to not resolving hostnames. `(PR #7254) `_ * Fixed a heap-use-after-free bug. `(PR #7250) `_ * Fixed a performance issue that remote TLogs are sending too many pops to log routers. `(PR #7235) `_ * Fixed an issue that SharedTLogs are not displaced and leaking disk space. `(PR #7246) `_ diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index 45b9576f31..aa5eede4af 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -22,6 +22,8 @@ Each special key that existed before api version 630 is its own module. These ar #. ``\xff\xff/cluster_file_path`` - See :ref:`cluster file client access ` #. ``\xff\xff/status/json`` - See :doc:`Machine-readable status ` +#. ``\xff\xff/worker_interfaces`` - key as the worker's network address and value as the serialized ClientWorkerInterface, not transactional + Prior to api version 630, it was also possible to read a range starting at ``\xff\xff/worker_interfaces``. This is mostly an implementation detail of fdbcli, but it's available in api version 630 as a module with prefix ``\xff\xff/worker_interfaces/``. @@ -210,6 +212,7 @@ that process, and wait for necessary data to be moved away. #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/tenant/map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ````. Clearing a key in this range will delete the tenant with name ````. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. #. ``\xff\xff/management/tenant/rename/`` Read/write. Setting a key in this range to an unused tenant name will result in the tenant with the name ```` to be renamed to the value provided. If the rename operation is a transaction retried in a loop, it is possible for the rename to be applied twice, in which case ``tenant_not_found`` or ``tenant_already_exists`` errors may be returned. This can be avoided by checking for the tenant's existence first. +#. ``\xff\xff/management/options/worker_interfaces/verify`` Read/write. Setting this key will add a verification phase in reading ``\xff\xff/worker_interfaces``. Setting this key only has an effect in the current transaction and is not persisted on commit. Try to establish connections with every worker from the list returned by Cluster Controller and only return those workers that the client can connect to. This option is now only used in fdbcli commands ``kill``, ``suspend`` and ``expensive_data_check`` to populate the worker list. An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or diff --git a/documentation/sphinx/source/tenants.rst b/documentation/sphinx/source/tenants.rst index d22603b20e..b631c55ba2 100644 --- a/documentation/sphinx/source/tenants.rst +++ b/documentation/sphinx/source/tenants.rst @@ -49,7 +49,7 @@ All operations performed within a tenant transaction will occur within the tenan Raw access ---------- -When operating in the tenant mode ``required_experimental``, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant. +When operating in the tenant mode ``required_experimental`` or using a metacluster, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant. .. note :: Setting the ``READ_SYSTEM_KEYS`` or ``ACCESS_SYSTEM_KEYS`` options implies ``RAW_ACCESS`` for your transaction. diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 3d7889a36c..1a0f2eba14 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -928,7 +928,7 @@ void parentWatcher(void* parentHandle) { static void printVersion() { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); printf("source version %s\n", getSourceVersion()); - printf("protocol %llx\n", (long long)currentProtocolVersion.version()); + printf("protocol %llx\n", (long long)currentProtocolVersion().version()); } static void printBuildInformation() { diff --git a/fdbcli/BlobRangeCommand.actor.cpp b/fdbcli/BlobRangeCommand.actor.cpp index b5fa48ff0d..4c6bdf9614 100644 --- a/fdbcli/BlobRangeCommand.actor.cpp +++ b/fdbcli/BlobRangeCommand.actor.cpp @@ -23,6 +23,7 @@ #include "fdbclient/FDBOptions.g.h" #include "fdbclient/IClientApi.h" #include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" #include "flow/Arena.h" #include "flow/FastRef.h" @@ -31,33 +32,6 @@ namespace { -// copy to standalones for krm -ACTOR Future setBlobRange(Database db, Key startKey, Key endKey, Value value) { - state Reference tr = makeReference(db); - - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - - // FIXME: check that the set range is currently inactive, and that a revoked range is currently its own - // range in the map and fully set. - - tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString()); - // This is not coalescing because we want to keep each range logically separate. - wait(krmSetRange(tr, blobRangeKeys.begin, KeyRange(KeyRangeRef(startKey, endKey)), value)); - wait(tr->commit()); - printf("Successfully updated blob range [%s - %s) to %s\n", - startKey.printable().c_str(), - endKey.printable().c_str(), - value.printable().c_str()); - return Void(); - } catch (Error& e) { - wait(tr->onError(e)); - } - } -} - ACTOR Future getLatestReadVersion(Database db) { state Transaction tr(db); loop { @@ -78,7 +52,7 @@ ACTOR Future printAfterDelay(double delaySeconds, std::string message) { return Void(); } -ACTOR Future doBlobPurge(Database db, Key startKey, Key endKey, Optional version) { +ACTOR Future doBlobPurge(Database db, Key startKey, Key endKey, Optional version, bool force) { state Version purgeVersion; if (version.present()) { purgeVersion = version.get(); @@ -86,7 +60,7 @@ ACTOR Future doBlobPurge(Database db, Key startKey, Key endKey, OptionalpurgeBlobGranules(KeyRange(KeyRangeRef(startKey, endKey)), purgeVersion, {})); + state Key purgeKey = wait(db->purgeBlobGranules(KeyRange(KeyRangeRef(startKey, endKey)), purgeVersion, {}, force)); fmt::print("Blob purge registered for [{0} - {1}) @ {2}\n", startKey.printable(), endKey.printable(), purgeVersion); @@ -99,65 +73,10 @@ ACTOR Future doBlobPurge(Database db, Key startKey, Key endKey, Optional checkBlobSubrange(Database db, KeyRange keyRange, Optional version) { - state Transaction tr(db); - state Version readVersionOut = invalidVersion; - loop { - try { - wait(success(tr.readBlobGranules(keyRange, 0, version, &readVersionOut))); - return readVersionOut; - } catch (Error& e) { - wait(tr.onError(e)); - } - } -} - ACTOR Future doBlobCheck(Database db, Key startKey, Key endKey, Optional version) { - state Transaction tr(db); - state Version readVersionOut = invalidVersion; state double elapsed = -timer_monotonic(); - state KeyRange range = KeyRange(KeyRangeRef(startKey, endKey)); - state Standalone> allRanges; - loop { - try { - wait(store(allRanges, tr.getBlobGranuleRanges(range))); - break; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - if (allRanges.empty()) { - fmt::print("ERROR: No blob ranges for [{0} - {1})\n", startKey.printable(), endKey.printable()); - return Void(); - } - fmt::print("Loaded {0} blob ranges to check\n", allRanges.size()); - state std::vector> checkParts; - // chunk up to smaller ranges than max - int maxChunkSize = 1000; - KeyRange currentChunk; - int currentChunkSize = 0; - for (auto& it : allRanges) { - if (currentChunkSize == maxChunkSize) { - checkParts.push_back(checkBlobSubrange(db, currentChunk, version)); - currentChunkSize = 0; - } - if (currentChunkSize == 0) { - currentChunk = it; - } else if (it.begin != currentChunk.end) { - fmt::print("ERROR: Blobrange check failed, gap in blob ranges from [{0} - {1})\n", - currentChunk.end.printable(), - it.begin.printable()); - return Void(); - } else { - currentChunk = KeyRangeRef(currentChunk.begin, it.end); - } - currentChunkSize++; - } - checkParts.push_back(checkBlobSubrange(db, currentChunk, version)); - - wait(waitForAll(checkParts)); - readVersionOut = checkParts.back().get(); + state Version readVersionOut = wait(db->verifyBlobRange(KeyRangeRef(startKey, endKey), version)); elapsed += timer_monotonic(); @@ -201,7 +120,7 @@ ACTOR Future blobRangeCommandActor(Database localDb, fmt::print("Invalid blob range [{0} - {1})\n", tokens[2].printable(), tokens[3].printable()); } else { if (tokencmp(tokens[1], "start") || tokencmp(tokens[1], "stop")) { - bool starting = tokencmp(tokens[1], "start"); + state bool starting = tokencmp(tokens[1], "start"); if (tokens.size() > 4) { printUsage(tokens[0]); return false; @@ -210,9 +129,22 @@ ACTOR Future blobRangeCommandActor(Database localDb, starting ? "Starting" : "Stopping", tokens[2].printable().c_str(), tokens[3].printable().c_str()); - wait(setBlobRange(localDb, begin, end, starting ? LiteralStringRef("1") : StringRef())); - } else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "check")) { - bool purge = tokencmp(tokens[1], "purge"); + state bool success = false; + if (starting) { + wait(store(success, localDb->blobbifyRange(KeyRangeRef(begin, end)))); + } else { + wait(store(success, localDb->unblobbifyRange(KeyRangeRef(begin, end)))); + } + if (!success) { + fmt::print("{0} blobbify range for [{1} - {2}) failed\n", + starting ? "Starting" : "Stopping", + tokens[2].printable().c_str(), + tokens[3].printable().c_str()); + } + return success; + } else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) { + bool purge = tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge"); + bool forcePurge = tokencmp(tokens[1], "forcepurge"); Optional version; if (tokens.size() > 4) { @@ -225,17 +157,18 @@ ACTOR Future blobRangeCommandActor(Database localDb, version = v; } - fmt::print("{0} blob range [{1} - {2})", + fmt::print("{0} blob range [{1} - {2}){3}", purge ? "Purging" : "Checking", tokens[2].printable(), - tokens[3].printable()); + tokens[3].printable(), + forcePurge ? " (force)" : ""); if (version.present()) { fmt::print(" @ {0}", version.get()); } fmt::print("\n"); if (purge) { - wait(doBlobPurge(localDb, begin, end, version)); + wait(doBlobPurge(localDb, begin, end, version, forcePurge)); } else { wait(doBlobCheck(localDb, begin, end, version)); } @@ -247,8 +180,7 @@ ACTOR Future blobRangeCommandActor(Database localDb, return true; } -CommandFactory blobRangeFactory("blobrange", - CommandHelp("blobrange [version]", - "", - "")); +CommandFactory blobRangeFactory( + "blobrange", + CommandHelp("blobrange [version]", "", "")); } // namespace fdb_cli diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp index 37474242e1..52521ea677 100644 --- a/fdbcli/ConfigureCommand.actor.cpp +++ b/fdbcli/ConfigureCommand.actor.cpp @@ -272,6 +272,10 @@ ACTOR Future configureCommandActor(Reference db, stderr, "WARN: Sharded RocksDB storage engine type is still in experimental stage, not yet production tested.\n"); break; + case ConfigurationResult::DATABASE_IS_REGISTERED: + fprintf(stderr, "ERROR: A cluster cannot change its tenant mode while part of a metacluster.\n"); + ret = false; + break; default: ASSERT(false); ret = false; diff --git a/fdbcli/ExpensiveDataCheckCommand.actor.cpp b/fdbcli/ExpensiveDataCheckCommand.actor.cpp index e9d5c5b989..3be572d3d1 100644 --- a/fdbcli/ExpensiveDataCheckCommand.actor.cpp +++ b/fdbcli/ExpensiveDataCheckCommand.actor.cpp @@ -46,7 +46,7 @@ ACTOR Future expensiveDataCheckCommandActor( if (tokens.size() == 1) { // initialize worker interfaces address_interface->clear(); - wait(getWorkerInterfaces(tr, address_interface)); + wait(getWorkerInterfaces(tr, address_interface, true)); } if (tokens.size() == 1 || tokencmp(tokens[1], "list")) { if (address_interface->size() == 0) { diff --git a/fdbcli/KillCommand.actor.cpp b/fdbcli/KillCommand.actor.cpp index d025b10388..c8fa75bb1c 100644 --- a/fdbcli/KillCommand.actor.cpp +++ b/fdbcli/KillCommand.actor.cpp @@ -44,7 +44,7 @@ ACTOR Future killCommandActor(Reference db, if (tokens.size() == 1) { // initialize worker interfaces address_interface->clear(); - wait(getWorkerInterfaces(tr, address_interface)); + wait(getWorkerInterfaces(tr, address_interface, true)); } if (tokens.size() == 1 || tokencmp(tokens[1], "list")) { if (address_interface->size() == 0) { diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp new file mode 100644 index 0000000000..da7c0f79fd --- /dev/null +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -0,0 +1,432 @@ +/* + * MetaclusterCommands.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/Knobs.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/Schemas.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace fdb_cli { + +Optional, Optional>> +parseClusterConfiguration(std::vector const& tokens, DataClusterEntry const& defaults, int startIndex) { + Optional entry; + Optional connectionString; + + std::set usedParams; + for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) { + StringRef token = tokens[tokenNum]; + bool foundEquals; + StringRef param = token.eat("=", &foundEquals); + if (!foundEquals) { + fmt::print(stderr, + "ERROR: invalid configuration string `{}'. String must specify a value using `='.\n", + param.toString().c_str()); + return {}; + } + std::string value = token.toString(); + if (!usedParams.insert(value).second) { + fmt::print( + stderr, "ERROR: configuration parameter `{}' specified more than once.\n", param.toString().c_str()); + return {}; + } + if (tokencmp(param, "max_tenant_groups")) { + entry = defaults; + + int n; + if (sscanf(value.c_str(), "%d%n", &entry.get().capacity.numTenantGroups, &n) != 1 || n != value.size() || + entry.get().capacity.numTenantGroups < 0) { + fmt::print(stderr, "ERROR: invalid number of tenant groups `{}'.\n", value.c_str()); + return {}; + } + } else if (tokencmp(param, "connection_string")) { + connectionString = ClusterConnectionString(value); + } else { + fmt::print(stderr, "ERROR: unrecognized configuration parameter `{}'.\n", param.toString().c_str()); + return {}; + } + } + + return std::make_pair(connectionString, entry); +} + +void printMetaclusterConfigureOptionsUsage() { + fmt::print("max_tenant_groups sets the maximum number of tenant groups that can be assigned\n" + "to the named data cluster.\n"); + fmt::print("connection_string sets the connection string for the named data cluster.\n"); +} + +// metacluster create command +ACTOR Future metaclusterCreateCommand(Reference db, std::vector tokens) { + if (tokens.size() != 3) { + fmt::print("Usage: metacluster create_experimental \n\n"); + fmt::print("Configures the cluster to be a management cluster in a metacluster.\n"); + fmt::print("NAME is an identifier used to distinguish this metacluster from other metaclusters.\n"); + return false; + } + + Optional errorStr = wait(MetaclusterAPI::createMetacluster(db, tokens[2])); + if (errorStr.present()) { + fmt::print("ERROR: {}.\n", errorStr.get()); + } else { + fmt::print("The cluster has been configured as a metacluster.\n"); + } + return true; +} + +// metacluster decommission command +ACTOR Future metaclusterDecommissionCommand(Reference db, std::vector tokens) { + if (tokens.size() != 2) { + fmt::print("Usage: metacluster decommission\n\n"); + fmt::print("Converts the current cluster from a metacluster management cluster back into an\n"); + fmt::print("ordinary cluster. It must be called on a cluster with no registered data clusters.\n"); + return false; + } + + wait(MetaclusterAPI::decommissionMetacluster(db)); + + fmt::print("The cluster is no longer a metacluster.\n"); + return true; +} + +// metacluster register command +ACTOR Future metaclusterRegisterCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4) { + fmt::print("Usage: metacluster register connection_string=\n" + "[max_tenant_groups=]\n\n"); + fmt::print("Adds a data cluster to a metacluster.\n"); + fmt::print("NAME is used to identify the cluster in future commands.\n"); + printMetaclusterConfigureOptionsUsage(); + return false; + } + + DataClusterEntry defaultEntry; + auto config = parseClusterConfiguration(tokens, defaultEntry, 3); + if (!config.present()) { + return false; + } else if (!config.get().first.present()) { + fmt::print(stderr, "ERROR: connection_string must be configured when registering a cluster.\n"); + return false; + } + + wait(MetaclusterAPI::registerCluster( + db, tokens[2], config.get().first.get(), config.get().second.orDefault(defaultEntry))); + + fmt::print("The cluster `{}' has been added\n", printable(tokens[2]).c_str()); + return true; +} + +// metacluster remove command +ACTOR Future metaclusterRemoveCommand(Reference db, std::vector tokens) { + if (tokens.size() < 3 || tokens.size() > 4 || (tokens.size() == 4 && tokens[2] != "FORCE"_sr)) { + fmt::print("Usage: metacluster remove [FORCE] \n\n"); + fmt::print("Removes the specified data cluster from a metacluster.\n"); + fmt::print("If FORCE is specified, then the cluster will be detached even if it has\n" + "tenants assigned to it.\n"); + return false; + } + + state ClusterNameRef clusterName = tokens[tokens.size() - 1]; + wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4)); + + fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); + return true; +} + +// metacluster configure command +ACTOR Future metaclusterConfigureCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4) { + fmt::print("Usage: metacluster configure |\n" + "connection_string=> ...\n\n"); + fmt::print("Updates the configuration of the metacluster.\n"); + printMetaclusterConfigureOptionsUsage(); + return false; + } + + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + Optional metadata = wait(MetaclusterAPI::tryGetClusterTransaction(tr, tokens[2])); + if (!metadata.present()) { + throw cluster_not_found(); + } + + auto config = parseClusterConfiguration(tokens, metadata.get().entry, 3); + if (!config.present()) { + return false; + } + + MetaclusterAPI::updateClusterMetadata( + tr, tokens[2], metadata.get(), config.get().first, config.get().second); + + wait(safeThreadFutureToFuture(tr->commit())); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return true; +} + +// metacluster list command +ACTOR Future metaclusterListCommand(Reference db, std::vector tokens) { + if (tokens.size() > 5) { + fmt::print("Usage: metacluster list [BEGIN] [END] [LIMIT]\n\n"); + fmt::print("Lists the data clusters in a metacluster.\n"); + fmt::print("Only cluster names in the range BEGIN - END will be printed.\n"); + fmt::print("An optional LIMIT can be specified to limit the number of results (default 100).\n"); + return false; + } + + state ClusterNameRef begin = tokens.size() > 2 ? tokens[2] : ""_sr; + state ClusterNameRef end = tokens.size() > 3 ? tokens[3] : "\xff"_sr; + int limit = 100; + + if (tokens.size() > 4) { + int n = 0; + if (sscanf(tokens[3].toString().c_str(), "%d%n", &limit, &n) != 1 || n != tokens[3].size() || limit < 0) { + fmt::print(stderr, "ERROR: invalid limit {}\n", tokens[3].toString().c_str()); + return false; + } + } + + std::map clusters = wait(MetaclusterAPI::listClusters(db, begin, end, limit)); + if (clusters.empty()) { + if (tokens.size() == 2) { + fmt::print("The metacluster has no registered data clusters\n"); + } else { + fmt::print("The metacluster has no registered data clusters in the specified range\n"); + } + } + + int index = 0; + for (auto cluster : clusters) { + fmt::print(" {}. {}\n", ++index, printable(cluster.first).c_str()); + } + + return true; +} + +// metacluster get command +ACTOR Future metaclusterGetCommand(Reference db, std::vector tokens) { + if (tokens.size() > 4 || (tokens.size() == 4 && tokens[3] != "JSON"_sr)) { + fmt::print("Usage: metacluster get [JSON]\n\n"); + fmt::print("Prints metadata associated with the given data cluster.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); + return false; + } + + state bool useJson = tokens.size() == 4; + + try { + DataClusterMetadata metadata = wait(MetaclusterAPI::getCluster(db, tokens[2])); + + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "success"; + obj["cluster"] = metadata.toJson(); + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + } else { + fmt::print(" connection string: {}\n", metadata.connectionString.toString().c_str()); + fmt::print(" cluster state: {}\n", DataClusterEntry::clusterStateToString(metadata.entry.clusterState)); + fmt::print(" tenant group capacity: {}\n", metadata.entry.capacity.numTenantGroups); + fmt::print(" allocated tenant groups: {}\n", metadata.entry.allocated.numTenantGroups); + } + } catch (Error& e) { + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "error"; + obj["error"] = e.what(); + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + return false; + } else { + throw; + } + } + + return true; +} + +// metacluster status command +ACTOR Future metaclusterStatusCommand(Reference db, std::vector tokens) { + if (tokens.size() < 2 || tokens.size() > 3) { + fmt::print("Usage: metacluster status [JSON]\n\n"); + fmt::print("Prints metacluster metadata.\n"); + fmt::print("If JSON is specified, then the output will be in JSON format.\n"); + return false; + } + + state bool useJson = tokens.size() == 3; + + try { + std::map clusters = + wait(MetaclusterAPI::listClusters(db, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS)); + + ClusterUsage totalCapacity; + ClusterUsage totalAllocated; + for (auto cluster : clusters) { + totalCapacity.numTenantGroups += + std::max(cluster.second.entry.capacity.numTenantGroups, cluster.second.entry.allocated.numTenantGroups); + totalAllocated.numTenantGroups += cluster.second.entry.allocated.numTenantGroups; + } + + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "success"; + + json_spirit::mObject metaclusterObj; + metaclusterObj["data_clusters"] = (int)clusters.size(); + metaclusterObj["capacity"] = totalCapacity.toJson(); + metaclusterObj["allocated"] = totalAllocated.toJson(); + + obj["metacluster"] = metaclusterObj; + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + } else { + fmt::print(" number of data clusters: {}\n", clusters.size()); + fmt::print(" tenant group capacity: {}\n", totalCapacity.numTenantGroups); + fmt::print(" allocated tenant groups: {}\n", totalAllocated.numTenantGroups); + } + + return true; + } catch (Error& e) { + if (useJson) { + json_spirit::mObject obj; + obj["type"] = "error"; + obj["error"] = e.what(); + fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str()); + return false; + } else { + throw; + } + } +} + +// metacluster command +Future metaclusterCommand(Reference db, std::vector tokens) { + if (tokens.size() == 1) { + printUsage(tokens[0]); + return true; + } else if (tokencmp(tokens[1], "create_experimental")) { + return metaclusterCreateCommand(db, tokens); + } else if (tokencmp(tokens[1], "decommission")) { + return metaclusterDecommissionCommand(db, tokens); + } else if (tokencmp(tokens[1], "register")) { + return metaclusterRegisterCommand(db, tokens); + } else if (tokencmp(tokens[1], "remove")) { + return metaclusterRemoveCommand(db, tokens); + } else if (tokencmp(tokens[1], "configure")) { + return metaclusterConfigureCommand(db, tokens); + } else if (tokencmp(tokens[1], "list")) { + return metaclusterListCommand(db, tokens); + } else if (tokencmp(tokens[1], "get")) { + return metaclusterGetCommand(db, tokens); + } else if (tokencmp(tokens[1], "status")) { + return metaclusterStatusCommand(db, tokens); + } else { + printUsage(tokens[0]); + return true; + } +} + +void metaclusterGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { + "create_experimental", "decommission", "register", "remove", "configure", "list", "get", "status", nullptr + }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() > 1 && (tokencmp(tokens[1], "register") || tokencmp(tokens[1], "configure"))) { + const char* opts[] = { "max_tenant_groups=", "connection_string=", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if ((tokens.size() == 2 && tokencmp(tokens[1], "status")) || + (tokens.size() == 3 && tokencmp(tokens[1], "get"))) { + const char* opts[] = { "JSON", nullptr }; + arrayGenerator(text, line, opts, lc); + } +} + +std::vector metaclusterHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "create_experimental")) { + return { "" }; + } else if (tokencmp(tokens[1], "decommission")) { + return {}; + } else if (tokencmp(tokens[1], "register") && tokens.size() < 5) { + static std::vector opts = { "", + "connection_string=", + "[max_tenant_groups=]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "remove") && tokens.size() < 4) { + static std::vector opts = { "[FORCE]", "" }; + if (tokens.size() == 2) { + return opts; + } else if (tokens.size() == 3 && (inArgument || tokens[2].size() == "FORCE"_sr.size()) && + "FORCE"_sr.startsWith(tokens[2])) { + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else { + return {}; + } + } else if (tokencmp(tokens[1], "configure")) { + static std::vector opts = { + "", "|connection_string=>" + }; + return std::vector(opts.begin() + std::min(1, tokens.size() - 2), opts.end()); + } else if (tokencmp(tokens[1], "list") && tokens.size() < 5) { + static std::vector opts = { "[BEGIN]", "[END]", "[LIMIT]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "get") && tokens.size() < 4) { + static std::vector opts = { "", "[JSON]" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokencmp(tokens[1], "status") && tokens.size() == 2) { + return { "[JSON]" }; + } else { + return {}; + } +} + +CommandFactory metaclusterRegisterFactory( + "metacluster", + CommandHelp("metacluster [ARGS]", + "view and manage a metacluster", + "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n" + "`register' and `remove' add and remove data clusters from the metacluster.\n" + "`configure' updates the configuration of a data cluster.\n" + "`list' prints a list of data clusters in the metacluster.\n" + "`get' prints the metadata for a particular data cluster.\n" + "`status' prints metacluster metadata.\n"), + &metaclusterGenerator, + &metaclusterHintGenerator); + +} // namespace fdb_cli diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp index f8749c0bce..67dcfa946d 100644 --- a/fdbcli/StatusCommand.actor.cpp +++ b/fdbcli/StatusCommand.actor.cpp @@ -411,6 +411,7 @@ void printStatus(StatusObjectReader statusObj, outputString += "\nConfiguration:"; std::string outputStringCache = outputString; bool isOldMemory = false; + bool blobGranuleEnabled{ false }; try { // Configuration section // FIXME: Should we suppress this if there are cluster messages implying that the database has no @@ -434,7 +435,6 @@ void printStatus(StatusObjectReader statusObj, outputString += "unknown"; int intVal = 0; - bool blobGranuleEnabled{ false }; if (statusObjConfig.get("blob_granules_enabled", intVal) && intVal) { blobGranuleEnabled = true; } @@ -1110,6 +1110,15 @@ void printStatus(StatusObjectReader statusObj, outputString += "\n\nCoordination servers:"; outputString += getCoordinatorsInfoString(statusObj); } + + if (blobGranuleEnabled) { + outputString += "\n\nBlob Granules:"; + StatusObjectReader statusObjBlobGranules = statusObjCluster["blob_granules"]; + auto numWorkers = statusObjBlobGranules["number_of_blob_workers"].get_int(); + outputString += "\n Number of Workers - " + format("%d", numWorkers); + auto numKeyRanges = statusObjBlobGranules["number_of_key_ranges"].get_int(); + outputString += "\n Number of Key Ranges - " + format("%d", numKeyRanges); + } } // client time diff --git a/fdbcli/SuspendCommand.actor.cpp b/fdbcli/SuspendCommand.actor.cpp index 78a7fa1ed9..483ad4e445 100644 --- a/fdbcli/SuspendCommand.actor.cpp +++ b/fdbcli/SuspendCommand.actor.cpp @@ -43,7 +43,7 @@ ACTOR Future suspendCommandActor(Reference db, if (tokens.size() == 1) { // initialize worker interfaces address_interface->clear(); - wait(getWorkerInterfaces(tr, address_interface)); + wait(getWorkerInterfaces(tr, address_interface, true)); if (address_interface->size() == 0) { printf("\nNo addresses can be suspended.\n"); } else if (address_interface->size() == 1) { diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp index eebc556133..7648d8dbd8 100644 --- a/fdbcli/TenantCommands.actor.cpp +++ b/fdbcli/TenantCommands.actor.cpp @@ -25,6 +25,7 @@ #include "fdbclient/IClientApi.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/MetaclusterManagement.actor.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/Schemas.h" @@ -100,9 +101,9 @@ Key makeConfigKey(TenantNameRef tenantName, StringRef configName) { return tenantConfigSpecialKeyRange.begin.withSuffix(Tuple().append(tenantName).append(configName).pack()); } -void applyConfiguration(Reference tr, - TenantNameRef tenantName, - std::map, Optional> configuration) { +void applyConfigurationToSpecialKeys(Reference tr, + TenantNameRef tenantName, + std::map, Optional> configuration) { for (auto [configName, value] : configuration) { if (value.present()) { tr->set(makeConfigKey(tenantName, configName), value.get()); @@ -136,21 +137,32 @@ ACTOR Future createTenantCommandActor(Reference db, std::vector } loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - if (!doneExistenceCheck) { - // Hold the reference to the standalone's memory - state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); - Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); - if (existingTenant.present()) { - throw tenant_already_exists(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + TenantMapEntry tenantEntry; + for (auto const& [name, value] : configuration.get()) { + tenantEntry.configure(name, value); } - doneExistenceCheck = true; + wait(MetaclusterAPI::createTenant(db, tokens[1], tenantEntry)); + } else { + if (!doneExistenceCheck) { + // Hold the reference to the standalone's memory + state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); + Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); + if (existingTenant.present()) { + throw tenant_already_exists(); + } + doneExistenceCheck = true; + } + + tr->set(tenantNameKey, ValueRef()); + applyConfigurationToSpecialKeys(tr, tokens[1], configuration.get()); + wait(safeThreadFutureToFuture(tr->commit())); } - tr->set(tenantNameKey, ValueRef()); - applyConfiguration(tr, tokens[1], configuration.get()); - wait(safeThreadFutureToFuture(tr->commit())); break; } catch (Error& e) { state Error err(e); @@ -167,10 +179,12 @@ ACTOR Future createTenantCommandActor(Reference db, std::vector return true; } -CommandFactory createTenantFactory("createtenant", - CommandHelp("createtenant [tenant_group=]", - "creates a new tenant in the cluster", - "Creates a new tenant in the cluster with the specified name.")); +CommandFactory createTenantFactory( + "createtenant", + CommandHelp("createtenant [tenant_group=]", + "creates a new tenant in the cluster", + "Creates a new tenant in the cluster with the specified name. An optional group can be specified" + "that will require this tenant to be placed on the same cluster as other tenants in the same group.")); // deletetenant command ACTOR Future deleteTenantCommandActor(Reference db, std::vector tokens, int apiVersion) { @@ -184,20 +198,27 @@ ACTOR Future deleteTenantCommandActor(Reference db, std::vector state bool doneExistenceCheck = false; loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - if (!doneExistenceCheck) { - // Hold the reference to the standalone's memory - state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); - Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); - if (!existingTenant.present()) { - throw tenant_not_found(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(MetaclusterAPI::deleteTenant(db, tokens[1])); + } else { + if (!doneExistenceCheck) { + // Hold the reference to the standalone's memory + state ThreadFuture> existingTenantFuture = tr->get(tenantNameKey); + Optional existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture)); + if (!existingTenant.present()) { + throw tenant_not_found(); + } + doneExistenceCheck = true; } - doneExistenceCheck = true; + + tr->clear(tenantNameKey); + wait(safeThreadFutureToFuture(tr->commit())); } - tr->clear(tenantNameKey); - wait(safeThreadFutureToFuture(tr->commit())); break; } catch (Error& e) { state Error err(e); @@ -228,8 +249,8 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< return false; } - StringRef beginTenant = ""_sr; - StringRef endTenant = "\xff\xff"_sr; + state StringRef beginTenant = ""_sr; + state StringRef endTenant = "\xff\xff"_sr; state int limit = 100; if (tokens.size() >= 2) { @@ -256,12 +277,26 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< loop { try { - // Hold the reference to the standalone's memory - state ThreadFuture kvsFuture = - tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit); - RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture)); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::vector tenantNames; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + std::vector> tenants = + wait(MetaclusterAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit)); + for (auto tenant : tenants) { + tenantNames.push_back(tenant.first); + } + } else { + // Hold the reference to the standalone's memory + state ThreadFuture kvsFuture = + tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit); + RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture)); + for (auto tenant : tenants) { + tenantNames.push_back(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin)); + } + } - if (tenants.empty()) { + if (tenantNames.empty()) { if (tokens.size() == 1) { fmt::print("The cluster has no tenants\n"); } else { @@ -270,10 +305,8 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< } int index = 0; - for (auto tenant : tenants) { - fmt::print(" {}. {}\n", - ++index, - printable(tenant.key.removePrefix(tenantMapSpecialKeyRange(apiVersion).begin)).c_str()); + for (auto tenantName : tenantNames) { + fmt::print(" {}. {}\n", ++index, printable(tenantName).c_str()); } return true; @@ -309,15 +342,24 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector> tenantFuture = tr->get(tenantNameKey); - Optional tenant = wait(safeThreadFutureToFuture(tenantFuture)); - if (!tenant.present()) { - throw tenant_not_found(); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + state std::string tenantJson; + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + TenantMapEntry entry = wait(MetaclusterAPI::getTenantTransaction(tr, tokens[1])); + tenantJson = entry.toJson(apiVersion); + } else { + // Hold the reference to the standalone's memory + state ThreadFuture> tenantFuture = tr->get(tenantNameKey); + Optional tenant = wait(safeThreadFutureToFuture(tenantFuture)); + if (!tenant.present()) { + throw tenant_not_found(); + } + tenantJson = tenant.get().toString(); } json_spirit::mValue jsonObject; - json_spirit::read_string(tenant.get().toString(), jsonObject); + json_spirit::read_string(tenantJson, jsonObject); if (useJson) { json_spirit::mObject resultObj; @@ -333,6 +375,7 @@ ACTOR Future getTenantCommandActor(Reference db, std::vector getTenantCommandActor(Reference db, std::vector getTenantCommandActor(Reference db, std::vector configureTenantCommandActor(Reference db, std::vec state Reference tr = db->createTransaction(); loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - applyConfiguration(tr, tokens[1], configuration.get()); - wait(safeThreadFutureToFuture(tr->commit())); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + TenantMapEntry tenantEntry; + wait(MetaclusterAPI::configureTenant(db, tokens[1], configuration.get())); + } else { + applyConfigurationToSpecialKeys(tr, tokens[1], configuration.get()); + wait(safeThreadFutureToFuture(tr->commit())); + } break; } catch (Error& e) { state Error err(e); @@ -456,50 +509,56 @@ ACTOR Future renameTenantCommandActor(Reference db, std::vector state Key tenantOldNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[1]); state Key tenantNewNameKey = tenantMapSpecialKeyRange(apiVersion).begin.withSuffix(tokens[2]); state bool firstTry = true; - state int64_t id; + state int64_t id = -1; loop { - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - // Hold the reference to the standalone's memory - state ThreadFuture> oldEntryFuture = tr->get(tenantOldNameKey); - state ThreadFuture> newEntryFuture = tr->get(tenantNewNameKey); - state Optional oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture)); - state Optional newEntry = wait(safeThreadFutureToFuture(newEntryFuture)); - if (firstTry) { - if (!oldEntry.present()) { - throw tenant_not_found(); - } - if (newEntry.present()) { - throw tenant_already_exists(); - } - // Store the id we see when first reading this key - id = getTenantId(oldEntry.get()); - - firstTry = false; + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + state ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + wait(MetaclusterAPI::renameTenant(db, tokens[1], tokens[2])); } else { - // If we got commit_unknown_result, the rename may have already occurred. - if (newEntry.present()) { - int64_t checkId = getTenantId(newEntry.get()); - if (id == checkId) { - ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id); - return true; + // Hold the reference to the standalone's memory + state ThreadFuture> oldEntryFuture = tr->get(tenantOldNameKey); + state ThreadFuture> newEntryFuture = tr->get(tenantNewNameKey); + state Optional oldEntry = wait(safeThreadFutureToFuture(oldEntryFuture)); + state Optional newEntry = wait(safeThreadFutureToFuture(newEntryFuture)); + if (firstTry) { + if (!oldEntry.present()) { + throw tenant_not_found(); + } + if (newEntry.present()) { + throw tenant_already_exists(); + } + // Store the id we see when first reading this key + id = getTenantId(oldEntry.get()); + + firstTry = false; + } else { + // If we got commit_unknown_result, the rename may have already occurred. + if (newEntry.present()) { + int64_t checkId = getTenantId(newEntry.get()); + if (id == checkId) { + ASSERT(!oldEntry.present() || getTenantId(oldEntry.get()) != id); + return true; + } + // If the new entry is present but does not match, then + // the rename should fail, so we throw an error. + throw tenant_already_exists(); + } + if (!oldEntry.present()) { + throw tenant_not_found(); + } + int64_t checkId = getTenantId(oldEntry.get()); + // If the id has changed since we made our first attempt, + // then it's possible we've already moved the tenant. Don't move it again. + if (id != checkId) { + throw tenant_not_found(); } - // If the new entry is present but does not match, then - // the rename should fail, so we throw an error. - throw tenant_already_exists(); - } - if (!oldEntry.present()) { - throw tenant_not_found(); - } - int64_t checkId = getTenantId(oldEntry.get()); - // If the id has changed since we made our first attempt, - // then it's possible we've already moved the tenant. Don't move it again. - if (id != checkId) { - throw tenant_not_found(); } + tr->set(tenantRenameKey, tokens[2]); + wait(safeThreadFutureToFuture(tr->commit())); } - tr->set(tenantRenameKey, tokens[2]); - wait(safeThreadFutureToFuture(tr->commit())); break; } catch (Error& e) { state Error err(e); diff --git a/fdbcli/Util.actor.cpp b/fdbcli/Util.actor.cpp index d40a5dcaeb..2d0e77d9fe 100644 --- a/fdbcli/Util.actor.cpp +++ b/fdbcli/Util.actor.cpp @@ -62,56 +62,52 @@ ACTOR Future getSpecialKeysFailureErrorMessage(Reference verifyAndAddInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv) { - wait(connectLock->take()); - state FlowLock::Releaser releaser(*connectLock); - state ClientWorkerInterface workerInterf; - try { - // the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version - workerInterf = BinaryReader::fromStringRef(kv.value, IncludeVersion()); - } catch (Error& e) { - fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what()); - return Void(); - } - state ClientLeaderRegInterface leaderInterf(workerInterf.address()); - choose { - when(Optional rep = - wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) { - StringRef ip_port = - (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); - (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); - - if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { - Key full_ip_port2 = - StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) - ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) - : full_ip_port2; - (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); - } +void addInterfacesFromKVs(RangeResult& kvs, + std::map>* address_interface) { + for (const auto& kv : kvs) { + ClientWorkerInterface workerInterf; + try { + // the interface is back-ward compatible, thus if parsing failed, it needs to upgrade cli version + workerInterf = BinaryReader::fromStringRef(kv.value, IncludeVersion()); + } catch (Error& e) { + fprintf(stderr, "Error: %s; CLI version is too old, please update to use a newer version\n", e.what()); + return; + } + ClientLeaderRegInterface leaderInterf(workerInterf.address()); + StringRef ip_port = + (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) + .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); + (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); + + if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { + Key full_ip_port2 = + StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); + StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) + ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) + : full_ip_port2; + (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); } - when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {} } - return Void(); } ACTOR Future getWorkerInterfaces(Reference tr, - std::map>* address_interface) { + std::map>* address_interface, + bool verify) { + if (verify) { + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->set(workerInterfacesVerifyOptionSpecialKey, ValueRef()); + } // Hold the reference to the standalone's memory state ThreadFuture kvsFuture = tr->getRange( KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")), CLIENT_KNOBS->TOO_MANY); - RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); + state RangeResult kvs = wait(safeThreadFutureToFuture(kvsFuture)); ASSERT(!kvs.more); - auto connectLock = makeReference(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM); - std::vector> addInterfs; - for (auto it : kvs) { - addInterfs.push_back(verifyAndAddInterface(address_interface, connectLock, it)); + if (verify) { + // remove the option if set + tr->clear(workerInterfacesVerifyOptionSpecialKey); } - wait(waitForAll(addInterfs)); + addInterfacesFromKVs(kvs, address_interface); return Void(); } diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 072b11fec0..b10ed32a20 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -103,6 +103,7 @@ enum { OPT_DEBUG_TLS, OPT_API_VERSION, OPT_MEMORY, + OPT_USE_FUTURE_PROTOCOL_VERSION }; CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, @@ -127,6 +128,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, { OPT_DEBUG_TLS, "--debug-tls", SO_NONE }, { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, { OPT_MEMORY, "--memory", SO_REQ_SEP }, + { OPT_USE_FUTURE_PROTOCOL_VERSION, "--use-future-protocol-version", SO_NONE }, TLS_OPTION_FLAGS, SO_END_OF_OPTIONS }; @@ -475,6 +477,9 @@ static void printProgramUsage(const char* name) { " Useful in reporting and diagnosing TLS issues.\n" " --build-flags Print build information and exit.\n" " --memory Resident memory limit of the CLI (defaults to 8GiB).\n" + " --use-future-protocol-version\n" + " Use the simulated future protocol version to connect to the cluster.\n" + " This option can be used testing purposes only!\n" " -v, --version Print FoundationDB CLI version information and exit.\n" " -h, --help Display this help and exit.\n"); } @@ -578,7 +583,7 @@ void initHelp() { void printVersion() { printf("FoundationDB CLI " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); printf("source version %s\n", getSourceVersion()); - printf("protocol %" PRIx64 "\n", currentProtocolVersion.version()); + printf("protocol %" PRIx64 "\n", currentProtocolVersion().version()); } void printBuildInformation() { @@ -872,6 +877,7 @@ struct CLIOptions { Optional exec; bool initialStatusCheck = true; bool cliHints = true; + bool useFutureProtocolVersion = false; bool debugTLS = false; std::string tlsCertPath; std::string tlsKeyPath; @@ -973,6 +979,10 @@ struct CLIOptions { break; case OPT_NO_HINTS: cliHints = false; + break; + case OPT_USE_FUTURE_PROTOCOL_VERSION: + useFutureProtocolVersion = true; + break; // TLS Options case TLSConfig::OPT_TLS_PLUGIN: @@ -1040,36 +1050,6 @@ Future stopNetworkAfter(Future what) { } } -ACTOR Future addInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv) { - wait(connectLock->take()); - state FlowLock::Releaser releaser(*connectLock); - state ClientWorkerInterface workerInterf = - BinaryReader::fromStringRef(kv.value, IncludeVersion()); - state ClientLeaderRegInterface leaderInterf(workerInterf.address()); - choose { - when(Optional rep = - wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) { - StringRef ip_port = - (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) - .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); - (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); - - if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { - Key full_ip_port2 = - StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); - StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) - ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) - : full_ip_port2; - (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); - } - } - when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {} - } - return Void(); -} - ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { state LineNoise& linenoise = *plinenoise; state bool intrans = false; @@ -1967,6 +1947,13 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "metacluster")) { + bool _result = wait(makeInterruptable(metaclusterCommand(db, tokens))); + if (!_result) + is_error = true; + continue; + } + fprintf(stderr, "ERROR: Unknown command `%s'. Try `help'?\n", formatStringRef(tokens[0]).c_str()); is_error = true; } @@ -2192,6 +2179,9 @@ int main(int argc, char** argv) { try { API->selectApiVersion(opt.apiVersion); + if (opt.useFutureProtocolVersion) { + API->useFutureProtocolVersion(); + } API->setupNetwork(); opt.setupKnobs(); if (opt.exit_code != -1) { diff --git a/fdbcli/include/fdbcli/fdbcli.actor.h b/fdbcli/include/fdbcli/fdbcli.actor.h index 2b56c216a0..3df51b4677 100644 --- a/fdbcli/include/fdbcli/fdbcli.actor.h +++ b/fdbcli/include/fdbcli/fdbcli.actor.h @@ -120,6 +120,7 @@ extern const KeyRangeRef processClassSourceSpecialKeyRange; extern const KeyRangeRef processClassTypeSpecialKeyRange; // Other special keys inline const KeyRef errorMsgSpecialKey = LiteralStringRef("\xff\xff/error_message"); +inline const KeyRef workerInterfacesVerifyOptionSpecialKey = "\xff\xff/management/options/worker_interfaces/verify"_sr; // help functions (Copied from fdbcli.actor.cpp) // get all workers' info @@ -132,13 +133,14 @@ void printUsage(StringRef command); // Pre: tr failed with special_keys_api_failure error // Read the error message special key and return the message ACTOR Future getSpecialKeysFailureErrorMessage(Reference tr); -// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces +// Using \xff\xff/worker_interfaces/ special key, get all worker interfaces. +// A worker list will be returned from CC. +// If verify, we will try to establish connections to all workers returned. +// In particular, it will deserialize \xff\xff/worker_interfaces/
:= kv pairs and issue RPC +// calls, then only return interfaces(kv pairs) the client can talk to ACTOR Future getWorkerInterfaces(Reference tr, - std::map>* address_interface); -// Deserialize \xff\xff/worker_interfaces/
:= k-v pair and verify by a RPC call -ACTOR Future verifyAndAddInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv); + std::map>* address_interface, + bool verify = false); // print cluster status info void printStatus(StatusObjectReader statusObj, StatusClient::StatusLevel level, @@ -200,6 +202,10 @@ ACTOR Future listTenantsCommandActor(Reference db, std::vector< // lock/unlock command ACTOR Future lockCommandActor(Reference db, std::vector tokens); ACTOR Future unlockDatabaseActor(Reference db, UID uid); + +// metacluster command +Future metaclusterCommand(Reference db, std::vector tokens); + // changefeed command ACTOR Future changeFeedCommandActor(Database localDb, Optional tenantEntry, diff --git a/fdbclient/BackupContainer.actor.cpp b/fdbclient/BackupContainer.actor.cpp index 4b0c217293..c318a6591d 100644 --- a/fdbclient/BackupContainer.actor.cpp +++ b/fdbclient/BackupContainer.actor.cpp @@ -288,11 +288,46 @@ Reference IBackupContainer::openContainer(const std::string& u #ifdef BUILD_AZURE_BACKUP else if (u.startsWith("azure://"_sr)) { u.eat("azure://"_sr); - auto accountName = u.eat("@"_sr).toString(); - auto endpoint = u.eat("/"_sr).toString(); - auto containerName = u.eat("/"_sr).toString(); - r = makeReference( - endpoint, accountName, containerName, encryptionKeyFileName); + auto address = u.eat("/"_sr); + if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) { + CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint"); + // ..core.windows.net/ + auto endPoint = address.toString(); + auto accountName = address.eat("."_sr).toString(); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endPoint, accountName, containerName, encryptionKeyFileName); + } else { + // resolve the network address if necessary + std::string endpoint(address.toString()); + Optional parsedAddress = NetworkAddress::parseOptional(endpoint); + if (!parsedAddress.present()) { + try { + auto hostname = Hostname::parse(endpoint); + auto resolvedAddress = hostname.resolveBlocking(); + if (resolvedAddress.present()) { + CODE_PROBE(true, "Azure backup url with hostname in the endpoint"); + parsedAddress = resolvedAddress.get(); + } + } catch (Error& e) { + TraceEvent(SevError, "InvalidAzureBackupUrl").error(e).detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + } + if (!parsedAddress.present()) { + TraceEvent(SevError, "InvalidAzureBackupUrl").detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + auto accountName = u.eat("/"_sr).toString(); + // Avoid including ":tls" and "(fromHostname)" + // note: the endpoint needs to contain the account name + // so either ".blob.core.windows.net" or ":/" + endpoint = + fmt::format("{}/{}", formatIpPort(parsedAddress.get().ip, parsedAddress.get().port), accountName); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endpoint, accountName, containerName, encryptionKeyFileName); + } } #endif else { diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp index b413bae0c8..b222153517 100644 --- a/fdbclient/BackupContainerFileSystem.actor.cpp +++ b/fdbclient/BackupContainerFileSystem.actor.cpp @@ -1523,11 +1523,46 @@ Reference BackupContainerFileSystem::openContainerFS( #ifdef BUILD_AZURE_BACKUP else if (u.startsWith("azure://"_sr)) { u.eat("azure://"_sr); - auto accountName = u.eat("@"_sr).toString(); - auto endpoint = u.eat("/"_sr).toString(); - auto containerName = u.eat("/"_sr).toString(); - r = makeReference( - endpoint, accountName, containerName, encryptionKeyFileName); + auto address = u.eat("/"_sr); + if (address.endsWith(std::string(azure::storage_lite::constants::default_endpoint_suffix))) { + CODE_PROBE(true, "Azure backup url with standard azure storage account endpoint"); + // ..core.windows.net/ + auto endPoint = address.toString(); + auto accountName = address.eat("."_sr).toString(); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endPoint, accountName, containerName, encryptionKeyFileName); + } else { + // resolve the network address if necessary + std::string endpoint(address.toString()); + Optional parsedAddress = NetworkAddress::parseOptional(endpoint); + if (!parsedAddress.present()) { + try { + auto hostname = Hostname::parse(endpoint); + auto resolvedAddress = hostname.resolveBlocking(); + if (resolvedAddress.present()) { + CODE_PROBE(true, "Azure backup url with hostname in the endpoint"); + parsedAddress = resolvedAddress.get(); + } + } catch (Error& e) { + TraceEvent(SevError, "InvalidAzureBackupUrl").error(e).detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + } + if (!parsedAddress.present()) { + TraceEvent(SevError, "InvalidAzureBackupUrl").detail("Endpoint", endpoint); + throw backup_invalid_url(); + } + auto accountName = u.eat("/"_sr).toString(); + // Avoid including ":tls" and "(fromHostname)" + // note: the endpoint needs to contain the account name + // so either ".blob.core.windows.net" or ":/" + endpoint = + fmt::format("{}/{}", formatIpPort(parsedAddress.get().ip, parsedAddress.get().port), accountName); + auto containerName = u.eat("/"_sr).toString(); + r = makeReference( + endpoint, accountName, containerName, encryptionKeyFileName); + } } #endif else { diff --git a/fdbclient/BlobGranuleCommon.cpp b/fdbclient/BlobGranuleCommon.cpp new file mode 100644 index 0000000000..44f32bcb25 --- /dev/null +++ b/fdbclient/BlobGranuleCommon.cpp @@ -0,0 +1,45 @@ +/* + * BlobGranuleCommon.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/BlobGranuleCommon.h" + +BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk) { + BlobGranuleSummaryRef summary; + ASSERT(chunk.snapshotFile.present()); + ASSERT(chunk.snapshotVersion != invalidVersion); + ASSERT(chunk.includedVersion >= chunk.snapshotVersion); + ASSERT(chunk.newDeltas.empty()); + + if (chunk.tenantPrefix.present()) { + summary.keyRange = KeyRangeRef(ar, chunk.keyRange.removePrefix(chunk.tenantPrefix.get())); + } else { + summary.keyRange = KeyRangeRef(ar, chunk.keyRange); + } + + summary.snapshotVersion = chunk.snapshotVersion; + summary.snapshotSize = chunk.snapshotFile.get().length; + summary.deltaVersion = chunk.includedVersion; + summary.deltaSize = 0; + for (auto& it : chunk.deltaFiles) { + summary.deltaSize += it.length; + } + + return summary; +} \ No newline at end of file diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index d18c745ce4..0e402cedb1 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -40,6 +40,7 @@ #include #include // for perf microbenchmark +#include #include #define BG_READ_DEBUG false @@ -209,16 +210,21 @@ namespace { BlobGranuleFileEncryptionKeys getEncryptBlobCipherKey(const BlobGranuleCipherKeysCtx cipherKeysCtx) { BlobGranuleFileEncryptionKeys eKeys; + // Cipher key reconstructed is 'never' inserted into BlobCipherKey cache, choose 'neverExpire' eKeys.textCipherKey = makeReference(cipherKeysCtx.textCipherKey.encryptDomainId, cipherKeysCtx.textCipherKey.baseCipherId, cipherKeysCtx.textCipherKey.baseCipher.begin(), cipherKeysCtx.textCipherKey.baseCipher.size(), - cipherKeysCtx.textCipherKey.salt); + cipherKeysCtx.textCipherKey.salt, + std::numeric_limits::max(), + std::numeric_limits::max()); eKeys.headerCipherKey = makeReference(cipherKeysCtx.headerCipherKey.encryptDomainId, cipherKeysCtx.headerCipherKey.baseCipherId, cipherKeysCtx.headerCipherKey.baseCipher.begin(), cipherKeysCtx.headerCipherKey.baseCipher.size(), - cipherKeysCtx.headerCipherKey.salt); + cipherKeysCtx.headerCipherKey.salt, + std::numeric_limits::max(), + std::numeric_limits::max()); return eKeys; } @@ -346,7 +352,9 @@ struct IndexBlockRef { decrypt(cipherKeysCtx.get(), *this, arena); } else { - TraceEvent("IndexBlockSize").detail("Sz", buffer.size()); + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent("IndexBlockSize").detail("Sz", buffer.size()); + } ObjectReader dataReader(buffer.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, block, arena); @@ -368,7 +376,11 @@ struct IndexBlockRef { arena, ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())).contents()); } - TraceEvent(SevDebug, "IndexBlockSize").detail("Sz", buffer.size()).detail("Encrypted", cipherKeysCtx.present()); + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "IndexBlockSize") + .detail("Sz", buffer.size()) + .detail("Encrypted", cipherKeysCtx.present()); + } } template @@ -804,10 +816,6 @@ static Standalone> loadSnapshotFile( ASSERT(file.indexBlockRef.block.children.size() >= 2); - // TODO: refactor this out of delta tree - // int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first, - // index.dataBlockOffsets.back().first); - // find range of blocks needed to read ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); @@ -1157,10 +1165,6 @@ Standalone> loadChunkedDeltaFile(const Standal ASSERT(file.indexBlockRef.block.children.size() >= 2); - // TODO: refactor this out of delta tree - // int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first, - // index.dataBlockOffsets.back().first); - // find range of blocks needed to read ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); @@ -1169,7 +1173,8 @@ Standalone> loadChunkedDeltaFile(const Standal return deltas; } - // TODO: could cpu optimize first block a bit more by seeking right to start + // FIXME: shared prefix for key comparison + // FIXME: could cpu optimize first block a bit more by seeking right to start bool lastBlock = false; bool prevClearAfter = false; while (!lastBlock) { @@ -1553,12 +1558,23 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, return mergeDeltaStreams(chunk, streams, startClears); } +struct GranuleLoadFreeHandle : NonCopyable, ReferenceCounted { + const ReadBlobGranuleContext* granuleContext; + int64_t loadId; + + GranuleLoadFreeHandle(const ReadBlobGranuleContext* granuleContext, int64_t loadId) + : granuleContext(granuleContext), loadId(loadId) {} + + ~GranuleLoadFreeHandle() { granuleContext->free_load_f(loadId, granuleContext->userContext); } +}; + struct GranuleLoadIds { Optional snapshotId; std::vector deltaIds; + std::vector> freeHandles; }; -static void startLoad(const ReadBlobGranuleContext granuleContext, +static void startLoad(const ReadBlobGranuleContext* granuleContext, const BlobGranuleChunkRef& chunk, GranuleLoadIds& loadIds) { @@ -1568,12 +1584,13 @@ static void startLoad(const ReadBlobGranuleContext granuleContext, // FIXME: remove when we implement file multiplexing ASSERT(chunk.snapshotFile.get().offset == 0); ASSERT(chunk.snapshotFile.get().length == chunk.snapshotFile.get().fullFileLength); - loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(), - snapshotFname.size(), - chunk.snapshotFile.get().offset, - chunk.snapshotFile.get().length, - chunk.snapshotFile.get().fullFileLength, - granuleContext.userContext); + loadIds.snapshotId = granuleContext->start_load_f(snapshotFname.c_str(), + snapshotFname.size(), + chunk.snapshotFile.get().offset, + chunk.snapshotFile.get().length, + chunk.snapshotFile.get().fullFileLength, + granuleContext->userContext); + loadIds.freeHandles.push_back(makeReference(granuleContext, loadIds.snapshotId.get())); } loadIds.deltaIds.reserve(chunk.deltaFiles.size()); for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) { @@ -1581,13 +1598,14 @@ static void startLoad(const ReadBlobGranuleContext granuleContext, // FIXME: remove when we implement file multiplexing ASSERT(chunk.deltaFiles[deltaFileIdx].offset == 0); ASSERT(chunk.deltaFiles[deltaFileIdx].length == chunk.deltaFiles[deltaFileIdx].fullFileLength); - int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(), - deltaFName.size(), - chunk.deltaFiles[deltaFileIdx].offset, - chunk.deltaFiles[deltaFileIdx].length, - chunk.deltaFiles[deltaFileIdx].fullFileLength, - granuleContext.userContext); + int64_t deltaLoadId = granuleContext->start_load_f(deltaFName.c_str(), + deltaFName.size(), + chunk.deltaFiles[deltaFileIdx].offset, + chunk.deltaFiles[deltaFileIdx].length, + chunk.deltaFiles[deltaFileIdx].fullFileLength, + granuleContext->userContext); loadIds.deltaIds.push_back(deltaLoadId); + loadIds.freeHandles.push_back(makeReference(granuleContext, deltaLoadId)); } } @@ -1606,17 +1624,16 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone 1 - for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { - startLoad(granuleContext, files[i], loadIds[i]); - } - try { + // Kick off first file reads if parallelism > 1 + for (int i = 0; i < parallelism - 1 && i < files.size(); i++) { + startLoad(&granuleContext, files[i], loadIds[i]); + } RangeResult results; for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) { // Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1 if (chunkIdx + parallelism - 1 < files.size()) { - startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); + startLoad(&granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]); } RangeResult chunkRows; @@ -1632,7 +1649,8 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone loadAndMaterializeBlobGranules(const Standalone(results); } catch (Error& e) { @@ -2372,7 +2386,6 @@ void checkDeltaRead(const KeyValueGen& kvGen, std::string filename = randomBGFilename( deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta"); Standalone chunk; - // TODO need to add cipher keys meta chunk.deltaFiles.emplace_back_deep( chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys); chunk.keyRange = kvGen.allRange; @@ -2429,7 +2442,6 @@ static std::tuple randomizeKeyAndVersions(const KeyV } } - // TODO randomize begin and read version to sometimes +/- 1 and readRange begin and end to keyAfter sometimes return { readRange, beginVersion, readVersion }; } @@ -2653,7 +2665,11 @@ TEST_CASE("/blobgranule/files/granuleReadUnitTest") { serializedDeltaFiles, inMemoryDeltas); - for (int i = 0; i < std::min(100, 5 + snapshotData.size() * deltaData.size()); i++) { + // prevent overflow by doing min before multiply + int maxRuns = 100; + int snapshotAndDeltaSize = 5 + std::min(maxRuns, snapshotData.size()) * std::min(maxRuns, deltaData.size()); + int lim = std::min(maxRuns, snapshotAndDeltaSize); + for (int i = 0; i < lim; i++) { auto params = randomizeKeyAndVersions(kvGen, deltaData); fmt::print("Partial test {0}: [{1} - {2}) @ {3} - {4}\n", i, diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index e0f627a9da..9b24380d2c 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -31,13 +31,6 @@ #include "fdbclient/FDBTypes.h" #include "flow/actorcompiler.h" // This must be the last #include. -// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other -// sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything -// else is in 1 arena and discarded at the end. - -// TODO could refactor the file reading code from here and the delta file function into another actor, -// then this part would also be testable? but meh - ACTOR Future> readFile(Reference bstoreProvider, BlobFilePointerRef f) { try { state Arena arena; @@ -140,3 +133,66 @@ ACTOR Future readBlobGranules(BlobGranuleFileRequest request, return Void(); } + +// Return true if a given range is fully covered by blob chunks +bool isRangeFullyCovered(KeyRange range, Standalone> blobChunks) { + std::vector blobRanges; + for (const BlobGranuleChunkRef& chunk : blobChunks) { + blobRanges.push_back(chunk.keyRange); + } + + return range.isCovered(blobRanges); +} + +void testAddChunkRange(KeyRef begin, KeyRef end, Standalone>& chunks) { + BlobGranuleChunkRef chunk; + chunk.keyRange = KeyRangeRef(begin, end); + chunks.push_back(chunks.arena(), chunk); +} + +TEST_CASE("/fdbserver/blobgranule/isRangeCoveredByBlob") { + Standalone> chunks; + // chunk1 key_a1 - key_a9 + testAddChunkRange("key_a1"_sr, "key_a9"_sr, chunks); + // chunk2 key_b1 - key_b9 + testAddChunkRange("key_b1"_sr, "key_b9"_sr, chunks); + + // check empty range. not covered + { ASSERT(isRangeFullyCovered(KeyRangeRef(), chunks) == false); } + + // check empty chunks. not covered + { + Standalone> empyChunks; + ASSERT(isRangeFullyCovered(KeyRangeRef(), empyChunks) == false); + } + + // check '' to \xff + { ASSERT(isRangeFullyCovered(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff")), chunks) == false); } + + // check {key_a1, key_a9} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a9"_sr), chunks)); } + + // check {key_a1, key_a3} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_a3"_sr), chunks)); } + + // check {key_a0, key_a3} + { ASSERT(isRangeFullyCovered(KeyRangeRef("key_a0"_sr, "key_a3"_sr), chunks) == false); } + + // check {key_a5, key_b2} + { + auto range = KeyRangeRef("key_a5"_sr, "key_b5"_sr); + ASSERT(isRangeFullyCovered(range, chunks) == false); + ASSERT(range.begin == "key_a5"_sr); + ASSERT(range.end == "key_b5"_sr); + } + + // check continued chunks + { + Standalone> continuedChunks; + testAddChunkRange("key_a1"_sr, "key_a9"_sr, continuedChunks); + testAddChunkRange("key_a9"_sr, "key_b1"_sr, continuedChunks); + testAddChunkRange("key_b1"_sr, "key_b9"_sr, continuedChunks); + ASSERT(isRangeFullyCovered(KeyRangeRef("key_a1"_sr, "key_b9"_sr), continuedChunks) == false); + } + return Void(); +} diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 2a1713878f..2953a360e7 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -90,8 +90,8 @@ add_flow_target(LINK_TEST NAME fdbclientlinktest SRCS LinkTest.cpp) target_link_libraries(fdbclientlinktest PRIVATE fdbclient rapidxml) # re-link rapidxml due to private link interface if(BUILD_AZURE_BACKUP) - target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite) - target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite) + target_link_libraries(fdbclient PRIVATE curl azure-storage-lite) + target_link_libraries(fdbclient_sampling PRIVATE curl azure-storage-lite) endif() if(BUILD_AWS_BACKUP) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 537f91f0aa..7c8a69a337 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -42,10 +42,6 @@ void ClientKnobs::initialize(Randomize randomize) { init( FAILURE_MAX_DELAY, 5.0 ); init( FAILURE_MIN_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_MIN_DELAY = 1.0; - init( FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY ); - init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY ); - init( FAILURE_EMERGENCY_DELAY, 30.0 ); - init( FAILURE_MAX_GENERATIONS, 10 ); init( RECOVERY_DELAY_START_GENERATION, 70 ); init( RECOVERY_DELAY_SECONDS_PER_GENERATION, 60.0 ); init( MAX_GENERATIONS, 100 ); @@ -64,6 +60,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + init( GRV_ERROR_RETRY_DELAY, 5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01(); init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01(); init( REPLY_BYTE_LIMIT, 80000 ); init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01(); @@ -84,6 +81,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( CHANGE_FEED_CACHE_SIZE, 100000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_SIZE = 1; init( CHANGE_FEED_POP_TIMEOUT, 10.0 ); init( CHANGE_FEED_STREAM_MIN_BYTES, 1e4 ); if( randomize && BUGGIFY ) CHANGE_FEED_STREAM_MIN_BYTES = 1; + init( CHANGE_FEED_START_INTERVAL, 10.0 ); init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1; init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; @@ -159,8 +157,6 @@ void ClientKnobs::initialize(Randomize randomize) { init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60); init( BACKUP_AGGREGATE_POLL_RATE, 2.0 ); // polls per second target for all agents on the cluster init( BACKUP_LOG_WRITE_BATCH_MAX_SIZE, 1e6 ); //Must be much smaller than TRANSACTION_SIZE_LIMIT - init( BACKUP_LOG_ATOMIC_OPS_SIZE, 1000 ); - init( BACKUP_OPERATION_COST_OVERHEAD, 50 ); init( BACKUP_MAX_LOG_RANGES, 21 ); if( randomize && BUGGIFY ) BACKUP_MAX_LOG_RANGES = 4; init( BACKUP_SIM_COPY_LOG_RANGES, 100 ); init( BACKUP_VERSION_DELAY, 5*CORE_VERSIONSPERSECOND ); @@ -279,18 +275,21 @@ void ClientKnobs::initialize(Randomize randomize) { init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 ); init( BUSYNESS_SPIKE_SATURATED_THRESHOLD, 0.500 ); - // multi-version client control - init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 ); - init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 ); - // Blob granules init( BG_MAX_GRANULE_PARALLELISM, 10 ); + init( BG_TOO_MANY_GRANULES, 10000 ); init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 ); init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 ); // Tenants and Metacluster - init( MAX_TENANTS_PER_CLUSTER, 1e6 ); if ( randomize && BUGGIFY ) MAX_TENANTS_PER_CLUSTER = deterministicRandom()->randomInt(20, 100); + init( MAX_TENANTS_PER_CLUSTER, 1e6 ); + init( TENANT_TOMBSTONE_CLEANUP_INTERVAL, 60 ); if ( randomize && BUGGIFY ) TENANT_TOMBSTONE_CLEANUP_INTERVAL = deterministicRandom()->random01() * 30; + init( MAX_DATA_CLUSTERS, 1e5 ); + init( REMOVE_CLUSTER_TENANT_BATCH_SIZE, 1e4 ); if ( randomize && BUGGIFY ) REMOVE_CLUSTER_TENANT_BATCH_SIZE = 1; + init( METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, 5 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK = 1; + init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60; + init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59; // clang-format on } diff --git a/fdbclient/KeyRangeMap.actor.cpp b/fdbclient/KeyRangeMap.actor.cpp index c736c714bf..cb1f0558c1 100644 --- a/fdbclient/KeyRangeMap.actor.cpp +++ b/fdbclient/KeyRangeMap.actor.cpp @@ -23,6 +23,7 @@ #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/ReadYourWrites.h" +#include "flow/UnitTest.h" #include "flow/actorcompiler.h" // has to be last include void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std::vector& affectedRanges) { @@ -35,32 +36,54 @@ void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, std affectedRanges.push_back(KeyRangeRef(keys.end, e.end())); } -RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv) { +RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align) { ASSERT(!kv.more || kv.size() > 1); KeyRange withPrefix = KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); - ValueRef beginValue, endValue; - if (kv.size() && kv[0].key.startsWith(mapPrefix)) - beginValue = kv[0].value; - if (kv.size() && kv.end()[-1].key.startsWith(mapPrefix)) - endValue = kv.end()[-1].value; - RangeResult result; result.arena().dependsOn(kv.arena()); result.arena().dependsOn(keys.arena()); - result.push_back(result.arena(), KeyValueRef(keys.begin, beginValue)); + // Always push a kv pair <= keys.begin. + KeyRef beginKey = keys.begin; + if (!align && !kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key < withPrefix.begin) { + beginKey = kv[0].key.removePrefix(mapPrefix); + } + ValueRef beginValue; + if (!kv.empty() && kv.front().key.startsWith(mapPrefix) && kv.front().key <= withPrefix.begin) { + beginValue = kv.front().value; + } + result.push_back(result.arena(), KeyValueRef(beginKey, beginValue)); + for (int i = 0; i < kv.size(); i++) { if (kv[i].key > withPrefix.begin && kv[i].key < withPrefix.end) { KeyRef k = kv[i].key.removePrefix(mapPrefix); result.push_back(result.arena(), KeyValueRef(k, kv[i].value)); - } else if (kv[i].key >= withPrefix.end) + } else if (kv[i].key >= withPrefix.end) { kv.more = false; + // There should be at most 1 value past mapPrefix + keys.end. + ASSERT(i == kv.size() - 1); + break; + } } - if (!kv.more) - result.push_back(result.arena(), KeyValueRef(keys.end, endValue)); + if (!kv.more) { + KeyRef endKey = keys.end; + if (!align && !kv.empty() && kv.back().key.startsWith(mapPrefix) && kv.back().key >= withPrefix.end) { + endKey = kv.back().key.removePrefix(mapPrefix); + } + ValueRef endValue; + if (!kv.empty()) { + // In the aligned case, carry the last value to be the end value. + if (align && kv.back().key.startsWith(mapPrefix) && kv.back().key > withPrefix.end) { + endValue = result.back().value; + } else { + endValue = kv.back().value; + } + } + result.push_back(result.arena(), KeyValueRef(endKey, endValue)); + } result.more = kv.more; return result; @@ -93,6 +116,37 @@ ACTOR Future krmGetRanges(Reference tr, return krmDecodeRanges(mapPrefix, keys, kv); } +// Returns keys.begin, all transitional points in keys, and keys.end, and their values +ACTOR Future krmGetRangesUnaligned(Transaction* tr, + Key mapPrefix, + KeyRange keys, + int limit, + int limitBytes) { + KeyRange withPrefix = + KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); + + state GetRangeLimits limits(limit, limitBytes); + limits.minRows = 2; + RangeResult kv = wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits)); + + return krmDecodeRanges(mapPrefix, keys, kv, false); +} + +ACTOR Future krmGetRangesUnaligned(Reference tr, + Key mapPrefix, + KeyRange keys, + int limit, + int limitBytes) { + KeyRange withPrefix = + KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString()); + + state GetRangeLimits limits(limit, limitBytes); + limits.minRows = 2; + RangeResult kv = wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits)); + + return krmDecodeRanges(mapPrefix, keys, kv, false); +} + void krmSetPreviouslyEmptyRange(Transaction* tr, const KeyRef& mapPrefix, const KeyRangeRef& keys, @@ -254,3 +308,87 @@ Future krmSetRangeCoalescing(Reference const& t Value const& value) { return holdWhile(tr, krmSetRangeCoalescing_(tr.getPtr(), mapPrefix, range, maxRange, value)); } + +TEST_CASE("/keyrangemap/decoderange/aligned") { + Arena arena; + Key prefix = LiteralStringRef("/prefix/"); + StringRef fullKeyA = StringRef(arena, LiteralStringRef("/prefix/a")); + StringRef fullKeyB = StringRef(arena, LiteralStringRef("/prefix/b")); + StringRef fullKeyC = StringRef(arena, LiteralStringRef("/prefix/c")); + StringRef fullKeyD = StringRef(arena, LiteralStringRef("/prefix/d")); + + StringRef keyA = StringRef(arena, LiteralStringRef("a")); + StringRef keyB = StringRef(arena, LiteralStringRef("b")); + StringRef keyC = StringRef(arena, LiteralStringRef("c")); + StringRef keyD = StringRef(arena, LiteralStringRef("d")); + StringRef keyE = StringRef(arena, LiteralStringRef("e")); + StringRef keyAB = StringRef(arena, LiteralStringRef("ab")); + StringRef keyCD = StringRef(arena, LiteralStringRef("cd")); + + // Fake getRange() call. + RangeResult kv; + kv.push_back(arena, KeyValueRef(fullKeyA, keyA)); + kv.push_back(arena, KeyValueRef(fullKeyB, keyB)); + kv.push_back(arena, KeyValueRef(fullKeyC, keyC)); + kv.push_back(arena, KeyValueRef(fullKeyD, keyD)); + + // [A, AB(start), B, C, CD(end), D] + RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv); + ASSERT(decodedRanges.size() == 4); + ASSERT(decodedRanges.front().key == keyAB); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyCD); + ASSERT(decodedRanges.back().value == keyC); + + // [""(start), A, B, C, D, E(end)] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv); + ASSERT(decodedRanges.size() == 6); + ASSERT(decodedRanges.front().key == StringRef()); + ASSERT(decodedRanges.front().value == StringRef()); + ASSERT(decodedRanges.back().key == keyE); + ASSERT(decodedRanges.back().value == keyD); + + return Void(); +} + +TEST_CASE("/keyrangemap/decoderange/unaligned") { + Arena arena; + Key prefix = LiteralStringRef("/prefix/"); + StringRef fullKeyA = StringRef(arena, LiteralStringRef("/prefix/a")); + StringRef fullKeyB = StringRef(arena, LiteralStringRef("/prefix/b")); + StringRef fullKeyC = StringRef(arena, LiteralStringRef("/prefix/c")); + StringRef fullKeyD = StringRef(arena, LiteralStringRef("/prefix/d")); + + StringRef keyA = StringRef(arena, LiteralStringRef("a")); + StringRef keyB = StringRef(arena, LiteralStringRef("b")); + StringRef keyC = StringRef(arena, LiteralStringRef("c")); + StringRef keyD = StringRef(arena, LiteralStringRef("d")); + StringRef keyE = StringRef(arena, LiteralStringRef("e")); + StringRef keyAB = StringRef(arena, LiteralStringRef("ab")); + StringRef keyCD = StringRef(arena, LiteralStringRef("cd")); + + // Fake getRange() call. + RangeResult kv; + kv.push_back(arena, KeyValueRef(fullKeyA, keyA)); + kv.push_back(arena, KeyValueRef(fullKeyB, keyB)); + kv.push_back(arena, KeyValueRef(fullKeyC, keyC)); + kv.push_back(arena, KeyValueRef(fullKeyD, keyD)); + + // [A, AB(start), B, C, CD(end), D] + RangeResult decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(keyAB, keyCD), kv, false); + ASSERT(decodedRanges.size() == 4); + ASSERT(decodedRanges.front().key == keyA); + ASSERT(decodedRanges.front().value == keyA); + ASSERT(decodedRanges.back().key == keyD); + ASSERT(decodedRanges.back().value == keyD); + + // [""(start), A, B, C, D, E(end)] + decodedRanges = krmDecodeRanges(prefix, KeyRangeRef(StringRef(), keyE), kv, false); + ASSERT(decodedRanges.size() == 6); + ASSERT(decodedRanges.front().key == StringRef()); + ASSERT(decodedRanges.front().value == StringRef()); + ASSERT(decodedRanges.back().key == keyE); + ASSERT(decodedRanges.back().value == keyD); + + return Void(); +} \ No newline at end of file diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 2a5c9ac910..6270cc0b88 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -2559,7 +2559,7 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") { ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource), "", "", - currentProtocolVersion); + currentProtocolVersion()); } workers.push_back(data); diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp new file mode 100644 index 0000000000..6463033db8 --- /dev/null +++ b/fdbclient/Metacluster.cpp @@ -0,0 +1,71 @@ +/* + * Metacluster.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/Metacluster.h" +#include "fdbclient/MetaclusterManagement.actor.h" + +FDB_DEFINE_BOOLEAN_PARAM(AddNewTenants); +FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants); + +std::string DataClusterEntry::clusterStateToString(DataClusterState clusterState) { + switch (clusterState) { + case DataClusterState::READY: + return "ready"; + case DataClusterState::REMOVING: + return "removing"; + case DataClusterState::RESTORING: + return "restoring"; + default: + UNREACHABLE(); + } +} + +DataClusterState DataClusterEntry::stringToClusterState(std::string stateStr) { + if (stateStr == "ready") { + return DataClusterState::READY; + } else if (stateStr == "removing") { + return DataClusterState::REMOVING; + } else if (stateStr == "restoring") { + return DataClusterState::RESTORING; + } + + UNREACHABLE(); +} + +json_spirit::mObject DataClusterEntry::toJson() const { + json_spirit::mObject obj; + obj["capacity"] = capacity.toJson(); + obj["allocated"] = allocated.toJson(); + obj["cluster_state"] = DataClusterEntry::clusterStateToString(clusterState); + return obj; +} + +json_spirit::mObject ClusterUsage::toJson() const { + json_spirit::mObject obj; + obj["num_tenant_groups"] = numTenantGroups; + return obj; +} + +KeyBackedObjectProperty& +MetaclusterMetadata::metaclusterRegistration() { + static KeyBackedObjectProperty instance( + "\xff/metacluster/clusterRegistration"_sr, IncludeVersion()); + return instance; +} \ No newline at end of file diff --git a/fdbclient/MetaclusterManagement.actor.cpp b/fdbclient/MetaclusterManagement.actor.cpp new file mode 100644 index 0000000000..33403300bd --- /dev/null +++ b/fdbclient/MetaclusterManagement.actor.cpp @@ -0,0 +1,67 @@ +/* + * MetaclusterManagement.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/ThreadSafeTransaction.h" +#include "flow/actorcompiler.h" // has to be last include + +namespace MetaclusterAPI { + +ACTOR Future> openDatabase(ClusterConnectionString connectionString) { + if (g_network->isSimulated()) { + Reference clusterFile = + makeReference(connectionString); + Database nativeDb = Database::createDatabase(clusterFile, -1); + Reference threadSafeDb = + wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(nativeDb))); + return MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeDb); + } else { + return MultiVersionApi::api->createDatabaseFromConnectionString(connectionString.toString().c_str()); + } +} + +KeyBackedObjectMap& +ManagementClusterMetadata::dataClusters() { + static KeyBackedObjectMap instance( + "metacluster/dataCluster/metadata/"_sr, IncludeVersion()); + return instance; +} + +KeyBackedMap, + ManagementClusterMetadata::ConnectionStringCodec> + ManagementClusterMetadata::dataClusterConnectionRecords("metacluster/dataCluster/connectionString/"_sr); + +KeyBackedSet ManagementClusterMetadata::clusterCapacityIndex("metacluster/clusterCapacityIndex/"_sr); +KeyBackedMap, BinaryCodec> + ManagementClusterMetadata::clusterTenantCount("metacluster/clusterTenantCount/"_sr); +KeyBackedSet ManagementClusterMetadata::clusterTenantIndex("metacluster/dataCluster/tenantMap/"_sr); +KeyBackedSet ManagementClusterMetadata::clusterTenantGroupIndex("metacluster/dataCluster/tenantGroupMap/"_sr); + +TenantMetadataSpecification& ManagementClusterMetadata::tenantMetadata() { + static TenantMetadataSpecification instance(""_sr); + return instance; +} + +}; // namespace MetaclusterAPI \ No newline at end of file diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index f72913f7ee..977d465908 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -663,69 +663,43 @@ ACTOR Future asyncDeserializeClusterInterface(Reference> s } } -struct ClientStatusStats { - int count; - std::vector> examples; +namespace { - ClientStatusStats() : count(0) { examples.reserve(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT); } -}; +void tryInsertIntoSamples(OpenDatabaseRequest::Samples& samples, + const NetworkAddress& networkAddress, + const Key& traceLogGroup) { + ++samples.count; + if (samples.samples.size() < static_cast(CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT)) { + samples.samples.insert({ networkAddress, traceLogGroup }); + } +} + +} // namespace OpenDatabaseRequest ClientData::getRequest() { OpenDatabaseRequest req; - std::map issueMap; - std::map versionMap; - std::map maxProtocolMap; - int clientCount = 0; - - // SOMEDAY: add a yield in this loop for (auto& ci : clientStatusInfoMap) { - for (auto& it : ci.second.issues) { - auto& entry = issueMap[it]; - entry.count++; - if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - entry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } - if (ci.second.versions.size()) { - clientCount++; - StringRef maxProtocol; - for (auto& it : ci.second.versions) { - maxProtocol = std::max(maxProtocol, it.protocolVersion); - auto& entry = versionMap[it]; - entry.count++; - if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - entry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } - auto& maxEntry = maxProtocolMap[maxProtocol]; - maxEntry.count++; - if (maxEntry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - maxEntry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } else { - auto& entry = versionMap[ClientVersionRef()]; - entry.count++; - if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) { - entry.examples.emplace_back(ci.first, ci.second.traceLogGroup); - } - } - } + const auto& networkAddress = ci.first; + const auto& traceLogGroup = ci.second.traceLogGroup; - req.issues.reserve(issueMap.size()); - for (auto& it : issueMap) { - req.issues.push_back(ItemWithExamples(it.first, it.second.count, it.second.examples)); + for (auto& issue : ci.second.issues) { + tryInsertIntoSamples(req.issues[issue], networkAddress, traceLogGroup); + } + + if (!ci.second.versions.size()) { + tryInsertIntoSamples(req.supportedVersions[ClientVersionRef()], networkAddress, traceLogGroup); + continue; + } + + ++req.clientCount; + StringRef maxProtocol; + for (auto& it : ci.second.versions) { + maxProtocol = std::max(maxProtocol, it.protocolVersion); + tryInsertIntoSamples(req.supportedVersions[it], networkAddress, traceLogGroup); + } + tryInsertIntoSamples(req.maxProtocolSupported[maxProtocol], networkAddress, traceLogGroup); } - req.supportedVersions.reserve(versionMap.size()); - for (auto& it : versionMap) { - req.supportedVersions.push_back( - ItemWithExamples>(it.first, it.second.count, it.second.examples)); - } - req.maxProtocolSupported.reserve(maxProtocolMap.size()); - for (auto& it : maxProtocolMap) { - req.maxProtocolSupported.push_back(ItemWithExamples(it.first, it.second.count, it.second.examples)); - } - req.clientCount = clientCount; return req; } diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index b08457a3ac..51271dd09e 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -257,13 +257,14 @@ ThreadFuture>> DLTransaction::getRangeSplitPoints(c }); } -ThreadFuture>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange) { +ThreadFuture>> DLTransaction::getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) { if (!api->transactionGetBlobGranuleRanges) { return unsupported_operation(); } FdbCApi::FDBFuture* f = api->transactionGetBlobGranuleRanges( - tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); + tr, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit); return toThreadFuture>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { const FdbCApi::FDBKeyRange* keyRanges; int keyRangesLength; @@ -279,10 +280,46 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) { - if (!api->transactionReadBlobGranules) { + return unsupported_operation(); +} + +ThreadFuture>> DLTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { + if (!api->transactionReadBlobGranulesStart) { return unsupported_operation(); } + int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; + + FdbCApi::FDBFuture* f = api->transactionReadBlobGranulesStart(tr, + keyRange.begin.begin(), + keyRange.begin.size(), + keyRange.end.begin(), + keyRange.end.size(), + beginVersion, + rv, + readVersionOut); + + return ThreadFuture>>( + (ThreadSingleAssignmentVar>>*)(f)); +}; + +ThreadResult DLTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { + if (!api->transactionReadBlobGranulesFinish) { + return unsupported_operation(); + } + + // convert back to fdb future for API + FdbCApi::FDBFuture* f = (FdbCApi::FDBFuture*)(startFuture.extractPtr()); + // FIXME: better way to convert here? FdbCApi::FDBReadBlobGranuleContext context; context.userContext = granuleContext.userContext; @@ -292,18 +329,18 @@ ThreadResult DLTransaction::readBlobGranules(const KeyRangeRef& key context.debugNoMaterialize = granuleContext.debugNoMaterialize; context.granuleParallelism = granuleContext.granuleParallelism; - int64_t rv = readVersion.present() ? readVersion.get() : latestVersion; + FdbCApi::FDBResult* r = api->transactionReadBlobGranulesFinish(tr, + f, + keyRange.begin.begin(), + keyRange.begin.size(), + keyRange.end.begin(), + keyRange.end.size(), + beginVersion, + readVersion, + &context); - FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr, - keyRange.begin.begin(), - keyRange.begin.size(), - keyRange.end.begin(), - keyRange.end.size(), - beginVersion, - rv, - context); return ThreadResult((ThreadSingleAssignmentVar*)(r)); -} +}; void DLTransaction::addReadConflictRange(const KeyRangeRef& keys) { throwIfError(api->transactionAddConflictRange( @@ -583,6 +620,71 @@ ThreadFuture DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); }); } +ThreadFuture DLDatabase::blobbifyRange(const KeyRangeRef& keyRange) { + if (!api->databaseBlobbifyRange) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseBlobbifyRange( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); + + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + bool ret = false; + ASSERT(!api->futureGetBool(f, &ret)); + return ret; + }); +} + +ThreadFuture DLDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { + if (!api->databaseUnblobbifyRange) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseUnblobbifyRange( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size()); + + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + bool ret = false; + ASSERT(!api->futureGetBool(f, &ret)); + return ret; + }); +} + +ThreadFuture>> DLDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) { + if (!api->databaseListBlobbifiedRanges) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseListBlobbifiedRanges( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), rangeLimit); + + return toThreadFuture>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + const FdbCApi::FDBKeyRange* keyRanges; + int keyRangesLength; + FdbCApi::fdb_error_t error = api->futureGetKeyRangeArray(f, &keyRanges, &keyRangesLength); + ASSERT(!error); + // The memory for this is stored in the FDBFuture and is released when the future gets destroyed. + return Standalone>(VectorRef((KeyRangeRef*)keyRanges, keyRangesLength), + Arena()); + }); +} + +ThreadFuture DLDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { + if (!api->databaseVerifyBlobRange) { + return unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseVerifyBlobRange( + db, keyRange.begin.begin(), keyRange.begin.size(), keyRange.end.begin(), keyRange.end.size(), version); + + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + Version version = invalidVersion; + ASSERT(!api->futureGetInt64(f, &version)); + return version; + }); +} + // DLApi // Loads the specified function from a dynamic library @@ -626,6 +728,8 @@ void DLApi::init() { loadClientFunction(&api->selectApiVersion, lib, fdbCPath, "fdb_select_api_version_impl", headerVersion >= 0); loadClientFunction(&api->getClientVersion, lib, fdbCPath, "fdb_get_client_version", headerVersion >= 410); + loadClientFunction( + &api->useFutureProtocolVersion, lib, fdbCPath, "fdb_use_future_protocol_version", headerVersion >= 720); loadClientFunction(&api->setNetworkOption, lib, fdbCPath, "fdb_network_set_option", headerVersion >= 0); loadClientFunction(&api->setupNetwork, lib, fdbCPath, "fdb_setup_network", headerVersion >= 0); loadClientFunction(&api->runNetwork, lib, fdbCPath, "fdb_run_network", headerVersion >= 0); @@ -668,6 +772,13 @@ void DLApi::init() { fdbCPath, "fdb_database_wait_purge_granules_complete", headerVersion >= 710); + loadClientFunction(&api->databaseBlobbifyRange, lib, fdbCPath, "fdb_database_blobbify_range", headerVersion >= 720); + loadClientFunction( + &api->databaseUnblobbifyRange, lib, fdbCPath, "fdb_database_unblobbify_range", headerVersion >= 720); + loadClientFunction( + &api->databaseListBlobbifiedRanges, lib, fdbCPath, "fdb_database_list_blobbified_ranges", headerVersion >= 720); + loadClientFunction( + &api->databaseVerifyBlobRange, lib, fdbCPath, "fdb_database_verify_blob_range", headerVersion >= 720); loadClientFunction( &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710); @@ -737,11 +848,22 @@ void DLApi::init() { headerVersion >= 710); loadClientFunction( &api->transactionReadBlobGranules, lib, fdbCPath, "fdb_transaction_read_blob_granules", headerVersion >= 710); + loadClientFunction(&api->transactionReadBlobGranulesStart, + lib, + fdbCPath, + "fdb_transaction_read_blob_granules_start", + headerVersion >= 720); + loadClientFunction(&api->transactionReadBlobGranulesFinish, + lib, + fdbCPath, + "fdb_transaction_read_blob_granules_finish", + headerVersion >= 720); loadClientFunction(&api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version", headerVersion >= 0); + loadClientFunction(&api->futureGetBool, lib, fdbCPath, "fdb_future_get_bool", headerVersion >= 720); loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700); loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error", headerVersion >= 0); loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key", headerVersion >= 0); @@ -788,6 +910,14 @@ const char* DLApi::getClientVersion() { return api->getClientVersion(); } +void DLApi::useFutureProtocolVersion() { + if (!api->useFutureProtocolVersion) { + return; + } + + api->useFutureProtocolVersion(); +} + void DLApi::setNetworkOption(FDBNetworkOptions::Option option, Optional value) { throwIfError(api->setNetworkOption(static_cast(option), value.present() ? value.get().begin() : nullptr, @@ -1069,9 +1199,10 @@ ThreadFuture>> MultiVersionTransaction::getRangeSpl } ThreadFuture>> MultiVersionTransaction::getBlobGranuleRanges( - const KeyRangeRef& keyRange) { + const KeyRangeRef& keyRange, + int rangeLimit) { auto tr = getTransaction(); - auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange) + auto f = tr.transaction ? tr.transaction->getBlobGranuleRanges(keyRange, rangeLimit) : makeTimeout>>(); return abortableFuture(f, tr.onChange); } @@ -1080,14 +1211,45 @@ ThreadResult MultiVersionTransaction::readBlobGranules(const KeyRan Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) { + // FIXME: prevent from calling this from another main thread? auto tr = getTransaction(); if (tr.transaction) { - return tr.transaction->readBlobGranules(keyRange, beginVersion, readVersion, granuleContext); + Version readVersionOut; + auto f = tr.transaction->readBlobGranulesStart(keyRange, beginVersion, readVersion, &readVersionOut); + auto abortableF = abortableFuture(f, tr.onChange); + abortableF.blockUntilReadyCheckOnMainThread(); + if (abortableF.isError()) { + return ThreadResult(abortableF.getError()); + } + if (granuleContext.debugNoMaterialize) { + return ThreadResult(blob_granule_not_materialized()); + } + return tr.transaction->readBlobGranulesFinish( + abortableF, keyRange, beginVersion, readVersionOut, granuleContext); } else { return abortableTimeoutResult(tr.onChange); } } +ThreadFuture>> MultiVersionTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { + // can't call this directly + return ThreadFuture>>(unsupported_operation()); +} + +ThreadResult MultiVersionTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { + // can't call this directly + return ThreadResult(unsupported_operation()); +} + void MultiVersionTransaction::atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) { auto tr = getTransaction(); if (tr.transaction) { @@ -1579,6 +1741,32 @@ ThreadFuture MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef& return abortableFuture(f, dbState->dbVar->get().onChange); } +ThreadFuture MultiVersionDatabase::blobbifyRange(const KeyRangeRef& keyRange) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->blobbifyRange(keyRange) : ThreadFuture(Never()); + return abortableFuture(f, dbVar.onChange); +} + +ThreadFuture MultiVersionDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->unblobbifyRange(keyRange) : ThreadFuture(Never()); + return abortableFuture(f, dbVar.onChange); +} + +ThreadFuture>> MultiVersionDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->listBlobbifiedRanges(keyRange, rangeLimit) + : ThreadFuture>>(Never()); + return abortableFuture(f, dbVar.onChange); +} + +ThreadFuture MultiVersionDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { + auto dbVar = dbState->dbVar->get(); + auto f = dbVar.value ? dbVar.value->verifyBlobRange(keyRange, version) : ThreadFuture(Never()); + return abortableFuture(f, dbVar.onChange); +} + // Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected // Note: this will never return if the server is running a protocol from FDB 5.0 or older @@ -1644,7 +1832,7 @@ ThreadFuture MultiVersionDatabase::DatabaseState::monitorProtocolVersion() } ProtocolVersion clusterVersion = - !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion); + !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion()); onMainThreadVoid([self, clusterVersion]() { self->protocolVersionChanged(clusterVersion); }); return ErrorOr(Void()); }); @@ -1974,6 +2162,10 @@ const char* MultiVersionApi::getClientVersion() { return localClient->api->getClientVersion(); } +void MultiVersionApi::useFutureProtocolVersion() { + localClient->api->useFutureProtocolVersion(); +} + namespace { void validateOption(Optional value, bool canBePresent, bool canBeAbsent, bool canBeEmpty = true) { @@ -2006,7 +2198,7 @@ void MultiVersionApi::setCallbacksOnExternalThreads() { callbackOnMainThread = false; } -void MultiVersionApi::addExternalLibrary(std::string path) { +void MultiVersionApi::addExternalLibrary(std::string path, bool useFutureVersion) { std::string filename = basename(path); if (filename.empty() || !fileExists(path)) { @@ -2023,8 +2215,8 @@ void MultiVersionApi::addExternalLibrary(std::string path) { threadCount = std::max(threadCount, 1); if (externalClientDescriptions.count(filename) == 0) { - TraceEvent("AddingExternalClient").detail("LibraryPath", filename); - externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(path, true))); + TraceEvent("AddingExternalClient").detail("LibraryPath", filename).detail("UseFutureVersion", useFutureVersion); + externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(path, true, useFutureVersion))); } } @@ -2044,7 +2236,7 @@ void MultiVersionApi::addExternalLibraryDirectory(std::string path) { std::string lib = abspath(joinPath(path, filename)); if (externalClientDescriptions.count(filename) == 0) { TraceEvent("AddingExternalClient").detail("LibraryPath", filename); - externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(lib, true))); + externalClientDescriptions.emplace(std::make_pair(filename, ClientDesc(lib, true, false))); } } } @@ -2182,7 +2374,7 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, setCallbacksOnExternalThreads(); } else if (option == FDBNetworkOptions::EXTERNAL_CLIENT_LIBRARY) { validateOption(value, true, false, false); - addExternalLibrary(abspath(value.get().toString())); + addExternalLibrary(abspath(value.get().toString()), false); } else if (option == FDBNetworkOptions::EXTERNAL_CLIENT_DIRECTORY) { validateOption(value, true, false, false); addExternalLibraryDirectory(value.get().toString()); @@ -2213,6 +2405,9 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option, } else if (option == FDBNetworkOptions::CLIENT_TMP_DIR) { validateOption(value, true, false, false); tmpDir = abspath(value.get().toString()); + } else if (option == FDBNetworkOptions::FUTURE_VERSION_CLIENT_LIBRARY) { + validateOption(value, true, false, false); + addExternalLibrary(abspath(value.get().toString()), true); } else { forwardOption = true; } @@ -2251,13 +2446,14 @@ void MultiVersionApi::setupNetwork() { for (auto i : externalClientDescriptions) { std::string path = i.second.libPath; std::string filename = basename(path); + bool useFutureVersion = i.second.useFutureVersion; // Copy external lib for each thread if (externalClients.count(filename) == 0) { externalClients[filename] = {}; for (const auto& tmp : copyExternalLibraryPerThread(path)) { externalClients[filename].push_back(Reference( - new ClientInfo(new DLApi(tmp.first, tmp.second /*unlink on load*/), path))); + new ClientInfo(new DLApi(tmp.first, tmp.second /*unlink on load*/), path, useFutureVersion))); } } } @@ -2297,6 +2493,9 @@ void MultiVersionApi::setupNetwork() { runOnExternalClientsAllThreads([this](Reference client) { TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath); client->api->selectApiVersion(apiVersion); + if (client->useFutureVersion) { + client->api->useFutureProtocolVersion(); + } client->loadVersion(); }); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 7c69c628c2..d41e5a0260 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,8 @@ #endif #include "flow/actorcompiler.h" // This must be the last #include. +FDB_DEFINE_BOOLEAN_PARAM(CacheResult); + extern const char* getSourceVersion(); namespace { @@ -230,8 +233,9 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc VersionVector& latestCommitVersions) { latestCommitVersions.clear(); - if (info->debugID.present()) { - g_traceBatch.addEvent("TransactionDebug", info->debugID.get().first(), "NativeAPI.getLatestCommitVersions"); + if (info->readOptions.present() && info->readOptions.get().debugID.present()) { + g_traceBatch.addEvent( + "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions"); } if (!info->readVersionObtainedFromGrvProxy) { @@ -269,8 +273,8 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc } } } - // commitVersion == readVersion is common, do not log. - if (!updatedVersionMap && commitVersion != readVersion) { + // Do not log if commitVersion >= readVersion. + if (!updatedVersionMap && commitVersion == invalidVersion) { TraceEvent(SevDebug, "CommitVersionNotFoundForSS") .detail("InSSIDMap", iter != ssidTagMapping.end() ? 1 : 0) .detail("Tag", tag) @@ -1278,32 +1282,6 @@ void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, ACTOR Future getWorkerInterfaces(Reference clusterRecord); ACTOR Future> getJSON(Database db); -struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl { - Future getRange(ReadYourWritesTransaction* ryw, - KeyRangeRef kr, - GetRangeLimits limitsHint) const override { - if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { - Key prefix = Key(getKeyRange().begin); - return map(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord()), - [prefix = prefix, kr = KeyRange(kr)](const RangeResult& in) { - RangeResult result; - for (const auto& [k_, v] : in) { - auto k = k_.withPrefix(prefix); - if (kr.contains(k)) - result.push_back_deep(result.arena(), KeyValueRef(k, v)); - } - - std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{}); - return result; - }); - } else { - return RangeResult(); - } - } - - explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} -}; - struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, @@ -1826,6 +1804,12 @@ DatabaseContext::~DatabaseContext() { it->second->notifyContextDestroyed(); ASSERT_ABORT(server_interf.empty()); locationCache.insert(allKeys, Reference()); + for (auto& it : notAtLatestChangeFeeds) { + it.second->context = nullptr; + } + for (auto& it : changeFeedUpdaters) { + it.second->context = nullptr; + } TraceEvent("DatabaseContextDestructed", dbId).backtrace(); } @@ -2987,16 +2971,14 @@ Future getKeyLocation(Reference trState, key, member, trState->spanContext, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, isBackward, version); - if (trState->tenant().present() && useTenant && trState->tenantId == TenantInfo::INVALID_TENANT) { + if (trState->tenant().present() && useTenant && trState->tenantId() == TenantInfo::INVALID_TENANT) { return map(f, [trState](const KeyRangeLocationInfo& locationInfo) { - if (trState->tenantId == TenantInfo::INVALID_TENANT) { - trState->tenantId = locationInfo.tenantEntry.id; - } + trState->trySetTenantId(locationInfo.tenantEntry.id); return locationInfo; }); } else { @@ -3130,16 +3112,14 @@ Future> getKeyRangeLocations(ReferencespanContext, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, version); - if (trState->tenant().present() && useTenant && trState->tenantId == TenantInfo::INVALID_TENANT) { + if (trState->tenant().present() && useTenant && trState->tenantId() == TenantInfo::INVALID_TENANT) { return map(f, [trState](const std::vector& locationInfo) { ASSERT(!locationInfo.empty()); - if (trState->tenantId == TenantInfo::INVALID_TENANT) { - trState->tenantId = locationInfo[0].tenantEntry.id; - } + trState->trySetTenantId(locationInfo[0].tenantEntry.id); return locationInfo; }); } else { @@ -3154,16 +3134,16 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange state Version version = wait(fVersion); loop { - std::vector locations = - wait(getKeyRangeLocations_internal(trState->cx, - trState->getTenantInfo(), - keys, - CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, - Reverse::False, - trState->spanContext, - trState->debugID, - trState->useProvisionalProxies, - version)); + std::vector locations = wait(getKeyRangeLocations_internal( + trState->cx, + trState->getTenantInfo(), + keys, + CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, + Reverse::False, + trState->spanContext, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), + trState->useProvisionalProxies, + version)); totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || @@ -3242,6 +3222,8 @@ TenantInfo TransactionState::getTenantInfo(AllowInvalidTenantID allowInvalidId / if (options.rawAccess) { return TenantInfo(); + } else if (!cx->internal && cx->clientInfo->get().clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + throw management_cluster_invalid_access(); } else if (!cx->internal && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !t.present()) { throw tenant_name_required(); } else if (!t.present()) { @@ -3257,8 +3239,8 @@ TenantInfo TransactionState::getTenantInfo(AllowInvalidTenantID allowInvalidId / } } - ASSERT(allowInvalidId || tenantId != TenantInfo::INVALID_TENANT); - return TenantInfo(t, authToken, tenantId); + ASSERT(allowInvalidId || tenantId_ != TenantInfo::INVALID_TENANT); + return TenantInfo(t, authToken, tenantId_); } // Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default @@ -3286,6 +3268,13 @@ bool TransactionState::hasTenant() const { return tenantSet && tenant_.present(); } +Future TransactionState::handleUnknownTenant() { + tenantId_ = TenantInfo::INVALID_TENANT; + ASSERT(tenant().present()); + cx->invalidateCachedTenant(tenant().get()); + return delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, taskID); +} + Future Transaction::warmRange(KeyRange keys) { return warmRange_impl(trState, keys, getReadVersion()); } @@ -3312,12 +3301,16 @@ ACTOR Future> getValue(Reference trState, state uint64_t startTime; state double startTimeD; state VersionVector ssLatestCommitVersions; + state Optional readOptions = trState->readOptions; + trState->cx->getLatestCommitVersions(locationInfo.locations, ver, trState, ssLatestCommitVersions); try { - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { getValueID = nondeterministicRandom()->randomUniqueID(); + readOptions.get().debugID = getValueID; - g_traceBatch.addAttach("GetValueAttachID", trState->debugID.get().first(), getValueID.get().first()); + g_traceBatch.addAttach( + "GetValueAttachID", trState->readOptions.get().debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); @@ -3345,13 +3338,12 @@ ACTOR Future> getValue(Reference trState, locationInfo.locations, &StorageServerInterface::getValue, GetValueRequest(span.context, - useTenant ? trState->getTenantInfo() : TenantInfo(), + useTenant ? trState->getTenantInfo() : TenantInfo(), key, ver, - trState->readType, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), - getValueID, + readOptions, ssLatestCommitVersions), TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, @@ -3405,9 +3397,8 @@ ACTOR Future> getValue(Reference trState, trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); } else { if (trState->trLogInfo && recordLogInfo) trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, @@ -3427,12 +3418,16 @@ ACTOR Future getKey(Reference trState, UseTenant useTenant = UseTenant::True) { wait(success(version)); - state Optional getKeyID = Optional(); - state Span span("NAPI:getKey"_loc, trState->spanContext); - if (trState->debugID.present()) { - getKeyID = nondeterministicRandom()->randomUniqueID(); + state Optional getKeyID; + state Optional readOptions = trState->readOptions; - g_traceBatch.addAttach("GetKeyAttachID", trState->debugID.get().first(), getKeyID.get().first()); + state Span span("NAPI:getKey"_loc, trState->spanContext); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + getKeyID = nondeterministicRandom()->randomUniqueID(); + readOptions.get().debugID = getKeyID; + + g_traceBatch.addAttach( + "GetKeyAttachID", trState->readOptions.get().debugID.get().first(), getKeyID.get().first()); g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), @@ -3474,9 +3469,8 @@ ACTOR Future getKey(Reference trState, useTenant ? trState->getTenantInfo() : TenantInfo(), k, version.get(), - trState->readType, trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), - getKeyID, + readOptions, ssLatestCommitVersions); req.arena.dependsOn(k.arena()); @@ -3517,9 +3511,8 @@ ACTOR Future getKey(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); throw e; @@ -3530,8 +3523,8 @@ ACTOR Future getKey(Reference trState, ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanContext spanContext) { state Span span("NAPI:waitForCommittedVersion"_loc, spanContext); - try { - loop { + loop { + try { choose { when(wait(cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance( @@ -3557,10 +3550,16 @@ ACTOR Future waitForCommittedVersion(Database cx, Version version, Span wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID)); } } + } catch (Error& e) { + if (e.code() == error_code_batch_transaction_throttled || + e.code() == error_code_grv_proxy_memory_limit_exceeded) { + // GRV Proxy returns an error + wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); + } else { + TraceEvent(SevError, "WaitForCommittedVersionError").error(e); + throw; + } } - } catch (Error& e) { - TraceEvent(SevError, "WaitForCommittedVersionError").error(e); - throw; } } @@ -3753,7 +3752,7 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference } // val_3 == val_2 (storage server value matches value passed into the function -> new watch) - if (valSS == parameters->value && tr.getTransactionState()->tenantId == parameters->tenant.tenantId) { + if (valSS == parameters->value && tr.getTransactionState()->tenantId() == parameters->tenant.tenantId) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); @@ -3923,7 +3922,7 @@ Future getExactRange(Reference trState, req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); - req.readType = trState->readType; + setMatchIndex(req, matchIndex); req.spanContext = span.context; trState->cx->getLatestCommitVersions( @@ -3937,13 +3936,15 @@ Future getExactRange(Reference trState, // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; + + req.options = trState->readOptions; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.Before"); - /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->debugID.get()) + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.Before"); + /*TraceEvent("TransactionDebugGetExactRangeInfo", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("ReqLimit", req.limit) @@ -3973,9 +3974,10 @@ Future getExactRange(Reference trState, ++trState->cx->transactionPhysicalReadsCompleted; throw; } - if (trState->debugID.present()) - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.After"); output.arena().dependsOn(rep.arena); output.append(output.arena(), rep.data.begin(), rep.data.size()); @@ -4062,9 +4064,8 @@ Future getExactRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); break; } else { TraceEvent(SevInfo, "GetExactRangeError") @@ -4304,7 +4305,7 @@ Future getRange(Reference trState, req.arena.dependsOn(mapper.arena()); setMatchIndex(req, matchIndex); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); - req.readType = trState->readType; + req.options = trState->readOptions; req.version = readVersion; trState->cx->getLatestCommitVersions( @@ -4342,13 +4343,13 @@ Future getRange(Reference trState, ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; req.spanContext = span.context; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Before"); - /*TraceEvent("TransactionDebugGetRangeInfo", trState->debugID.get()) + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getRange.Before"); + /*TraceEvent("TransactionDebugGetRangeInfo", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("OriginalBegin", originalBegin.toString()) @@ -4387,11 +4388,11 @@ Future getRange(Reference trState, throw; } - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { g_traceBatch.addEvent("TransactionDebug", - trState->debugID.get().first(), + trState->readOptions.get().debugID.get().first(), "NativeAPI.getRange.After"); //.detail("SizeOf", rep.data.size()); - /*TraceEvent("TransactionDebugGetRangeDone", trState->debugID.get()) + /*TraceEvent("TransactionDebugGetRangeDone", trState->readOptions.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) .detail("RepIsMore", rep.more) @@ -4503,10 +4504,11 @@ Future getRange(Reference trState, } } catch (Error& e) { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getRange.Error"); - TraceEvent("TransactionDebugError", trState->debugID.get()).error(e); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getRange.Error"); + TraceEvent("TransactionDebugError", trState->readOptions.get().debugID.get()).error(e); } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && readVersion == latestVersion)) { @@ -4533,9 +4535,8 @@ Future getRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + ASSERT(useTenant); + wait(trState->handleUnknownTenant()); } else { if (trState->trLogInfo) trState->trLogInfo->addLog( @@ -4759,9 +4760,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, req.spanContext = spanContext; req.limit = reverse ? -CLIENT_KNOBS->REPLY_BYTE_LIMIT : CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limitBytes = std::numeric_limits::max(); - // it is used to inform the storage that the rangeRead is for Fetch - // req.isFetchKeys = (trState->readType == ReadType::FETCH); - req.readType = trState->readType; + req.options = trState->readOptions; + trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); @@ -4772,12 +4772,12 @@ ACTOR Future getRangeStreamFragment(Reference trState, // FIXME: buggify byte limits on internal functions that use them, instead of globally req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional(); - req.debugID = trState->debugID; try { - if (trState->debugID.present()) { - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.RangeStream.Before"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.RangeStream.Before"); } ++trState->cx->transactionPhysicalReads; state GetKeyValuesStreamReply rep; @@ -4871,9 +4871,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, } rep = GetKeyValuesStreamReply(); } - if (trState->debugID.present()) - g_traceBatch.addEvent( - "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) + g_traceBatch.addEvent("TransactionDebug", + trState->readOptions.get().debugID.get().first(), + "NativeAPI.getExactRange.After"); RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena); if (tssDuplicateStream.present() && !tssDuplicateStream.get().done()) { @@ -4994,9 +4995,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + wait(trState->handleUnknownTenant()); break; } else { results->sendError(e); @@ -5279,7 +5278,7 @@ ACTOR Future getTenantMetadata(Reference trState, Future populateAndGetTenant(Reference trState, Key const& key, Version version) { if (!trState->tenant().present() || key == metadataVersionKey) { return TenantInfo(); - } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { + } else if (trState->tenantId() != TenantInfo::INVALID_TENANT) { return trState->getTenantInfo(); } else { return getTenantMetadata(trState, key, version); @@ -5354,7 +5353,7 @@ Future Transaction::watch(Reference watch) { trState->options.readTags, trState->spanContext, trState->taskID, - trState->debugID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies); } @@ -5773,7 +5772,9 @@ double Transaction::getBackoff(int errCode) { returnedBackoff *= deterministicRandom()->random01(); // Set backoff for next time - if (errCode == error_code_proxy_memory_limit_exceeded) { + if (errCode == error_code_commit_proxy_memory_limit_exceeded || + errCode == error_code_grv_proxy_memory_limit_exceeded) { + backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, CLIENT_KNOBS->RESOURCE_CONSTRAINED_MAX_BACKOFF); } else { backoff = std::min(backoff * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, trState->options.maxBackoff); @@ -5979,7 +5980,7 @@ ACTOR static Future commitDummyTransaction(Reference trS tr.trState->options = trState->options; tr.trState->taskID = trState->taskID; tr.trState->authToken = trState->authToken; - tr.trState->tenantId = trState->tenantId; + tr.trState->trySetTenantId(trState->tenantId()); if (!trState->hasTenant()) { tr.setOption(FDBTransactionOptions::RAW_ACCESS); } else { @@ -6020,16 +6021,17 @@ void Transaction::setupWatches() { Future watchVersion = getCommittedVersion() > 0 ? getCommittedVersion() : getReadVersion(); for (int i = 0; i < watches.size(); ++i) - watches[i]->setWatch(watchValueMap(watchVersion, - trState->getTenantInfo(), - watches[i]->key, - watches[i]->value, - trState->cx, - trState->options.readTags, - trState->spanContext, - trState->taskID, - trState->debugID, - trState->useProvisionalProxies)); + watches[i]->setWatch( + watchValueMap(watchVersion, + trState->getTenantInfo(), + watches[i]->key, + watches[i]->value, + trState->cx, + trState->options.readTags, + trState->spanContext, + trState->taskID, + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), + trState->useProvisionalProxies)); watches.clear(); } catch (Error&) { @@ -6150,15 +6152,18 @@ ACTOR static Future tryCommit(Reference trState, state TraceInterval interval("TransactionCommit"); state double startTime = now(); state Span span("NAPI:tryCommit"_loc, trState->spanContext); - state Optional debugID = trState->debugID; + state Optional debugID = trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(); state TenantPrefixPrepended tenantPrefixPrepended = TenantPrefixPrepended::False; if (debugID.present()) { TraceEvent(interval.begin()).detail("Parent", debugID.get()); } try { if (CLIENT_BUGGIFY) { - throw deterministicRandom()->randomChoice(std::vector{ - not_committed(), transaction_too_old(), proxy_memory_limit_exceeded(), commit_unknown_result() }); + throw deterministicRandom()->randomChoice(std::vector{ not_committed(), + transaction_too_old(), + commit_proxy_memory_limit_exceeded(), + grv_proxy_memory_limit_exceeded(), + commit_unknown_result() }); } if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) { @@ -6317,12 +6322,15 @@ ACTOR static Future tryCommit(Reference trState, // retry it anyway (relying on transaction idempotence) but a client might do something else. throw commit_unknown_result(); } else if (e.code() == error_code_unknown_tenant) { + // Rather than reset the tenant and retry just the commit, we need to throw this error to the user and let + // them retry the whole transaction ASSERT(trState->tenant().present()); trState->cx->invalidateCachedTenant(trState->tenant().get()); throw; } else { if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && - e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded && + e.code() != error_code_database_locked && e.code() != error_code_commit_proxy_memory_limit_exceeded && + e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled && e.code() != error_code_process_behind && e.code() != error_code_future_version && e.code() != error_code_tenant_not_found) { @@ -6548,10 +6556,10 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional(value.get().printable(), TransactionLogInfo::DONT_LOG); trState->trLogInfo->maxFieldLength = trState->options.maxTransactionLoggingFieldLength; } - if (trState->debugID.present()) { + if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { TraceEvent(SevInfo, "TransactionBeingTraced") .detail("DebugTransactionID", trState->trLogInfo->identifier) - .detail("ServerTraceID", trState->debugID.get()); + .detail("ServerTraceID", trState->readOptions.get().debugID.get()); } break; @@ -6583,10 +6591,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, OptionalrandomUniqueID()); - if (trState->trLogInfo && !trState->trLogInfo->identifier.empty()) { + if (trState->trLogInfo && !trState->trLogInfo->identifier.empty() && trState->readOptions.present() && + trState->readOptions.get().debugID.present()) { TraceEvent(SevInfo, "TransactionBeingTraced") .detail("DebugTransactionID", trState->trLogInfo->identifier) - .detail("ServerTraceID", trState->debugID.get()); + .detail("ServerTraceID", trState->readOptions.get().debugID.get()); } break; @@ -6766,9 +6775,12 @@ ACTOR Future getConsistentReadVersion(SpanContext parentSpa } } } catch (Error& e) { - if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled) + if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled && + e.code() != error_code_grv_proxy_memory_limit_exceeded) TraceEvent(SevError, "GetConsistentReadVersionError").error(e); - if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) { + if ((e.code() == error_code_batch_transaction_throttled || + e.code() == error_code_grv_proxy_memory_limit_exceeded) && + !cx->apiVersionAtLeast(630)) { wait(delayJittered(5.0)); } else { throw; @@ -7054,7 +7066,9 @@ Future Transaction::getReadVersion(uint32_t flags) { Location location = "NAPI:getReadVersion"_loc; SpanContext spanContext = generateSpanID(trState->cx->transactionTracingSample, trState->spanContext); - auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, trState->debugID); + Optional versionDebugID = + trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(); + auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, versionDebugID); batcher.stream.send(req); trState->startTime = now(); readVersion = extractReadVersion(trState, location, spanContext, req.reply.getFuture(), metadataVersion); @@ -7212,14 +7226,16 @@ Future Transaction::onError(Error const& e) { return client_invalid_operation(); } if (e.code() == error_code_not_committed || e.code() == error_code_commit_unknown_result || - e.code() == error_code_database_locked || e.code() == error_code_proxy_memory_limit_exceeded || - e.code() == error_code_process_behind || e.code() == error_code_batch_transaction_throttled || - e.code() == error_code_tag_throttled) { + e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded || + e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind || + e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled || + e.code() == error_code_blob_granule_request_failed) { if (e.code() == error_code_not_committed) ++trState->cx->transactionsNotCommitted; else if (e.code() == error_code_commit_unknown_result) ++trState->cx->transactionsMaybeCommitted; - else if (e.code() == error_code_proxy_memory_limit_exceeded) + else if (e.code() == error_code_commit_proxy_memory_limit_exceeded || + e.code() == error_code_grv_proxy_memory_limit_exceeded) ++trState->cx->transactionsResourceConstrained; else if (e.code() == error_code_process_behind) ++trState->cx->transactionsProcessBehind; @@ -7607,9 +7623,7 @@ ACTOR Future>> getRangeSplitPoints(Referencecx->invalidateCache(locations[0].tenantEntry.prefix, keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant().present()); - trState->cx->invalidateCachedTenant(trState->tenant().get()); - wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); + wait(trState->handleUnknownTenant()); } else { TraceEvent(SevError, "GetRangeSplitPoints").error(e); throw; @@ -7623,22 +7637,19 @@ ACTOR Future blobGranuleGetTenantEntry(Transaction* self, Key ra Optional cachedLocationInfo = self->trState->cx->getCachedLocation(self->getTenant().get(), rangeStartKey, Reverse::False); if (!cachedLocationInfo.present()) { - KeyRangeLocationInfo l = wait(getKeyLocation_internal(self->trState->cx, - self->trState->getTenantInfo(AllowInvalidTenantID::True), - rangeStartKey, - self->trState->spanContext, - self->trState->debugID, - self->trState->useProvisionalProxies, - Reverse::False, - latestVersion)); - if (self->trState->tenantId == TenantInfo::INVALID_TENANT) { - self->trState->tenantId = l.tenantEntry.id; - } + KeyRangeLocationInfo l = wait(getKeyLocation_internal( + self->trState->cx, + self->trState->getTenantInfo(AllowInvalidTenantID::True), + rangeStartKey, + self->trState->spanContext, + self->trState->readOptions.present() ? self->trState->readOptions.get().debugID : Optional(), + self->trState->useProvisionalProxies, + Reverse::False, + latestVersion)); + self->trState->trySetTenantId(l.tenantEntry.id); return l.tenantEntry; } else { - if (self->trState->tenantId == TenantInfo::INVALID_TENANT) { - self->trState->tenantId = cachedLocationInfo.get().tenantEntry.id; - } + self->trState->trySetTenantId(cachedLocationInfo.get().tenantEntry.id); return cachedLocationInfo.get().tenantEntry; } } @@ -7652,7 +7663,9 @@ Future>> Transaction::getRangeSplitPoints(KeyRange // the blob granule requests are a bit funky because they piggyback off the existing transaction to read from the system // keyspace -ACTOR Future>> getBlobGranuleRangesActor(Transaction* self, KeyRange keyRange) { +ACTOR Future>> getBlobGranuleRangesActor(Transaction* self, + KeyRange keyRange, + int rangeLimit) { // FIXME: use streaming range read state KeyRange currentRange = keyRange; state Standalone> results; @@ -7675,7 +7688,7 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans // basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with // UseTenant::False - GetRangeLimits limits(1000); + GetRangeLimits limits(2 * rangeLimit + 2); limits.minRows = 2; RangeResult rawMapping = wait(getRange(self->trState, self->getReadVersion(), @@ -7697,6 +7710,9 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans if (blobGranuleMapping[i].value.size()) { results.push_back(results.arena(), KeyRangeRef(blobGranuleMapping[i].key, blobGranuleMapping[i + 1].key)); + if (results.size() == rangeLimit) { + return results; + } } } results.arena().dependsOn(blobGranuleMapping.arena()); @@ -7708,8 +7724,8 @@ ACTOR Future>> getBlobGranuleRangesActor(Trans } } -Future>> Transaction::getBlobGranuleRanges(const KeyRange& range) { - return ::getBlobGranuleRangesActor(this, range); +Future>> Transaction::getBlobGranuleRanges(const KeyRange& range, int rangeLimit) { + return ::getBlobGranuleRangesActor(this, range, rangeLimit); } // hack (for now) to get blob worker interface into load balance @@ -7723,7 +7739,11 @@ ACTOR Future>> readBlobGranulesActor( KeyRange range, Version begin, Optional read, - Version* readVersionOut) { // read not present is "use transaction version" + Version* readVersionOut, + int chunkLimit, + bool summarize) { // read not present is "use transaction version" + + ASSERT(chunkLimit > 0); state RangeResult blobGranuleMapping; state Key granuleStartKey; @@ -7764,7 +7784,7 @@ ACTOR Future>> readBlobGranulesActor( // basically krmGetRange, but enable it to not use tenant without RAW_ACCESS by doing manual getRange with // UseTenant::False - GetRangeLimits limits(1000); + GetRangeLimits limits(CLIENT_KNOBS->BG_TOO_MANY_GRANULES); limits.minRows = 2; RangeResult rawMapping = wait(getRange(self->trState, self->getReadVersion(), @@ -7779,19 +7799,24 @@ ACTOR Future>> readBlobGranulesActor( blobGranuleMapping = krmDecodeRanges(prefix, range, rawMapping); } else { self->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - wait(store( - blobGranuleMapping, - krmGetRanges(self, blobGranuleMappingKeys.begin, keyRange, 1000, GetRangeLimits::BYTE_LIMIT_UNLIMITED))); + wait(store(blobGranuleMapping, + krmGetRanges(self, + blobGranuleMappingKeys.begin, + keyRange, + CLIENT_KNOBS->BG_TOO_MANY_GRANULES, + GetRangeLimits::BYTE_LIMIT_UNLIMITED))); } if (blobGranuleMapping.more) { if (BG_REQUEST_DEBUG) { fmt::print( "BG Mapping for [{0} - %{1}) too large!\n", keyRange.begin.printable(), keyRange.end.printable()); } - TraceEvent(SevWarn, "BGMappingTooLarge").detail("Range", range).detail("Max", 1000); + TraceEvent(SevWarn, "BGMappingTooLarge") + .detail("Range", range) + .detail("Max", CLIENT_KNOBS->BG_TOO_MANY_GRANULES); throw unsupported_operation(); } - ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() < CLIENT_KNOBS->TOO_MANY); + ASSERT(!blobGranuleMapping.more && blobGranuleMapping.size() <= CLIENT_KNOBS->BG_TOO_MANY_GRANULES); if (blobGranuleMapping.size() < 2) { throw blob_granule_transaction_too_old(); @@ -7810,7 +7835,6 @@ ACTOR Future>> readBlobGranulesActor( fmt::print("Key range [{0} - {1}) missing worker assignment!\n", granuleStartKey.printable(), granuleEndKey.printable()); - // TODO probably new exception type instead } throw blob_granule_transaction_too_old(); } @@ -7837,11 +7861,9 @@ ACTOR Future>> readBlobGranulesActor( getValue(self->trState, blobWorkerListKeyFor(workerId), self->getReadVersion(), UseTenant::False))); // from the time the mapping was read from the db, the associated blob worker // could have died and so its interface wouldn't be present as part of the blobWorkerList - // we persist in the db. So throw wrong_shard_server to get the new mapping + // we persist in the db. So throw blob_granule_request_failed to get the new mapping if (!workerInterface.present()) { - // need to re-read mapping, throw transaction_too_old so client retries. TODO better error? - // throw wrong_shard_server(); - throw transaction_too_old(); + throw blob_granule_request_failed(); } // FIXME: maybe just want to insert here if there are racing queries for the same worker or something? self->trState->cx->blobWorker_interf[workerId] = decodeBlobWorkerListValue(workerInterface.get()); @@ -7874,6 +7896,7 @@ ACTOR Future>> readBlobGranulesActor( req.readVersion = rv; req.tenantInfo = self->getTenant().present() ? self->trState->getTenantInfo() : TenantInfo(); req.canCollapseBegin = true; // TODO make this a parameter once we support it + req.summarize = summarize; std::vector>> v; v.push_back( @@ -7944,6 +7967,12 @@ ACTOR Future>> readBlobGranulesActor( chunkEndKey = chunkEndKey.removePrefix(tenantPrefix.get()); } keyRange = KeyRangeRef(std::min(chunkEndKey, keyRange.end), keyRange.end); + if (summarize && results.size() == chunkLimit) { + break; + } + } + if (summarize && results.size() == chunkLimit) { + break; } } // if we detect that this blob worker fails, cancel the request, as otherwise load balance will @@ -7969,10 +7998,8 @@ ACTOR Future>> readBlobGranulesActor( e.name()); } // worker is up but didn't actually have granule, or connection failed - if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed || - e.code() == error_code_unknown_tenant) { - // need to re-read mapping, throw transaction_too_old so client retries. TODO better error? - throw transaction_too_old(); + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_connection_failed) { + throw blob_granule_request_failed(); } throw e; } @@ -7992,7 +8019,32 @@ Future>> Transaction::readBlobGranules Version begin, Optional readVersion, Version* readVersionOut) { - return readBlobGranulesActor(this, range, begin, readVersion, readVersionOut); + return readBlobGranulesActor( + this, range, begin, readVersion, readVersionOut, std::numeric_limits::max(), false); +} + +ACTOR Future>> summarizeBlobGranulesActor(Transaction* self, + KeyRange range, + Version summaryVersion, + int rangeLimit) { + state Version readVersionOut; + Standalone> chunks = + wait(readBlobGranulesActor(self, range, 0, summaryVersion, &readVersionOut, rangeLimit, true)); + ASSERT(chunks.size() <= rangeLimit); + ASSERT(readVersionOut == summaryVersion); + Standalone> summaries; + summaries.reserve(summaries.arena(), chunks.size()); + for (auto& it : chunks) { + summaries.push_back(summaries.arena(), summarizeGranuleChunk(summaries.arena(), it)); + } + + return summaries; +} + +Future>> Transaction::summarizeBlobGranules(const KeyRange& range, + Version summaryVersion, + int rangeLimit) { + return summarizeBlobGranulesActor(this, range, summaryVersion, rangeLimit); } ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) { @@ -8016,6 +8068,93 @@ ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAw return version; } +ACTOR Future checkBlobSubrange(Database db, KeyRange keyRange, Optional version) { + state Transaction tr(db); + loop { + try { + state Version summaryVersion; + if (version.present()) { + summaryVersion = version.get(); + } else { + wait(store(summaryVersion, tr.getReadVersion())); + } + // same properties as a read for validating granule is readable, just much less memory and network bandwidth + // used + wait(success(tr.summarizeBlobGranules(keyRange, summaryVersion, std::numeric_limits::max()))); + return summaryVersion; + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + +ACTOR Future verifyBlobRangeActor(Reference cx, KeyRange range, Optional version) { + state Database db(cx); + state Transaction tr(db); + state Standalone> allRanges; + state KeyRange curRegion = KeyRangeRef(range.begin, range.begin); + state Version readVersionOut = invalidVersion; + state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2; + state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize; + loop { + if (curRegion.begin >= range.end) { + return readVersionOut; + } + loop { + try { + wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize))); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + if (allRanges.empty()) { + if (curRegion.begin < range.end) { + return invalidVersion; + } + return readVersionOut; + } + + state std::vector> checkParts; + // Chunk up to smaller ranges than this limit. Must be smaller than BG_TOO_MANY_GRANULES to not hit the limit + int batchCount = 0; + for (auto& it : allRanges) { + if (it.begin != curRegion.end) { + return invalidVersion; + } + + curRegion = KeyRangeRef(curRegion.begin, it.end); + batchCount++; + + if (batchCount == batchSize) { + checkParts.push_back(checkBlobSubrange(db, curRegion, version)); + batchCount = 0; + curRegion = KeyRangeRef(curRegion.end, curRegion.end); + } + } + if (!curRegion.empty()) { + checkParts.push_back(checkBlobSubrange(db, curRegion, version)); + } + + try { + wait(waitForAll(checkParts)); + } catch (Error& e) { + if (e.code() == error_code_blob_granule_transaction_too_old) { + return invalidVersion; + } + throw e; + } + ASSERT(!checkParts.empty()); + readVersionOut = checkParts.back().get(); + curRegion = KeyRangeRef(curRegion.end, curRegion.end); + } +} + +Future DatabaseContext::verifyBlobRange(const KeyRange& range, Optional version) { + return verifyBlobRangeActor(Reference::addRef(this), range, version); +} + ACTOR Future>> readStorageWiggleValues(Database cx, bool primary, bool use_system_priority) { @@ -8645,38 +8784,28 @@ Future DatabaseContext::initSharedState() { } void DatabaseContext::setSharedState(DatabaseSharedState* p) { - ASSERT(p->protocolVersion == currentProtocolVersion); + ASSERT(p->protocolVersion == currentProtocolVersion()); sharedStatePtr = p; sharedStatePtr->refCount++; } ACTOR Future storageFeedVersionUpdater(StorageServerInterface interf, ChangeFeedStorageData* self) { - state Promise destroyed = self->destroyed; loop { - if (destroyed.isSet()) { - return Void(); - } if (self->version.get() < self->desired.get()) { wait(delay(CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) || self->version.whenAtLeast(self->desired.get())); - if (destroyed.isSet()) { - return Void(); - } if (self->version.get() < self->desired.get()) { try { ChangeFeedVersionUpdateReply rep = wait(brokenPromiseToNever( interf.changeFeedVersionUpdate.getReply(ChangeFeedVersionUpdateRequest(self->desired.get())))); - if (rep.version > self->version.get()) { self->version.set(rep.version); } } catch (Error& e) { - if (e.code() == error_code_server_overloaded) { - if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) { - wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME)); - } - } else { - throw e; + if (e.code() != error_code_server_overloaded) { + throw; + } + if (FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY > CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME) { + wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY - CLIENT_KNOBS->CHANGE_FEED_EMPTY_BATCH_TIME)); } } } @@ -8695,10 +8824,53 @@ Reference DatabaseContext::getStorageData(StorageServerIn newStorageUpdater->id = interf.id(); newStorageUpdater->interfToken = token; newStorageUpdater->updater = storageFeedVersionUpdater(interf, newStorageUpdater.getPtr()); - changeFeedUpdaters[token] = newStorageUpdater; + newStorageUpdater->context = this; + newStorageUpdater->created = now(); + changeFeedUpdaters[token] = newStorageUpdater.getPtr(); return newStorageUpdater; } - return it->second; + return Reference::addRef(it->second); +} + +Version DatabaseContext::getMinimumChangeFeedVersion() { + Version minVersion = std::numeric_limits::max(); + for (auto& it : changeFeedUpdaters) { + if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) { + minVersion = std::min(minVersion, it.second->version.get()); + } + } + for (auto& it : notAtLatestChangeFeeds) { + if (now() - it.second->created > CLIENT_KNOBS->CHANGE_FEED_START_INTERVAL) { + minVersion = std::min(minVersion, it.second->getVersion()); + } + } + return minVersion; +} + +void DatabaseContext::setDesiredChangeFeedVersion(Version v) { + for (auto& it : changeFeedUpdaters) { + if (it.second->version.get() < v && it.second->desired.get() < v) { + it.second->desired.set(v); + } + } +} + +ChangeFeedStorageData::~ChangeFeedStorageData() { + if (context) { + context->changeFeedUpdaters.erase(interfToken); + } +} + +ChangeFeedData::ChangeFeedData(DatabaseContext* context) + : dbgid(deterministicRandom()->randomUniqueID()), context(context), notAtLatest(1), created(now()) { + if (context) { + context->notAtLatestChangeFeeds[dbgid] = this; + } +} +ChangeFeedData::~ChangeFeedData() { + if (context) { + context->notAtLatestChangeFeeds.erase(dbgid); + } } Version ChangeFeedData::getVersion() { @@ -8892,6 +9064,9 @@ ACTOR Future partialChangeFeedStream(StorageServerInterface interf, if (refresh.canBeSet() && !atLatestVersion && rep.atLatestVersion) { atLatestVersion = true; feedData->notAtLatest.set(feedData->notAtLatest.get() - 1); + if (feedData->notAtLatest.get() == 0 && feedData->context) { + feedData->context->notAtLatestChangeFeeds.erase(feedData->dbgid); + } } if (refresh.canBeSet() && rep.minStreamVersion > storageData->version.get()) { storageData->version.set(rep.minStreamVersion); @@ -9082,11 +9257,6 @@ ACTOR Future mergeChangeFeedStream(Reference db, results->streams.push_back(it.first.changeFeedStream.getReplyStream(req)); } - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->maxSeenVersion = invalidVersion; results->storageData.clear(); Promise refresh = results->refresh; @@ -9095,6 +9265,10 @@ ACTOR Future mergeChangeFeedStream(Reference db, results->storageData.push_back(db->getStorageData(interfs[i].first)); } results->notAtLatest.set(interfs.size()); + if (results->context) { + results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); + } refresh.send(Void()); for (int i = 0; i < interfs.size(); i++) { @@ -9137,6 +9311,8 @@ ACTOR Future getChangeFeedRange(Reference db, Databas loop { try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); Version readVer = wait(tr.getReadVersion()); if (readVer < begin) { wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY)); @@ -9183,10 +9359,21 @@ ACTOR Future singleChangeFeedStreamInternal(KeyRange range, // update lastReturned once the previous mutation has been consumed if (*begin - 1 > results->lastReturnedVersion.get()) { results->lastReturnedVersion.set(*begin - 1); + if (!refresh.canBeSet()) { + try { + // refresh is set if and only if this actor is cancelled + wait(Future(Void())); + // Catch any unexpected behavior if the above contract is broken + ASSERT(false); + } catch (Error& e) { + ASSERT(e.code() == error_code_actor_cancelled); + throw; + } + } } loop { - + ASSERT(refresh.canBeSet()); state ChangeFeedStreamReply feedReply = waitNext(results->streams[0].getFuture()); *begin = feedReply.mutations.back().version + 1; @@ -9236,6 +9423,9 @@ ACTOR Future singleChangeFeedStreamInternal(KeyRange range, if (!atLatest && feedReply.atLatestVersion) { atLatest = true; results->notAtLatest.set(0); + if (results->context) { + results->context->notAtLatestChangeFeeds.erase(results->dbgid); + } } if (feedReply.minStreamVersion > results->storageData[0]->version.get()) { @@ -9274,11 +9464,6 @@ ACTOR Future singleChangeFeedStream(Reference db, results->streams.clear(); - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.push_back(interf.changeFeedStream.getReplyStream(req)); results->maxSeenVersion = invalidVersion; @@ -9287,6 +9472,10 @@ ACTOR Future singleChangeFeedStream(Reference db, Promise refresh = results->refresh; results->refresh = Promise(); results->notAtLatest.set(1); + if (results->context) { + results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); + } refresh.send(Void()); wait(results->streams[0].onError() || singleChangeFeedStreamInternal(range, results, rangeID, begin, end)); @@ -9395,11 +9584,6 @@ ACTOR Future getChangeFeedStreamActor(Reference db, } } catch (Error& e) { if (e.code() == error_code_actor_cancelled || e.code() == error_code_change_feed_popped) { - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.clear(); results->storageData.clear(); if (e.code() == error_code_change_feed_popped) { @@ -9413,11 +9597,15 @@ ACTOR Future getChangeFeedStreamActor(Reference db, } if (results->notAtLatest.get() == 0) { results->notAtLatest.set(1); + if (results->context) { + results->context->notAtLatestChangeFeeds[results->dbgid] = results.getPtr(); + results->created = now(); + } } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_connection_failed || e.code() == error_code_unknown_change_feed || - e.code() == error_code_broken_promise) { + e.code() == error_code_broken_promise || e.code() == error_code_future_version) { db->changeFeedCache.erase(rangeID); cx->invalidateCache(Key(), keys); if (begin == lastBeginVersion) { @@ -9431,11 +9619,6 @@ ACTOR Future getChangeFeedStreamActor(Reference db, } else { results->mutations.sendError(e); results->refresh.sendError(change_feed_cancelled()); - for (auto& it : results->storageData) { - if (it->debugGetReferenceCount() == 2) { - db->changeFeedUpdaters.erase(it->interfToken); - } - } results->streams.clear(); results->storageData.clear(); return Void(); @@ -9544,7 +9727,8 @@ ACTOR Future getOverlappingChangeFeedsActor(Referenc } return result; } catch (Error& e) { - if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || + e.code() == error_code_future_version) { cx->invalidateCache(Key(), range); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); } else { @@ -9563,6 +9747,8 @@ ACTOR static Future popChangeFeedBackup(Database cx, Key rangeID, Version loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); state Key rangeIDKey = rangeID.withPrefix(changeFeedPrefix); Optional val = wait(tr.get(rangeIDKey)); if (val.present()) { @@ -9666,6 +9852,7 @@ Reference DatabaseContext::createTransaction() { return makeReference(Database(Reference::addRef(this))); } +// BlobGranule API. ACTOR Future purgeBlobGranulesActor(Reference db, KeyRange range, Version purgeVersion, @@ -9677,15 +9864,11 @@ ACTOR Future purgeBlobGranulesActor(Reference db, state KeyRange purgeRange = range; state bool loadedTenantPrefix = false; - // FIXME: implement force - if (force) { - throw unsupported_operation(); - } - loop { try { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); if (tenant.present() && !loadedTenantPrefix) { TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin)); @@ -9762,6 +9945,111 @@ Future DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) { return waitPurgeGranulesCompleteActor(Reference::addRef(this), purgeKey); } +ACTOR Future>> getBlobRanges(Reference tr, + KeyRange range, + int batchLimit) { + state Standalone> blobRanges; + state Key beginKey = range.begin; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state RangeResult results = wait( + krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2)); + + blobRanges.arena().dependsOn(results.arena()); + for (int i = 0; i < results.size() - 1; i++) { + if (results[i].value == blobRangeActive) { + blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key)); + } + if (blobRanges.size() == batchLimit) { + return blobRanges; + } + } + + if (!results.more) { + return blobRanges; + } + beginKey = results.back().key; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + +ACTOR Future setBlobRangeActor(Reference cx, KeyRange range, bool active) { + state Database db(cx); + state Reference tr = makeReference(db); + + state Value value = active ? blobRangeActive : blobRangeInactive; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + state Standalone> startBlobRanges = wait(getBlobRanges(tr, range, 10)); + state Standalone> endBlobRanges = + wait(getBlobRanges(tr, KeyRangeRef(range.end, keyAfter(range.end)), 10)); + + if (active) { + // Idempotent request. + if (!startBlobRanges.empty() && !endBlobRanges.empty()) { + return startBlobRanges.front().begin == range.begin && endBlobRanges.front().end == range.end; + } + } else { + // An unblobbify request must be aligned to boundaries. + // It is okay to unblobbify multiple regions all at once. + if (startBlobRanges.empty() && endBlobRanges.empty()) { + return true; + } + // If there is a blob at the beginning of the range and it isn't aligned, + // or there is a blob range that begins before the end of the range, then fail. + if ((!startBlobRanges.empty() && startBlobRanges.front().begin != range.begin) || + (!endBlobRanges.empty() && endBlobRanges.front().begin < range.end)) { + return false; + } + } + + tr->set(blobRangeChangeKey, deterministicRandom()->randomUniqueID().toString()); + // This is not coalescing because we want to keep each range logically separate. + wait(krmSetRange(tr, blobRangeKeys.begin, range, value)); + wait(tr->commit()); + printf("Successfully updated blob range [%s - %s) to %s\n", + range.begin.printable().c_str(), + range.end.printable().c_str(), + value.printable().c_str()); + return true; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + +Future DatabaseContext::blobbifyRange(KeyRange range) { + return setBlobRangeActor(Reference::addRef(this), range, true); +} + +Future DatabaseContext::unblobbifyRange(KeyRange range) { + return setBlobRangeActor(Reference::addRef(this), range, false); +} + +ACTOR Future>> listBlobbifiedRangesActor(Reference cx, + KeyRange range, + int rangeLimit) { + state Database db(cx); + state Reference tr = makeReference(db); + + state Standalone> blobRanges = wait(getBlobRanges(tr, range, rangeLimit)); + + return blobRanges; +} + +Future>> DatabaseContext::listBlobbifiedRanges(KeyRange range, int rowLimit) { + return listBlobbifiedRangesActor(Reference::addRef(this), range, rowLimit); +} + int64_t getMaxKeySize(KeyRef const& key) { return getMaxWriteKeySize(key, true); } diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 0635358402..892c9b84a0 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -681,7 +681,8 @@ public: break; if (it.is_unknown_range()) { - if (limits.hasByteLimit() && result.size() && itemsPastEnd >= 1 - end.offset) { + if (limits.hasByteLimit() && limits.hasSatisfiedMinRows() && result.size() && + itemsPastEnd >= 1 - end.offset) { result.more = true; break; } @@ -1783,7 +1784,8 @@ Future>> ReadYourWritesTransaction::getRangeSplitPo return waitOrError(tr.getRangeSplitPoints(range, chunkSize), resetPromise.getFuture()); } -Future>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range) { +Future>> ReadYourWritesTransaction::getBlobGranuleRanges(const KeyRange& range, + int rangeLimit) { if (checkUsedDuringCommit()) { return used_during_commit(); } @@ -1794,7 +1796,7 @@ Future>> ReadYourWritesTransaction::getBlobGra if (range.begin > maxKey || range.end > maxKey) return key_outside_legal_range(); - return waitOrError(tr.getBlobGranuleRanges(range), resetPromise.getFuture()); + return waitOrError(tr.getBlobGranuleRanges(range, rangeLimit), resetPromise.getFuture()); } Future>> ReadYourWritesTransaction::readBlobGranules( diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index 8054b778c8..ce99e30ac8 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -735,16 +735,21 @@ ACTOR Future connect_impl(Referenceknobs.secure_connection ? "https" : "http"; } bool isTLS = b->knobs.secure_connection == 1; + state Reference conn; if (b->useProxy) { - // TODO(renxuan): Support http proxy + TLS - if (isTLS || b->service == "443") { - fprintf(stderr, "ERROR: TLS is not supported yet when using HTTP proxy.\n"); - throw connection_failed(); + if (isTLS) { + Reference _conn = + wait(HTTP::proxyConnect(host, service, b->proxyHost.get(), b->proxyPort.get())); + conn = _conn; + } else { + host = b->proxyHost.get(); + service = b->proxyPort.get(); + Reference _conn = wait(INetworkConnections::net()->connect(host, service, false)); + conn = _conn; } - host = b->proxyHost.get(); - service = b->proxyPort.get(); + } else { + wait(store(conn, INetworkConnections::net()->connect(host, service, isTLS))); } - state Reference conn = wait(INetworkConnections::net()->connect(host, service, isTLS)); wait(conn->connectHandshake()); TraceEvent("S3BlobStoreEndpointNewConnection") @@ -892,7 +897,7 @@ ACTOR Future> doRequest_impl(ReferenceuseProxy) { + if (bstore->useProxy && bstore->knobs.secure_connection == 0) { // Has to be in absolute-form. canonicalURI = "http://" + bstore->host + ":" + bstore->service + canonicalURI; } diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 7cc2079e27..7f3a3c658b 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -427,7 +427,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." @@ -448,7 +450,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "log_server_min_free_space", "log_server_min_free_space_ratio", "storage_server_durability_lag", - "storage_server_list_fetch_failed" + "storage_server_list_fetch_failed", + "blob_worker_lag", + "blob_worker_missing" ] }, "description":"The database is not being saturated by the workload." diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 948b3a1a9a..e71f1bdb5f 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -50,7 +50,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // TLogs init( TLOG_TIMEOUT, 0.4 ); //cannot buggify because of availability init( TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS, 60 ); if( randomize && BUGGIFY ) TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS = deterministicRandom()->randomInt(5,10); - init( RECOVERY_TLOG_SMART_QUORUM_DELAY, 0.25 ); if( randomize && BUGGIFY ) RECOVERY_TLOG_SMART_QUORUM_DELAY = 0.0; // smaller might be better for bug amplification init( TLOG_STORAGE_MIN_UPDATE_INTERVAL, 0.5 ); init( BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL, 30 ); init( DESIRED_TOTAL_BYTES, 150000 ); if( randomize && BUGGIFY ) DESIRED_TOTAL_BYTES = 10000; @@ -58,10 +57,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( UPDATE_DELAY, 0.001 ); init( MAXIMUM_PEEK_BYTES, 10e6 ); init( APPLY_MUTATION_BYTES, 1e6 ); - init( RECOVERY_DATA_BYTE_LIMIT, 100000 ); - init( BUGGIFY_RECOVERY_DATA_LIMIT, 1000 ); - init( LONG_TLOG_COMMIT_TIME, 0.25 ); //cannot buggify because of recovery time - init( LARGE_TLOG_COMMIT_BYTES, 4<<20 ); init( BUGGIFY_RECOVER_MEMORY_LIMIT, 1e6 ); init( BUGGIFY_WORKER_REMOVED_MAX_LAG, 30 ); init( UPDATE_STORAGE_BYTE_LIMIT, 1e6 ); @@ -94,7 +89,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_CACHE_VERSIONS, 10e6 ); init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); - init( TLOG_MAX_CREATE_DURATION, 10.0 ); + // In some rare simulation tests, particularly with log_spill:=1 configured, the 10 second limit is exceeded, causing SevError trace events + // and simulation test failure. Increasing the knob value to 15.0 in simulation is a workaround to avoid these failures. + init( TLOG_MAX_CREATE_DURATION, 10.0 ); if (isSimulated) TLOG_MAX_CREATE_DURATION = 15.0; init( PEEK_LOGGING_AMOUNT, 5 ); init( PEEK_LOGGING_DELAY, 5.0 ); init( PEEK_RESET_INTERVAL, 300.0 ); if ( randomize && BUGGIFY ) PEEK_RESET_INTERVAL = 20.0; @@ -133,16 +130,15 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BG_REBALANCE_POLLING_INTERVAL, 10.0 ); init( BG_REBALANCE_SWITCH_CHECK_INTERVAL, 5.0 ); if (randomize && BUGGIFY) BG_REBALANCE_SWITCH_CHECK_INTERVAL = 1.0; init( DD_QUEUE_LOGGING_INTERVAL, 5.0 ); + init( DD_QUEUE_COUNTER_REFRESH_INTERVAL, 60.0 ); + // 100 / 60 < 2 trace/sec ~ 2 * 200 = 400b/sec + init( DD_QUEUE_COUNTER_MAX_LOG, 100 ); if( randomize && BUGGIFY ) DD_QUEUE_COUNTER_MAX_LOG = 1; + init( DD_QUEUE_COUNTER_SUMMARIZE, true ); init( RELOCATION_PARALLELISM_PER_SOURCE_SERVER, 2 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_SOURCE_SERVER = 1; init( RELOCATION_PARALLELISM_PER_DEST_SERVER, 10 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_DEST_SERVER = 1; // Note: if this is smaller than FETCH_KEYS_PARALLELISM, this will artificially reduce performance. The current default of 10 is probably too high but is set conservatively for now. init( DD_QUEUE_MAX_KEY_SERVERS, 100 ); if( randomize && BUGGIFY ) DD_QUEUE_MAX_KEY_SERVERS = 1; init( DD_REBALANCE_PARALLELISM, 50 ); init( DD_REBALANCE_RESET_AMOUNT, 30 ); - init( BG_DD_MAX_WAIT, 120.0 ); - init( BG_DD_MIN_WAIT, 0.1 ); - init( BG_DD_INCREASE_RATE, 1.10 ); - init( BG_DD_DECREASE_RATE, 1.02 ); - init( BG_DD_SATURATION_DELAY, 1.0 ); init( INFLIGHT_PENALTY_HEALTHY, 1.0 ); init( INFLIGHT_PENALTY_UNHEALTHY, 500.0 ); init( INFLIGHT_PENALTY_ONE_LEFT, 1000.0 ); @@ -165,9 +161,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( PRIORITY_TEAM_FAILED, 805 ); init( PRIORITY_TEAM_0_LEFT, 809 ); init( PRIORITY_SPLIT_SHARD, 950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350; + init( PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD, 960 ); if( randomize && BUGGIFY ) PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD = 360; // Set as the lowest priority // Data distribution init( SHARD_ENCODE_LOCATION_METADATA, false ); if( randomize && BUGGIFY ) SHARD_ENCODE_LOCATION_METADATA = true; + init( ENABLE_DD_PHYSICAL_SHARD, false ); // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true; When true, optimization of data move between DCs is disabled + init( MAX_PHYSICAL_SHARD_BYTES, 500000000 ); // 500 MB; for ENABLE_DD_PHYSICAL_SHARD; smaller leads to larger number of physicalShard per storage server + init( PHYSICAL_SHARD_METRICS_DELAY, 300.0 ); // 300 seconds; for ENABLE_DD_PHYSICAL_SHARD + init( ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME, 600.0 ); if( randomize && BUGGIFY ) ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME = 0.0; // 600 seconds; for ENABLE_DD_PHYSICAL_SHARD init( READ_REBALANCE_CPU_THRESHOLD, 15.0 ); init( READ_REBALANCE_SRC_PARALLELISM, 20 ); init( READ_REBALANCE_SHARD_TOPK, READ_REBALANCE_SRC_PARALLELISM * 2 ); @@ -250,7 +251,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( SERVER_LIST_DELAY, 1.0 ); init( RECRUITMENT_IDLE_DELAY, 1.0 ); init( STORAGE_RECRUITMENT_DELAY, 10.0 ); - init( BLOB_WORKER_RECRUITMENT_DELAY, 10.0 ); init( TSS_HACK_IDENTITY_MAPPING, false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail init( TSS_DD_CHECK_INTERVAL, 60.0 ); if (randomize && BUGGIFY ) TSS_DD_CHECK_INTERVAL = 1.0; // May kill all TSS quickly @@ -276,7 +276,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) DD_FAILURE_TIME = 10.0; init( DD_ZERO_HEALTHY_TEAM_DELAY, 1.0 ); init( REMOTE_KV_STORE, false ); - init( REMOTE_KV_STORE_INIT_DELAY, 0.1 ); + init( REBOOT_KV_STORE_DELAY, 0.1 ); init( REMOTE_KV_STORE_MAX_INIT_DURATION, 10.0 ); init( REBALANCE_MAX_RETRIES, 100 ); init( DD_OVERLAP_PENALTY, 10000 ); @@ -292,8 +292,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5; init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000; init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 ); + init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_TENANT_AWARENESS_ENABLED, false ); - init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2.0 ); + init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true @@ -371,6 +373,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REPLACE_CONTENTS_BYTES, 1e5 ); // KeyValueStoreRocksDB + init( ROCKSDB_READ_RANGE_ROW_LIMIT, 65535 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ROW_LIMIT = deterministicRandom()->randomInt(2, 10); + init( ROCKSDB_BACKGROUND_PARALLELISM, 4 ); init( ROCKSDB_READ_PARALLELISM, 4 ); // Use a smaller memtable in simulation to avoid OOMs. @@ -381,9 +385,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_PREFIX_LEN, 0 ); init( ROCKSDB_BLOCK_CACHE_SIZE, 0 ); init( ROCKSDB_METRICS_DELAY, 60.0 ); - init( ROCKSDB_READ_VALUE_TIMEOUT, 5.0 ); - init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, 5.0 ); - init( ROCKSDB_READ_RANGE_TIMEOUT, 5.0 ); + init( ROCKSDB_READ_VALUE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + init( ROCKSDB_READ_VALUE_PREFIX_TIMEOUT, isSimulated ? 5.0 : 200.0 ); + init( ROCKSDB_READ_RANGE_TIMEOUT, isSimulated ? 5.0 : 200.0 ); init( ROCKSDB_READ_QUEUE_WAIT, 1.0 ); init( ROCKSDB_READ_QUEUE_HARD_MAX, 1000 ); init( ROCKSDB_READ_QUEUE_SOFT_MAX, 500 ); @@ -400,6 +404,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true; init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 ); + init( ROCKSDB_METRICS_SAMPLE_INTERVAL, 0.0); init( ROCKSDB_MAX_SUBCOMPACTIONS, 2 ); init( ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT, 64000000000 ); // 64GB, Rocksdb option, Writes will slow down. init( ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT, 100000000000 ); // 100GB, Rocksdb option, Writes will stall. @@ -412,6 +417,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_COMPACTION_READAHEAD_SIZE, 32768 ); // 32 KB, performs bigger reads when doing compaction. init( ROCKSDB_BLOCK_SIZE, 32768 ); // 32 KB, size of the block in rocksdb cache. init( ENABLE_SHARDED_ROCKSDB, false ); + init( ROCKSDB_WRITE_BUFFER_SIZE, 1 << 30 ); // 1G + init( ROCKSDB_CF_WRITE_BUFFER_SIZE, 64 << 20 ); // 64M, RocksDB default. + init( ROCKSDB_MAX_TOTAL_WAL_SIZE, 0 ); // RocksDB default. + init( ROCKSDB_MAX_BACKGROUND_JOBS, 2 ); // RocksDB default. + init( ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD, 21600 ); // 6h, RocksDB default. + init( ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY, isSimulated ? 10.0 : 300.0 ); // Delays shard clean up, must be larger than ROCKSDB_READ_VALUE_TIMEOUT to prevent reading deleted shard. // Leader election bool longLeaderElection = randomize && BUGGIFY; @@ -475,7 +486,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REPORT_TRANSACTION_COST_ESTIMATION_DELAY, 0.1 ); init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG, true ); - bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST; + bool buggfyUseResolverPrivateMutations = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR_TLOG_UNICAST; init( PROXY_USE_RESOLVER_PRIVATE_MUTATIONS, false ); if( buggfyUseResolverPrivateMutations ) PROXY_USE_RESOLVER_PRIVATE_MUTATIONS = deterministicRandom()->coinflip(); init( RESET_MASTER_BATCHES, 200 ); @@ -610,9 +621,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( SLOW_SMOOTHING_AMOUNT, 10.0 ); if( slowRatekeeper ) SLOW_SMOOTHING_AMOUNT = 50.0; init( METRIC_UPDATE_RATE, .1 ); if( slowRatekeeper ) METRIC_UPDATE_RATE = 0.5; init( DETAILED_METRIC_UPDATE_RATE, 5.0 ); - init (RATEKEEPER_DEFAULT_LIMIT, 1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0; + init( RATEKEEPER_DEFAULT_LIMIT, 1e6 ); if( randomize && BUGGIFY ) RATEKEEPER_DEFAULT_LIMIT = 0; init( RATEKEEPER_LIMIT_REASON_SAMPLE_RATE, 0.1 ); init( RATEKEEPER_PRINT_LIMIT_REASON, false ); if( randomize && BUGGIFY ) RATEKEEPER_PRINT_LIMIT_REASON = true; + init( RATEKEEPER_MIN_RATE, 0.0 ); + init( RATEKEEPER_MAX_RATE, 1e9 ); + init( RATEKEEPER_BATCH_MIN_RATE, 0.0 ); + init( RATEKEEPER_BATCH_MAX_RATE, 1e9 ); bool smallStorageTarget = randomize && BUGGIFY; init( TARGET_BYTES_PER_STORAGE_SERVER, 1000e6 ); if( smallStorageTarget ) TARGET_BYTES_PER_STORAGE_SERVER = 3000e3; @@ -662,6 +677,19 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DURABILITY_LAG_REDUCTION_RATE, 0.9999 ); init( DURABILITY_LAG_INCREASE_RATE, 1.001 ); init( STORAGE_SERVER_LIST_FETCH_TIMEOUT, 20.0 ); + init( BW_THROTTLING_ENABLED, true ); + + bool buggifySmallBWLag = randomize && BUGGIFY; + init( TARGET_BW_LAG, 50.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0; + init( TARGET_BW_LAG_BATCH, 20.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0; + init( TARGET_BW_LAG_UPDATE, 9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0; + init( MIN_BW_HISTORY, 10 ); + init( BW_ESTIMATION_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0; + init( BW_LAG_INCREASE_AMOUNT, 1.1 ); + init( BW_LAG_DECREASE_AMOUNT, 0.9 ); + init( BW_FETCH_WORKERS_INTERVAL, 5.0 ); + init( BW_RW_LOGGING_INTERVAL, 5.0 ); + init( BW_MAX_BLOCKED_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_MAX_BLOCKED_INTERVAL = 2.0; init( MAX_AUTO_THROTTLED_TRANSACTION_TAGS, 5 ); if(randomize && BUGGIFY) MAX_AUTO_THROTTLED_TRANSACTION_TAGS = 1; init( MAX_MANUAL_THROTTLED_TRANSACTION_TAGS, 40 ); if(randomize && BUGGIFY) MAX_MANUAL_THROTTLED_TRANSACTION_TAGS = 1; @@ -676,6 +704,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false; init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10); init( GLOBAL_TAG_THROTTLING, false ); + init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false ); init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); @@ -698,12 +727,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_LIMIT_BYTES, 500000 ); init( BUGGIFY_LIMIT_BYTES, 1000 ); init( FETCH_USING_STREAMING, false ); if( randomize && isSimulated && BUGGIFY ) FETCH_USING_STREAMING = true; //Determines if fetch keys uses streaming reads + init( FETCH_USING_BLOB, false ); init( FETCH_BLOCK_BYTES, 2e6 ); init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6; init( FETCH_KEYS_PARALLELISM, 2 ); - init( FETCH_KEYS_PARALLELISM_FULL, 10 ); + init( FETCH_KEYS_PARALLELISM_FULL, 6 ); init( FETCH_KEYS_LOWER_PRIORITY, 0 ); - init( FETCH_CHANGEFEED_PARALLELISM, 4 ); init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 ); init( BUGGIFY_BLOCK_BYTES, 10000 ); init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS ); @@ -712,7 +741,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD, 0.25 ); init( STORAGE_DURABILITY_LAG_MIN_RATE, 0.1 ); init( STORAGE_COMMIT_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0; - init( UPDATE_SHARD_VERSION_INTERVAL, 0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0; init( BYTE_SAMPLING_FACTOR, 250 ); //cannot buggify because of differences in restarting tests init( BYTE_SAMPLING_OVERHEAD, 100 ); init( MAX_STORAGE_SERVER_WATCH_BYTES, 100e6 ); if( randomize && BUGGIFY ) MAX_STORAGE_SERVER_WATCH_BYTES = 10e3; @@ -721,7 +749,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BYTE_SAMPLE_LOAD_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_PARALLELISM = 1; init( BYTE_SAMPLE_LOAD_DELAY, 0.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_LOAD_DELAY = 0.1; init( BYTE_SAMPLE_START_DELAY, 1.0 ); if( randomize && BUGGIFY ) BYTE_SAMPLE_START_DELAY = 0.0; - init( UPDATE_STORAGE_PROCESS_STATS_INTERVAL, 5.0 ); init( BEHIND_CHECK_DELAY, 2.0 ); init( BEHIND_CHECK_COUNT, 2 ); init( BEHIND_CHECK_VERSIONS, 5 * VERSIONS_PER_SECOND ); @@ -788,7 +815,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // Dynamic Knobs (implementation) init( COMPACTION_INTERVAL, isSimulated ? 5.0 : 300.0 ); - init( UPDATE_NODE_TIMEOUT, 3.0 ); init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 ); init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT, 3.0 ); init( FETCH_CHANGES_TIMEOUT, 3.0 ); @@ -804,14 +830,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DISABLE_DUPLICATE_LOG_WARNING, false ); init( HISTOGRAM_REPORT_INTERVAL, 300.0 ); - // IPager - init( PAGER_RESERVED_PAGES, 1 ); - - // IndirectShadowPager - init( FREE_PAGE_VACUUM_THRESHOLD, 1 ); - init( VACUUM_QUEUE_SIZE, 100000 ); - init( VACUUM_BYTES_PER_SECOND, 1e6 ); - // Timekeeper init( TIME_KEEPER_DELAY, 10 ); init( TIME_KEEPER_MAX_ENTRIES, 3600 * 24 * 30 * 6 ); if( randomize && BUGGIFY ) { TIME_KEEPER_MAX_ENTRIES = 2; } @@ -830,11 +848,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( FASTRESTORE_ROLE_LOGGING_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_ROLE_LOGGING_DELAY = deterministicRandom()->random01() * 60 + 1; } init( FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL = deterministicRandom()->random01() * 60 + 1; } init( FASTRESTORE_ATOMICOP_WEIGHT, 1 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; } - init( FASTRESTORE_APPLYING_PARALLELISM, 10000 ); if( randomize && BUGGIFY ) { FASTRESTORE_APPLYING_PARALLELISM = deterministicRandom()->random01() * 10 + 1; } init( FASTRESTORE_MONITOR_LEADER_DELAY, 5 ); if( randomize && BUGGIFY ) { FASTRESTORE_MONITOR_LEADER_DELAY = deterministicRandom()->random01() * 100; } init( FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS, 60 ); if( randomize && BUGGIFY ) { FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS = deterministicRandom()->random01() * 240 + 10; } init( FASTRESTORE_TRACK_REQUEST_LATENCY, false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_REQUEST_LATENCY = false; } - init( FASTRESTORE_TRACK_LOADER_SEND_REQUESTS, false ); if( randomize && BUGGIFY ) { FASTRESTORE_TRACK_LOADER_SEND_REQUESTS = true; } init( FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT, 6144 ); if( randomize && BUGGIFY ) { FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT = 1; } init( FASTRESTORE_WAIT_FOR_MEMORY_LATENCY, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_WAIT_FOR_MEMORY_LATENCY = 60; } init( FASTRESTORE_HEARTBEAT_DELAY, 10 ); if( randomize && BUGGIFY ) { FASTRESTORE_HEARTBEAT_DELAY = deterministicRandom()->random01() * 120 + 2; } @@ -893,27 +909,25 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX, "Master" ); // Encryption - init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) { ENABLE_ENCRYPTION = deterministicRandom()->coinflip(); } + init( ENABLE_ENCRYPTION, false ); if ( randomize && BUGGIFY ) ENABLE_ENCRYPTION = !ENABLE_ENCRYPTION; init( ENCRYPTION_MODE, "AES-256-CTR" ); init( SIM_KMS_MAX_KEYS, 4096 ); init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000 ); - init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_TLOG_ENCRYPTION = (ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && deterministicRandom()->coinflip()); } - init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_ENCRYPTION = (ENABLE_ENCRYPTION && deterministicRandom()->coinflip()); } + init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true; + init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION; + init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION; // encrypt key proxy init( ENABLE_BLOB_GRANULE_COMPRESSION, false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); } init( BLOB_GRANULE_COMPRESSION_FILTER, "GZIP" ); if ( randomize && BUGGIFY ) { BLOB_GRANULE_COMPRESSION_FILTER = "NONE"; } - - // KMS connector type + // KMS connector type init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" ); // Blob granlues - init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually - // BlobGranuleVerify* simulation tests use "blobRangeKeys", BlobGranuleCorrectness* use "tenant", default in real clusters is "tenant" - init( BG_RANGE_SOURCE, "tenant" ); - // BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs" + init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY); + // BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs" init( BG_METADATA_SOURCE, "knobs" ); init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000; init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8)); @@ -933,11 +947,14 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 ); init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1; + init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10); + init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100); init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0; init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0; init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 ); init( BLOB_WORKER_BATCH_GRV_INTERVAL, 0.1 ); - + init( BLOB_WORKER_DO_REJECT_WHEN_FULL, true ); if ( randomize && BUGGIFY ) BLOB_WORKER_DO_REJECT_WHEN_FULL = false; + init( BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD, 0.9 ); init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN, 0.1 ); init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX, 5.0 ); diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 7774b99ba7..719aff9fe8 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -133,7 +133,8 @@ std::unordered_map SpecialKeySpace::actorLineageApiComman std::set SpecialKeySpace::options = { "excluded/force", "failed/force", "excluded_locality/force", - "failed_locality/force" }; + "failed_locality/force", + "worker_interfaces/verify" }; std::set SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey }; @@ -1603,7 +1604,8 @@ Future TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) { if (ryw->getApproximateSize() > 0) { - ryw->setSpecialKeySpaceErrorMsg("tracing options must be set first"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "configure trace", "tracing options must be set first")); ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional())); return; } @@ -1616,7 +1618,8 @@ void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, } else if (value.toString() == "false") { ryw->setToken(0); } else { - ryw->setSpecialKeySpaceErrorMsg("token must be set to true/false"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "configure trace token", "token must be set to true/false")); throw special_keys_api_failure(); } } @@ -1630,12 +1633,12 @@ Future> TracingOptionsImpl::commit(ReadYourWritesTransacti } void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) { - ryw->setSpecialKeySpaceErrorMsg("clear range disabled"); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(false, "clear trace", "clear range disabled")); throw special_keys_api_failure(); } void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) { - ryw->setSpecialKeySpaceErrorMsg("clear disabled"); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(false, "clear trace", "clear disabled")); throw special_keys_api_failure(); } @@ -2180,7 +2183,8 @@ ACTOR static Future actorLineageGetRangeActor(ReadYourWritesTransac state std::vector endValues = kr.end.removePrefix(prefix).splitAny("/"_sr); // Require index (either "state" or "time") and address:port. if (beginValues.size() < 2 || endValues.size() < 2) { - ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "read actor_lineage", "missing required parameters (index, host)")); throw special_keys_api_failure(); } @@ -2199,12 +2203,14 @@ ACTOR static Future actorLineageGetRangeActor(ReadYourWritesTransac parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd); } } else { - ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "read actor_lineage", "invalid index in actor_lineage")); throw special_keys_api_failure(); } } catch (Error& e) { if (e.code() != special_keys_api_failure().code()) { - ryw->setSpecialKeySpaceErrorMsg("failed to parse key"); + ryw->setSpecialKeySpaceErrorMsg( + ManagementAPIError::toJsonString(false, "read actor_lineage", "failed to parse key")); throw special_keys_api_failure(); } else { throw e; @@ -2214,7 +2220,8 @@ ACTOR static Future actorLineageGetRangeActor(ReadYourWritesTransac if (kr.begin != kr.end && host != endRangeHost) { // The client doesn't know about all the hosts, so a get range covering // multiple hosts has no way of knowing which IP:port combos to use. - ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range"); + ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString( + false, "read actor_lineage", "the host must remain the same on both ends of the range")); throw special_keys_api_failure(); } @@ -2748,6 +2755,64 @@ Future> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr return excludeLocalityCommitActor(ryw, true); } +// Defined in ReadYourWrites.actor.cpp +ACTOR Future getWorkerInterfaces(Reference clusterRecord); +// Defined in NativeAPI.actor.cpp +ACTOR Future verifyInterfaceActor(Reference connectLock, ClientWorkerInterface workerInterf); + +ACTOR static Future workerInterfacesImplGetRangeActor(ReadYourWritesTransaction* ryw, + KeyRef prefix, + KeyRangeRef kr) { + if (!ryw->getDatabase().getPtr() || !ryw->getDatabase()->getConnectionRecord()) + return RangeResult(); + + state RangeResult interfs = wait(getWorkerInterfaces(ryw->getDatabase()->getConnectionRecord())); + // for options' special keys, the boolean flag indicates if it's a SET operation + auto [verify, _] = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandOptionSpecialKey( + "worker_interfaces", "verify")]; + state RangeResult result; + if (verify) { + // if verify option is set, we try to talk to every worker and only returns those we can talk to + Reference connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM)); + state std::vector> verifyInterfs; + for (const auto& [k_, value] : interfs) { + auto k = k_.withPrefix(prefix); + if (kr.contains(k)) { + ClientWorkerInterface workerInterf = + BinaryReader::fromStringRef(value, IncludeVersion()); + verifyInterfs.push_back(verifyInterfaceActor(connectLock, workerInterf)); + } else { + verifyInterfs.push_back(false); + } + } + wait(waitForAll(verifyInterfs)); + // state int index; + for (int index = 0; index < interfs.size(); index++) { + if (verifyInterfs[index].get()) { + // if we can establish a connection, add the kv pair into the result + result.push_back_deep(result.arena(), + KeyValueRef(interfs[index].key.withPrefix(prefix), interfs[index].value)); + } + } + } else { + for (const auto& [k_, v] : interfs) { + auto k = k_.withPrefix(prefix); + if (kr.contains(k)) + result.push_back_deep(result.arena(), KeyValueRef(k, v)); + } + } + std::sort(result.begin(), result.end(), KeyValueRef::OrderByKey{}); + return result; +} + +WorkerInterfacesSpecialKeyImpl::WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} + +Future WorkerInterfacesSpecialKeyImpl::getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const { + return workerInterfacesImplGetRangeActor(ryw, getKeyRange().begin, kr); +} + ACTOR Future validateSpecialSubrangeRead(ReadYourWritesTransaction* ryw, KeySelector begin, KeySelector end, diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 3f110d0b80..d56c117a65 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -356,7 +356,7 @@ const Key storageCacheServerKey(UID id) { } const Value storageCacheServerValue(const StorageServerInterface& ssi) { - auto protocolVersion = currentProtocolVersion; + auto protocolVersion = currentProtocolVersion(); protocolVersion.addObjectSerializerFlag(); return ObjectWriter::toValue(ssi, IncludeVersion(protocolVersion)); } @@ -666,7 +666,7 @@ const KeyRangeRef tagLocalityListKeys(LiteralStringRef("\xff/tagLocalityList/"), const KeyRef tagLocalityListPrefix = tagLocalityListKeys.begin; const Key tagLocalityListKeyFor(Optional dcID) { - BinaryWriter wr(AssumeVersion(currentProtocolVersion)); + BinaryWriter wr(AssumeVersion(currentProtocolVersion())); wr.serializeBytes(tagLocalityListKeys.begin); wr << dcID; return wr.toValue(); @@ -679,7 +679,7 @@ const Value tagLocalityListValue(int8_t const& tagLocality) { } Optional decodeTagLocalityListKey(KeyRef const& key) { Optional dcID; - BinaryReader rd(key.removePrefix(tagLocalityListKeys.begin), AssumeVersion(currentProtocolVersion)); + BinaryReader rd(key.removePrefix(tagLocalityListKeys.begin), AssumeVersion(currentProtocolVersion())); rd >> dcID; return dcID; } @@ -695,7 +695,7 @@ const KeyRangeRef datacenterReplicasKeys(LiteralStringRef("\xff\x02/datacenterRe const KeyRef datacenterReplicasPrefix = datacenterReplicasKeys.begin; const Key datacenterReplicasKeyFor(Optional dcID) { - BinaryWriter wr(AssumeVersion(currentProtocolVersion)); + BinaryWriter wr(AssumeVersion(currentProtocolVersion())); wr.serializeBytes(datacenterReplicasKeys.begin); wr << dcID; return wr.toValue(); @@ -708,7 +708,7 @@ const Value datacenterReplicasValue(int const& replicas) { } Optional decodeDatacenterReplicasKey(KeyRef const& key) { Optional dcID; - BinaryReader rd(key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion)); + BinaryReader rd(key.removePrefix(datacenterReplicasKeys.begin), AssumeVersion(currentProtocolVersion())); rd >> dcID; return dcID; } @@ -729,14 +729,14 @@ const KeyRangeRef tLogDatacentersKeys(LiteralStringRef("\xff\x02/tLogDatacenters const KeyRef tLogDatacentersPrefix = tLogDatacentersKeys.begin; const Key tLogDatacentersKeyFor(Optional dcID) { - BinaryWriter wr(AssumeVersion(currentProtocolVersion)); + BinaryWriter wr(AssumeVersion(currentProtocolVersion())); wr.serializeBytes(tLogDatacentersKeys.begin); wr << dcID; return wr.toValue(); } Optional decodeTLogDatacentersKey(KeyRef const& key) { Optional dcID; - BinaryReader rd(key.removePrefix(tLogDatacentersKeys.begin), AssumeVersion(currentProtocolVersion)); + BinaryReader rd(key.removePrefix(tLogDatacentersKeys.begin), AssumeVersion(currentProtocolVersion())); rd >> dcID; return dcID; } @@ -755,7 +755,7 @@ const Key serverListKeyFor(UID serverID) { } const Value serverListValue(StorageServerInterface const& server) { - auto protocolVersion = currentProtocolVersion; + auto protocolVersion = currentProtocolVersion(); protocolVersion.addObjectSerializerFlag(); return ObjectWriter::toValue(server, IncludeVersion(protocolVersion)); } @@ -787,7 +787,7 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) { } Value swVersionValue(SWVersion const& swversion) { - auto protocolVersion = currentProtocolVersion; + auto protocolVersion = currentProtocolVersion(); protocolVersion.addObjectSerializerFlag(); return ObjectWriter::toValue(swversion, IncludeVersion(protocolVersion)); } @@ -1331,6 +1331,9 @@ int64_t decodeBlobManagerEpochValue(ValueRef const& value) { } // blob granule data +const KeyRef blobRangeActive = LiteralStringRef("1"); +const KeyRef blobRangeInactive = StringRef(); + const KeyRangeRef blobGranuleFileKeys(LiteralStringRef("\xff\x02/bgf/"), LiteralStringRef("\xff\x02/bgf0")); const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), LiteralStringRef("\xff\x02/bgm0")); const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0")); @@ -1340,7 +1343,8 @@ const KeyRangeRef blobGranuleMergeBoundaryKeys(LiteralStringRef("\xff\x02/bgmerg LiteralStringRef("\xff\x02/bgmergebounds0")); const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0")); const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0")); -const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0")); +const KeyRangeRef blobGranuleForcePurgedKeys(LiteralStringRef("\xff\x02/bgpforce/"), + LiteralStringRef("\xff\x02/bgpforce0")); const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange"); const uint8_t BG_FILE_TYPE_DELTA = 'D'; diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp index 9c8c63f18e..b1c5d7ce53 100644 --- a/fdbclient/Tenant.cpp +++ b/fdbclient/Tenant.cpp @@ -26,11 +26,11 @@ Key TenantMapEntry::idToPrefix(int64_t id) { int64_t swapped = bigEndian64(id); - return StringRef(reinterpret_cast(&swapped), 8); + return StringRef(reinterpret_cast(&swapped), TENANT_PREFIX_SIZE); } int64_t TenantMapEntry::prefixToId(KeyRef prefix) { - ASSERT(prefix.size() == 8); + ASSERT(prefix.size() == TENANT_PREFIX_SIZE); int64_t id = *reinterpret_cast(prefix.begin()); id = bigEndian64(id); ASSERT(id >= 0); @@ -47,6 +47,10 @@ std::string TenantMapEntry::tenantStateToString(TenantState tenantState) { return "removing"; case TenantState::UPDATING_CONFIGURATION: return "updating configuration"; + case TenantState::RENAMING_FROM: + return "renaming from"; + case TenantState::RENAMING_TO: + return "renaming to"; case TenantState::ERROR: return "error"; default: @@ -63,6 +67,10 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) { return TenantState::REMOVING; } else if (stateStr == "updating configuration") { return TenantState::UPDATING_CONFIGURATION; + } else if (stateStr == "renaming from") { + return TenantState::RENAMING_FROM; + } else if (stateStr == "renaming to") { + return TenantState::RENAMING_TO; } else if (stateStr == "error") { return TenantState::ERROR; } @@ -70,6 +78,31 @@ TenantState TenantMapEntry::stringToTenantState(std::string stateStr) { UNREACHABLE(); } +std::string TenantMapEntry::tenantLockStateToString(TenantLockState tenantState) { + switch (tenantState) { + case TenantLockState::UNLOCKED: + return "unlocked"; + case TenantLockState::READ_ONLY: + return "read only"; + case TenantLockState::LOCKED: + return "locked"; + default: + UNREACHABLE(); + } +} + +TenantLockState TenantMapEntry::stringToTenantLockState(std::string stateStr) { + if (stateStr == "unlocked") { + return TenantLockState::UNLOCKED; + } else if (stateStr == "read only") { + return TenantLockState::READ_ONLY; + } else if (stateStr == "locked") { + return TenantLockState::LOCKED; + } + + UNREACHABLE(); +} + TenantMapEntry::TenantMapEntry() {} TenantMapEntry::TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted) : tenantState(tenantState), encrypted(encrypted) { @@ -109,7 +142,9 @@ std::string TenantMapEntry::toJson(int apiVersion) const { } tenantEntry["tenant_state"] = TenantMapEntry::tenantStateToString(tenantState); - + if (assignedCluster.present()) { + tenantEntry["assigned_cluster"] = assignedCluster.get().toString(); + } if (tenantGroup.present()) { json_spirit::mObject tenantGroupObject; std::string encodedTenantGroup = base64::encoder::from_string(tenantGroup.get().toString()); @@ -125,7 +160,7 @@ std::string TenantMapEntry::toJson(int apiVersion) const { } bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const { - return tenantGroup == other.tenantGroup; + return tenantGroup == other.tenantGroup && encrypted == other.encrypted; } void TenantMapEntry::configure(Standalone parameter, Optional value) { @@ -137,6 +172,16 @@ void TenantMapEntry::configure(Standalone parameter, Optional } } +TenantMetadataSpecification& TenantMetadata::instance() { + static TenantMetadataSpecification _instance = TenantMetadataSpecification("\xff/"_sr); + return _instance; +} + +Key TenantMetadata::tenantMapPrivatePrefix() { + static Key _prefix = "\xff"_sr.withSuffix(tenantMap().subspace.begin); + return _prefix; +} + TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { TenantMapEntry entry1(1, TenantState::READY, false); ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr); diff --git a/fdbclient/TenantManagement.actor.cpp b/fdbclient/TenantManagement.actor.cpp new file mode 100644 index 0000000000..608da5c690 --- /dev/null +++ b/fdbclient/TenantManagement.actor.cpp @@ -0,0 +1,40 @@ +/* + * TenantManagement.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "fdbclient/SystemData.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/Tuple.h" +#include "flow/actorcompiler.h" // has to be last include + +namespace TenantAPI { + +TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode) { + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + return TenantMode::DISABLED; + } else if (clusterType == ClusterType::METACLUSTER_DATA) { + return TenantMode::REQUIRED; + } else { + return tenantMode; + } +} + +} // namespace TenantAPI diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 1dc5357572..0edd398c53 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -21,6 +21,7 @@ #include "fdbclient/BlobGranuleFiles.h" #include "fdbclient/ClusterConnectionFile.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/CoordinationInterface.h" #include "fdbclient/ThreadSafeTransaction.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/versions.h" @@ -143,13 +144,47 @@ ThreadFuture ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& p return onMainThread([db, key]() -> Future { return db->waitPurgeGranulesComplete(key); }); } -ThreadSafeDatabase::ThreadSafeDatabase(Reference connectionRecord, int apiVersion) { +ThreadFuture ThreadSafeDatabase::blobbifyRange(const KeyRangeRef& keyRange) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future { return db->blobbifyRange(range); }); +} + +ThreadFuture ThreadSafeDatabase::unblobbifyRange(const KeyRangeRef& keyRange) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future { return db->blobbifyRange(range); }); +} + +ThreadFuture>> ThreadSafeDatabase::listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread( + [=]() -> Future>> { return db->listBlobbifiedRanges(range, rangeLimit); }); +} + +ThreadFuture ThreadSafeDatabase::verifyBlobRange(const KeyRangeRef& keyRange, Optional version) { + DatabaseContext* db = this->db; + KeyRange range = keyRange; + return onMainThread([=]() -> Future { return db->verifyBlobRange(range, version); }); +} + +ThreadSafeDatabase::ThreadSafeDatabase(ConnectionRecordType connectionRecordType, + std::string connectionRecordString, + int apiVersion) { // Allocate memory for the Database from this thread (so the pointer is known for subsequent method calls) // but run its constructor on the main thread DatabaseContext* db = this->db = DatabaseContext::allocateOnForeignThread(); - onMainThreadVoid([db, connectionRecord, apiVersion]() { + onMainThreadVoid([db, connectionRecordType, connectionRecordString, apiVersion]() { try { + Reference connectionRecord = + connectionRecordType == ConnectionRecordType::FILE + ? Reference(ClusterConnectionFile::openOrDefault(connectionRecordString)) + : Reference( + new ClusterConnectionMemoryRecord(ClusterConnectionString(connectionRecordString))); + Database::createDatabase(connectionRecord, apiVersion, IsInternal::False, LocalityData(), db).extractPtr(); } catch (Error& e) { new (db) DatabaseContext(e); @@ -350,13 +385,14 @@ ThreadFuture>> ThreadSafeTransaction::getAddre } ThreadFuture>> ThreadSafeTransaction::getBlobGranuleRanges( - const KeyRangeRef& keyRange) { + const KeyRangeRef& keyRange, + int rangeLimit) { ISingleThreadTransaction* tr = this->tr; KeyRange r = keyRange; - return onMainThread([tr, r]() -> Future>> { + return onMainThread([=]() -> Future>> { tr->checkDeferredError(); - return tr->getBlobGranuleRanges(r); + return tr->getBlobGranuleRanges(r, rangeLimit); }); } @@ -364,34 +400,33 @@ ThreadResult ThreadSafeTransaction::readBlobGranules(const KeyRange Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) { - // FIXME: prevent from calling this from another main thread! + // This should not be called directly, bypassMultiversionApi should not be set + return ThreadResult(unsupported_operation()); +} +ThreadFuture>> ThreadSafeTransaction::readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) { ISingleThreadTransaction* tr = this->tr; KeyRange r = keyRange; - int64_t readVersionOut; - ThreadFuture>> getFilesFuture = onMainThread( - [tr, r, beginVersion, readVersion, &readVersionOut]() -> Future>> { + return onMainThread( + [tr, r, beginVersion, readVersion, readVersionOut]() -> Future>> { tr->checkDeferredError(); - return tr->readBlobGranules(r, beginVersion, readVersion, &readVersionOut); + return tr->readBlobGranules(r, beginVersion, readVersion, readVersionOut); }); - - // FIXME: can this safely avoid another main thread jump? - getFilesFuture.blockUntilReadyCheckOnMainThread(); - - // propagate error to client - if (getFilesFuture.isError()) { - return ThreadResult(getFilesFuture.getError()); - } - - Standalone> files = getFilesFuture.get(); - +} +ThreadResult ThreadSafeTransaction::readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) { // do this work off of fdb network threads for performance! - if (granule_context.debugNoMaterialize) { - return ThreadResult(blob_granule_not_materialized()); - } else { - return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersionOut, granule_context); - } + Standalone> files = startFuture.get(); + return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersion, granuleContext); } void ThreadSafeTransaction::addReadConflictRange(const KeyRangeRef& keys) { @@ -563,19 +598,25 @@ void ThreadSafeTransaction::reset() { extern const char* getSourceVersion(); -ThreadSafeApi::ThreadSafeApi() - : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion)), - transportId(0) {} +ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), transportId(0) {} void ThreadSafeApi::selectApiVersion(int apiVersion) { this->apiVersion = apiVersion; } const char* ThreadSafeApi::getClientVersion() { - // There is only one copy of the ThreadSafeAPI, and it never gets deleted. Also, clientVersion is never modified. + // There is only one copy of the ThreadSafeAPI, and it never gets deleted. + // Also, clientVersion is initialized on demand and never modified afterwards. + if (clientVersion.empty()) { + clientVersion = format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion()); + } return clientVersion.c_str(); } +void ThreadSafeApi::useFutureProtocolVersion() { + ::useFutureProtocolVersion(); +} + void ThreadSafeApi::setNetworkOption(FDBNetworkOptions::Option option, Optional value) { if (option == FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID) { if (value.present()) { @@ -632,12 +673,12 @@ void ThreadSafeApi::stopNetwork() { Reference ThreadSafeApi::createDatabase(const char* clusterFilePath) { return Reference( - new ThreadSafeDatabase(ClusterConnectionFile::openOrDefault(clusterFilePath), apiVersion)); + new ThreadSafeDatabase(ThreadSafeDatabase::ConnectionRecordType::FILE, clusterFilePath, apiVersion)); } Reference ThreadSafeApi::createDatabaseFromConnectionString(const char* connectionString) { return Reference(new ThreadSafeDatabase( - makeReference(ClusterConnectionString(connectionString)), apiVersion)); + ThreadSafeDatabase::ConnectionRecordType::CONNECTION_STRING, connectionString, apiVersion)); } void ThreadSafeApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) { diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp index d3c3416b88..1575a565ab 100644 --- a/fdbclient/Tuple.cpp +++ b/fdbclient/Tuple.cpp @@ -208,7 +208,7 @@ Tuple& Tuple::append(double value) { return *this; } -Tuple& Tuple::append(nullptr_t) { +Tuple& Tuple::append(std::nullptr_t) { offsets.push_back(data.size()); data.push_back(data.arena(), (uint8_t)'\x00'); return *this; diff --git a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp index 59d31fc8f3..8bd8f94872 100644 --- a/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp +++ b/fdbclient/azure_backup/BackupContainerAzureBlobStore.actor.cpp @@ -29,7 +29,7 @@ namespace { std::string const notFoundErrorCode = "404"; void printAzureError(std::string const& operationName, azure::storage_lite::storage_error const& err) { - printf("(%s) : Error from Azure SDK : %s (%s) : %s", + printf("(%s) : Error from Azure SDK : %s (%s) : %s\n", operationName.c_str(), err.code_name.c_str(), err.code.c_str(), @@ -109,9 +109,9 @@ public: class WriteFile final : public IAsyncFile, ReferenceCounted { AsyncTaskThread* asyncTaskThread; - std::shared_ptr client; std::string containerName; std::string blobName; + std::shared_ptr client; int64_t m_cursor{ 0 }; // Ideally this buffer should not be a string, but // the Azure SDK only supports/tests uploading to append @@ -318,7 +318,7 @@ BackupContainerAzureBlobStore::BackupContainerAzureBlobStore(const std::string& std::string accountKey = _accountKey; auto credential = std::make_shared(accountName, accountKey); auto storageAccount = std::make_shared( - accountName, credential, true, format("https://%s", endpoint.c_str())); + accountName, credential, true, fmt::format("https://{}", endpoint)); client = std::make_unique(storageAccount, 1); } @@ -342,6 +342,7 @@ Future BackupContainerAzureBlobStore::create() { Future encryptionSetupFuture = usesEncryption() ? encryptionSetupComplete() : Void(); return createContainerFuture && encryptionSetupFuture; } + Future BackupContainerAzureBlobStore::exists() { TraceEvent(SevDebug, "BCAzureBlobStoreCheckContainerExists").detail("ContainerName", containerName); return asyncTaskThread.execAsync([containerName = this->containerName, client = this->client] { diff --git a/fdbclient/azure_backup/README.md b/fdbclient/azure_backup/README.md new file mode 100644 index 0000000000..4a34683674 --- /dev/null +++ b/fdbclient/azure_backup/README.md @@ -0,0 +1,33 @@ +# Set up the Azure Backup Testing Environment + +Make sure we built FDB with `-DBUILD_AZURE_BACKUP=ON` + +# Test + +If you run _BackupToBlob_ and _RestoreFromBlob_ workloads with the paramter _backupURL_ starts with `azure://`, +the workload will backup to and restore from the azure blob storage. +For example, _BackupAzureBlobCorrectness.toml_ + +## Url format + +The code now supports the following style urls: + +- `azure://.blob.core.windows.net/` (The formal url format for the blob service provided by the azure storage account) +- `azure://://` (Directly providing the endpoint address for the blob service, usually for local testing) + +## Local test environment + +We need to use the _Azurite_ to simulate an Azure blob service locally. +Please follow the [turtorial](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=docker-hub) to start your service locally. + +For example, +``` +docker run -p 10000:10000 -v `pwd`: -w mcr.microsoft.com/azure-storage/azurite azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --oauth basic --cert ./<...>.pem --key ./<...>.key.pem --debug ./ +``` + +### Notice + +- To use uses _https_, we need to provide the certificates via `--cert` and `--key` + The detailed [turtorial](https://github.com/Azure/Azurite/blob/main/README.md#https-setup) to setup HTTPS. (We tested with the `mkcert` method) +- To use Azure SDKs, we need to pass `--oauth basic` option +- Please take a look at the [difference](https://github.com/Azure/Azurite/blob/main/README.md#differences-between-azurite-and-azure-storage) between Azurite and Azure Storage diff --git a/fdbclient/azurestorage.cmake b/fdbclient/azurestorage.cmake index 36f8e24f6e..b967824948 100644 --- a/fdbclient/azurestorage.cmake +++ b/fdbclient/azurestorage.cmake @@ -1,3 +1,5 @@ +cmake_minimum_required(VERSION 3.13) + project(azurestorage-download) include(ExternalProject) diff --git a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h index 77285ced16..ed79a56078 100644 --- a/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h +++ b/fdbclient/include/fdbclient/BackupContainerAzureBlobStore.h @@ -25,8 +25,6 @@ #include "fdbclient/AsyncTaskThread.h" #include "fdbclient/BackupContainerFileSystem.h" -#include "storage_credential.h" -#include "storage_account.h" #include "blob/blob_client.h" class BackupContainerAzureBlobStore final : public BackupContainerFileSystem, diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index 5120a7d021..e0589877c1 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -35,7 +35,6 @@ #define BG_ENCRYPT_COMPRESS_DEBUG false // file format of actual blob files -// FIXME: use VecSerStrategy::String serialization for this struct GranuleSnapshot : VectorRef { constexpr static FileIdentifier file_identifier = 1300395; @@ -234,6 +233,22 @@ struct BlobGranuleChunkRef { } }; +struct BlobGranuleSummaryRef { + constexpr static FileIdentifier file_identifier = 9774587; + KeyRangeRef keyRange; + Version snapshotVersion; + int64_t snapshotSize; + Version deltaVersion; + int64_t deltaSize; + + template + void serialize(Ar& ar) { + serializer(ar, keyRange, snapshotVersion, snapshotSize, deltaVersion, deltaSize); + } +}; + +BlobGranuleSummaryRef summarizeGranuleChunk(Arena& ar, const BlobGranuleChunkRef& chunk); + enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done = 3 }; // Boundary metadata for each range indexed by the beginning of the range. @@ -252,7 +267,6 @@ struct BlobGranuleMergeBoundary { struct BlobGranuleHistoryValue { constexpr static FileIdentifier file_identifier = 991434; UID granuleID; - // VectorRef> parentGranules; VectorRef parentBoundaries; VectorRef parentVersions; diff --git a/fdbclient/include/fdbclient/BlobGranuleReader.actor.h b/fdbclient/include/fdbclient/BlobGranuleReader.actor.h index a9008b03eb..395b76b26c 100644 --- a/fdbclient/include/fdbclient/BlobGranuleReader.actor.h +++ b/fdbclient/include/fdbclient/BlobGranuleReader.actor.h @@ -51,5 +51,7 @@ ACTOR Future readBlobGranules(BlobGranuleFileRequest request, Reference bstore, PromiseStream results); +bool isRangeFullyCovered(KeyRange range, Standalone> blobChunks); + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h index 0535301427..45ee961320 100644 --- a/fdbclient/include/fdbclient/BlobWorkerCommon.h +++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h @@ -30,7 +30,7 @@ struct BlobWorkerStats { Counter deltaBytesWritten, snapshotBytesWritten; Counter bytesReadFromFDBForInitialSnapshot; Counter bytesReadFromS3ForCompaction; - Counter rangeAssignmentRequests, readRequests; + Counter rangeAssignmentRequests, readRequests, summaryReads; Counter wrongShardServer; Counter changeFeedInputBytes; Counter readReqTotalFilesReturned; @@ -41,16 +41,32 @@ struct BlobWorkerStats { Counter readRequestsWithBegin; Counter readRequestsCollapsed; Counter flushGranuleReqs; + Counter compressionBytesRaw; + Counter compressionBytesFinal; + Counter fullRejections; int numRangesAssigned; int mutationBytesBuffered; int activeReadRequests; int granulesPendingSplitCheck; + Version minimumCFVersion; + Version cfVersionLag; + int notAtLatestChangeFeeds; + int64_t lastResidentMemory; + int64_t estimatedMaxResidentMemory; + + Reference initialSnapshotLock; + Reference resnapshotLock; + Reference deltaWritesLock; Future logger; // Current stats maintained for a given blob worker process - explicit BlobWorkerStats(UID id, double interval) + explicit BlobWorkerStats(UID id, + double interval, + Reference initialSnapshotLock, + Reference resnapshotLock, + Reference deltaWritesLock) : cc("BlobWorkerStats", id.toString()), s3PutReqs("S3PutReqs", cc), s3GetReqs("S3GetReqs", cc), s3DeleteReqs("S3DeleteReqs", cc), @@ -59,17 +75,31 @@ struct BlobWorkerStats { bytesReadFromFDBForInitialSnapshot("BytesReadFromFDBForInitialSnapshot", cc), bytesReadFromS3ForCompaction("BytesReadFromS3ForCompaction", cc), rangeAssignmentRequests("RangeAssignmentRequests", cc), readRequests("ReadRequests", cc), - wrongShardServer("WrongShardServer", cc), changeFeedInputBytes("ChangeFeedInputBytes", cc), - readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), + summaryReads("SummaryReads", cc), wrongShardServer("WrongShardServer", cc), + changeFeedInputBytes("ChangeFeedInputBytes", cc), readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc), readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc), granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc), readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc), - flushGranuleReqs("FlushGranuleReqs", cc), numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), - granulesPendingSplitCheck(0) { + flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc), + compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), numRangesAssigned(0), + mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), + cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0), + initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) { specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; }); specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; }); specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; }); specialCounter(cc, "GranulesPendingSplitCheck", [this]() { return this->granulesPendingSplitCheck; }); + specialCounter(cc, "MinimumChangeFeedVersion", [this]() { return this->minimumCFVersion; }); + specialCounter(cc, "CFVersionLag", [this]() { return this->cfVersionLag; }); + specialCounter(cc, "NotAtLatestChangeFeeds", [this]() { return this->notAtLatestChangeFeeds; }); + specialCounter(cc, "LastResidentMemory", [this]() { return this->lastResidentMemory; }); + specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; }); + specialCounter(cc, "InitialSnapshotsActive", [this]() { return this->initialSnapshotLock->activePermits(); }); + specialCounter(cc, "InitialSnapshotsWaiting", [this]() { return this->initialSnapshotLock->waiters(); }); + specialCounter(cc, "ReSnapshotsActive", [this]() { return this->resnapshotLock->activePermits(); }); + specialCounter(cc, "ReSnapshotsWaiting", [this]() { return this->resnapshotLock->waiters(); }); + specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); }); + specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); }); logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics"); } diff --git a/fdbclient/include/fdbclient/BlobWorkerInterface.h b/fdbclient/include/fdbclient/BlobWorkerInterface.h index de370d248f..69d938300e 100644 --- a/fdbclient/include/fdbclient/BlobWorkerInterface.h +++ b/fdbclient/include/fdbclient/BlobWorkerInterface.h @@ -30,15 +30,15 @@ struct BlobWorkerInterface { constexpr static FileIdentifier file_identifier = 8358753; - // TODO: mimic what StorageServerInterface does with sequential endpoint IDs RequestStream> waitFailure; - RequestStream blobGranuleFileRequest; + PublicRequestStream blobGranuleFileRequest; RequestStream assignBlobRangeRequest; RequestStream revokeBlobRangeRequest; RequestStream granuleAssignmentsRequest; RequestStream granuleStatusStreamRequest; RequestStream haltBlobWorker; RequestStream flushGranuleRequest; + RequestStream minBlobVersionRequest; struct LocalityData locality; UID myId; @@ -57,6 +57,7 @@ struct BlobWorkerInterface { streams.push_back(granuleStatusStreamRequest.getReceiver()); streams.push_back(haltBlobWorker.getReceiver()); streams.push_back(flushGranuleRequest.getReceiver()); + streams.push_back(minBlobVersionRequest.getReceiver()); FlowTransport::transport().addEndpoints(streams); } UID id() const { return myId; } @@ -72,7 +73,7 @@ struct BlobWorkerInterface { serializer(ar, myId, locality, waitFailure); if (Archive::isDeserializing) { blobGranuleFileRequest = - RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); + PublicRequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); assignBlobRangeRequest = RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(2)); revokeBlobRangeRequest = @@ -85,6 +86,8 @@ struct BlobWorkerInterface { RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(6)); flushGranuleRequest = RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(7)); + minBlobVersionRequest = + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(8)); } } }; @@ -110,13 +113,16 @@ struct BlobGranuleFileRequest { Version readVersion; bool canCollapseBegin = true; TenantInfo tenantInfo; + bool summarize = false; ReplyPromise reply; BlobGranuleFileRequest() {} + bool verify() const { return tenantInfo.isAuthorized(); } + template void serialize(Ar& ar) { - serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, reply, arena); + serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, tenantInfo, summarize, reply, arena); } }; @@ -137,6 +143,28 @@ struct RevokeBlobRangeRequest { } }; +struct MinBlobVersionReply { + constexpr static FileIdentifier file_identifier = 6857512; + Version version; + + template + void serialize(Ar& ar) { + serializer(ar, version); + } +}; + +struct MinBlobVersionRequest { + constexpr static FileIdentifier file_identifier = 4833278; + Version grv; + ReplyPromise reply; + + MinBlobVersionRequest() {} + + template + void serialize(Ar& ar) { + serializer(ar, grv, reply); + } +}; /* * Continue: Blob worker should continue handling a granule that was evaluated for a split * Normal: Blob worker should open the granule and start processing it @@ -172,6 +200,7 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { KeyRange granuleRange; bool doSplit; bool writeHotSplit; + bool initialSplitTooBig; int64_t continueEpoch; int64_t continueSeqno; UID granuleID; @@ -180,11 +209,13 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { bool mergeCandidate; int64_t originalEpoch; int64_t originalSeqno; + Optional proposedSplitKey; GranuleStatusReply() {} explicit GranuleStatusReply(KeyRange range, bool doSplit, bool writeHotSplit, + bool initialSplitTooBig, int64_t continueEpoch, int64_t continueSeqno, UID granuleID, @@ -193,11 +224,15 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { bool mergeCandidate, int64_t originalEpoch, int64_t originalSeqno) - : granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), continueEpoch(continueEpoch), - continueSeqno(continueSeqno), granuleID(granuleID), startVersion(startVersion), blockedVersion(blockedVersion), - mergeCandidate(mergeCandidate), originalEpoch(originalEpoch), originalSeqno(originalSeqno) {} + : granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), initialSplitTooBig(initialSplitTooBig), + continueEpoch(continueEpoch), continueSeqno(continueSeqno), granuleID(granuleID), startVersion(startVersion), + blockedVersion(blockedVersion), mergeCandidate(mergeCandidate), originalEpoch(originalEpoch), + originalSeqno(originalSeqno) {} - int expectedSize() const { return sizeof(GranuleStatusReply) + granuleRange.expectedSize(); } + int expectedSize() const { + return sizeof(GranuleStatusReply) + granuleRange.expectedSize() + + (proposedSplitKey.present() ? proposedSplitKey.get().expectedSize() : 0); + } template void serialize(Ar& ar) { @@ -207,6 +242,7 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { granuleRange, doSplit, writeHotSplit, + initialSplitTooBig, continueEpoch, continueSeqno, granuleID, @@ -214,7 +250,8 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply { blockedVersion, mergeCandidate, originalEpoch, - originalSeqno); + originalSeqno, + proposedSplitKey); } }; diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 4978bd84e9..4bcec9fd44 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -39,10 +39,6 @@ public: double FAILURE_MAX_DELAY; double FAILURE_MIN_DELAY; - double FAILURE_TIMEOUT_DELAY; - double CLIENT_FAILURE_TIMEOUT_DELAY; - double FAILURE_EMERGENCY_DELAY; - double FAILURE_MAX_GENERATIONS; double RECOVERY_DELAY_START_GENERATION; double RECOVERY_DELAY_SECONDS_PER_GENERATION; double MAX_GENERATIONS; @@ -61,6 +57,7 @@ public: double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is // mostly wrong (e.g. dumping the database after a test) double FUTURE_VERSION_RETRY_DELAY; + double GRV_ERROR_RETRY_DELAY; double UNKNOWN_TENANT_RETRY_DELAY; int REPLY_BYTE_LIMIT; double DEFAULT_BACKOFF; @@ -81,6 +78,7 @@ public: int64_t CHANGE_FEED_CACHE_SIZE; double CHANGE_FEED_POP_TIMEOUT; int64_t CHANGE_FEED_STREAM_MIN_BYTES; + double CHANGE_FEED_START_INTERVAL; int MAX_BATCH_SIZE; double GRV_BATCH_TIMEOUT; @@ -161,10 +159,8 @@ public: double BACKUP_AGGREGATE_POLL_RATE; double BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL; int BACKUP_LOG_WRITE_BATCH_MAX_SIZE; - int BACKUP_LOG_ATOMIC_OPS_SIZE; int BACKUP_MAX_LOG_RANGES; int BACKUP_SIM_COPY_LOG_RANGES; - int BACKUP_OPERATION_COST_OVERHEAD; int BACKUP_VERSION_DELAY; int BACKUP_MAP_KEY_LOWER_LIMIT; int BACKUP_MAP_KEY_UPPER_LIMIT; @@ -269,12 +265,9 @@ public: double BUSYNESS_SPIKE_START_THRESHOLD; double BUSYNESS_SPIKE_SATURATED_THRESHOLD; - // multi-version client control - int MVC_CLIENTLIB_CHUNK_SIZE; - int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION; - // Blob Granules int BG_MAX_GRANULE_PARALLELISM; + int BG_TOO_MANY_GRANULES; // The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file. // This might happen when a recovery is happening together with a cluster controller coordinator key change. @@ -285,6 +278,12 @@ public: // Tenants and Metacluster int MAX_TENANTS_PER_CLUSTER; + int TENANT_TOMBSTONE_CLEANUP_INTERVAL; + int MAX_DATA_CLUSTERS; + int REMOVE_CLUSTER_TENANT_BATCH_SIZE; + int METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK; + double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY; + double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT; ClientKnobs(Randomize randomize); void initialize(Randomize randomize); diff --git a/fdbclient/include/fdbclient/ClusterInterface.h b/fdbclient/include/fdbclient/ClusterInterface.h index 14935f1700..a4e3da44f3 100644 --- a/fdbclient/include/fdbclient/ClusterInterface.h +++ b/fdbclient/include/fdbclient/ClusterInterface.h @@ -98,32 +98,44 @@ struct ClusterControllerClientInterface { } }; -template -struct ItemWithExamples { - T item; - int count; - std::vector> examples; - - ItemWithExamples() : item{}, count(0) {} - ItemWithExamples(T const& item, int count, std::vector> const& examples) - : item(item), count(count), examples(examples) {} - - template - void serialize(Ar& ar) { - serializer(ar, item, count, examples); - } -}; - struct OpenDatabaseRequest { constexpr static FileIdentifier file_identifier = 2799502; // Sent by the native API to the cluster controller to open a database and track client // info changes. Returns immediately if the current client info id is different from // knownClientInfoID; otherwise returns when it next changes (or perhaps after a long interval) - int clientCount; - std::vector> issues; - std::vector>> supportedVersions; - std::vector> maxProtocolSupported; + struct Samples { + int count; + + // network address / trace log group + std::set> samples; + + Samples() : count(0), samples{} {} + + template + void serialize(Ar& ar) { + serializer(ar, count, samples); + } + + // Merges a set of Samples into *this + Samples& operator+=(const Samples& other) { + count += other.count; + samples.insert(std::begin(other.samples), std::end(other.samples)); + + return *this; + } + }; + + int clientCount = 0; + + // Maps issue to Samples + std::map issues; + + // Maps ClientVersionRef to Samples + std::map, Samples> supportedVersions; + + // Maps max protocol to Samples + std::map maxProtocolSupported; UID knownClientInfoID; ReplyPromise reply; diff --git a/fdbclient/include/fdbclient/CommitProxyInterface.h b/fdbclient/include/fdbclient/CommitProxyInterface.h index 253e0e1f36..1a6a0410ae 100644 --- a/fdbclient/include/fdbclient/CommitProxyInterface.h +++ b/fdbclient/include/fdbclient/CommitProxyInterface.h @@ -25,6 +25,7 @@ #include #include +#include "fdbclient/EncryptKeyProxyInterface.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/CommitTransaction.h" @@ -118,8 +119,11 @@ struct ClientDBInfo { std::vector history; UID clusterId; bool isEncryptionEnabled = false; + Optional encryptKeyProxy; TenantMode tenantMode; + ClusterType clusterType = ClusterType::STANDALONE; + Optional metaclusterName; ClientDBInfo() {} @@ -131,7 +135,18 @@ struct ClientDBInfo { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); } - serializer(ar, grvProxies, commitProxies, id, forward, history, tenantMode, clusterId, isEncryptionEnabled); + serializer(ar, + grvProxies, + commitProxies, + id, + forward, + history, + tenantMode, + isEncryptionEnabled, + encryptKeyProxy, + clusterId, + clusterType, + metaclusterName); } }; diff --git a/fdbclient/include/fdbclient/CommitTransaction.h b/fdbclient/include/fdbclient/CommitTransaction.h index f6757ac17e..dc26df4fa4 100644 --- a/fdbclient/include/fdbclient/CommitTransaction.h +++ b/fdbclient/include/fdbclient/CommitTransaction.h @@ -25,6 +25,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/Knobs.h" #include "fdbclient/Tracing.h" +#include "flow/BlobCipher.h" // The versioned message has wire format : -1, version, messages static const int32_t VERSION_HEADER = -1; @@ -79,7 +80,7 @@ struct MutationRef { CompareAndClear, Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */, Reserved_For_OTELSpanContextMessage, - Reserved_For_EncryptedMutationMessage /* See fdbserver/EncryptedMutationMessage.actor.h */, + Encrypted, /* Represents an encrypted mutation and cannot be used directly before decrypting */ MAX_ATOMIC_OP }; // This is stored this way for serialization purposes. @@ -128,6 +129,64 @@ struct MutationRef { } } + // An encrypted mutation has type Encrypted, encryption header (which contains encryption metadata) as param1, + // and the payload as param2. It can be serialize/deserialize as normal mutation, but can only be used after + // decryption via decrypt(). + bool isEncrypted() const { return type == Encrypted; } + + const BlobCipherEncryptHeader* encryptionHeader() const { + ASSERT(isEncrypted()); + return reinterpret_cast(param1.begin()); + } + + MutationRef encrypt(const std::unordered_map>& cipherKeys, + const EncryptCipherDomainId& domainId, + Arena& arena) const { + ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID); + auto textCipherItr = cipherKeys.find(domainId); + auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + uint8_t iv[AES_256_IV_LENGTH] = { 0 }; + deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); + BinaryWriter bw(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); + bw << *this; + EncryptBlobCipherAes265Ctr cipher(textCipherItr->second, + headerCipherItr->second, + iv, + AES_256_IV_LENGTH, + ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + BlobCipherEncryptHeader* header = new (arena) BlobCipherEncryptHeader; + StringRef headerRef(reinterpret_cast(header), sizeof(BlobCipherEncryptHeader)); + StringRef payload = + cipher.encrypt(static_cast(bw.getData()), bw.getLength(), header, arena)->toStringRef(); + return MutationRef(Encrypted, headerRef, payload); + } + + MutationRef encryptMetadata(const std::unordered_map>& cipherKeys, + Arena& arena) const { + return encrypt(cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, arena); + } + + MutationRef decrypt(const std::unordered_map>& cipherKeys, + Arena& arena, + StringRef* buf = nullptr) const { + const BlobCipherEncryptHeader* header = encryptionHeader(); + auto textCipherItr = cipherKeys.find(header->cipherTextDetails); + auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, header->iv); + StringRef plaintext = cipher.decrypt(param2.begin(), param2.size(), *header, arena)->toStringRef(); + if (buf != nullptr) { + *buf = plaintext; + } + ArenaReader reader(arena, plaintext, AssumeVersion(ProtocolVersion::withEncryptionAtRest())); + MutationRef mutation; + reader >> mutation; + return mutation; + } + // These masks define which mutation types have particular properties (they are used to implement // isSingleKeyMutation() etc) enum { diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index 092a290a4c..1a4fee7126 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -25,6 +25,7 @@ #include "flow/FastRef.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/StorageServerInterface.h" +#include "flow/IRandom.h" #include "flow/genericactors.actor.h" #include #include @@ -167,10 +168,11 @@ struct ChangeFeedStorageData : ReferenceCounted { Future updater; NotifiedVersion version; NotifiedVersion desired; - Promise destroyed; UID interfToken; + DatabaseContext* context; + double created; - ~ChangeFeedStorageData() { destroyed.send(Void()); } + ~ChangeFeedStorageData(); }; struct ChangeFeedData : ReferenceCounted { @@ -180,6 +182,8 @@ struct ChangeFeedData : ReferenceCounted { Version getVersion(); Future whenAtLeast(Version version); + UID dbgid; + DatabaseContext* context; NotifiedVersion lastReturnedVersion; std::vector> storageData; AsyncVar notAtLatest; @@ -188,8 +192,10 @@ struct ChangeFeedData : ReferenceCounted { Version endVersion = invalidVersion; Version popVersion = invalidVersion; // like TLog pop version, set by SS and client can check it to see if they missed data + double created = 0; - ChangeFeedData() : notAtLatest(1) {} + explicit ChangeFeedData(DatabaseContext* context = nullptr); + ~ChangeFeedData(); }; struct EndpointFailureInfo { @@ -374,12 +380,18 @@ public: Future getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion); Future popChangeFeedMutations(Key rangeID, Version version); + // BlobGranule API. Future purgeBlobGranules(KeyRange keyRange, Version purgeVersion, Optional tenant, bool force = false); Future waitPurgeGranulesComplete(Key purgeKey); + Future blobbifyRange(KeyRange range); + Future unblobbifyRange(KeyRange range); + Future>> listBlobbifiedRanges(KeyRange range, int rangeLimit); + Future verifyBlobRange(const KeyRange& range, Optional version); + // private: explicit DatabaseContext(Reference>> connectionRecord, Reference> clientDBInfo, @@ -467,9 +479,12 @@ public: std::unordered_map> tssMetrics; // map from changeFeedId -> changeFeedRange std::unordered_map changeFeedCache; - std::unordered_map> changeFeedUpdaters; + std::unordered_map changeFeedUpdaters; + std::map notAtLatestChangeFeeds; Reference getStorageData(StorageServerInterface interf); + Version getMinimumChangeFeedVersion(); + void setDesiredChangeFeedVersion(Version v); // map from ssid -> ss tag // @note this map allows the client to identify the latest commit versions diff --git a/fdbserver/include/fdbserver/EncryptKeyProxyInterface.h b/fdbclient/include/fdbclient/EncryptKeyProxyInterface.h similarity index 100% rename from fdbserver/include/fdbserver/EncryptKeyProxyInterface.h rename to fdbclient/include/fdbclient/EncryptKeyProxyInterface.h diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index 4f4d48cb75..f770c92592 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -331,6 +331,22 @@ struct KeyRangeRef { bool empty() const { return begin == end; } bool singleKeyRange() const { return equalsKeyAfter(begin, end); } + // Return true if it's fully covered by given range list. Note that ranges should be sorted + bool isCovered(std::vector& ranges) { + ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder())); + KeyRangeRef clone(begin, end); + for (auto r : ranges) { + if (begin < r.begin) + return false; // uncovered gap between clone.begin and r.begin + if (end <= r.end) + return true; // range is fully covered + if (end > r.begin) + // {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end} + clone = KeyRangeRef(r.end, clone.end); + } + return false; + } + Standalone withPrefix(const StringRef& prefix) const { return KeyRangeRef(begin.withPrefix(prefix), end.withPrefix(prefix)); } @@ -1283,8 +1299,6 @@ struct WorkerBackupStatus { enum class TransactionPriority : uint8_t { BATCH, DEFAULT, IMMEDIATE, MIN = BATCH, MAX = IMMEDIATE }; -enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH }; - const std::array allTransactionPriorities = { TransactionPriority::BATCH, TransactionPriority::DEFAULT, @@ -1394,6 +1408,11 @@ struct TenantMode { uint32_t mode; }; +typedef StringRef ClusterNameRef; +typedef Standalone ClusterName; + +enum class ClusterType { STANDALONE, METACLUSTER_MANAGEMENT, METACLUSTER_DATA }; + struct GRVCacheSpace { Version cachedReadVersion; double lastGrvTime; @@ -1415,7 +1434,7 @@ struct DatabaseSharedState { std::atomic refCount; DatabaseSharedState() - : protocolVersion(currentProtocolVersion), mutexLock(Mutex()), grvCacheSpace(GRVCacheSpace()), refCount(0) {} + : protocolVersion(currentProtocolVersion()), mutexLock(Mutex()), grvCacheSpace(GRVCacheSpace()), refCount(0) {} }; inline bool isValidPerpetualStorageWiggleLocality(std::string locality) { @@ -1462,7 +1481,7 @@ struct StorageMetadataType { bool wrongConfigured = false; StorageMetadataType() : createdTime(0) {} - StorageMetadataType(uint64_t t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false) + StorageMetadataType(double t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false) : createdTime(t), storeType(storeType), wrongConfigured(wrongConfigured) {} static double currentTime() { return g_network->timer(); } @@ -1512,6 +1531,44 @@ struct StorageWiggleValue { } }; +enum class ReadType { + EAGER = 0, + FETCH = 1, + LOW = 2, + NORMAL = 3, + HIGH = 4, + MIN = EAGER, + MAX = HIGH +}; + +FDB_DECLARE_BOOLEAN_PARAM(CacheResult); + +// store options for storage engine read +// ReadType describes the usage and priority of the read +// cacheResult determines whether the storage engine cache for this read +// consistencyCheckStartVersion indicates the consistency check which began at this version +// debugID helps to trace the path of the read +struct ReadOptions { + ReadType type; + // Once CacheResult is serializable, change type from bool to CacheResult + bool cacheResult; + Optional debugID; + Optional consistencyCheckStartVersion; + + ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){}; + + ReadOptions(Optional debugID, + ReadType type = ReadType::NORMAL, + CacheResult cache = CacheResult::False, + Optional version = Optional()) + : type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){}; + + template + void serialize(Ar& ar) { + serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion); + } +}; + // Can be used to identify types (e.g. IDatabase) that can be used to create transactions with a `createTransaction` // function template diff --git a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h index 8fe0d08fd2..4c920f5da6 100644 --- a/fdbclient/include/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/GenericManagementAPI.actor.h @@ -39,6 +39,7 @@ the contents of the system key space. #include "fdbclient/Status.h" #include "fdbclient/Subspace.h" #include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/Metacluster.h" #include "fdbclient/Status.h" #include "fdbclient/SystemData.h" #include "flow/actorcompiler.h" // has to be last include @@ -69,6 +70,7 @@ enum class ConfigurationResult { SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL, DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL, DATABASE_CREATED_WARN_SHARDED_ROCKSDB_EXPERIMENTAL, + DATABASE_IS_REGISTERED }; enum class CoordinatorsResult { @@ -475,6 +477,14 @@ Future changeConfig(Reference db, std::map metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + if (metaclusterRegistration.present()) { + return ConfigurationResult::DATABASE_IS_REGISTERED; + } + } } } if (creating) { diff --git a/fdbserver/GetEncryptCipherKeys.actor.cpp b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h similarity index 66% rename from fdbserver/GetEncryptCipherKeys.actor.cpp rename to fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h index 328ff21587..42537bfacb 100644 --- a/fdbserver/GetEncryptCipherKeys.actor.cpp +++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h @@ -1,5 +1,5 @@ /* - * GetEncryptCipherKeys.actor.cpp + * GetEncryptCipherKeys.actor.h * * This source file is part of the FoundationDB open source project * @@ -17,18 +17,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H) +#define FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H +#include "fdbclient/GetEncryptCipherKeys.actor.g.h" +#elif !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_H) +#define FDBCLIENT_GETCIPHERKEYS_ACTOR_H -#include "fdbserver/GetEncryptCipherKeys.h" +#include "fdbclient/EncryptKeyProxyInterface.h" +#include "flow/BlobCipher.h" +#include "flow/IRandom.h" -#include +#include +#include -namespace { +#include "flow/actorcompiler.h" // This must be the last #include. -Optional getEncryptKeyProxyId(const Reference const>& db) { - return db->get().encryptKeyProxy.map([](EncryptKeyProxyInterface proxy) { return proxy.id(); }); +template +Optional getEncryptKeyProxyId(const Reference const>& db) { + return db->get().encryptKeyProxy.template map([](EncryptKeyProxyInterface proxy) { return proxy.id(); }); } -ACTOR Future onEncryptKeyProxyChange(Reference const> db) { +ACTOR template +Future onEncryptKeyProxyChange(Reference const> db) { state Optional previousProxyId = getEncryptKeyProxyId(db); state Optional currentProxyId; loop { @@ -44,9 +55,9 @@ ACTOR Future onEncryptKeyProxyChange(Reference cons return Void(); } -ACTOR Future getUncachedLatestEncryptCipherKeys( - Reference const> db, - EKPGetLatestBaseCipherKeysRequest request) { +ACTOR template +Future getUncachedLatestEncryptCipherKeys(Reference const> db, + EKPGetLatestBaseCipherKeysRequest request) { Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. @@ -71,10 +82,12 @@ ACTOR Future getUncachedLatestEncryptCipherKeys } } -} // anonymous namespace - -ACTOR Future>> getLatestEncryptCipherKeys( - Reference const> db, +// Get latest cipher keys for given encryption domains. It tries to get the cipher keys from local cache. +// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future>> getLatestEncryptCipherKeys( + Reference const> db, std::unordered_map domains) { state Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); state std::unordered_map> cipherKeys; @@ -105,8 +118,12 @@ ACTOR Future> for (const EKPBaseCipherDetails& details : reply.baseCipherDetails) { EncryptCipherDomainId domainId = details.encryptDomainId; if (domains.count(domainId) > 0 && cipherKeys.count(domainId) == 0) { - Reference cipherKey = cipherKeyCache->insertCipherKey( - domainId, details.baseCipherId, details.baseCipherKey.begin(), details.baseCipherKey.size()); + Reference cipherKey = cipherKeyCache->insertCipherKey(domainId, + details.baseCipherId, + details.baseCipherKey.begin(), + details.baseCipherKey.size(), + details.refreshAt, + details.expireAt); ASSERT(cipherKey.isValid()); cipherKeys[domainId] = cipherKey; } @@ -126,10 +143,9 @@ ACTOR Future> return cipherKeys; } -namespace { - -ACTOR Future getUncachedEncryptCipherKeys(Reference const> db, - EKPGetBaseCipherKeysByIdsRequest request) { +ACTOR template +Future getUncachedEncryptCipherKeys(Reference const> db, + EKPGetBaseCipherKeysByIdsRequest request) { Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. @@ -156,10 +172,12 @@ ACTOR Future getUncachedEncryptCipherKeys(Refere using BaseCipherIndex = std::pair; -} // anonymous namespace - -ACTOR Future>> getEncryptCipherKeys( - Reference const> db, +// Get cipher keys specified by the list of cipher details. It tries to get the cipher keys from local cache. +// In case of cache miss, it fetches the cipher keys from EncryptKeyProxy and put the result in the local cache +// before return. +ACTOR template +Future>> getEncryptCipherKeys( + Reference const> db, std::unordered_set cipherDetails) { state Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); state std::unordered_map> cipherKeys; @@ -191,10 +209,10 @@ ACTOR Future>> ge // Fetch any uncached cipher keys. loop choose { when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request))) { - std::unordered_map> baseCipherKeys; + std::unordered_map> baseCipherKeys; for (const EKPBaseCipherDetails& baseDetails : reply.baseCipherDetails) { BaseCipherIndex baseIdx = std::make_pair(baseDetails.encryptDomainId, baseDetails.baseCipherId); - baseCipherKeys[baseIdx] = baseDetails.baseCipherKey; + baseCipherKeys[baseIdx] = baseDetails; } // Insert base cipher keys into cache and construct result. for (const BlobCipherDetails& details : cipherDetails) { @@ -211,9 +229,11 @@ ACTOR Future>> ge } Reference cipherKey = cipherKeyCache->insertCipherKey(details.encryptDomainId, details.baseCipherId, - itr->second.begin(), - itr->second.size(), - details.salt); + itr->second.baseCipherKey.begin(), + itr->second.baseCipherKey.size(), + details.salt, + itr->second.refreshAt, + itr->second.expireAt); ASSERT(cipherKey.isValid()); cipherKeys[details] = cipherKey; } @@ -225,24 +245,35 @@ ACTOR Future>> ge return cipherKeys; } -ACTOR Future getLatestSystemEncryptCipherKeys(Reference const> db) { - static std::unordered_map domains = { - { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }, - { ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME } - }; +struct TextAndHeaderCipherKeys { + Reference cipherTextKey; + Reference cipherHeaderKey; +}; + +ACTOR template +Future getLatestEncryptCipherKeysForDomain(Reference const> db, + EncryptCipherDomainId domainId, + EncryptCipherDomainName domainName) { + std::unordered_map domains; + domains[domainId] = domainName; + domains[ENCRYPT_HEADER_DOMAIN_ID] = FDB_DEFAULT_ENCRYPT_DOMAIN_NAME; std::unordered_map> cipherKeys = wait(getLatestEncryptCipherKeys(db, domains)); - ASSERT(cipherKeys.count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) > 0); + ASSERT(cipherKeys.count(domainId) > 0); ASSERT(cipherKeys.count(ENCRYPT_HEADER_DOMAIN_ID) > 0); - TextAndHeaderCipherKeys result{ cipherKeys.at(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID), - cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) }; + TextAndHeaderCipherKeys result{ cipherKeys.at(domainId), cipherKeys.at(ENCRYPT_HEADER_DOMAIN_ID) }; ASSERT(result.cipherTextKey.isValid()); ASSERT(result.cipherHeaderKey.isValid()); return result; } -ACTOR Future getEncryptCipherKeys(Reference const> db, - BlobCipherEncryptHeader header) { +template +Future getLatestSystemEncryptCipherKeys(const Reference const>& db) { + return getLatestEncryptCipherKeysForDomain(db, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); +} + +ACTOR template +Future getEncryptCipherKeys(Reference const> db, BlobCipherEncryptHeader header) { std::unordered_set cipherDetails{ header.cipherTextDetails, header.cipherHeaderDetails }; std::unordered_map> cipherKeys = wait(getEncryptCipherKeys(db, cipherDetails)); @@ -254,3 +285,6 @@ ACTOR Future getEncryptCipherKeys(Reference>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) = 0; - virtual ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) = 0; + virtual ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rowLimit) = 0; virtual ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) = 0; + virtual ThreadFuture>> readBlobGranulesStart( + const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) = 0; + + virtual ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) = 0; + virtual void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) = 0; virtual void set(const KeyRef& key, const ValueRef& value) = 0; virtual void clear(const KeyRef& begin, const KeyRef& end) = 0; @@ -172,6 +187,13 @@ public: virtual ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0; virtual ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0; + virtual ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) = 0; + virtual ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) = 0; + virtual ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) = 0; + + virtual ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) = 0; + // Interface to manage shared state across multiple connections to the same Database virtual ThreadFuture createSharedState() = 0; virtual void setSharedState(DatabaseSharedState* p) = 0; @@ -190,6 +212,7 @@ public: virtual void selectApiVersion(int apiVersion) = 0; virtual const char* getClientVersion() = 0; + virtual void useFutureProtocolVersion() = 0; virtual void setNetworkOption(FDBNetworkOptions::Option option, Optional value = Optional()) = 0; diff --git a/fdbclient/include/fdbclient/IConfigTransaction.h b/fdbclient/include/fdbclient/IConfigTransaction.h index 8f21679e27..9246e4016e 100644 --- a/fdbclient/include/fdbclient/IConfigTransaction.h +++ b/fdbclient/include/fdbclient/IConfigTransaction.h @@ -55,7 +55,7 @@ public: Future>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) override { throw client_invalid_operation(); } - Future>> getBlobGranuleRanges(KeyRange const& range) override { + Future>> getBlobGranuleRanges(KeyRange const& range, int rowLimit) override { throw client_invalid_operation(); } Future>> readBlobGranules(KeyRange const& range, diff --git a/fdbclient/include/fdbclient/ISingleThreadTransaction.h b/fdbclient/include/fdbclient/ISingleThreadTransaction.h index b44f58b464..6143ec8605 100644 --- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h @@ -80,7 +80,7 @@ public: virtual Future>> getAddressesForKey(Key const& key) = 0; virtual Future>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) = 0; virtual Future getEstimatedRangeSizeBytes(KeyRange const& keys) = 0; - virtual Future>> getBlobGranuleRanges(KeyRange const& range) = 0; + virtual Future>> getBlobGranuleRanges(KeyRange const& range, int rangeLimit) = 0; virtual Future>> readBlobGranules(KeyRange const& range, Version begin, Optional readVersion, diff --git a/fdbclient/include/fdbclient/KeyBackedTypes.h b/fdbclient/include/fdbclient/KeyBackedTypes.h index 2977262b6c..a3fee57644 100644 --- a/fdbclient/include/fdbclient/KeyBackedTypes.h +++ b/fdbclient/include/fdbclient/KeyBackedTypes.h @@ -25,6 +25,7 @@ #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/CommitTransaction.h" +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/GenericTransactionHelper.h" #include "fdbclient/Subspace.h" #include "flow/ObjectSerializer.h" @@ -156,6 +157,12 @@ struct NullCodec { static Standalone unpack(Standalone val) { return val; } }; +template +struct BinaryCodec { + static Standalone pack(T val) { return BinaryWriter::toValue(val, Unversioned()); } + static T unpack(Standalone val) { return BinaryReader::fromStringRef(val, Unversioned()); } +}; + template struct KeyBackedRangeResult { std::vector results; @@ -364,6 +371,16 @@ public: })); } + // Get key's value or defaultValue if it doesn't exist + template + Future getD(Transaction tr, + KeyType const& key, + Snapshot snapshot = Snapshot::False, + ValueType defaultValue = ValueType()) const { + return map(get(tr, key, snapshot), + [=](Optional val) -> ValueType { return val.orDefault(defaultValue); }); + } + // Returns a Property that can be get/set that represents key's entry in this this. KeyBackedProperty getProperty(KeyType const& key) const { return subspace.begin.withSuffix(KeyCodec::pack(key)); @@ -378,6 +395,13 @@ public: return k.expectedSize() + v.expectedSize(); } + template + void atomicOp(Transaction tr, KeyType const& key, ValueType const& val, MutationRef::Type type) { + Key k = subspace.begin.withSuffix(KeyCodec::pack(key)); + Value v = ValueCodec::pack(val); + tr->atomicOp(k, v, type); + } + template void erase(Transaction tr, KeyType const& key) { tr->clear(subspace.begin.withSuffix(KeyCodec::pack(key))); diff --git a/fdbclient/include/fdbclient/KeyRangeMap.h b/fdbclient/include/fdbclient/KeyRangeMap.h index 88cce027a8..f88dc72dda 100644 --- a/fdbclient/include/fdbclient/KeyRangeMap.h +++ b/fdbclient/include/fdbclient/KeyRangeMap.h @@ -136,6 +136,16 @@ Future krmGetRanges(Reference const& tr, KeyRange const& keys, int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); +Future krmGetRangesUnaligned(Transaction* const& tr, + Key const& mapPrefix, + KeyRange const& keys, + int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, + int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); +Future krmGetRangesUnaligned(Reference const& tr, + Key const& mapPrefix, + KeyRange const& keys, + int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT, + int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES); void krmSetPreviouslyEmptyRange(Transaction* tr, const KeyRef& mapPrefix, const KeyRangeRef& keys, @@ -162,7 +172,7 @@ Future krmSetRangeCoalescing(Reference const& t KeyRange const& range, KeyRange const& maxRange, Value const& value); -RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv); +RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv, bool align = true); template std::vector> KeyRangeMap::getAffectedRangesAfterInsertion( diff --git a/fdbclient/include/fdbclient/Metacluster.h b/fdbclient/include/fdbclient/Metacluster.h new file mode 100644 index 0000000000..99abed564b --- /dev/null +++ b/fdbclient/include/fdbclient/Metacluster.h @@ -0,0 +1,183 @@ +/* + * Metacluster.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_METACLUSTER_H +#define FDBCLIENT_METACLUSTER_H +#include "CoordinationInterface.h" +#include "json_spirit/json_spirit_value.h" +#pragma once + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/KeyBackedTypes.h" +#include "flow/flat_buffers.h" + +struct ClusterUsage { + int numTenantGroups = 0; + + ClusterUsage() = default; + ClusterUsage(int numTenantGroups) : numTenantGroups(numTenantGroups) {} + + json_spirit::mObject toJson() const; + + bool operator==(const ClusterUsage& other) const noexcept { return numTenantGroups == other.numTenantGroups; } + bool operator!=(const ClusterUsage& other) const noexcept { return !(*this == other); } + bool operator<(const ClusterUsage& other) const noexcept { return numTenantGroups < other.numTenantGroups; } + + template + void serialize(Ar& ar) { + serializer(ar, numTenantGroups); + } +}; + +template <> +struct Traceable : std::true_type { + static std::string toString(const ClusterUsage& value) { + return format("NumTenantGroups: %d", value.numTenantGroups); + } +}; + +// Represents the various states that a data cluster could be in. +// +// READY - the data cluster is active +// REMOVING - the data cluster is being removed and cannot have its configuration changed or any tenants created +// RESTORING - the data cluster is being restored and cannot have its configuration changed or any tenants +// created/updated/deleted. +enum class DataClusterState { READY, REMOVING, RESTORING }; + +struct DataClusterEntry { + constexpr static FileIdentifier file_identifier = 929511; + + static std::string clusterStateToString(DataClusterState clusterState); + static DataClusterState stringToClusterState(std::string stateStr); + + UID id; + ClusterUsage capacity; + ClusterUsage allocated; + + DataClusterState clusterState = DataClusterState::READY; + + DataClusterEntry() = default; + DataClusterEntry(ClusterUsage capacity) : capacity(capacity) {} + DataClusterEntry(UID id, ClusterUsage capacity, ClusterUsage allocated) + : id(id), capacity(capacity), allocated(allocated) {} + + // Returns true if all configurable properties match + bool matchesConfiguration(DataClusterEntry const& other) const { + return id == other.id && capacity == other.capacity; + } + + bool hasCapacity() const { return allocated < capacity; } + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } + static DataClusterEntry decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + + json_spirit::mObject toJson() const; + + template + void serialize(Ar& ar) { + serializer(ar, id, capacity, allocated, clusterState); + } +}; + +struct MetaclusterRegistrationEntry { + constexpr static FileIdentifier file_identifier = 13448589; + + ClusterType clusterType; + + ClusterName metaclusterName; + ClusterName name; + UID metaclusterId; + UID id; + + MetaclusterRegistrationEntry() = default; + MetaclusterRegistrationEntry(ClusterName metaclusterName, UID metaclusterId) + : clusterType(ClusterType::METACLUSTER_MANAGEMENT), metaclusterName(metaclusterName), name(metaclusterName), + metaclusterId(metaclusterId), id(metaclusterId) {} + MetaclusterRegistrationEntry(ClusterName metaclusterName, ClusterName name, UID metaclusterId, UID id) + : clusterType(ClusterType::METACLUSTER_DATA), metaclusterName(metaclusterName), name(name), + metaclusterId(metaclusterId), id(id) { + ASSERT(metaclusterName != name && metaclusterId != id); + } + + // Returns true if this entry is associated with the same cluster as the passed in entry. If one entry is from the + // management cluster and the other is from a data cluster, this checks whether they are part of the same + // metacluster. + bool matches(MetaclusterRegistrationEntry const& other) const { + if (metaclusterName != other.metaclusterName || metaclusterId != other.metaclusterId) { + return false; + } else if (clusterType == ClusterType::METACLUSTER_DATA && other.clusterType == ClusterType::METACLUSTER_DATA && + (name != other.name || id != other.id)) { + return false; + } + + return true; + } + + MetaclusterRegistrationEntry toManagementClusterRegistration() const { + ASSERT(clusterType == ClusterType::METACLUSTER_DATA); + return MetaclusterRegistrationEntry(metaclusterName, metaclusterId); + } + + MetaclusterRegistrationEntry toDataClusterRegistration(ClusterName name, UID id) const { + ASSERT(clusterType == ClusterType::METACLUSTER_MANAGEMENT); + return MetaclusterRegistrationEntry(metaclusterName, name, metaclusterId, id); + } + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } + static MetaclusterRegistrationEntry decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + static Optional decode(Optional value) { + return value.map( + [](ValueRef const& v) { return MetaclusterRegistrationEntry::decode(v); }); + } + + std::string toString() const { + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + return fmt::format( + "metacluster name: {}, metacluster id: {}", printable(metaclusterName), metaclusterId.shortString()); + } else { + return fmt::format("metacluster name: {}, metacluster id: {}, data cluster name: {}, data cluster id: {}", + printable(metaclusterName), + metaclusterId.shortString(), + printable(name), + id.shortString()); + } + } + + template + void serialize(Ar& ar) { + serializer(ar, clusterType, metaclusterName, name, metaclusterId, id); + } +}; + +template <> +struct Traceable : std::true_type { + static std::string toString(MetaclusterRegistrationEntry const& entry) { return entry.toString(); } +}; + +struct MetaclusterMetadata { + // Registration information for a metacluster, stored on both management and data clusters + static KeyBackedObjectProperty& metaclusterRegistration(); +}; + +#endif \ No newline at end of file diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h new file mode 100644 index 0000000000..e0b9c33629 --- /dev/null +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -0,0 +1,1926 @@ +/* + * MetaclusterManagement.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "fdbclient/FDBOptions.g.h" +#include "flow/IRandom.h" +#include "flow/ThreadHelper.actor.h" +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H) +#define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H +#include "fdbclient/MetaclusterManagement.actor.g.h" +#elif !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H) +#define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/GenericTransactionHelper.h" +#include "fdbclient/GenericManagementAPI.actor.h" +#include "fdbclient/KeyBackedTypes.h" +#include "fdbclient/Metacluster.h" +#include "fdbclient/MultiVersionTransaction.h" +#include "fdbclient/SystemData.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/VersionedMap.h" +#include "flow/flat_buffers.h" +#include "flow/actorcompiler.h" // has to be last include + +// This file provides the interfaces to manage metacluster metadata. +// +// These transactions can operate on clusters at different versions, so care needs to be taken to update the metadata +// according to the cluster version. +// +// Support is maintained in this file for the current and the previous protocol versions. + +struct DataClusterMetadata { + constexpr static FileIdentifier file_identifier = 5573993; + + DataClusterEntry entry; + ClusterConnectionString connectionString; + + DataClusterMetadata() = default; + DataClusterMetadata(DataClusterEntry const& entry, ClusterConnectionString const& connectionString) + : entry(entry), connectionString(connectionString) {} + + bool matchesConfiguration(DataClusterMetadata const& other) const { + return entry.matchesConfiguration(other.entry) && connectionString == other.connectionString; + } + + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion()); } + static DataClusterMetadata decode(ValueRef const& value) { + return ObjectReader::fromStringRef(value, IncludeVersion()); + } + + json_spirit::mValue toJson() const { + json_spirit::mObject obj = entry.toJson(); + obj["connection_string"] = connectionString.toString(); + return obj; + } + + template + void serialize(Ar& ar) { + serializer(ar, connectionString, entry); + } +}; + +FDB_DECLARE_BOOLEAN_PARAM(AddNewTenants); +FDB_DECLARE_BOOLEAN_PARAM(RemoveMissingTenants); + +namespace MetaclusterAPI { + +struct ManagementClusterMetadata { + struct ConnectionStringCodec { + static inline Standalone pack(ClusterConnectionString const& val) { + return StringRef(val.toString()); + } + static inline ClusterConnectionString unpack(Standalone const& val) { + return ClusterConnectionString(val.toString()); + } + }; + + static TenantMetadataSpecification& tenantMetadata(); + + // A map from cluster name to the metadata associated with a cluster + static KeyBackedObjectMap& dataClusters(); + + // A map from cluster name to the connection string for the cluster + static KeyBackedMap, ConnectionStringCodec> + dataClusterConnectionRecords; + + // A set of non-full clusters where the key is the tuple (num tenant groups allocated, cluster name). + static KeyBackedSet clusterCapacityIndex; + + // A map from cluster name to a count of tenants + static KeyBackedMap, BinaryCodec> clusterTenantCount; + + // A set of (cluster name, tenant name, tenant ID) tuples ordered by cluster + static KeyBackedSet clusterTenantIndex; + + // A set of (cluster, tenant group name) tuples ordered by cluster + static KeyBackedSet clusterTenantGroupIndex; +}; + +ACTOR Future> openDatabase(ClusterConnectionString connectionString); + +ACTOR template +Future> tryGetClusterTransaction(Transaction tr, ClusterName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + state Future metaclusterRegistrationCheck = + TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT); + + state Future> clusterEntryFuture = + ManagementClusterMetadata::dataClusters().get(tr, name); + state Future> connectionRecordFuture = + ManagementClusterMetadata::dataClusterConnectionRecords.get(tr, name); + + wait(metaclusterRegistrationCheck); + + state Optional clusterEntry = wait(clusterEntryFuture); + Optional connectionString = wait(connectionRecordFuture); + + if (clusterEntry.present()) { + ASSERT(connectionString.present()); + return Optional(DataClusterMetadata(clusterEntry.get(), connectionString.get())); + } else { + return Optional(); + } +} + +ACTOR template +Future> tryGetCluster(Reference db, ClusterName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional metadata = wait(tryGetClusterTransaction(tr, name)); + return metadata; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future getClusterTransaction(Transaction tr, ClusterNameRef name) { + Optional metadata = wait(tryGetClusterTransaction(tr, name)); + if (!metadata.present()) { + throw cluster_not_found(); + } + + return metadata.get(); +} + +ACTOR template +Future getCluster(Reference db, ClusterName name) { + Optional metadata = wait(tryGetCluster(db, name)); + if (!metadata.present()) { + throw cluster_not_found(); + } + + return metadata.get(); +} + +ACTOR template +Future> getAndOpenDatabase(Transaction managementTr, ClusterName clusterName) { + DataClusterMetadata clusterMetadata = wait(getClusterTransaction(managementTr, clusterName)); + Reference db = wait(openDatabase(clusterMetadata.connectionString)); + return db; +} + +template +struct MetaclusterOperationContext { + Reference managementDb; + Reference dataClusterDb; + + Optional clusterName; + + Optional metaclusterRegistration; + Optional dataClusterMetadata; + + MetaclusterOperationContext(Reference managementDb, Optional clusterName = {}) + : managementDb(managementDb), clusterName(clusterName) {} + + // Run a transaction on the management cluster. This verifies that the cluster is a management cluster and matches + // the same metacluster that we've run any previous transactions on. If a clusterName is set, it also verifies that + // the specified cluster is present. Stores the metaclusterRegistration entry and, if a clusterName is set, the + // dataClusterMetadata and dataClusterDb in the context. + ACTOR template + static Future()(Reference()).getValue())> + runManagementTransaction(MetaclusterOperationContext* self, Function func) { + state Reference tr = self->managementDb->createTransaction(); + state bool clusterPresentAtStart = self->clusterName.present(); + loop { + try { + // If this transaction is retrying and didn't have the cluster name set at the beginning, clear it out + // to be set again in the next iteration. + if (!clusterPresentAtStart) { + self->clearCluster(); + } + + // Get the data cluster metadata for the specified cluster, if present + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Future> dataClusterMetadataFuture; + if (self->clusterName.present()) { + dataClusterMetadataFuture = tryGetClusterTransaction(tr, self->clusterName.get()); + } + + // Get the metacluster registration information + state Optional currentMetaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + state Optional currentDataClusterMetadata; + if (self->clusterName.present()) { + wait(store(currentDataClusterMetadata, dataClusterMetadataFuture)); + } + + // Check that this is a management cluster and is the same metacluster that any previous transactions + // have run on. + if (!currentMetaclusterRegistration.present() || + currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_MANAGEMENT) { + throw invalid_metacluster_operation(); + } else if (self->metaclusterRegistration.present() && + !self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { + throw invalid_metacluster_operation(); + } + + // If a cluster was specified, check that the cluster metadata is present. If so, load it and store it + // in the context. Additionally, store the data cluster details in the local metacluster registration + // entry. + if (self->clusterName.present()) { + if (!currentDataClusterMetadata.present()) { + throw cluster_not_found(); + } else { + currentMetaclusterRegistration = currentMetaclusterRegistration.get().toDataClusterRegistration( + self->clusterName.get(), currentDataClusterMetadata.get().entry.id); + } + } + + // Store the metacluster registration entry + if (!self->metaclusterRegistration.present()) { + self->metaclusterRegistration = currentMetaclusterRegistration; + } + + // Check that our data cluster has the same ID as previous transactions. If so, then store the updated + // cluster metadata in the context and open a connection to the data DB. + if (self->dataClusterMetadata.present() && + self->dataClusterMetadata.get().entry.id != currentDataClusterMetadata.get().entry.id) { + throw cluster_not_found(); + } else if (self->clusterName.present()) { + self->dataClusterMetadata = currentDataClusterMetadata; + if (!self->dataClusterDb) { + wait( + store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString))); + } + } + + state decltype(std::declval()(Reference()).getValue()) result = + wait(func(tr)); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + return result; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + template + Future()(Reference()).getValue())> + runManagementTransaction(Function func) { + return runManagementTransaction(this, func); + } + + // Runs a transaction on the data cluster. This requires that a cluster name be set and that a transaction has + // already been run on the management cluster to populate the needed metadata. This verifies that the data cluster + // has the expected ID and is part of the metacluster that previous transactions have run on. + ACTOR template + static Future()(Reference()).getValue())> + runDataClusterTransaction(MetaclusterOperationContext* self, Function func) { + ASSERT(self->dataClusterDb); + ASSERT(self->dataClusterMetadata.present()); + ASSERT(self->metaclusterRegistration.present() && + self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + + state Reference tr = self->dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Optional currentMetaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + // Check that this is the expected data cluster and is part of the right metacluster + if (!currentMetaclusterRegistration.present() || + currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) { + throw invalid_metacluster_operation(); + } else if (!self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { + throw invalid_metacluster_operation(); + } + + state decltype(std::declval()(Reference()).getValue()) result = + wait(func(tr)); + + wait(safeThreadFutureToFuture(tr->commit())); + return result; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + template + Future()(Reference()).getValue())> + runDataClusterTransaction(Function func) { + return runDataClusterTransaction(this, func); + } + + ACTOR static Future updateClusterName(MetaclusterOperationContext* self, + Reference tr) { + state DataClusterMetadata currentDataClusterMetadata = wait(getClusterTransaction(tr, self->clusterName.get())); + + self->metaclusterRegistration = self->metaclusterRegistration.get().toDataClusterRegistration( + self->clusterName.get(), currentDataClusterMetadata.entry.id); + + self->dataClusterMetadata = currentDataClusterMetadata; + if (!self->dataClusterDb) { + wait(store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString))); + } + + return Void(); + } + + // Sets the cluster used in this context. This must be called from a management cluster transaction, and it + // will load the cluster metadata and connect to the cluster. + Future setCluster(Reference tr, ClusterName clusterName) { + ASSERT(!this->clusterName.present()); + ASSERT(!dataClusterMetadata.present()); + ASSERT(metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_MANAGEMENT); + this->clusterName = clusterName; + return updateClusterName(this, tr); + } + + // Clears the chosen cluster for this context. This is useful if we are retrying a transaction that expects an + // uninitialized cluster. + void clearCluster() { + clusterName = {}; + dataClusterMetadata = {}; + dataClusterDb = {}; + if (metaclusterRegistration.present() && + metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA) { + metaclusterRegistration = metaclusterRegistration.get().toManagementClusterRegistration(); + } + } +}; + +template +Future> tryGetTenantTransaction(Transaction tr, TenantName name) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + return ManagementClusterMetadata::tenantMetadata().tenantMap.get(tr, name); +} + +ACTOR template +Future> tryGetTenant(Reference db, TenantName name) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + Optional entry = wait(tryGetTenantTransaction(tr, name)); + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future getTenantTransaction(Transaction tr, TenantName name) { + Optional entry = wait(tryGetTenantTransaction(tr, name)); + if (!entry.present()) { + throw tenant_not_found(); + } + + return entry.get(); +} + +ACTOR template +Future getTenant(Reference db, TenantName name) { + Optional entry = wait(tryGetTenant(db, name)); + if (!entry.present()) { + throw tenant_not_found(); + } + + return entry.get(); +} + +ACTOR template +Future managementClusterCheckEmpty(Transaction tr) { + state Future>> tenantsFuture = + TenantMetadata::tenantMap().getRange(tr, {}, {}, 1); + state typename transaction_future_type::type dbContentsFuture = + tr->getRange(normalKeys, 1); + + KeyBackedRangeResult> tenants = wait(tenantsFuture); + if (!tenants.results.empty()) { + throw cluster_not_empty(); + } + + RangeResult dbContents = wait(safeThreadFutureToFuture(dbContentsFuture)); + if (!dbContents.empty()) { + throw cluster_not_empty(); + } + + return Void(); +} + +ACTOR template +Future> createMetacluster(Reference db, ClusterName name) { + state Reference tr = db->createTransaction(); + state Optional metaclusterUid; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Future> metaclusterRegistrationFuture = + MetaclusterMetadata::metaclusterRegistration().get(tr); + + wait(managementClusterCheckEmpty(tr)); + + Optional existingRegistration = wait(metaclusterRegistrationFuture); + if (existingRegistration.present()) { + if (metaclusterUid.present() && metaclusterUid.get() == existingRegistration.get().metaclusterId) { + return Optional(); + } else { + return format("cluster is already registered as a %s named `%s'", + existingRegistration.get().clusterType == ClusterType::METACLUSTER_DATA + ? "data cluster" + : "metacluster", + printable(existingRegistration.get().name).c_str()); + } + } + + if (!metaclusterUid.present()) { + metaclusterUid = deterministicRandom()->randomUniqueID(); + } + + MetaclusterMetadata::metaclusterRegistration().set( + tr, MetaclusterRegistrationEntry(name, metaclusterUid.get())); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return Optional(); +} + +ACTOR template +Future decommissionMetacluster(Reference db) { + state Reference tr = db->createTransaction(); + state bool firstTry = true; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + ClusterType clusterType = wait(TenantAPI::getClusterType(tr)); + if (clusterType != ClusterType::METACLUSTER_MANAGEMENT) { + if (firstTry) { + throw invalid_metacluster_operation(); + } else { + return Void(); + } + } + + // Erase all metadata not associated with specific tenants prior to checking + // cluster emptiness + ManagementClusterMetadata::tenantMetadata().tenantCount.clear(tr); + ManagementClusterMetadata::tenantMetadata().lastTenantId.clear(tr); + ManagementClusterMetadata::tenantMetadata().tenantTombstones.clear(tr); + ManagementClusterMetadata::tenantMetadata().tombstoneCleanupData.clear(tr); + + wait(managementClusterCheckEmpty(tr)); + MetaclusterMetadata::metaclusterRegistration().clear(tr); + + firstTry = false; + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return Void(); +} + +template +void updateClusterCapacityIndex(Transaction tr, + ClusterName name, + DataClusterEntry const& previousEntry, + DataClusterEntry const& updatedEntry) { + // Entries are put in the cluster capacity index ordered by how many items are already allocated to them + if (previousEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.erase( + tr, Tuple::makeTuple(previousEntry.allocated.numTenantGroups, name)); + } + if (updatedEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.insert( + tr, Tuple::makeTuple(updatedEntry.allocated.numTenantGroups, name)); + } +} + +// This should only be called from a transaction that has already confirmed that the cluster entry +// is present. The updatedEntry should use the existing entry and modify only those fields that need +// to be changed. +template +void updateClusterMetadata(Transaction tr, + ClusterNameRef name, + DataClusterMetadata const& previousMetadata, + Optional const& updatedConnectionString, + Optional const& updatedEntry) { + + if (updatedEntry.present()) { + if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } + ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); + updateClusterCapacityIndex(tr, name, previousMetadata.entry, updatedEntry.get()); + } + if (updatedConnectionString.present()) { + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, name, updatedConnectionString.get()); + } +} + +template +struct RegisterClusterImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + ClusterName clusterName; + ClusterConnectionString connectionString; + DataClusterEntry clusterEntry; + + RegisterClusterImpl(Reference managementDb, + ClusterName clusterName, + ClusterConnectionString connectionString, + DataClusterEntry clusterEntry) + : ctx(managementDb), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry) {} + + // Check that cluster name is available + ACTOR static Future registrationPrecheck(RegisterClusterImpl* self, Reference tr) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (dataClusterMetadata.present()) { + throw cluster_already_exists(); + } + + return Void(); + } + + ACTOR static Future configureDataCluster(RegisterClusterImpl* self) { + state Reference dataClusterDb = wait(openDatabase(self->connectionString)); + state Reference tr = dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Future>> existingTenantsFuture = + TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1); + state ThreadFuture existingDataFuture = tr->getRange(normalKeys, 1); + + // Check whether this cluster has already been registered + state Optional existingRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + if (existingRegistration.present()) { + if (existingRegistration.get().clusterType != ClusterType::METACLUSTER_DATA || + existingRegistration.get().name != self->clusterName || + !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { + throw cluster_already_registered(); + } else { + // We already successfully registered the cluster with these details, so there's nothing to do + self->clusterEntry.id = existingRegistration.get().id; + return Void(); + } + } + + // Check for any existing data + std::vector> existingTenants = + wait(safeThreadFutureToFuture(existingTenantsFuture)); + if (!existingTenants.empty()) { + TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName); + throw cluster_not_empty(); + } + + RangeResult existingData = wait(safeThreadFutureToFuture(existingDataFuture)); + if (!existingData.empty()) { + TraceEvent(SevWarn, "CannotRegisterClusterWithData").detail("ClusterName", self->clusterName); + throw cluster_not_empty(); + } + + self->clusterEntry.id = deterministicRandom()->randomUniqueID(); + MetaclusterMetadata::metaclusterRegistration().set( + tr, + self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, + self->clusterEntry.id)); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + + TraceEvent("ConfiguredDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + // Store the cluster entry for the new cluster + ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, + Reference tr) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration( + DataClusterMetadata(self->clusterEntry, self->connectionString))) { + throw cluster_already_exists(); + } else if (!dataClusterMetadata.present()) { + self->clusterEntry.allocated = ClusterUsage(); + + if (self->clusterEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.insert( + tr, Tuple::makeTuple(self->clusterEntry.allocated.numTenantGroups, self->clusterName)); + } + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); + } + + TraceEvent("RegisteredDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } + + ACTOR static Future run(RegisterClusterImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return registrationPrecheck(self, tr); })); + // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we don't + // have a metacluster registration on the data cluster + wait(configureDataCluster(self)); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future registerCluster(Reference db, + ClusterName name, + ClusterConnectionString connectionString, + DataClusterEntry entry) { + state RegisterClusterImpl impl(db, name, connectionString, entry); + wait(impl.run()); + return Void(); +} + +ACTOR template +Future restoreCluster(Reference db, + ClusterName name, + std::string connectionString, + DataClusterEntry entry, + AddNewTenants addNewTenants, + RemoveMissingTenants removeMissingTenants) { + // TODO: add implementation + wait(delay(0.0)); + return Void(); +} + +template +struct RemoveClusterImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + bool forceRemove; + + // Parameters set in markClusterRemoving + Optional lastTenantId; + + RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) + : ctx(managementDb, clusterName), forceRemove(forceRemove) {} + + // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. + ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { + if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) { + throw cluster_not_empty(); + } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::REMOVING) { + // Mark the cluster in a removing state while we finish the remaining removal steps. This prevents new + // tenants from being assigned to it. + DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; + updatedEntry.clusterState = DataClusterState::REMOVING; + updatedEntry.capacity.numTenantGroups = 0; + + updateClusterMetadata(tr, + self->ctx.clusterName.get(), + self->ctx.dataClusterMetadata.get(), + Optional(), + updatedEntry); + } + + ManagementClusterMetadata::clusterCapacityIndex.erase( + tr, + Tuple::makeTuple(self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups, + self->ctx.clusterName.get())); + + // Get the last allocated tenant ID to be used on the detached data cluster + if (self->forceRemove) { + Optional lastId = wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.get(tr)); + self->lastTenantId = lastId; + } + + TraceEvent("MarkedDataClusterRemoving") + .detail("Name", self->ctx.clusterName.get()) + .detail("Version", tr->getCommittedVersion()); + + return true; + } + + // Delete metacluster metadata from the data cluster + ACTOR static Future updateDataCluster(RemoveClusterImpl* self, Reference tr) { + // Delete metacluster related metadata + MetaclusterMetadata::metaclusterRegistration().clear(tr); + TenantMetadata::tenantTombstones().clear(tr); + TenantMetadata::tombstoneCleanupData().clear(tr); + + // If we are force removing a cluster, then it will potentially contain tenants that have IDs + // larger than the next tenant ID to be allocated on the cluster. To avoid collisions, we advance + // the ID so that it will be the larger of the current one on the data cluster and the management + // cluster. + if (self->lastTenantId.present()) { + Optional lastId = wait(TenantMetadata::lastTenantId().get(tr)); + if (!lastId.present() || lastId.get() < self->lastTenantId.get()) { + TenantMetadata::lastTenantId().set(tr, self->lastTenantId.get()); + } + } + + TraceEvent("ReconfiguredDataCluster") + .detail("Name", self->ctx.clusterName.get()) + .detail("Version", tr->getCommittedVersion()); + + return Void(); + } + + // Returns true if all tenants have been purged + ACTOR static Future purgeTenants(RemoveClusterImpl* self, + Reference tr, + std::pair clusterTupleRange) { + ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING); + + // Get the list of tenants + state Future> tenantEntriesFuture = + ManagementClusterMetadata::clusterTenantIndex.getRange( + tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->REMOVE_CLUSTER_TENANT_BATCH_SIZE); + + state KeyBackedRangeResult tenantEntries = wait(tenantEntriesFuture); + + // Erase each tenant from the tenant map on the management cluster + for (Tuple entry : tenantEntries.results) { + ASSERT(entry.getString(0) == self->ctx.clusterName.get()); + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getString(1)); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, entry.getInt(2)); + } + + // Erase all of the tenants processed in this transaction from the cluster tenant index + if (!tenantEntries.results.empty()) { + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, + clusterTupleRange.first, + Tuple::makeTuple(self->ctx.clusterName.get(), keyAfter(tenantEntries.results.rbegin()->getString(1)))); + } + + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp( + tr, -tenantEntries.results.size(), MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, self->ctx.clusterName.get(), -tenantEntries.results.size(), MutationRef::AddValue); + + return !tenantEntries.more; + } + + // Returns true if all tenant groups and the data cluster have been purged + ACTOR static Future purgeTenantGroupsAndDataCluster(RemoveClusterImpl* self, + Reference tr, + std::pair clusterTupleRange) { + ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING); + + // Get the list of tenant groups + state Future> tenantGroupEntriesFuture = + ManagementClusterMetadata::clusterTenantGroupIndex.getRange( + tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->REMOVE_CLUSTER_TENANT_BATCH_SIZE); + + // Erase each tenant group from the tenant group map and the tenant group tenant index + state KeyBackedRangeResult tenantGroupEntries = wait(tenantGroupEntriesFuture); + for (Tuple entry : tenantGroupEntries.results) { + ASSERT(entry.getString(0) == self->ctx.clusterName.get()); + TenantGroupName tenantGroup = entry.getString(1); + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase( + tr, Tuple::makeTuple(tenantGroup), Tuple::makeTuple(keyAfter(tenantGroup))); + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.erase(tr, tenantGroup); + } + + if (!tenantGroupEntries.results.empty()) { + // Erase all of the tenant groups processed in this transaction from the cluster tenant group index + ManagementClusterMetadata::clusterTenantGroupIndex.erase( + tr, + clusterTupleRange.first, + Tuple::makeTuple(self->ctx.clusterName.get(), + keyAfter(tenantGroupEntries.results.rbegin()->getString(1)))); + } + + // Erase the data cluster record from the management cluster if processing our last batch + if (!tenantGroupEntries.more) { + ManagementClusterMetadata::dataClusters().erase(tr, self->ctx.clusterName.get()); + ManagementClusterMetadata::dataClusterConnectionRecords.erase(tr, self->ctx.clusterName.get()); + ManagementClusterMetadata::clusterTenantCount.erase(tr, self->ctx.clusterName.get()); + } + + return !tenantGroupEntries.more; + } + + // Remove all metadata associated with the data cluster from the management cluster + ACTOR static Future managementClusterPurgeDataCluster(RemoveClusterImpl* self) { + state std::pair clusterTupleRange = std::make_pair( + Tuple::makeTuple(self->ctx.clusterName.get()), Tuple::makeTuple(keyAfter(self->ctx.clusterName.get()))); + + // First remove all tenants associated with the data cluster from the management cluster + loop { + bool clearedAll = wait(self->ctx.runManagementTransaction( + [self = self, clusterTupleRange = clusterTupleRange](Reference tr) { + return purgeTenants(self, tr, clusterTupleRange); + })); + + if (clearedAll) { + break; + } + } + + // Next remove all tenant groups associated with the data cluster from the management cluster + loop { + bool clearedAll = wait(self->ctx.runManagementTransaction( + [self = self, clusterTupleRange = clusterTupleRange](Reference tr) { + return purgeTenantGroupsAndDataCluster(self, tr, clusterTupleRange); + })); + if (clearedAll) { + break; + } + } + + TraceEvent("RemovedDataCluster").detail("Name", self->ctx.clusterName.get()); + return Void(); + } + + ACTOR static Future run(RemoveClusterImpl* self) { + state bool clusterIsPresent; + try { + wait(store(clusterIsPresent, + self->ctx.runManagementTransaction([self = self](Reference tr) { + return markClusterRemoving(self, tr); + }))); + } catch (Error& e) { + // If the transaction retries after success or if we are trying a second time to remove the cluster, it will + // throw an error indicating that the removal has already started + if (e.code() == error_code_cluster_removed) { + clusterIsPresent = true; + } else { + throw; + } + } + + if (clusterIsPresent) { + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + } catch (Error& e) { + // If this transaction gets retried, the metacluster information may have already been erased. + if (e.code() != error_code_invalid_metacluster_operation) { + throw; + } + } + + // This runs multiple transactions, so the run transaction calls are inside the function + try { + wait(managementClusterPurgeDataCluster(self)); + } catch (Error& e) { + // If this transaction gets retried, the cluster may have already been deleted. + if (e.code() != error_code_cluster_not_found) { + throw; + } + } + } + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future removeCluster(Reference db, ClusterName name, bool forceRemove) { + state RemoveClusterImpl impl(db, name, forceRemove); + wait(impl.run()); + return Void(); +} + +ACTOR template +Future> listClustersTransaction(Transaction tr, + ClusterNameRef begin, + ClusterNameRef end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + state Future tenantModeCheck = TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT); + + state Future>> clusterEntriesFuture = + ManagementClusterMetadata::dataClusters().getRange(tr, begin, end, limit); + state Future>> connectionStringFuture = + ManagementClusterMetadata::dataClusterConnectionRecords.getRange(tr, begin, end, limit); + + wait(tenantModeCheck); + + state KeyBackedRangeResult> clusterEntries = + wait(safeThreadFutureToFuture(clusterEntriesFuture)); + KeyBackedRangeResult> connectionStrings = + wait(safeThreadFutureToFuture(connectionStringFuture)); + + ASSERT(clusterEntries.results.size() == connectionStrings.results.size()); + + std::map clusters; + for (int i = 0; i < clusterEntries.results.size(); ++i) { + ASSERT(clusterEntries.results[i].first == connectionStrings.results[i].first); + clusters[clusterEntries.results[i].first] = + DataClusterMetadata(clusterEntries.results[i].second, connectionStrings.results[i].second); + } + + return clusters; +} + +ACTOR template +Future> listClusters(Reference db, + ClusterName begin, + ClusterName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + std::map clusters = wait(listClustersTransaction(tr, begin, end, limit)); + + return clusters; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +template +void managementClusterAddTenantToGroup(Transaction tr, + TenantName tenantName, + TenantMapEntry tenantEntry, + DataClusterMetadata* clusterMetadata, + bool groupAlreadyExists) { + if (tenantEntry.tenantGroup.present()) { + if (tenantEntry.tenantGroup.get().startsWith("\xff"_sr)) { + throw invalid_tenant_group_name(); + } + + if (!groupAlreadyExists) { + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.set( + tr, tenantEntry.tenantGroup.get(), TenantGroupEntry(tenantEntry.assignedCluster)); + ManagementClusterMetadata::clusterTenantGroupIndex.insert( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantGroup.get())); + } + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.insert( + tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName)); + } + + if (!groupAlreadyExists) { + ASSERT(clusterMetadata->entry.hasCapacity()); + + DataClusterEntry updatedEntry = clusterMetadata->entry; + ++updatedEntry.allocated.numTenantGroups; + + updateClusterMetadata( + tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional(), updatedEntry); + + clusterMetadata->entry = updatedEntry; + } +} + +ACTOR template +Future managementClusterRemoveTenantFromGroup(Transaction tr, + TenantName tenantName, + TenantMapEntry tenantEntry, + DataClusterMetadata* clusterMetadata, + bool isRenamePair = false) { + state bool updateClusterCapacity = !tenantEntry.tenantGroup.present() && !isRenamePair; + if (tenantEntry.tenantGroup.present()) { + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantName)); + + state KeyBackedSet::RangeResultType result = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.getRange( + tr, + Tuple::makeTuple(tenantEntry.tenantGroup.get()), + Tuple::makeTuple(keyAfter(tenantEntry.tenantGroup.get())), + 1)); + + if (result.results.size() == 0) { + ManagementClusterMetadata::clusterTenantGroupIndex.erase( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantGroup.get())); + + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.erase(tr, tenantEntry.tenantGroup.get()); + updateClusterCapacity = true; + } + } + + // Update the tenant group count information for the assigned cluster if this tenant group was erased so we + // can use the freed capacity. + if (updateClusterCapacity) { + DataClusterEntry updatedEntry = clusterMetadata->entry; + --updatedEntry.allocated.numTenantGroups; + updateClusterMetadata( + tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional(), updatedEntry); + + clusterMetadata->entry = updatedEntry; + } + + return Void(); +} + +template +struct CreateTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName tenantName; + TenantMapEntry tenantEntry; + + // Parameter set if tenant creation permanently fails on the data cluster + Optional replaceExistingTenantId; + + CreateTenantImpl(Reference managementDb, TenantName tenantName, TenantMapEntry tenantEntry) + : ctx(managementDb), tenantName(tenantName), tenantEntry(tenantEntry) {} + + ACTOR static Future checkClusterAvailability(Reference dataClusterDb, + ClusterName clusterName) { + state Reference tr = dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->addWriteConflictRange(KeyRangeRef("\xff/metacluster/availability_check"_sr, + "\xff/metacluster/availability_check\x00"_sr)); + wait(safeThreadFutureToFuture(tr->commit())); + return clusterName; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + // Returns true if the tenant is already assigned and can proceed to the next step and false if it needs + // to be created. Throws an error if the tenant already exists and cannot be created. + ACTOR static Future checkForExistingTenant(CreateTenantImpl* self, Reference tr) { + // Check if the tenant already exists. If it's partially created and matches the parameters we + // specified, continue creating it. Otherwise, fail with an error. + state Optional existingEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + if (existingEntry.present()) { + if (!existingEntry.get().matchesConfiguration(self->tenantEntry) || + existingEntry.get().tenantState != TenantState::REGISTERING) { + // The tenant already exists and is either completely created or has a different + // configuration + throw tenant_already_exists(); + } else if (!self->replaceExistingTenantId.present() || + self->replaceExistingTenantId.get() != existingEntry.get().id) { + // The tenant creation has already started, so resume where we left off + self->tenantEntry = existingEntry.get(); + ASSERT(existingEntry.get().assignedCluster.present()); + + wait(self->ctx.setCluster(tr, existingEntry.get().assignedCluster.get())); + return true; + } else { + // The previous creation is permanently failed, so cleanup the tenant and create it again from scratch + // We don't need to remove it from the tenant map because we will overwrite the existing entry later in + // this transaction. + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, existingEntry.get().id); + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, existingEntry.get().assignedCluster.get(), -1, MutationRef::AddValue); + + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, + Tuple::makeTuple( + existingEntry.get().assignedCluster.get(), self->tenantName, existingEntry.get().id)); + + state DataClusterMetadata previousAssignedClusterMetadata = + wait(getClusterTransaction(tr, existingEntry.get().assignedCluster.get())); + + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, existingEntry.get(), &previousAssignedClusterMetadata)); + } + } else if (self->replaceExistingTenantId.present()) { + throw tenant_removed(); + } + + return false; + } + + // Returns a pair with the name of the assigned cluster and whether the group was already assigned + ACTOR static Future> assignTenant(CreateTenantImpl* self, + Reference tr) { + // If our tenant group is already assigned, then we just use that assignment + state Optional groupEntry; + if (self->tenantEntry.tenantGroup.present()) { + Optional _groupEntry = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get( + tr, self->tenantEntry.tenantGroup.get())); + groupEntry = _groupEntry; + + if (groupEntry.present()) { + ASSERT(groupEntry.get().assignedCluster.present()); + return std::make_pair(groupEntry.get().assignedCluster.get(), true); + } + } + + // Get a set of the most full clusters that still have capacity + state KeyBackedSet::RangeResultType availableClusters = + wait(ManagementClusterMetadata::clusterCapacityIndex.getRange( + tr, {}, {}, CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, Snapshot::False, Reverse::True)); + + if (availableClusters.results.empty()) { + throw metacluster_no_capacity(); + } + + state std::vector>> dataClusterDbs; + for (auto clusterTuple : availableClusters.results) { + dataClusterDbs.push_back(getAndOpenDatabase(tr, clusterTuple.getString(1))); + } + + wait(waitForAll(dataClusterDbs)); + + // Check the availability of our set of clusters + state std::vector> clusterAvailabilityChecks; + for (int i = 0; i < availableClusters.results.size(); ++i) { + clusterAvailabilityChecks.push_back( + checkClusterAvailability(dataClusterDbs[i].get(), availableClusters.results[i].getString(1))); + } + + // Wait for a successful availability check from some cluster. We prefer the most full cluster, but if it + // doesn't return quickly we may choose another. + Optional clusterAvailabilityCheck = wait(timeout( + success(clusterAvailabilityChecks[0]) || (delay(CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY) && + waitForAny(clusterAvailabilityChecks)), + CLIENT_KNOBS->METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT)); + + if (!clusterAvailabilityCheck.present()) { + // If no clusters were available for long enough, then we throw an error and try again + throw transaction_too_old(); + } + + // Get the first cluster that was available + state Optional chosenCluster; + for (auto f : clusterAvailabilityChecks) { + if (f.isReady()) { + chosenCluster = f.get(); + break; + } + } + + ASSERT(chosenCluster.present()); + return std::make_pair(chosenCluster.get(), false); + } + + ACTOR static Future assignTenantAndStoreInManagementCluster(CreateTenantImpl* self, + Reference tr) { + // If the tenant already exists, we either throw an error from this function or move on to the next phase + bool tenantExists = wait(checkForExistingTenant(self, tr)); + if (tenantExists) { + return Void(); + } + + // Choose a cluster for the tenant + state std::pair assignment = wait(assignTenant(self, tr)); + self->tenantEntry.assignedCluster = assignment.first; + + // Update the context with the chosen cluster + state Future setClusterFuture = self->ctx.setCluster(tr, assignment.first); + + // Create a tenant entry in the management cluster + Optional lastId = wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.get(tr)); + self->tenantEntry.setId(lastId.orDefault(-1) + 1); + ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, self->tenantEntry.id); + + self->tenantEntry.tenantState = TenantState::REGISTERING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->tenantEntry); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantEntry.id, self->tenantName); + + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, self->tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD( + tr, self->tenantEntry.assignedCluster.get(), Snapshot::False, 0)); + + if (clusterTenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(self->tenantEntry.assignedCluster.get(), self->tenantName, self->tenantEntry.id)); + + wait(setClusterFuture); + + // If we are part of a tenant group that is assigned to a cluster being removed from the metacluster, + // then we fail with an error. + if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } + + managementClusterAddTenantToGroup( + tr, self->tenantName, self->tenantEntry, &self->ctx.dataClusterMetadata.get(), assignment.second); + + return Void(); + } + + ACTOR static Future storeTenantInDataCluster(CreateTenantImpl* self, Reference tr) { + std::pair, bool> dataClusterTenant = wait( + TenantAPI::createTenantTransaction(tr, self->tenantName, self->tenantEntry, ClusterType::METACLUSTER_DATA)); + + // If the tenant map entry is empty, then we encountered a tombstone indicating that the tenant was + // simultaneously removed. + if (!dataClusterTenant.first.present()) { + throw tenant_removed(); + } + + return Void(); + } + + ACTOR static Future markTenantReady(CreateTenantImpl* self, Reference tr) { + state Optional managementEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + if (!managementEntry.present()) { + throw tenant_removed(); + } else if (managementEntry.get().id != self->tenantEntry.id) { + throw tenant_already_exists(); + } + + if (managementEntry.get().tenantState == TenantState::REGISTERING) { + TenantMapEntry updatedEntry = managementEntry.get(); + updatedEntry.tenantState = TenantState::READY; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + } + + return Void(); + } + + ACTOR static Future run(CreateTenantImpl* self) { + if (self->tenantName.startsWith("\xff"_sr)) { + throw invalid_tenant_name(); + } + + loop { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return assignTenantAndStoreInManagementCluster(self, tr); + })); + + self->replaceExistingTenantId = {}; + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return storeTenantInDataCluster(self, tr); })); + + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markTenantReady(self, tr); })); + + return Void(); + } catch (Error& e) { + if (e.code() == error_code_tenant_creation_permanently_failed) { + // If the data cluster has permanently failed to create the tenant, then we can reassign it in + // the management cluster and start over + self->replaceExistingTenantId = self->tenantEntry.id; + self->ctx.clearCluster(); + } else { + throw; + } + } + } + } + Future run() { return run(this); } +}; + +ACTOR template +Future createTenant(Reference db, TenantName name, TenantMapEntry tenantEntry) { + state CreateTenantImpl impl(db, name, tenantEntry); + wait(impl.run()); + return Void(); +} + +template +struct DeleteTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName tenantName; + + // Parameters set in getAssignedLocation + int64_t tenantId; + + // Parameters set in markTenantInRemovingState + Optional pairName; + + DeleteTenantImpl(Reference managementDb, TenantName tenantName) : ctx(managementDb), tenantName(tenantName) {} + + // Loads the cluster details for the cluster where the tenant is assigned. + // Returns true if the deletion is already in progress + ACTOR static Future getAssignedLocation(DeleteTenantImpl* self, Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present()) { + throw tenant_not_found(); + } + + // Disallow removing the "new" name of a renamed tenant before it completes + if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) { + throw tenant_not_found(); + } + + if (tenantEntry.get().tenantState == TenantState::REMOVING) { + if (tenantEntry.get().renamePair.present()) { + self->pairName = tenantEntry.get().renamePair.get(); + } + } + + self->tenantId = tenantEntry.get().id; + wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get())); + return tenantEntry.get().tenantState == TenantState::REMOVING; + } + + // Does an initial check if the tenant is empty. This is an optimization to prevent us marking a tenant + // in the deleted state while it has data, but it is still possible that data gets added to it after this + // point. + // + // SOMEDAY: should this also lock the tenant when locking is supported? + ACTOR static Future checkTenantEmpty(DeleteTenantImpl* self, Reference tr) { + state Optional tenantEntry = wait(TenantAPI::tryGetTenantTransaction(tr, self->tenantName)); + if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { + // The tenant must have been removed simultaneously + return Void(); + } + + ThreadFuture rangeFuture = tr->getRange(prefixRange(tenantEntry.get().prefix), 1); + RangeResult result = wait(safeThreadFutureToFuture(rangeFuture)); + if (!result.empty()) { + throw tenant_not_empty(); + } + + return Void(); + } + + // Mark the tenant as being in a removing state on the management cluster + ACTOR static Future markTenantInRemovingState(DeleteTenantImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { + throw tenant_not_found(); + } + + if (tenantEntry.get().tenantState != TenantState::REMOVING) { + // Disallow removing the "new" name of a renamed tenant before it completes + if (tenantEntry.get().tenantState == TenantState::RENAMING_TO) { + throw tenant_not_found(); + } + state TenantMapEntry updatedEntry = tenantEntry.get(); + // Check if we are deleting a tenant in the middle of a rename + if (updatedEntry.renamePair.present()) { + ASSERT(updatedEntry.tenantState == TenantState::RENAMING_FROM); + self->pairName = updatedEntry.renamePair.get(); + } + updatedEntry.tenantState = TenantState::REMOVING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, updatedEntry); + // If this has a rename pair, also mark the other entry for deletion + if (self->pairName.present()) { + state Optional pairEntry = wait(tryGetTenantTransaction(tr, self->pairName.get())); + TenantMapEntry updatedPairEntry = pairEntry.get(); + // Sanity check that our pair has us named as their partner + ASSERT(updatedPairEntry.renamePair.present()); + ASSERT(updatedPairEntry.renamePair.get() == self->tenantName); + ASSERT(updatedPairEntry.id == self->tenantId); + CODE_PROBE(true, "marking pair tenant in removing state"); + updatedPairEntry.tenantState = TenantState::REMOVING; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->pairName.get(), updatedPairEntry); + } + } + + return Void(); + } + + // Delete the tenant and related metadata on the management cluster + ACTOR static Future deleteTenantFromManagementCluster(DeleteTenantImpl* self, + Reference tr, + bool pairDelete = false) { + // If pair is present, and this is not already a pair delete, call this function recursively + state Future pairFuture = Void(); + if (!pairDelete && self->pairName.present()) { + CODE_PROBE(true, "deleting pair tenant from management cluster"); + pairFuture = deleteTenantFromManagementCluster(self, tr, true); + } + state TenantName tenantName = pairDelete ? self->pairName.get() : self->tenantName; + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->tenantId) { + return Void(); + } + + ASSERT(tenantEntry.get().tenantState == TenantState::REMOVING); + + // Erase the tenant entry itself + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantName); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, tenantEntry.get().id); + + // This is idempotent because this function is only called if the tenant is in the map + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.get().assignedCluster.get(), -1, MutationRef::AddValue); + + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.get().assignedCluster.get(), tenantName, self->tenantId)); + + // Remove the tenant from its tenant group + wait(managementClusterRemoveTenantFromGroup( + tr, tenantName, tenantEntry.get(), &self->ctx.dataClusterMetadata.get(), pairDelete)); + + wait(pairFuture); + return Void(); + } + + ACTOR static Future run(DeleteTenantImpl* self) { + // Get information about the tenant and where it is assigned + bool deletionInProgress = wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return getAssignedLocation(self, tr); })); + + if (!deletionInProgress) { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return checkTenantEmpty(self, tr); })); + + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return markTenantInRemovingState(self, tr); + })); + } + + // Delete tenant on the data cluster + wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { + // If the removed tenant is being renamed, attempt to delete both the old and new names. + // At most one should be present with the given ID, and the other will be a no-op. + Future pairDelete = Void(); + if (self->pairName.present()) { + CODE_PROBE(true, "deleting pair tenant from data cluster"); + pairDelete = TenantAPI::deleteTenantTransaction( + tr, self->pairName.get(), self->tenantId, ClusterType::METACLUSTER_DATA); + } + return pairDelete && TenantAPI::deleteTenantTransaction( + tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA); + })); + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return deleteTenantFromManagementCluster(self, tr); + })); + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future deleteTenant(Reference db, TenantName name) { + state DeleteTenantImpl impl(db, name); + wait(impl.run()); + return Void(); +} + +ACTOR template +Future>> listTenantsTransaction(Transaction tr, + TenantNameRef begin, + TenantNameRef end, + int limit) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + + KeyBackedRangeResult> results = + wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange(tr, begin, end, limit)); + + return results.results; +} + +ACTOR template +Future>> listTenants(Reference db, + TenantName begin, + TenantName end, + int limit) { + state Reference tr = db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + std::vector> tenants = + wait(listTenantsTransaction(tr, begin, end, limit)); + return tenants; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +template +struct ConfigureTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName tenantName; + std::map, Optional> configurationParameters; + + // Parameters set in updateManagementCluster + TenantMapEntry updatedEntry; + + ConfigureTenantImpl(Reference managementDb, + TenantName tenantName, + std::map, Optional> configurationParameters) + : ctx(managementDb), tenantName(tenantName), configurationParameters(configurationParameters) {} + + // This verifies that the tenant group can be changed, and if so it updates all of the tenant group data + // structures. It does not update the TenantMapEntry stored in the tenant map. + ACTOR static Future updateTenantGroup(ConfigureTenantImpl* self, + Reference tr, + TenantMapEntry tenantEntry, + Optional desiredGroup) { + + state TenantMapEntry entryWithUpdatedGroup = tenantEntry; + entryWithUpdatedGroup.tenantGroup = desiredGroup; + + if (tenantEntry.tenantGroup == desiredGroup) { + return Void(); + } + + // Removing a tenant group is only possible if we have capacity for more groups on the current cluster + else if (!desiredGroup.present()) { + if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) { + throw metacluster_no_capacity(); + } + + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); + managementClusterAddTenantToGroup( + tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false); + return Void(); + } + + state Optional tenantGroupEntry = + wait(ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, desiredGroup.get())); + + // If we are creating a new tenant group, we need to have capacity on the current cluster + if (!tenantGroupEntry.present()) { + if (!self->ctx.dataClusterMetadata.get().entry.hasCapacity()) { + throw metacluster_no_capacity(); + } + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); + managementClusterAddTenantToGroup( + tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false); + return Void(); + } + + // Moves between groups in the same cluster are freely allowed + else if (tenantGroupEntry.get().assignedCluster == tenantEntry.assignedCluster) { + wait(managementClusterRemoveTenantFromGroup( + tr, self->tenantName, tenantEntry, &self->ctx.dataClusterMetadata.get())); + managementClusterAddTenantToGroup( + tr, self->tenantName, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), true); + return Void(); + } + + // We don't currently support movement between groups on different clusters + else { + throw cluster_no_capacity(); + } + } + + // Updates the configuration in the management cluster and marks it as being in the UPDATING_CONFIGURATION state + ACTOR static Future updateManagementCluster(ConfigureTenantImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present()) { + throw tenant_not_found(); + } + + if (tenantEntry.get().tenantState != TenantState::READY && + tenantEntry.get().tenantState != TenantState::UPDATING_CONFIGURATION) { + throw invalid_tenant_state(); + } + + wait(self->ctx.setCluster(tr, tenantEntry.get().assignedCluster.get())); + + self->updatedEntry = tenantEntry.get(); + self->updatedEntry.tenantState = TenantState::UPDATING_CONFIGURATION; + + state std::map, Optional>::iterator configItr; + for (configItr = self->configurationParameters.begin(); configItr != self->configurationParameters.end(); + ++configItr) { + if (configItr->first == "tenant_group"_sr) { + wait(updateTenantGroup(self, tr, self->updatedEntry, configItr->second)); + } + self->updatedEntry.configure(configItr->first, configItr->second); + } + + ++self->updatedEntry.configurationSequenceNum; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, self->updatedEntry); + + return Void(); + } + + // Updates the configuration in the data cluster + ACTOR static Future updateDataCluster(ConfigureTenantImpl* self, Reference tr) { + state Optional tenantEntry = wait(TenantAPI::tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->updatedEntry.id || + tenantEntry.get().configurationSequenceNum >= self->updatedEntry.configurationSequenceNum) { + // If the tenant isn't in the metacluster, it must have been concurrently removed + return Void(); + } + + TenantMapEntry dataClusterEntry = self->updatedEntry; + dataClusterEntry.tenantState = TenantState::READY; + dataClusterEntry.assignedCluster = {}; + + wait(TenantAPI::configureTenantTransaction(tr, self->tenantName, tenantEntry.get(), dataClusterEntry)); + return Void(); + } + + // Updates the tenant state in the management cluster to READY + ACTOR static Future markManagementTenantAsReady(ConfigureTenantImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present() || tenantEntry.get().id != self->updatedEntry.id || + tenantEntry.get().tenantState != TenantState::UPDATING_CONFIGURATION || + tenantEntry.get().configurationSequenceNum > self->updatedEntry.configurationSequenceNum) { + return Void(); + } + + tenantEntry.get().tenantState = TenantState::READY; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get()); + return Void(); + } + + ACTOR static Future run(ConfigureTenantImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return updateManagementCluster(self, tr); })); + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markManagementTenantAsReady(self, tr); })); + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future configureTenant(Reference db, + TenantName name, + std::map, Optional> configurationParameters) { + state ConfigureTenantImpl impl(db, name, configurationParameters); + wait(impl.run()); + return Void(); +} + +template +struct RenameTenantImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + TenantName oldName; + TenantName newName; + + // Parameters set in markTenantsInRenamingState + int64_t tenantId = -1; + int64_t configurationSequenceNum = -1; + + RenameTenantImpl(Reference managementDb, TenantName oldName, TenantName newName) + : ctx(managementDb), oldName(oldName), newName(newName) {} + + // Delete the tenant and related metadata on the management cluster + ACTOR static Future deleteTenantFromManagementCluster(RenameTenantImpl* self, + Reference tr, + TenantMapEntry tenantEntry) { + // Erase the tenant entry itself + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, self->oldName); + + // Remove old tenant from tenant count + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.assignedCluster.get(), -1, MutationRef::AddValue); + + // Clean up cluster based tenant indices and remove the old entry from its tenant group + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), self->oldName, self->tenantId)); + + // Remove the tenant from its tenant group + wait(managementClusterRemoveTenantFromGroup( + tr, self->oldName, tenantEntry, &self->ctx.dataClusterMetadata.get(), true)); + + return Void(); + } + + ACTOR static Future markTenantsInRenamingState(RenameTenantImpl* self, + Reference tr) { + state TenantMapEntry oldTenantEntry; + state Optional newTenantEntry; + wait(store(oldTenantEntry, getTenantTransaction(tr, self->oldName)) && + store(newTenantEntry, tryGetTenantTransaction(tr, self->newName))); + + if (self->tenantId != -1 && oldTenantEntry.id != self->tenantId) { + // The tenant must have been removed simultaneously + CODE_PROBE(true, "Metacluster rename old tenant ID mismatch"); + throw tenant_removed(); + } + + // If marked for deletion, abort the rename + if (oldTenantEntry.tenantState == TenantState::REMOVING) { + CODE_PROBE(true, "Metacluster rename candidates marked for deletion"); + throw tenant_removed(); + } + + // If the new entry is present, we can only continue if this is a retry of the same rename + // To check this, verify both entries are in the correct state + // and have each other as pairs + if (newTenantEntry.present()) { + if (newTenantEntry.get().tenantState == TenantState::RENAMING_TO && + oldTenantEntry.tenantState == TenantState::RENAMING_FROM && newTenantEntry.get().renamePair.present() && + newTenantEntry.get().renamePair.get() == self->oldName && oldTenantEntry.renamePair.present() && + oldTenantEntry.renamePair.get() == self->newName) { + wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get())); + self->tenantId = newTenantEntry.get().id; + self->configurationSequenceNum = newTenantEntry.get().configurationSequenceNum; + CODE_PROBE(true, "Metacluster rename retry in progress"); + return Void(); + } else { + CODE_PROBE(true, "Metacluster rename new name already exists"); + throw tenant_already_exists(); + }; + } else { + if (self->tenantId == -1) { + self->tenantId = oldTenantEntry.id; + } + ++oldTenantEntry.configurationSequenceNum; + self->configurationSequenceNum = oldTenantEntry.configurationSequenceNum; + wait(self->ctx.setCluster(tr, oldTenantEntry.assignedCluster.get())); + if (oldTenantEntry.tenantState != TenantState::READY) { + CODE_PROBE(true, "Metacluster unable to proceed with rename operation"); + throw invalid_tenant_state(); + } + } + + // Check cluster capacity. If we would exceed the amount due to temporary extra tenants + // then we deny the rename request altogether. + int64_t clusterTenantCount = wait(ManagementClusterMetadata::clusterTenantCount.getD( + tr, oldTenantEntry.assignedCluster.get(), Snapshot::False, 0)); + + if (clusterTenantCount + 1 > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { + throw cluster_no_capacity(); + } + + TenantMapEntry updatedOldEntry = oldTenantEntry; + TenantMapEntry updatedNewEntry(updatedOldEntry); + ASSERT(updatedOldEntry.configurationSequenceNum == self->configurationSequenceNum); + ASSERT(updatedNewEntry.configurationSequenceNum == self->configurationSequenceNum); + updatedOldEntry.tenantState = TenantState::RENAMING_FROM; + updatedNewEntry.tenantState = TenantState::RENAMING_TO; + updatedOldEntry.renamePair = self->newName; + updatedNewEntry.renamePair = self->oldName; + + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->oldName, updatedOldEntry); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + + // Add temporary tenant to tenantCount to prevent exceeding capacity during a rename + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, updatedNewEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(updatedNewEntry.assignedCluster.get(), self->newName, self->tenantId)); + + // Add new name to tenant group. It should already exist since the old name was part of it. + managementClusterAddTenantToGroup( + tr, self->newName, updatedNewEntry, &self->ctx.dataClusterMetadata.get(), true); + return Void(); + } + + ACTOR static Future updateDataCluster(RenameTenantImpl* self, Reference tr) { + ASSERT(self->tenantId != -1); + ASSERT(self->configurationSequenceNum != -1); + wait(TenantAPI::renameTenantTransaction(tr, + self->oldName, + self->newName, + self->tenantId, + ClusterType::METACLUSTER_DATA, + self->configurationSequenceNum)); + return Void(); + } + + ACTOR static Future finishRenameFromManagementCluster(RenameTenantImpl* self, + Reference tr) { + state Optional oldTenantEntry; + state Optional newTenantEntry; + wait(store(oldTenantEntry, tryGetTenantTransaction(tr, self->oldName)) && + store(newTenantEntry, tryGetTenantTransaction(tr, self->newName))); + + // Another (or several other) operations have already removed/changed the old entry + // Possible for the new entry to also have been tampered with, + // so it may or may not be present with or without the same id, which are all + // legal states. Assume the rename completed properly in this case + if (!oldTenantEntry.present() || oldTenantEntry.get().id != self->tenantId || + oldTenantEntry.get().configurationSequenceNum > self->configurationSequenceNum) { + CODE_PROBE(true, + "Metacluster finished rename with missing entries, mismatched id, and/or mismatched " + "configuration sequence."); + return Void(); + } + if (oldTenantEntry.get().tenantState == TenantState::REMOVING) { + ASSERT(newTenantEntry.get().tenantState == TenantState::REMOVING); + throw tenant_removed(); + } + ASSERT(newTenantEntry.present()); + ASSERT(newTenantEntry.get().id == self->tenantId); + + TenantMapEntry updatedOldEntry = oldTenantEntry.get(); + TenantMapEntry updatedNewEntry = newTenantEntry.get(); + + // Only update if in the expected state + if (updatedNewEntry.tenantState == TenantState::RENAMING_TO) { + updatedNewEntry.tenantState = TenantState::READY; + updatedNewEntry.renamePair.reset(); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->newName, updatedNewEntry); + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.set(tr, self->tenantId, self->newName); + } + + // We will remove the old entry from the management cluster + // This should still be the same old entry since the tenantId matches from the check above. + wait(deleteTenantFromManagementCluster(self, tr, updatedOldEntry)); + return Void(); + } + + ACTOR static Future run(RenameTenantImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markTenantsInRenamingState(self, tr); })); + + // Rename tenant on the data cluster + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + } catch (Error& e) { + // Since we track the tenant entries on the management cluster, these error codes should only appear + // on a retry of the transaction, typically caused by commit_unknown_result. + // Operating on the assumption that the first transaction completed successfully, we keep going + // so we can finish the rename on the management cluster. + if (e.code() == error_code_tenant_not_found || e.code() == error_code_tenant_already_exists) { + CODE_PROBE(true, "Metacluster rename ran into commit_unknown_result"); + } else { + throw e; + } + } + + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return finishRenameFromManagementCluster(self, tr); + })); + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future renameTenant(Reference db, TenantName oldName, TenantName newName) { + state RenameTenantImpl impl(db, oldName, newName); + wait(impl.run()); + return Void(); +} + +} // namespace MetaclusterAPI + +#include "flow/unactorcompiler.h" +#endif \ No newline at end of file diff --git a/fdbclient/include/fdbclient/MultiVersionTransaction.h b/fdbclient/include/fdbclient/MultiVersionTransaction.h index 9593a3bc67..b17601cb19 100644 --- a/fdbclient/include/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/include/fdbclient/MultiVersionTransaction.h @@ -122,6 +122,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted { // Network fdb_error_t (*selectApiVersion)(int runtimeVersion, int headerVersion); const char* (*getClientVersion)(); + void (*useFutureProtocolVersion)(); + fdb_error_t (*setNetworkOption)(FDBNetworkOption option, uint8_t const* value, int valueLength); fdb_error_t (*setupNetwork)(); fdb_error_t (*runNetwork)(); @@ -169,6 +171,32 @@ struct FdbCApi : public ThreadSafeReferenceCounted { uint8_t const* purge_key_name, int purge_key_name_length); + FDBFuture* (*databaseBlobbifyRange)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + + FDBFuture* (*databaseUnblobbifyRange)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length); + + FDBFuture* (*databaseListBlobbifiedRanges)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int rangeLimit); + + FDBFuture* (*databaseVerifyBlobRange)(FDBDatabase* db, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + Optional version); + // Tenant fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction); @@ -270,20 +298,39 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int end_key_name_length, int64_t chunkSize); - FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* db, + FDBFuture* (*transactionGetBlobGranuleRanges)(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, - int end_key_name_length); + int end_key_name_length, + int rangeLimit); - FDBResult* (*transactionReadBlobGranules)(FDBTransaction* db, + FDBResult* (*transactionReadBlobGranules)(FDBTransaction* tr, uint8_t const* begin_key_name, int begin_key_name_length, uint8_t const* end_key_name, int end_key_name_length, int64_t beginVersion, - int64_t readVersion, - FDBReadBlobGranuleContext granule_context); + int64_t readVersion); + + FDBFuture* (*transactionReadBlobGranulesStart)(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + int64_t* readVersionOut); + + FDBResult* (*transactionReadBlobGranulesFinish)(FDBTransaction* tr, + FDBFuture* startFuture, + uint8_t const* begin_key_name, + int begin_key_name_length, + uint8_t const* end_key_name, + int end_key_name_length, + int64_t beginVersion, + int64_t readVersion, + FDBReadBlobGranuleContext* granule_context); FDBFuture* (*transactionCommit)(FDBTransaction* tr); fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion); @@ -374,13 +421,26 @@ public: ThreadFuture getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override; ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) override; - ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override; + ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + void addReadConflictRange(const KeyRangeRef& keys) override; void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override; @@ -474,6 +534,12 @@ public: ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override; ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) override; + ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; + ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) override; + ThreadFuture createSharedState() override; void setSharedState(DatabaseSharedState* p) override; @@ -492,6 +558,7 @@ public: void selectApiVersion(int apiVersion) override; const char* getClientVersion() override; + void useFutureProtocolVersion() override; void setNetworkOption(FDBNetworkOptions::Option option, Optional value = Optional()) override; void setupNetwork() override; @@ -571,13 +638,26 @@ public: ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) override; - ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override; + ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granule_context) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + void atomicOp(const KeyRef& key, const ValueRef& value, uint32_t operationType) override; void set(const KeyRef& key, const ValueRef& value) override; void clear(const KeyRef& begin, const KeyRef& end) override; @@ -643,6 +723,9 @@ private: template ThreadResult abortableTimeoutResult(ThreadFuture abortSignal); + template + ThreadResult abortableResult(ThreadResult result, ThreadFuture abortSignal); + TransactionInfo transaction; TransactionInfo getTransaction(); @@ -655,8 +738,10 @@ private: struct ClientDesc { std::string const libPath; bool const external; + bool const useFutureVersion; - ClientDesc(std::string libPath, bool external) : libPath(libPath), external(external) {} + ClientDesc(std::string libPath, bool external, bool useFutureVersion) + : libPath(libPath), external(external), useFutureVersion(useFutureVersion) {} }; struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { @@ -668,11 +753,11 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { std::vector> threadCompletionHooks; ClientInfo() - : ClientDesc(std::string(), false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {} + : ClientDesc(std::string(), false, false), protocolVersion(0), api(nullptr), failed(true), initialized(false) {} ClientInfo(IClientApi* api) - : ClientDesc("internal", false), protocolVersion(0), api(api), failed(false), initialized(false) {} - ClientInfo(IClientApi* api, std::string libPath) - : ClientDesc(libPath, true), protocolVersion(0), api(api), failed(false), initialized(false) {} + : ClientDesc("internal", false, false), protocolVersion(0), api(api), failed(false), initialized(false) {} + ClientInfo(IClientApi* api, std::string libPath, bool useFutureVersion) + : ClientDesc(libPath, true, useFutureVersion), protocolVersion(0), api(api), failed(false), initialized(false) {} void loadVersion(); bool canReplace(Reference other) const; @@ -812,6 +897,12 @@ public: ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override; ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) override; + ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; + ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) override; + ThreadFuture createSharedState() override; void setSharedState(DatabaseSharedState* p) override; @@ -919,6 +1010,7 @@ class MultiVersionApi : public IClientApi { public: void selectApiVersion(int apiVersion) override; const char* getClientVersion() override; + void useFutureProtocolVersion() override; void setNetworkOption(FDBNetworkOptions::Option option, Optional value = Optional()) override; void setupNetwork() override; @@ -965,7 +1057,7 @@ private: void disableMultiVersionClientApi(); void setCallbacksOnExternalThreads(); - void addExternalLibrary(std::string path); + void addExternalLibrary(std::string path, bool useFutureVersion); void addExternalLibraryDirectory(std::string path); // Return a vector of (pathname, unlink_on_close) pairs. Makes threadCount - 1 copies of the library stored in // path, and returns a vector of length threadCount. diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 9411db3eee..02ccf6d500 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -239,14 +239,12 @@ FDB_DECLARE_BOOLEAN_PARAM(AllowInvalidTenantID); struct TransactionState : ReferenceCounted { Database cx; - int64_t tenantId = TenantInfo::INVALID_TENANT; Optional> authToken; Reference trLogInfo; TransactionOptions options; + Optional readOptions; - Optional debugID; TaskPriority taskID; - ReadType readType = ReadType::NORMAL; SpanContext spanContext; UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False; bool readVersionObtainedFromGrvProxy; @@ -286,8 +284,18 @@ struct TransactionState : ReferenceCounted { Optional const& tenant(); bool hasTenant() const; + int64_t tenantId() const { return tenantId_; } + void trySetTenantId(int64_t tenantId) { + if (tenantId_ == TenantInfo::INVALID_TENANT) { + tenantId_ = tenantId; + } + } + + Future handleUnknownTenant(); + private: Optional tenant_; + int64_t tenantId_ = TenantInfo::INVALID_TENANT; bool tenantSet; }; @@ -407,12 +415,16 @@ public: // The returned list would still be in form of [keys.begin, splitPoint1, splitPoint2, ... , keys.end] Future>> getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize); - Future>> getBlobGranuleRanges(const KeyRange& range); + Future>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit); Future>> readBlobGranules(const KeyRange& range, Version begin, Optional readVersion, Version* readVersionOut = nullptr); + Future>> summarizeBlobGranules(const KeyRange& range, + Version summaryVersion, + int rangeLimit); + // If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key void set(const KeyRef& key, const ValueRef& value, AddConflictRange = AddConflictRange::True); void atomicOp(const KeyRef& key, @@ -447,7 +459,13 @@ public: void fullReset(); double getBackoff(int errCode); - void debugTransaction(UID dID) { trState->debugID = dID; } + void debugTransaction(UID dID) { + if (trState->readOptions.present()) { + trState->readOptions.get().debugID = dID; + } else { + trState->readOptions = ReadOptions(dID); + } + } VersionVector getVersionVector() const; SpanContext getSpanContext() const { return trState->spanContext; } diff --git a/fdbclient/include/fdbclient/ReadYourWrites.h b/fdbclient/include/fdbclient/ReadYourWrites.h index 89de979bc1..46650be3d3 100644 --- a/fdbclient/include/fdbclient/ReadYourWrites.h +++ b/fdbclient/include/fdbclient/ReadYourWrites.h @@ -20,6 +20,7 @@ #ifndef FDBCLIENT_READYOURWRITES_H #define FDBCLIENT_READYOURWRITES_H +#include "Status.h" #pragma once #include "fdbclient/NativeAPI.actor.h" @@ -120,7 +121,7 @@ public: Future>> getRangeSplitPoints(const KeyRange& range, int64_t chunkSize) override; Future getEstimatedRangeSizeBytes(const KeyRange& keys) override; - Future>> getBlobGranuleRanges(const KeyRange& range) override; + Future>> getBlobGranuleRanges(const KeyRange& range, int rangeLimit) override; Future>> readBlobGranules(const KeyRange& range, Version begin, Optional readVersion, @@ -192,7 +193,17 @@ public: KeyRangeMap>>& getSpecialKeySpaceWriteMap() { return specialKeySpaceWriteMap; } bool readYourWritesDisabled() const { return options.readYourWritesDisabled; } const Optional& getSpecialKeySpaceErrorMsg() { return specialKeySpaceErrorMsg; } - void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; } + void setSpecialKeySpaceErrorMsg(const std::string& msg) { + if (g_network && g_network->isSimulated()) { + try { + readJSONStrictly(msg); + } catch (Error& e) { + TraceEvent(SevError, "InvalidSpecialKeySpaceErrorMessage").error(e).detail("Message", msg); + ASSERT(false); + } + } + specialKeySpaceErrorMsg = msg; + } Transaction& getTransaction() { return tr; } Optional getTenant() { return tr.getTenant(); } diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 48702be13c..ab55f3dc43 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -50,7 +50,6 @@ public: bool PEEK_USING_STREAMING; double TLOG_TIMEOUT; // tlog OR commit proxy failure - master's reaction time double TLOG_SLOW_REJOIN_WARN_TIMEOUT_SECS; // Warns if a tlog takes too long to rejoin - double RECOVERY_TLOG_SMART_QUORUM_DELAY; // smaller might be better for bug amplification double TLOG_STORAGE_MIN_UPDATE_INTERVAL; double BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL; int DESIRED_TOTAL_BYTES; @@ -58,10 +57,6 @@ public: double UPDATE_DELAY; int MAXIMUM_PEEK_BYTES; int APPLY_MUTATION_BYTES; - int RECOVERY_DATA_BYTE_LIMIT; - int BUGGIFY_RECOVERY_DATA_LIMIT; - double LONG_TLOG_COMMIT_TIME; - int64_t LARGE_TLOG_COMMIT_BYTES; double BUGGIFY_RECOVER_MEMORY_LIMIT; double BUGGIFY_WORKER_REMOVED_MAX_LAG; int64_t UPDATE_STORAGE_BYTE_LIMIT; @@ -123,16 +118,16 @@ public: double BG_REBALANCE_POLLING_INTERVAL; double BG_REBALANCE_SWITCH_CHECK_INTERVAL; double DD_QUEUE_LOGGING_INTERVAL; + double DD_QUEUE_COUNTER_REFRESH_INTERVAL; + double DD_QUEUE_COUNTER_MAX_LOG; // max number of servers for which trace events will be generated in each round of + // DD_QUEUE_COUNTER_REFRESH_INTERVAL duration + bool DD_QUEUE_COUNTER_SUMMARIZE; // Enable summary of remaining servers when the number of servers with ongoing + // relocations in the last minute exceeds DD_QUEUE_COUNTER_MAX_LOG double RELOCATION_PARALLELISM_PER_SOURCE_SERVER; double RELOCATION_PARALLELISM_PER_DEST_SERVER; int DD_QUEUE_MAX_KEY_SERVERS; int DD_REBALANCE_PARALLELISM; int DD_REBALANCE_RESET_AMOUNT; - double BG_DD_MAX_WAIT; - double BG_DD_MIN_WAIT; - double BG_DD_INCREASE_RATE; - double BG_DD_DECREASE_RATE; - double BG_DD_SATURATION_DELAY; double INFLIGHT_PENALTY_HEALTHY; double INFLIGHT_PENALTY_REDUNDANT; double INFLIGHT_PENALTY_UNHEALTHY; @@ -161,9 +156,14 @@ public: int PRIORITY_TEAM_FAILED; // Priority when a server in the team is excluded as failed int PRIORITY_TEAM_0_LEFT; int PRIORITY_SPLIT_SHARD; + int PRIORITY_ENFORCE_MOVE_OUT_OF_PHYSICAL_SHARD; // Priority when a physical shard is oversize or anonymous // Data distribution bool SHARD_ENCODE_LOCATION_METADATA; // If true, location metadata will contain shard ID. + bool ENABLE_DD_PHYSICAL_SHARD; // EXPERIMENTAL; If true, SHARD_ENCODE_LOCATION_METADATA must be true. + int64_t MAX_PHYSICAL_SHARD_BYTES; + double PHYSICAL_SHARD_METRICS_DELAY; + double ANONYMOUS_PHYSICAL_SHARD_TRANSITION_TIME; double READ_REBALANCE_CPU_THRESHOLD; // read rebalance only happens if the source servers' CPU > threshold int READ_REBALANCE_SRC_PARALLELISM; // the max count a server become a source server within a certain interval @@ -195,7 +195,6 @@ public: double SERVER_LIST_DELAY; double RECRUITMENT_IDLE_DELAY; double STORAGE_RECRUITMENT_DELAY; - double BLOB_WORKER_RECRUITMENT_DELAY; bool TSS_HACK_IDENTITY_MAPPING; double TSS_RECRUITMENT_TIMEOUT; double TSS_DD_CHECK_INTERVAL; @@ -234,6 +233,8 @@ public: int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY; int DD_STORAGE_WIGGLE_PAUSE_THRESHOLD; // How many unhealthy relocations are ongoing will pause storage wiggle int DD_STORAGE_WIGGLE_STUCK_THRESHOLD; // How many times bestTeamStuck accumulate will pause storage wiggle + int64_t + DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled bool DD_TENANT_AWARENESS_ENABLED; int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed @@ -255,9 +256,8 @@ public: // Run storage enginee on a child process on the same machine with storage process bool REMOTE_KV_STORE; - // A delay to avoid race on file resources if the new kv store process started immediately after the previous kv - // store process died - double REMOTE_KV_STORE_INIT_DELAY; + // A delay to avoid race on file resources after seeing lock_file_failure + double REBOOT_KV_STORE_DELAY; // max waiting time for the remote kv store to initialize double REMOTE_KV_STORE_MAX_INIT_DURATION; @@ -302,6 +302,7 @@ public: int64_t REPLACE_CONTENTS_BYTES; // KeyValueStoreRocksDB + int ROCKSDB_READ_RANGE_ROW_LIMIT; int ROCKSDB_BACKGROUND_PARALLELISM; int ROCKSDB_READ_PARALLELISM; int64_t ROCKSDB_MEMTABLE_BYTES; @@ -328,6 +329,7 @@ public: std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY; bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead double ROCKSDB_PERFCONTEXT_SAMPLE_RATE; + double ROCKSDB_METRICS_SAMPLE_INTERVAL; int ROCKSDB_MAX_SUBCOMPACTIONS; int64_t ROCKSDB_SOFT_PENDING_COMPACT_BYTES_LIMIT; int64_t ROCKSDB_HARD_PENDING_COMPACT_BYTES_LIMIT; @@ -337,6 +339,12 @@ public: int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE; int64_t ROCKSDB_BLOCK_SIZE; bool ENABLE_SHARDED_ROCKSDB; + int64_t ROCKSDB_WRITE_BUFFER_SIZE; + int64_t ROCKSDB_CF_WRITE_BUFFER_SIZE; + int64_t ROCKSDB_MAX_TOTAL_WAL_SIZE; + int64_t ROCKSDB_MAX_BACKGROUND_JOBS; + int64_t ROCKSDB_DELETE_OBSOLETE_FILE_PERIOD; + double ROCKSDB_PHYSICAL_SHARD_CLEAN_UP_DELAY; // Leader election int MAX_NOTIFICATIONS; @@ -548,6 +556,10 @@ public: double RATEKEEPER_DEFAULT_LIMIT; double RATEKEEPER_LIMIT_REASON_SAMPLE_RATE; bool RATEKEEPER_PRINT_LIMIT_REASON; + double RATEKEEPER_MIN_RATE; + double RATEKEEPER_MAX_RATE; + double RATEKEEPER_BATCH_MIN_RATE; + double RATEKEEPER_BATCH_MAX_RATE; int64_t TARGET_BYTES_PER_STORAGE_SERVER; int64_t SPRING_BYTES_STORAGE_SERVER; @@ -591,6 +603,8 @@ public: // Use global tag throttling strategy. i.e. throttle based on the cluster-wide // throughput for tags and their associated quotas. bool GLOBAL_TAG_THROTTLING; + // Enforce tag throttling on proxies rather than on clients + bool ENFORCE_TAG_THROTTLING_ON_PROXIES; // Minimum number of transactions per second that the global tag throttler must allow for each tag double GLOBAL_TAG_THROTTLING_MIN_RATE; // Used by global tag throttling counters @@ -618,8 +632,18 @@ public: double INITIAL_DURABILITY_LAG_MULTIPLIER; double DURABILITY_LAG_REDUCTION_RATE; double DURABILITY_LAG_INCREASE_RATE; - double STORAGE_SERVER_LIST_FETCH_TIMEOUT; + bool BW_THROTTLING_ENABLED; + double TARGET_BW_LAG; + double TARGET_BW_LAG_BATCH; + double TARGET_BW_LAG_UPDATE; + int MIN_BW_HISTORY; + double BW_ESTIMATION_INTERVAL; + double BW_LAG_INCREASE_AMOUNT; + double BW_LAG_DECREASE_AMOUNT; + double BW_FETCH_WORKERS_INTERVAL; + double BW_RW_LOGGING_INTERVAL; + double BW_MAX_BLOCKED_INTERVAL; // disk snapshot int64_t MAX_FORKED_PROCESS_OUTPUT; @@ -658,12 +682,12 @@ public: int STORAGE_LIMIT_BYTES; int BUGGIFY_LIMIT_BYTES; bool FETCH_USING_STREAMING; + bool FETCH_USING_BLOB; int FETCH_BLOCK_BYTES; int FETCH_KEYS_PARALLELISM_BYTES; int FETCH_KEYS_PARALLELISM; int FETCH_KEYS_PARALLELISM_FULL; int FETCH_KEYS_LOWER_PRIORITY; - int FETCH_CHANGEFEED_PARALLELISM; int SERVE_FETCH_CHECKPOINT_PARALLELISM; int BUGGIFY_BLOCK_BYTES; int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT; @@ -672,7 +696,6 @@ public: int STORAGE_COMMIT_BYTES; int STORAGE_FETCH_BYTES; double STORAGE_COMMIT_INTERVAL; - double UPDATE_SHARD_VERSION_INTERVAL; int BYTE_SAMPLING_FACTOR; int BYTE_SAMPLING_OVERHEAD; int MAX_STORAGE_SERVER_WATCH_BYTES; @@ -681,7 +704,6 @@ public: int BYTE_SAMPLE_LOAD_PARALLELISM; double BYTE_SAMPLE_LOAD_DELAY; double BYTE_SAMPLE_START_DELAY; - double UPDATE_STORAGE_PROCESS_STATS_INTERVAL; double BEHIND_CHECK_DELAY; int BEHIND_CHECK_COUNT; int64_t BEHIND_CHECK_VERSIONS; @@ -755,7 +777,6 @@ public: // Dynamic Knobs (implementation) double COMPACTION_INTERVAL; - double UPDATE_NODE_TIMEOUT; double GET_COMMITTED_VERSION_TIMEOUT; double GET_SNAPSHOT_AND_CHANGES_TIMEOUT; double FETCH_CHANGES_TIMEOUT; @@ -771,14 +792,6 @@ public: bool DISABLE_DUPLICATE_LOG_WARNING; double HISTOGRAM_REPORT_INTERVAL; - // IPager - int PAGER_RESERVED_PAGES; - - // IndirectShadowPager - int FREE_PAGE_VACUUM_THRESHOLD; - int VACUUM_QUEUE_SIZE; - int VACUUM_BYTES_PER_SECOND; - // Timekeeper int64_t TIME_KEEPER_DELAY; int64_t TIME_KEEPER_MAX_ENTRIES; @@ -801,11 +814,9 @@ public: int64_t FASTRESTORE_ROLE_LOGGING_DELAY; int64_t FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL; // How quickly to update process metrics for restore int64_t FASTRESTORE_ATOMICOP_WEIGHT; // workload amplication factor for atomic op - int64_t FASTRESTORE_APPLYING_PARALLELISM; // number of outstanding txns writing to dest. DB int64_t FASTRESTORE_MONITOR_LEADER_DELAY; int64_t FASTRESTORE_STRAGGLER_THRESHOLD_SECONDS; bool FASTRESTORE_TRACK_REQUEST_LATENCY; // true to track reply latency of each request in a request batch - bool FASTRESTORE_TRACK_LOADER_SEND_REQUESTS; // track requests of load send mutations to appliers? int64_t FASTRESTORE_MEMORY_THRESHOLD_MB_SOFT; // threshold when pipelined actors should be delayed int64_t FASTRESTORE_WAIT_FOR_MEMORY_LATENCY; int64_t FASTRESTORE_HEARTBEAT_DELAY; // interval for master to ping loaders and appliers @@ -877,6 +888,7 @@ public: int SIM_KMS_MAX_KEYS; int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH; bool ENABLE_TLOG_ENCRYPTION; + bool ENABLE_STORAGE_SERVER_ENCRYPTION; // Currently only Redwood engine supports encryption bool ENABLE_BLOB_GRANULE_ENCRYPTION; // Compression @@ -890,8 +902,6 @@ public: // FIXME: configure url with database configuration instead of knob eventually std::string BG_URL; - // whether to use blobRangeKeys or tenants for blob granule range sources - std::string BG_RANGE_SOURCE; // Whether to use knobs or EKP for blob metadata and credentials std::string BG_METADATA_SOURCE; @@ -911,10 +921,15 @@ public: int BG_KEY_TUPLE_TRUNCATE_OFFSET; int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM; + int BLOB_WORKER_RESNAPSHOT_PARALLELISM; + int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM; + double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout double BLOB_WORKERLIST_FETCH_INTERVAL; double BLOB_WORKER_BATCH_GRV_INTERVAL; + bool BLOB_WORKER_DO_REJECT_WHEN_FULL; + double BLOB_WORKER_REJECT_WHEN_FULL_THRESHOLD; double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN; double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX; diff --git a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h index e665b83124..75cae1fc47 100644 --- a/fdbclient/include/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/include/fdbclient/SpecialKeySpace.actor.h @@ -548,6 +548,15 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; }; +class WorkerInterfacesSpecialKeyImpl : public SpecialKeyRangeReadImpl { +public: + explicit WorkerInterfacesSpecialKeyImpl(KeyRangeRef kr); + + Future getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const override; +}; + // If the underlying set of key-value pairs of a key space is not changing, then we expect repeating a read to give the // same result. Additionally, we can generate the expected result of any read if that read is reading a subrange. This // actor performs a read of an arbitrary subrange of [begin, end) and validates the results. diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index 76c51aaea1..00e9ff2aef 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -294,15 +294,13 @@ struct GetValueRequest : TimedRequest { TenantInfo tenantInfo; Key key; Version version; - ReadType readType; Optional tags; - Optional debugID; ReplyPromise reply; + Optional options; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - - GetValueRequest() : readType(ReadType::NORMAL) {} + GetValueRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -310,16 +308,15 @@ struct GetValueRequest : TimedRequest { const TenantInfo& tenantInfo, const Key& key, Version ver, - ReadType type, Optional tags, - Optional debugID, + Optional options, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), readType(type), tags(tags), - debugID(debugID), ssLatestCommitVersions(latestCommitVersions) {} + : spanContext(spanContext), tenantInfo(tenantInfo), key(key), version(ver), tags(tags), options(options), + ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer(ar, key, version, readType, tags, debugID, reply, spanContext, tenantInfo, ssLatestCommitVersions); + serializer(ar, key, version, tags, reply, spanContext, tenantInfo, options, ssLatestCommitVersions); } }; @@ -395,15 +392,14 @@ struct GetKeyValuesRequest : TimedRequest { KeyRef mapper = KeyRef(); Version version; // or latestVersion int limit, limitBytes; - ReadType readType; Optional tags; - Optional debugID; + Optional options; ReplyPromise reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - GetKeyValuesRequest() : readType(ReadType::NORMAL) {} + GetKeyValuesRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -415,12 +411,11 @@ struct GetKeyValuesRequest : TimedRequest { version, limit, limitBytes, - readType, tags, - debugID, reply, spanContext, tenantInfo, + options, arena, ssLatestCommitVersions); } @@ -454,15 +449,14 @@ struct GetMappedKeyValuesRequest : TimedRequest { Version version; // or latestVersion int limit, limitBytes; int matchIndex; - ReadType readType; Optional tags; - Optional debugID; + Optional options; ReplyPromise reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key range - GetMappedKeyValuesRequest() : readType(ReadType::NORMAL) {} + GetMappedKeyValuesRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -475,12 +469,11 @@ struct GetMappedKeyValuesRequest : TimedRequest { version, limit, limitBytes, - readType, tags, - debugID, reply, spanContext, tenantInfo, + options, arena, ssLatestCommitVersions, matchIndex); @@ -522,15 +515,14 @@ struct GetKeyValuesStreamRequest { KeySelectorRef begin, end; Version version; // or latestVersion int limit, limitBytes; - ReadType readType; Optional tags; - Optional debugID; + Optional options; ReplyPromiseStream reply; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key range - GetKeyValuesStreamRequest() : readType(ReadType::NORMAL) {} + GetKeyValuesStreamRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -542,12 +534,11 @@ struct GetKeyValuesStreamRequest { version, limit, limitBytes, - readType, tags, - debugID, reply, spanContext, tenantInfo, + options, arena, ssLatestCommitVersions); } @@ -574,15 +565,14 @@ struct GetKeyRequest : TimedRequest { TenantInfo tenantInfo; KeySelectorRef sel; Version version; // or latestVersion - ReadType readType; Optional tags; - Optional debugID; ReplyPromise reply; + Optional options; VersionVector ssLatestCommitVersions; // includes the latest commit versions, as known // to this client, of all storage replicas that // serve the given key - GetKeyRequest() : readType(ReadType::NORMAL) {} + GetKeyRequest() {} bool verify() const { return tenantInfo.isAuthorized(); } @@ -590,17 +580,15 @@ struct GetKeyRequest : TimedRequest { TenantInfo tenantInfo, KeySelectorRef const& sel, Version version, - ReadType type, Optional tags, - Optional debugID, + Optional options, VersionVector latestCommitVersions) - : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), readType(type), debugID(debugID), + : spanContext(spanContext), tenantInfo(tenantInfo), sel(sel), version(version), tags(tags), options(options), ssLatestCommitVersions(latestCommitVersions) {} template void serialize(Ar& ar) { - serializer( - ar, sel, version, readType, tags, debugID, reply, spanContext, tenantInfo, arena, ssLatestCommitVersions); + serializer(ar, sel, version, tags, reply, spanContext, tenantInfo, options, arena, ssLatestCommitVersions); } }; diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 2571b5aa53..b41809691e 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -594,6 +594,8 @@ const Value blobManagerEpochValueFor(int64_t epoch); int64_t decodeBlobManagerEpochValue(ValueRef const& value); // blob granule keys +extern const StringRef blobRangeActive; +extern const StringRef blobRangeInactive; extern const uint8_t BG_FILE_TYPE_DELTA; extern const uint8_t BG_FILE_TYPE_SNAPSHOT; @@ -621,7 +623,8 @@ extern const KeyRangeRef blobGranuleHistoryKeys; // \xff\x02/bgp/(start,end) = (version, force) extern const KeyRangeRef blobGranulePurgeKeys; -extern const KeyRangeRef blobGranuleVersionKeys; +// \xff\x02/bgpforce/(start) = {1|0} (key range map) +extern const KeyRangeRef blobGranuleForcePurgedKeys; extern const KeyRef blobGranulePurgeChangeKey; const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType); diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index 7cce7dcb05..0af19b85f1 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -25,6 +25,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/KeyBackedTypes.h" #include "fdbclient/VersionedMap.h" +#include "fdbclient/KeyBackedTypes.h" #include "fdbrpc/TenantInfo.h" #include "flow/flat_buffers.h" @@ -33,7 +34,35 @@ typedef Standalone TenantName; typedef StringRef TenantGroupNameRef; typedef Standalone TenantGroupName; -enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, ERROR }; +// Represents the various states that a tenant could be in. +// In a standalone cluster, a tenant should only ever be in the READY state. +// In a metacluster, a tenant on the management cluster could be in the other states while changes are applied to the +// data cluster. +// +// REGISTERING - the tenant has been created on the management cluster and is being created on the data cluster +// READY - the tenant has been created on both clusters, is active, and is consistent between the two clusters +// REMOVING - the tenant has been marked for removal and is being removed on the data cluster +// UPDATING_CONFIGURATION - the tenant configuration has changed on the management cluster and is being applied to the +// data cluster +// RENAMING_FROM - the tenant is being renamed to a new name and is awaiting the rename to complete on the data cluster +// RENAMING_TO - the tenant is being created as a rename from an existing tenant and is awaiting the rename to complete +// on the data cluster +// ERROR - the tenant is in an error state +// +// A tenant in any configuration is allowed to be removed. Only tenants in the READY or UPDATING_CONFIGURATION phases +// can have their configuration updated. A tenant must not exist or be in the REGISTERING phase to be created. To be +// renamed, a tenant must be in the READY or RENAMING_FROM state. In the latter case, the rename destination must match +// the original rename attempt. +// +// If an operation fails and the tenant is left in a non-ready state, re-running the same operation is legal. If +// successful, the tenant will return to the READY state. +enum class TenantState { REGISTERING, READY, REMOVING, UPDATING_CONFIGURATION, RENAMING_FROM, RENAMING_TO, ERROR }; + +// Represents the lock state the tenant could be in. +// Can be used in conjunction with the other tenant states above. +enum class TenantLockState { UNLOCKED, READ_ONLY, LOCKED }; + +constexpr int TENANT_PREFIX_SIZE = sizeof(int64_t); struct TenantMapEntry { constexpr static FileIdentifier file_identifier = 12247338; @@ -44,15 +73,24 @@ struct TenantMapEntry { static std::string tenantStateToString(TenantState tenantState); static TenantState stringToTenantState(std::string stateStr); + static std::string tenantLockStateToString(TenantLockState tenantState); + static TenantLockState stringToTenantLockState(std::string stateStr); + int64_t id = -1; Key prefix; TenantState tenantState = TenantState::READY; + TenantLockState tenantLockState = TenantLockState::UNLOCKED; Optional tenantGroup; bool encrypted = false; + Optional assignedCluster; + int64_t configurationSequenceNum = 0; + Optional renamePair; + + // Can be set to an error string if the tenant is in the ERROR state + std::string error; constexpr static int PREFIX_SIZE = sizeof(id); -public: TenantMapEntry(); TenantMapEntry(int64_t id, TenantState tenantState, bool encrypted); TenantMapEntry(int64_t id, TenantState tenantState, Optional tenantGroup, bool encrypted); @@ -70,7 +108,16 @@ public: template void serialize(Ar& ar) { - serializer(ar, id, tenantState, tenantGroup, encrypted); + serializer(ar, + id, + tenantState, + tenantLockState, + tenantGroup, + encrypted, + assignedCluster, + configurationSequenceNum, + renamePair, + error); if constexpr (Ar::isDeserializing) { if (id >= 0) { prefix = idToPrefix(id); @@ -83,7 +130,10 @@ public: struct TenantGroupEntry { constexpr static FileIdentifier file_identifier = 10764222; + Optional assignedCluster; + TenantGroupEntry() = default; + TenantGroupEntry(Optional assignedCluster) : assignedCluster(assignedCluster) {} Value encode() { return ObjectWriter::toValue(*this, IncludeVersion()); } static TenantGroupEntry decode(ValueRef const& value) { @@ -92,41 +142,67 @@ struct TenantGroupEntry { template void serialize(Ar& ar) { - serializer(ar); + serializer(ar, assignedCluster); + } +}; + +struct TenantTombstoneCleanupData { + constexpr static FileIdentifier file_identifier = 3291339; + + // All tombstones have been erased up to and including this id. + // We should not generate new tombstones at IDs equal to or older than this. + int64_t tombstonesErasedThrough = -1; + + // The version at which we will next erase tombstones. + Version nextTombstoneEraseVersion = invalidVersion; + + // When we reach the nextTombstoneEraseVersion, we will erase tombstones up through this ID. + int64_t nextTombstoneEraseId = -1; + + template + void serialize(Ar& ar) { + serializer(ar, tombstonesErasedThrough, nextTombstoneEraseVersion, nextTombstoneEraseId); } }; struct TenantMetadataSpecification { - static KeyRef subspace; + Key subspace; KeyBackedObjectMap tenantMap; + KeyBackedMap tenantIdIndex; KeyBackedProperty lastTenantId; KeyBackedBinaryValue tenantCount; + KeyBackedSet tenantTombstones; + KeyBackedObjectProperty tombstoneCleanupData; KeyBackedSet tenantGroupTenantIndex; KeyBackedObjectMap tenantGroupMap; - TenantMetadataSpecification(KeyRef subspace) - : tenantMap(subspace.withSuffix("tenant/map/"_sr), IncludeVersion()), - lastTenantId(subspace.withSuffix("tenant/lastId"_sr)), tenantCount(subspace.withSuffix("tenant/count"_sr)), - tenantGroupTenantIndex(subspace.withSuffix("tenant/tenantGroup/tenantIndex/"_sr)), - tenantGroupMap(subspace.withSuffix("tenant/tenantGroup/map/"_sr), IncludeVersion()) {} + TenantMetadataSpecification(KeyRef prefix) + : subspace(prefix.withSuffix("tenant/"_sr)), tenantMap(subspace.withSuffix("map/"_sr), IncludeVersion()), + tenantIdIndex(subspace.withSuffix("idIndex/"_sr)), lastTenantId(subspace.withSuffix("lastId"_sr)), + tenantCount(subspace.withSuffix("count"_sr)), tenantTombstones(subspace.withSuffix("tombstones/"_sr)), + tombstoneCleanupData(subspace.withSuffix("tombstoneCleanup"_sr), IncludeVersion()), + tenantGroupTenantIndex(subspace.withSuffix("tenantGroup/tenantIndex/"_sr)), + tenantGroupMap(subspace.withSuffix("tenantGroup/map/"_sr), IncludeVersion()) {} }; struct TenantMetadata { -private: - static inline TenantMetadataSpecification instance = TenantMetadataSpecification("\xff/"_sr); + static TenantMetadataSpecification& instance(); -public: - static inline auto& tenantMap = instance.tenantMap; - static inline auto& lastTenantId = instance.lastTenantId; - static inline auto& tenantCount = instance.tenantCount; - static inline auto& tenantGroupTenantIndex = instance.tenantGroupTenantIndex; - static inline auto& tenantGroupMap = instance.tenantGroupMap; + static inline auto& subspace() { return instance().subspace; } + static inline auto& tenantMap() { return instance().tenantMap; } + static inline auto& tenantIdIndex() { return instance().tenantIdIndex; } + static inline auto& lastTenantId() { return instance().lastTenantId; } + static inline auto& tenantCount() { return instance().tenantCount; } + static inline auto& tenantTombstones() { return instance().tenantTombstones; } + static inline auto& tombstoneCleanupData() { return instance().tombstoneCleanupData; } + static inline auto& tenantGroupTenantIndex() { return instance().tenantGroupTenantIndex; } + static inline auto& tenantGroupMap() { return instance().tenantGroupMap; } - static inline Key tenantMapPrivatePrefix = "\xff"_sr.withSuffix(tenantMap.subspace.begin); + static Key tenantMapPrivatePrefix(); }; typedef VersionedMap TenantMap; -typedef VersionedMap TenantPrefixIndex; +class TenantPrefixIndex : public VersionedMap, public ReferenceCounted {}; #endif diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index b9e26d0df7..7499c8ddb7 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -21,6 +21,7 @@ #pragma once #include "fdbclient/ClientBooleanParams.h" #include "flow/IRandom.h" +#include "flow/ThreadHelper.actor.h" #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H) #define FDBCLIENT_TENANT_MANAGEMENT_ACTOR_G_H #include "fdbclient/TenantManagement.actor.g.h" @@ -30,6 +31,7 @@ #include #include #include "fdbclient/GenericTransactionHelper.h" +#include "fdbclient/Metacluster.h" #include "fdbclient/SystemData.h" #include "flow/actorcompiler.h" // has to be last include @@ -38,7 +40,7 @@ namespace TenantAPI { template Future> tryGetTenantTransaction(Transaction tr, TenantName name) { tr->setOption(FDBTransactionOptions::RAW_ACCESS); - return TenantMetadata::tenantMap.get(tr, name); + return TenantMetadata::tenantMap().get(tr, name); } ACTOR template @@ -78,26 +80,60 @@ Future getTenant(Reference db, TenantName name) { } ACTOR template -Future checkTenantMode(Transaction tr) { +Future getClusterType(Transaction tr) { + Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + return metaclusterRegistration.present() ? metaclusterRegistration.get().clusterType : ClusterType::STANDALONE; +} + +ACTOR template +Future checkTenantMode(Transaction tr, ClusterType expectedClusterType) { state typename transaction_future_type>::type tenantModeFuture = tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr)); + state ClusterType actualClusterType = wait(getClusterType(tr)); Optional tenantModeValue = wait(safeThreadFutureToFuture(tenantModeFuture)); TenantMode tenantMode = TenantMode::fromValue(tenantModeValue.castTo()); - if (tenantMode == TenantMode::DISABLED) { + if (actualClusterType != expectedClusterType) { + throw invalid_metacluster_operation(); + } else if (actualClusterType == ClusterType::STANDALONE && tenantMode == TenantMode::DISABLED) { throw tenants_disabled(); } return Void(); } +TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode); + +// Returns true if the specified ID has already been deleted and false if not. If the ID is old enough +// that we no longer keep tombstones for it, an error is thrown. +ACTOR template +Future checkTombstone(Transaction tr, int64_t id) { + state Future tombstoneFuture = TenantMetadata::tenantTombstones().exists(tr, id); + + // If we are trying to create a tenant older than the oldest tombstones we still maintain, then we fail it + // with an error. + Optional tombstoneCleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr)); + if (tombstoneCleanupData.present() && tombstoneCleanupData.get().tombstonesErasedThrough >= id) { + throw tenant_creation_permanently_failed(); + } + + state bool hasTombstone = wait(tombstoneFuture); + return hasTombstone; +} + // Creates a tenant with the given name. If the tenant already exists, the boolean return parameter will be false // and the existing entry will be returned. If the tenant cannot be created, then the optional will be empty. ACTOR template -Future, bool>> createTenantTransaction(Transaction tr, - TenantNameRef name, - TenantMapEntry tenantEntry) { +Future, bool>> createTenantTransaction( + Transaction tr, + TenantNameRef name, + TenantMapEntry tenantEntry, + ClusterType clusterType = ClusterType::STANDALONE) { + + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); ASSERT(tenantEntry.id >= 0); if (name.startsWith("\xff"_sr)) { @@ -110,17 +146,25 @@ Future, bool>> createTenantTransaction(Transa tr->setOption(FDBTransactionOptions::RAW_ACCESS); state Future> existingEntryFuture = tryGetTenantTransaction(tr, name); - wait(checkTenantMode(tr)); + state Future tenantModeCheck = checkTenantMode(tr, clusterType); + state Future tombstoneFuture = + (clusterType == ClusterType::STANDALONE) ? false : checkTombstone(tr, tenantEntry.id); state Future> existingTenantGroupEntryFuture; if (tenantEntry.tenantGroup.present()) { - existingTenantGroupEntryFuture = TenantMetadata::tenantGroupMap.get(tr, tenantEntry.tenantGroup.get()); + existingTenantGroupEntryFuture = TenantMetadata::tenantGroupMap().get(tr, tenantEntry.tenantGroup.get()); } + wait(tenantModeCheck); Optional existingEntry = wait(existingEntryFuture); if (existingEntry.present()) { return std::make_pair(existingEntry.get(), false); } + state bool hasTombstone = wait(tombstoneFuture); + if (hasTombstone) { + return std::make_pair(Optional(), false); + } + state typename transaction_future_type::type prefixRangeFuture = tr->getRange(prefixRange(tenantEntry.prefix), 1); @@ -130,23 +174,27 @@ Future, bool>> createTenantTransaction(Transa } tenantEntry.tenantState = TenantState::READY; - TenantMetadata::tenantMap.set(tr, name, tenantEntry); + tenantEntry.assignedCluster = Optional(); + + TenantMetadata::tenantMap().set(tr, name, tenantEntry); + TenantMetadata::tenantIdIndex().set(tr, tenantEntry.id, name); + if (tenantEntry.tenantGroup.present()) { - TenantMetadata::tenantGroupTenantIndex.insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name)); + TenantMetadata::tenantGroupTenantIndex().insert(tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), name)); // Create the tenant group associated with this tenant if it doesn't already exist Optional existingTenantGroup = wait(existingTenantGroupEntryFuture); if (!existingTenantGroup.present()) { - TenantMetadata::tenantGroupMap.set(tr, tenantEntry.tenantGroup.get(), TenantGroupEntry()); + TenantMetadata::tenantGroupMap().set(tr, tenantEntry.tenantGroup.get(), TenantGroupEntry()); } } // This is idempotent because we only add an entry to the tenant map if it isn't already there - TenantMetadata::tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + TenantMetadata::tenantCount().atomicOp(tr, 1, MutationRef::AddValue); // Read the tenant count after incrementing the counter so that simultaneous attempts to create // tenants in the same transaction are properly reflected. - int64_t tenantCount = wait(TenantMetadata::tenantCount.getD(tr, Snapshot::False, 0)); + int64_t tenantCount = wait(TenantMetadata::tenantCount().getD(tr, Snapshot::False, 0)); if (tenantCount > CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER) { throw cluster_no_capacity(); } @@ -156,7 +204,7 @@ Future, bool>> createTenantTransaction(Transa ACTOR template Future getNextTenantId(Transaction tr) { - Optional lastId = wait(TenantMetadata::lastTenantId.get(tr)); + Optional lastId = wait(TenantMetadata::lastTenantId().get(tr)); int64_t tenantId = lastId.orDefault(-1) + 1; if (BUGGIFY) { tenantId += deterministicRandom()->randomSkewedUInt32(1, 1e9); @@ -167,12 +215,15 @@ Future getNextTenantId(Transaction tr) { ACTOR template Future> createTenant(Reference db, TenantName name, - TenantMapEntry tenantEntry = TenantMapEntry()) { + TenantMapEntry tenantEntry = TenantMapEntry(), + ClusterType clusterType = ClusterType::STANDALONE) { state Reference tr = db->createTransaction(); - state bool checkExistence = true; + state bool checkExistence = clusterType != ClusterType::METACLUSTER_DATA; state bool generateTenantId = tenantEntry.id < 0; + ASSERT(clusterType == ClusterType::STANDALONE || !generateTenantId); + loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -195,11 +246,11 @@ Future> createTenant(Reference db, if (generateTenantId) { int64_t tenantId = wait(tenantIdFuture); tenantEntry.setId(tenantId); - TenantMetadata::lastTenantId.set(tr, tenantId); + TenantMetadata::lastTenantId().set(tr, tenantId); } state std::pair, bool> newTenant = - wait(createTenantTransaction(tr, name, tenantEntry)); + wait(createTenantTransaction(tr, name, tenantEntry, clusterType)); if (newTenant.second) { ASSERT(newTenant.first.present()); @@ -220,6 +271,50 @@ Future> createTenant(Reference db, } } +ACTOR template +Future markTenantTombstones(Transaction tr, int64_t tenantId) { + // In data clusters, we store a tombstone + state Future> latestTombstoneFuture = + TenantMetadata::tenantTombstones().getRange(tr, {}, {}, 1, Snapshot::False, Reverse::True); + state Optional cleanupData = wait(TenantMetadata::tombstoneCleanupData().get(tr)); + state Version transactionReadVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); + + // If it has been long enough since we last cleaned up the tenant tombstones, we do that first + if (!cleanupData.present() || cleanupData.get().nextTombstoneEraseVersion <= transactionReadVersion) { + state int64_t deleteThroughId = cleanupData.present() ? cleanupData.get().nextTombstoneEraseId : -1; + // Delete all tombstones up through the one currently marked in the cleanup data + if (deleteThroughId >= 0) { + TenantMetadata::tenantTombstones().erase(tr, 0, deleteThroughId + 1); + } + + KeyBackedRangeResult latestTombstone = wait(latestTombstoneFuture); + int64_t nextDeleteThroughId = std::max(deleteThroughId, tenantId); + if (!latestTombstone.results.empty()) { + nextDeleteThroughId = std::max(nextDeleteThroughId, latestTombstone.results[0]); + } + + // The next cleanup will happen at or after TENANT_TOMBSTONE_CLEANUP_INTERVAL seconds have elapsed and + // will clean up tombstones through the most recently allocated ID. + TenantTombstoneCleanupData updatedCleanupData; + updatedCleanupData.tombstonesErasedThrough = deleteThroughId; + updatedCleanupData.nextTombstoneEraseId = nextDeleteThroughId; + updatedCleanupData.nextTombstoneEraseVersion = + transactionReadVersion + + CLIENT_KNOBS->TENANT_TOMBSTONE_CLEANUP_INTERVAL * CLIENT_KNOBS->VERSIONS_PER_SECOND; + + TenantMetadata::tombstoneCleanupData().set(tr, updatedCleanupData); + + // If the tenant being deleted is within the tombstone window, record the tombstone + if (tenantId > updatedCleanupData.tombstonesErasedThrough) { + TenantMetadata::tenantTombstones().insert(tr, tenantId); + } + } else if (tenantId > cleanupData.get().tombstonesErasedThrough) { + // If the tenant being deleted is within the tombstone window, record the tombstone + TenantMetadata::tenantTombstones().insert(tr, tenantId); + } + return Void(); +} + // Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same // ID. If no matching tenant is found, this function returns without deleting anything. This behavior allows the // function to be used idempotently: if the transaction is retried after having succeeded, it will see that the tenant @@ -227,11 +322,15 @@ Future> createTenant(Reference db, ACTOR template Future deleteTenantTransaction(Transaction tr, TenantNameRef name, - Optional tenantId = Optional()) { + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { + ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present()); + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); + tr->setOption(FDBTransactionOptions::RAW_ACCESS); state Future> tenantEntryFuture = tryGetTenantTransaction(tr, name); - wait(checkTenantMode(tr)); + wait(checkTenantMode(tr, clusterType)); state Optional tenantEntry = wait(tenantEntryFuture); if (tenantEntry.present() && (!tenantId.present() || tenantEntry.get().id == tenantId.get())) { @@ -244,34 +343,43 @@ Future deleteTenantTransaction(Transaction tr, } // This is idempotent because we only erase an entry from the tenant map if it is present - TenantMetadata::tenantMap.erase(tr, name); - TenantMetadata::tenantCount.atomicOp(tr, -1, MutationRef::AddValue); + TenantMetadata::tenantMap().erase(tr, name); + TenantMetadata::tenantIdIndex().erase(tr, tenantEntry.get().id); + TenantMetadata::tenantCount().atomicOp(tr, -1, MutationRef::AddValue); if (tenantEntry.get().tenantGroup.present()) { - TenantMetadata::tenantGroupTenantIndex.erase(tr, - Tuple::makeTuple(tenantEntry.get().tenantGroup.get(), name)); - KeyBackedSet::RangeResultType tenantsInGroup = wait(TenantMetadata::tenantGroupTenantIndex.getRange( - tr, - Tuple::makeTuple(tenantEntry.get().tenantGroup.get()), - Tuple::makeTuple(keyAfter(tenantEntry.get().tenantGroup.get())), - 2)); + TenantMetadata::tenantGroupTenantIndex().erase(tr, + Tuple::makeTuple(tenantEntry.get().tenantGroup.get(), name)); + KeyBackedSet::RangeResultType tenantsInGroup = + wait(TenantMetadata::tenantGroupTenantIndex().getRange( + tr, + Tuple::makeTuple(tenantEntry.get().tenantGroup.get()), + Tuple::makeTuple(keyAfter(tenantEntry.get().tenantGroup.get())), + 2)); if (tenantsInGroup.results.empty() || (tenantsInGroup.results.size() == 1 && tenantsInGroup.results[0].getString(1) == name)) { - TenantMetadata::tenantGroupMap.erase(tr, tenantEntry.get().tenantGroup.get()); + TenantMetadata::tenantGroupMap().erase(tr, tenantEntry.get().tenantGroup.get()); } } } + if (clusterType == ClusterType::METACLUSTER_DATA) { + wait(markTenantTombstones(tr, tenantId.get())); + } + return Void(); } // Deletes the tenant with the given name. If tenantId is specified, the tenant being deleted must also have the same // ID. ACTOR template -Future deleteTenant(Reference db, TenantName name, Optional tenantId = Optional()) { +Future deleteTenant(Reference db, + TenantName name, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { state Reference tr = db->createTransaction(); - state bool checkExistence = true; + state bool checkExistence = clusterType == ClusterType::STANDALONE; loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -289,7 +397,7 @@ Future deleteTenant(Reference db, TenantName name, Optional t checkExistence = false; } - wait(deleteTenantTransaction(tr, name, tenantId)); + wait(deleteTenantTransaction(tr, name, tenantId, clusterType)); wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); TraceEvent("DeletedTenant").detail("Tenant", name).detail("Version", tr->getCommittedVersion()); @@ -308,8 +416,10 @@ Future configureTenantTransaction(Transaction tr, TenantNameRef tenantName, TenantMapEntry originalEntry, TenantMapEntry updatedTenantEntry) { + ASSERT(updatedTenantEntry.id == originalEntry.id); + tr->setOption(FDBTransactionOptions::RAW_ACCESS); - TenantMetadata::tenantMap.set(tr, tenantName, updatedTenantEntry); + TenantMetadata::tenantMap().set(tr, tenantName, updatedTenantEntry); // If the tenant group was changed, we need to update the tenant group metadata structures if (originalEntry.tenantGroup != updatedTenantEntry.tenantGroup) { @@ -318,11 +428,11 @@ Future configureTenantTransaction(Transaction tr, } if (originalEntry.tenantGroup.present()) { // Remove this tenant from the original tenant group index - TenantMetadata::tenantGroupTenantIndex.erase(tr, - Tuple::makeTuple(originalEntry.tenantGroup.get(), tenantName)); + TenantMetadata::tenantGroupTenantIndex().erase( + tr, Tuple::makeTuple(originalEntry.tenantGroup.get(), tenantName)); // Check if the original tenant group is now empty. If so, remove the tenant group. - KeyBackedSet::RangeResultType tenants = wait(TenantMetadata::tenantGroupTenantIndex.getRange( + KeyBackedSet::RangeResultType tenants = wait(TenantMetadata::tenantGroupTenantIndex().getRange( tr, Tuple::makeTuple(originalEntry.tenantGroup.get()), Tuple::makeTuple(keyAfter(originalEntry.tenantGroup.get())), @@ -330,19 +440,19 @@ Future configureTenantTransaction(Transaction tr, if (tenants.results.empty() || (tenants.results.size() == 1 && tenants.results[0].getString(1) == tenantName)) { - TenantMetadata::tenantGroupMap.erase(tr, originalEntry.tenantGroup.get()); + TenantMetadata::tenantGroupMap().erase(tr, originalEntry.tenantGroup.get()); } } if (updatedTenantEntry.tenantGroup.present()) { // If this is creating a new tenant group, add it to the tenant group map Optional entry = - wait(TenantMetadata::tenantGroupMap.get(tr, updatedTenantEntry.tenantGroup.get())); + wait(TenantMetadata::tenantGroupMap().get(tr, updatedTenantEntry.tenantGroup.get())); if (!entry.present()) { - TenantMetadata::tenantGroupMap.set(tr, updatedTenantEntry.tenantGroup.get(), TenantGroupEntry()); + TenantMetadata::tenantGroupMap().set(tr, updatedTenantEntry.tenantGroup.get(), TenantGroupEntry()); } // Insert this tenant in the tenant group index - TenantMetadata::tenantGroupTenantIndex.insert( + TenantMetadata::tenantGroupTenantIndex().insert( tr, Tuple::makeTuple(updatedTenantEntry.tenantGroup.get(), tenantName)); } } @@ -358,7 +468,7 @@ Future>> listTenantsTransactio tr->setOption(FDBTransactionOptions::RAW_ACCESS); KeyBackedRangeResult> results = - wait(TenantMetadata::tenantMap.getRange(tr, begin, end, limit)); + wait(TenantMetadata::tenantMap().getRange(tr, begin, end, limit)); return results.results; } @@ -384,33 +494,58 @@ Future>> listTenants(Reference } ACTOR template -Future renameTenantTransaction(Transaction tr, TenantNameRef oldName, TenantNameRef newName) { +Future renameTenantTransaction(Transaction tr, + TenantName oldName, + TenantName newName, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE, + Optional configureSequenceNum = Optional()) { + ASSERT(clusterType == ClusterType::STANDALONE || (tenantId.present() && configureSequenceNum.present())); + ASSERT(clusterType != ClusterType::METACLUSTER_MANAGEMENT); + wait(checkTenantMode(tr, clusterType)); tr->setOption(FDBTransactionOptions::RAW_ACCESS); state Optional oldEntry; state Optional newEntry; wait(store(oldEntry, tryGetTenantTransaction(tr, oldName)) && store(newEntry, tryGetTenantTransaction(tr, newName))); - if (!oldEntry.present()) { + if (!oldEntry.present() || (tenantId.present() && tenantId.get() != oldEntry.get().id)) { throw tenant_not_found(); } if (newEntry.present()) { throw tenant_already_exists(); } - TenantMetadata::tenantMap.erase(tr, oldName); - TenantMetadata::tenantMap.set(tr, newName, oldEntry.get()); + if (configureSequenceNum.present()) { + if (oldEntry.get().configurationSequenceNum >= configureSequenceNum.get()) { + return Void(); + } + oldEntry.get().configurationSequenceNum = configureSequenceNum.get(); + } + TenantMetadata::tenantMap().erase(tr, oldName); + TenantMetadata::tenantMap().set(tr, newName, oldEntry.get()); + TenantMetadata::tenantIdIndex().set(tr, oldEntry.get().id, newName); // Update the tenant group index to reflect the new tenant name if (oldEntry.get().tenantGroup.present()) { - TenantMetadata::tenantGroupTenantIndex.erase(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), oldName)); - TenantMetadata::tenantGroupTenantIndex.insert(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName)); + TenantMetadata::tenantGroupTenantIndex().erase(tr, Tuple::makeTuple(oldEntry.get().tenantGroup.get(), oldName)); + TenantMetadata::tenantGroupTenantIndex().insert(tr, + Tuple::makeTuple(oldEntry.get().tenantGroup.get(), newName)); + } + + if (clusterType == ClusterType::METACLUSTER_DATA) { + wait(markTenantTombstones(tr, tenantId.get())); } return Void(); } ACTOR template -Future renameTenant(Reference db, TenantName oldName, TenantName newName) { +Future renameTenant(Reference db, + TenantName oldName, + TenantName newName, + Optional tenantId = Optional(), + ClusterType clusterType = ClusterType::STANDALONE) { state Reference tr = db->createTransaction(); + ASSERT(clusterType == ClusterType::STANDALONE || tenantId.present()); state bool firstTry = true; state int64_t id; @@ -454,7 +589,7 @@ Future renameTenant(Reference db, TenantName oldName, TenantName newNa throw tenant_not_found(); } } - wait(renameTenantTransaction(tr, oldName, newName)); + wait(renameTenantTransaction(tr, oldName, newName, tenantId, clusterType)); wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); TraceEvent("RenameTenantSuccess").detail("OldName", oldName).detail("NewName", newName); return Void(); diff --git a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h index 7c9ccc7cc2..af9d02c371 100644 --- a/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h +++ b/fdbclient/include/fdbclient/TenantSpecialKeys.actor.h @@ -137,7 +137,7 @@ private: std::map, Optional>>> tenants, std::map* tenantGroupNetTenantDelta) { state Future tenantCountFuture = - TenantMetadata::tenantCount.getD(&ryw->getTransaction(), Snapshot::False, 0); + TenantMetadata::tenantCount().getD(&ryw->getTransaction(), Snapshot::False, 0); int64_t _nextId = wait(TenantAPI::getNextTenantId(&ryw->getTransaction())); state int64_t nextId = _nextId; @@ -146,7 +146,7 @@ private: createFutures.push_back(createTenant(ryw, tenant, config, nextId++, tenantGroupNetTenantDelta)); } - TenantMetadata::lastTenantId.set(&ryw->getTransaction(), nextId - 1); + TenantMetadata::lastTenantId().set(&ryw->getTransaction(), nextId - 1); wait(waitForAll(createFutures)); state int numCreatedTenants = 0; @@ -240,14 +240,14 @@ private: ASSERT(tenantDelta < 0); state int removedTenants = -tenantDelta; KeyBackedSet::RangeResultType tenantsInGroup = - wait(TenantMetadata::tenantGroupTenantIndex.getRange(&ryw->getTransaction(), - Tuple::makeTuple(tenantGroup), - Tuple::makeTuple(keyAfter(tenantGroup)), - removedTenants + 1)); + wait(TenantMetadata::tenantGroupTenantIndex().getRange(&ryw->getTransaction(), + Tuple::makeTuple(tenantGroup), + Tuple::makeTuple(keyAfter(tenantGroup)), + removedTenants + 1)); ASSERT(tenantsInGroup.results.size() >= removedTenants); if (tenantsInGroup.results.size() == removedTenants) { - TenantMetadata::tenantGroupMap.erase(&ryw->getTransaction(), tenantGroup); + TenantMetadata::tenantGroupMap().erase(&ryw->getTransaction(), tenantGroup); } return Void(); @@ -289,7 +289,7 @@ public: state std::set renameSet; state std::vector> renameMutations; - tenantManagementFutures.push_back(TenantAPI::checkTenantMode(&ryw->getTransaction())); + tenantManagementFutures.push_back(TenantAPI::checkTenantMode(&ryw->getTransaction(), ClusterType::STANDALONE)); for (auto range : ranges) { if (!range.value().first) { diff --git a/fdbclient/include/fdbclient/ThreadSafeTransaction.h b/fdbclient/include/fdbclient/ThreadSafeTransaction.h index 875664ea76..d72c4c8fc5 100644 --- a/fdbclient/include/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/include/fdbclient/ThreadSafeTransaction.h @@ -62,6 +62,13 @@ public: ThreadFuture purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override; ThreadFuture waitPurgeGranulesComplete(const KeyRef& purgeKey) override; + ThreadFuture blobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture unblobbifyRange(const KeyRangeRef& keyRange) override; + ThreadFuture>> listBlobbifiedRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; + + ThreadFuture verifyBlobRange(const KeyRangeRef& keyRange, Optional version) override; + ThreadFuture createSharedState() override; void setSharedState(DatabaseSharedState* p) override; @@ -72,7 +79,8 @@ private: DatabaseContext* db; public: // Internal use only - ThreadSafeDatabase(Reference connectionRecord, int apiVersion); + enum class ConnectionRecordType { FILE, CONNECTION_STRING }; + ThreadSafeDatabase(ConnectionRecordType connectionRecordType, std::string connectionRecord, int apiVersion); ThreadSafeDatabase(DatabaseContext* db) : db(db) {} DatabaseContext* unsafeGetPtr() const { return db; } }; @@ -148,13 +156,26 @@ public: ThreadFuture>> getRangeSplitPoints(const KeyRangeRef& range, int64_t chunkSize) override; - ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange) override; + ThreadFuture>> getBlobGranuleRanges(const KeyRangeRef& keyRange, + int rangeLimit) override; ThreadResult readBlobGranules(const KeyRangeRef& keyRange, Version beginVersion, Optional readVersion, ReadBlobGranuleContext granuleContext) override; + ThreadFuture>> readBlobGranulesStart(const KeyRangeRef& keyRange, + Version beginVersion, + Optional readVersion, + Version* readVersionOut) override; + + ThreadResult readBlobGranulesFinish( + ThreadFuture>> startFuture, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + ReadBlobGranuleContext granuleContext) override; + void addReadConflictRange(const KeyRangeRef& keys) override; void makeSelfConflicting(); @@ -205,6 +226,7 @@ class ThreadSafeApi : public IClientApi, ThreadSafeReferenceCounted value = Optional()) override; void setupNetwork() override; @@ -221,7 +243,7 @@ private: ThreadSafeApi(); int apiVersion; - const std::string clientVersion; + std::string clientVersion; uint64_t transportId; Mutex lock; diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 242675e0b1..da092f5463 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -115,6 +115,9 @@ description is not currently required but encouraged.