From dfc11001afa9f2cad01bd4c07e60de818082eb1d Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 14 Oct 2022 12:33:33 -0700 Subject: [PATCH 01/52] Reject transactions that have been tag throttled for too long --- fdbclient/ClientKnobs.cpp | 1 + fdbclient/include/fdbclient/ClientKnobs.h | 2 ++ .../GrvProxyTransactionTagThrottler.actor.cpp | 21 +++++++++++++++++++ .../GrvProxyTransactionTagThrottler.h | 3 +++ flow/include/flow/error_definitions.h | 1 + 5 files changed, 28 insertions(+) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 41d218f8fe..ead11794fd 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -272,6 +272,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0; init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096; init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096; + init( PROXY_MAX_TAG_THROTTLE, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE = 0.5; // busyness reporting init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 ); diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 6f89cdb45c..5e34eab002 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -262,6 +262,8 @@ public: double TAG_THROTTLE_EXPIRATION_INTERVAL; int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations + double PROXY_MAX_TAG_THROTTLE; // Maximum duration that a transaction can be tag throttled by proxy before being + // rejected // busyness reporting double BUSYNESS_SPIKE_START_THRESHOLD; diff --git a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp index 0ab7953344..4a83162496 100644 --- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp +++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp @@ -19,6 +19,7 @@ */ #include "fdbserver/GrvProxyTransactionTagThrottler.h" +#include "fdbclient/Knobs.h" #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // must be last include @@ -28,6 +29,10 @@ void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDur req.proxyTagThrottledDuration = now() - startTime; } +bool GrvProxyTransactionTagThrottler::DelayedRequest::isTooOld() const { + return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE; +} + void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) { if (rateInfo.present()) { rateInfo.get().setRate(rate); @@ -36,6 +41,19 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) { } } +bool GrvProxyTransactionTagThrottler::TagQueue::isTooOld() const { + return requests.empty() || requests.front().isTooOld(); +} + +void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() { + while (!requests.empty()) { + auto& delayedReq = requests.front(); + delayedReq.updateProxyTagThrottledDuration(); + delayedReq.req.reply.sendError(proxy_tag_throttled()); + requests.pop_front(); + } +} + void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap const& newRates) { for (const auto& [tag, rate] : newRates) { auto it = queues.find(tag); @@ -140,6 +158,9 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed, // Cannot release any more transaction from this tag (don't push the tag queue handle back into // pqOfQueues) CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction"); + if (tagQueueHandle.queue->isTooOld()) { + tagQueueHandle.queue->rejectRequests(); + } break; } else { if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) { diff --git a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h index 77a87a97b2..259bb6adc2 100644 --- a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h +++ b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h @@ -46,6 +46,7 @@ class GrvProxyTransactionTagThrottler { : req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {} void updateProxyTagThrottledDuration(); + bool isTooOld() const; }; struct TagQueue { @@ -56,6 +57,8 @@ class GrvProxyTransactionTagThrottler { explicit TagQueue(double rate) : rateInfo(rate) {} void setRate(double rate); + bool isTooOld() const; + void rejectRequests(); }; // Track the budgets for each tag diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index d493f3c897..396723c60c 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -131,6 +131,7 @@ ERROR( please_reboot_kv_store, 1219, "Need to reboot the storage engine") ERROR( incompatible_software_version, 1220, "Current software does not support database format" ) ERROR( audit_storage_failed, 1221, "Validate storage consistency operation failed" ) ERROR( audit_storage_exceeded_request_limit, 1222, "Exceeded the max number of allowed concurrent audit storage requests" ) +ERROR( proxy_tag_throttled, 1223, "Exceeded maximum proxy tag throttling duration" ) // 15xx Platform errors ERROR( platform_error, 1500, "Platform error" ) From 300840ea2e48756344a8b149a8478ccfeb46dde0 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 14 Oct 2022 17:08:49 -0700 Subject: [PATCH 02/52] Enable GLOBAL_TAG_THROTTLING by default --- fdbclient/ServerKnobs.cpp | 4 +- fdbserver/workloads/TagThrottleApi.actor.cpp | 2 +- ...ng.actor.cpp => ThroughputQuota.actor.cpp} | 24 ++++++----- tests/CMakeLists.txt | 2 +- tests/rare/GlobalTagThrottling.toml | 41 ------------------- tests/rare/ThroughputQuota.toml | 13 ++++++ 6 files changed, 31 insertions(+), 55 deletions(-) rename fdbserver/workloads/{GlobalTagThrottling.actor.cpp => ThroughputQuota.actor.cpp} (70%) delete mode 100644 tests/rare/GlobalTagThrottling.toml create mode 100644 tests/rare/ThroughputQuota.toml diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 1ecda0dae1..3d1d93bcb5 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -721,8 +721,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0; init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false; init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10); - init( GLOBAL_TAG_THROTTLING, false ); - init( ENFORCE_TAG_THROTTLING_ON_PROXIES, false ); + init( GLOBAL_TAG_THROTTLING, true ); + init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING ); init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 ); diff --git a/fdbserver/workloads/TagThrottleApi.actor.cpp b/fdbserver/workloads/TagThrottleApi.actor.cpp index ac47417959..f6b8061de5 100644 --- a/fdbserver/workloads/TagThrottleApi.actor.cpp +++ b/fdbserver/workloads/TagThrottleApi.actor.cpp @@ -43,7 +43,7 @@ struct TagThrottleApiWorkload : TestWorkload { } Future start(Database const& cx) override { - if (this->clientId != 0) + if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING || this->clientId != 0) return Void(); return timeout(runThrottleApi(this, cx), testDuration, Void()); } diff --git a/fdbserver/workloads/GlobalTagThrottling.actor.cpp b/fdbserver/workloads/ThroughputQuota.actor.cpp similarity index 70% rename from fdbserver/workloads/GlobalTagThrottling.actor.cpp rename to fdbserver/workloads/ThroughputQuota.actor.cpp index 2e2ec07a04..7a7e2c7d2b 100644 --- a/fdbserver/workloads/GlobalTagThrottling.actor.cpp +++ b/fdbserver/workloads/ThroughputQuota.actor.cpp @@ -1,5 +1,5 @@ /* - * GlobalTagThrottling.actor.cpp + * ThroughputQuota.actor.cpp * * This source file is part of the FoundationDB open source project * @@ -23,42 +23,46 @@ #include "flow/actorcompiler.h" // This must be the last #include. -class GlobalTagThrottlingWorkload : public TestWorkload { +// This workload sets the throughput quota of a tag during the setup phase +class ThroughputQuotaWorkload : public TestWorkload { TransactionTag transactionTag; double reservedQuota{ 0.0 }; double totalQuota{ 0.0 }; - ACTOR static Future setup(GlobalTagThrottlingWorkload* self, Database cx) { + ACTOR static Future setup(ThroughputQuotaWorkload* self, Database cx) { state Reference tr = makeReference(cx); loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - TraceEvent("GlobalTagThrottlingWorkload_SettingTagQuota") - .detail("Tag", self->transactionTag) + TraceEvent("ThroughputQuotaWorkload_SettingTagQuota") + .detail("Tag", printable(self->transactionTag)) .detail("ReservedQuota", self->reservedQuota) .detail("TotalQuota", self->totalQuota); ThrottleApi::setTagQuota(tr, self->transactionTag, self->reservedQuota, self->totalQuota); wait(tr->commit()); return Void(); } catch (Error& e) { - TraceEvent("GlobalTagThrottlingWorkload_SetupError").error(e); + TraceEvent("ThroughputQuotaWorkload_SetupError").error(e); wait(tr->onError(e)); } }; } public: - static constexpr auto NAME = "GlobalTagThrottling"; - explicit GlobalTagThrottlingWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + static constexpr auto NAME = "ThroughputQuota"; + explicit ThroughputQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { transactionTag = getOption(options, "transactionTag"_sr, "sampleTag"_sr); reservedQuota = getOption(options, "reservedQuota"_sr, 0.0); totalQuota = getOption(options, "totalQuota"_sr, 0.0); } - Future setup(Database const& cx) override { return clientId ? Void() : setup(this, cx); } + Future setup(Database const& cx) override { + DatabaseContext::debugUseTags = true; + return clientId ? Void() : setup(this, cx); + } Future start(Database const& cx) override { return Void(); } Future check(Database const& cx) override { return true; } void getMetrics(std::vector& m) override {} }; -WorkloadFactory GlobalTagThrottlingWorkloadFactory; +WorkloadFactory ThroughputQuotaWorkloadFactory; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e6d0730255..cba8b4169c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -226,7 +226,6 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES rare/CycleWithDeadHall.toml) add_fdb_test(TEST_FILES rare/DataDistributionMetrics.toml) add_fdb_test(TEST_FILES rare/FuzzTest.toml) - add_fdb_test(TEST_FILES rare/GlobalTagThrottling.toml IGNORE) add_fdb_test(TEST_FILES rare/HighContentionPrefixAllocator.toml) add_fdb_test(TEST_FILES rare/InventoryTestHeavyWrites.toml) add_fdb_test(TEST_FILES rare/LargeApiCorrectness.toml) @@ -240,6 +239,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml) add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml) add_fdb_test(TEST_FILES rare/Throttling.toml) + add_fdb_test(TEST_FILES rare/ThroughputQuota.toml) add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml) add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml) add_fdb_test(TEST_FILES rare/WriteTagThrottling.toml) diff --git a/tests/rare/GlobalTagThrottling.toml b/tests/rare/GlobalTagThrottling.toml deleted file mode 100644 index 499f2990f2..0000000000 --- a/tests/rare/GlobalTagThrottling.toml +++ /dev/null @@ -1,41 +0,0 @@ -[[test]] -testTitle='GlobalTagThrottling' - - [[test.knobs]] - min_tag_read_pages_rate=1.0 - global_tag_throttling=true - - [[test.workload]] - testName='GlobalTagThrottling' - transactionTag='sampleTag1' - totalQuota=1.0 - - [[test.workload]] - testName='ReadWrite' - testDuration=600.0 - transactionsPerSecond=100 - writesPerTransactionA=0 - readsPerTransactionA=10 - writesPerTransactionB=0 - readsPerTransactionB=0 - alpha=0.0 - nodeCount=10000 - valueBytes=1000 - minValueBytes=1000 - warmingDelay=60.0 - transactionTag='sampleTag1' - - [[test.workload]] - testName='ReadWrite' - testDuration=600.0 - transactionsPerSecond=100 - writesPerTransactionA=0 - readsPerTransactionA=10 - writesPerTransactionB=0 - readsPerTransactionB=0 - alpha=0.0 - nodeCount=10000 - valueBytes=1000 - minValueBytes=1000 - warmingDelay=60.0 - transactionTag='sampleTag2' diff --git a/tests/rare/ThroughputQuota.toml b/tests/rare/ThroughputQuota.toml new file mode 100644 index 0000000000..1c7d191306 --- /dev/null +++ b/tests/rare/ThroughputQuota.toml @@ -0,0 +1,13 @@ +[[test]] +testTitle='ThroughputQuotaTest' + + [[test.workload]] + testName='ThroughputQuota' + transactionTag='sampleTag1' + totalQuota=1.0 + + [[test.workload]] + testName = 'Cycle' + transactionsPerSecond = 2500.0 + testDuration = 10.0 + expectedRate = 0 From 003986fdb0a60a81f28e130c77b0e4824aefa694 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 14 Oct 2022 18:48:12 -0700 Subject: [PATCH 03/52] Randomize GLOBAL_TAG_THROTTLING knob --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/BlobWorker.actor.cpp | 2 +- fdbserver/GlobalTagThrottler.actor.cpp | 6 +++++- fdbserver/GrvProxyTransactionTagThrottler.actor.cpp | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 3d1d93bcb5..bb860a1781 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -721,7 +721,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0; init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false; init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10); - init( GLOBAL_TAG_THROTTLING, true ); + init( GLOBAL_TAG_THROTTLING, true ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip(); init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING ); init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index a1c76463ff..5bd42cea81 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -1107,7 +1107,7 @@ ACTOR Future dumpInitialSnapshotFromFDB(Reference } retries++; CODE_PROBE(true, "Granule initial snapshot failed"); - // FIXME: why can't we supress error event? + // FIXME: why can't we suppress error event? TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id) .error(err) .detail("Granule", metadata->keyRange) diff --git a/fdbserver/GlobalTagThrottler.actor.cpp b/fdbserver/GlobalTagThrottler.actor.cpp index b468677ae4..0dcb35a603 100644 --- a/fdbserver/GlobalTagThrottler.actor.cpp +++ b/fdbserver/GlobalTagThrottler.actor.cpp @@ -202,7 +202,8 @@ class GlobalTagThrottlerImpl { for (const auto& [id, _] : throughput) { result += getCurrentCost(id, tag).orDefault(0); } - TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result); + // FIXME: Disabled due to noisy trace events. Fix the noise and reenabled + //TraceEvent("GlobalTagThrottler_GetCurrentCost").detail("Tag", printable(tag)).detail("Cost", result); return result; } @@ -235,10 +236,13 @@ class GlobalTagThrottlerImpl { return 1.0; } auto const transactionRate = stats.get().getTransactionRate(); + // FIXME: Disabled due to noisy trace events. Fix the noise and reenabled + /* TraceEvent("GlobalTagThrottler_GetAverageTransactionCost") .detail("Tag", tag) .detail("TransactionRate", transactionRate) .detail("Cost", cost); + */ if (transactionRate == 0.0) { return 1.0; } else { diff --git a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp index 4a83162496..4487b28266 100644 --- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp +++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp @@ -91,6 +91,7 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re // SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be // unexpected behaviour, because only one tag is used for throttling. TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags") + .suppressFor(1.0) .detail("NumTags", req.tags.size()) .detail("UsingTag", printable(tag)); } From a70a007dcfe498ef672a23a476183d3ca52d181d Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 18 Oct 2022 16:27:55 -0700 Subject: [PATCH 04/52] Remove outdated comment --- fdbserver/BlobWorker.actor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index 5bd42cea81..ed2dbcf5da 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -1107,7 +1107,6 @@ ACTOR Future dumpInitialSnapshotFromFDB(Reference } retries++; CODE_PROBE(true, "Granule initial snapshot failed"); - // FIXME: why can't we suppress error event? TraceEvent(retries < 10 ? SevDebug : SevWarn, "BlobGranuleInitialSnapshotRetry", bwData->id) .error(err) .detail("Granule", metadata->keyRange) From a8b1154e457f7c88b2df2abcb113dc12273209a6 Mon Sep 17 00:00:00 2001 From: Ankita Kejriwal Date: Fri, 21 Oct 2022 14:53:19 -0700 Subject: [PATCH 05/52] Move the storage quota monitor from DataDistributor to TenantCache --- fdbserver/DataDistribution.actor.cpp | 32 +++---------------- fdbserver/TenantCache.actor.cpp | 25 +++++++++++++++ .../fdbserver/DataDistribution.actor.h | 4 --- fdbserver/include/fdbserver/TenantCache.h | 7 ++++ 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 9f01a1b2be..0edde3d55d 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -286,8 +286,6 @@ public: PromiseStream relocationProducer, relocationConsumer; Reference physicalShardCollection; - StorageQuotaInfo storageQuotaInfo; - Promise initialized; std::unordered_map>> audits; @@ -542,27 +540,6 @@ public: } }; -ACTOR Future storageQuotaTracker(Database cx, StorageQuotaInfo* storageQuotaInfo) { - loop { - state Transaction tr(cx); - loop { - try { - state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); - TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size()); - for (auto const kv : currentQuotas) { - Key const key = kv.key.removePrefix(storageQuotaPrefix); - uint64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); - storageQuotaInfo->quotaMap[key] = quota; - } - wait(delay(5.0)); - break; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } -} - // Periodically check and log the physicalShard status; clean up empty physicalShard; ACTOR Future monitorPhysicalShardStatus(Reference self) { ASSERT(SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA); @@ -683,16 +660,15 @@ ACTOR Future dataDistribution(Reference self, self->ddId, &normalDDQueueErrors())); - actors.push_back(reportErrorsExcept(storageQuotaTracker(cx, &self->storageQuotaInfo), - "StorageQuotaTracker", - self->ddId, - &normalDDQueueErrors())); - if (ddIsTenantAware) { actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorTenantMap(), "DDTenantCacheMonitor", self->ddId, &normalDDQueueErrors())); + actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorstorageQuota(), + "StorageQuotaTracker", + self->ddId, + &normalDDQueueErrors())); actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageUsage(), "StorageUsageTracker", self->ddId, diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index 5ff3cdb30d..47a005e27b 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -149,6 +149,27 @@ public: wait(delay(refreshInterval)); } } + + ACTOR static Future monitorstorageQuota(TenantCache* tenantCache) { + loop { + state Transaction tr(tenantCache->dbcx()); + loop { + try { + state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); + TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size()); + for (auto const kv : currentQuotas) { + Key const key = kv.key.removePrefix(storageQuotaPrefix); + uint64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); + tenantCache->storageQuotaInfo.quotaMap[key] = quota; + } + wait(delay(5.0)); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + } }; void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) { @@ -272,6 +293,10 @@ Future TenantCache::monitorStorageUsage() { return TenantCacheImpl::monitorStorageUsage(this); } +Future TenantCache::monitorstorageQuota() { + return TenantCacheImpl::monitorstorageQuota(this); +} + class TenantCacheUnitTest { public: ACTOR static Future InsertAndTestPresence() { diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 2389dc0ab6..c5b39ee3eb 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -484,10 +484,6 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize); // Determines the maximum shard size based on the size of the database int64_t getMaxShardSize(double dbSizeEstimate); -struct StorageQuotaInfo { - std::map quotaMap; -}; - #ifndef __INTEL_COMPILER #pragma endregion #endif diff --git a/fdbserver/include/fdbserver/TenantCache.h b/fdbserver/include/fdbserver/TenantCache.h index ff7e941aab..5355ebc921 100644 --- a/fdbserver/include/fdbserver/TenantCache.h +++ b/fdbserver/include/fdbserver/TenantCache.h @@ -50,6 +50,11 @@ private: uint64_t generation; TenantMapByPrefix tenantCache; + // Map from tenant names to storage quota + struct StorageQuotaInfo { + std::map quotaMap; + } storageQuotaInfo; + // mark the start of a new sweep of the tenant cache void startRefresh(); @@ -85,6 +90,8 @@ public: Future monitorStorageUsage(); + Future monitorstorageQuota(); + std::string desc() const; bool isTenantKey(KeyRef key) const; From 7647cea4e556f730e6f1b707a8870bf4c88087f6 Mon Sep 17 00:00:00 2001 From: Ankita Kejriwal Date: Fri, 21 Oct 2022 15:44:13 -0700 Subject: [PATCH 06/52] Improve the storage quota monitor code + add a knob for refresh interval --- fdbclient/ServerKnobs.cpp | 3 ++- fdbclient/include/fdbclient/ServerKnobs.h | 6 ++++-- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/TenantCache.actor.cpp | 18 ++++++++++-------- fdbserver/include/fdbserver/TenantCache.h | 8 ++++---- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index dca0e22058..15c1718f38 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -297,7 +297,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_TENANT_AWARENESS_ENABLED, false ); init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); - init( TENANT_CACHE_STORAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); // TeamRemover init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 8b172bd438..b77d1cbad0 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -237,8 +237,10 @@ public: DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled bool DD_TENANT_AWARENESS_ENABLED; int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed - int TENANT_CACHE_STORAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant in the TenantCache is - // refreshed + int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed + // in the TenantCache + int TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL; // How often the storage quota allocated to each tenant is + // refreshed in the TenantCache // TeamRemover to remove redundant teams bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 0edde3d55d..2caaf15832 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -665,7 +665,7 @@ ACTOR Future dataDistribution(Reference self, "DDTenantCacheMonitor", self->ddId, &normalDDQueueErrors())); - actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorstorageQuota(), + actors.push_back(reportErrorsExcept(ddTenantCache.get()->monitorStorageQuota(), "StorageQuotaTracker", self->ddId, &normalDDQueueErrors())); diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index 47a005e27b..8631b866d6 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -122,7 +122,7 @@ public: ACTOR static Future monitorStorageUsage(TenantCache* tenantCache) { TraceEvent(SevInfo, "StartingTenantCacheStorageUsageMonitor", tenantCache->id()).log(); - state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_REFRESH_INTERVAL; + state int refreshInterval = SERVER_KNOBS->TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; state double lastTenantListFetchTime = now(); loop { @@ -150,21 +150,23 @@ public: } } - ACTOR static Future monitorstorageQuota(TenantCache* tenantCache) { + ACTOR static Future monitorStorageQuota(TenantCache* tenantCache) { + TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log(); + loop { state Transaction tr(tenantCache->dbcx()); loop { try { state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); - TraceEvent("StorageQuota_ReadCurrentQuotas").detail("Size", currentQuotas.size()); for (auto const kv : currentQuotas) { - Key const key = kv.key.removePrefix(storageQuotaPrefix); + TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix); uint64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); - tenantCache->storageQuotaInfo.quotaMap[key] = quota; + tenantCache->tenantStorageMap[tenant] = quota; } - wait(delay(5.0)); + wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL)); break; } catch (Error& e) { + TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e); wait(tr.onError(e)); } } @@ -293,8 +295,8 @@ Future TenantCache::monitorStorageUsage() { return TenantCacheImpl::monitorStorageUsage(this); } -Future TenantCache::monitorstorageQuota() { - return TenantCacheImpl::monitorstorageQuota(this); +Future TenantCache::monitorStorageQuota() { + return TenantCacheImpl::monitorStorageQuota(this); } class TenantCacheUnitTest { diff --git a/fdbserver/include/fdbserver/TenantCache.h b/fdbserver/include/fdbserver/TenantCache.h index 5355ebc921..95c9402921 100644 --- a/fdbserver/include/fdbserver/TenantCache.h +++ b/fdbserver/include/fdbserver/TenantCache.h @@ -32,6 +32,8 @@ typedef Map> TenantMapByPrefix; +typedef std::unordered_map TenantStorageMap; + struct TenantCacheTenantCreated { KeyRange keys; Promise reply; @@ -51,9 +53,7 @@ private: TenantMapByPrefix tenantCache; // Map from tenant names to storage quota - struct StorageQuotaInfo { - std::map quotaMap; - } storageQuotaInfo; + TenantStorageMap tenantStorageMap; // mark the start of a new sweep of the tenant cache void startRefresh(); @@ -90,7 +90,7 @@ public: Future monitorStorageUsage(); - Future monitorstorageQuota(); + Future monitorStorageQuota(); std::string desc() const; From 03fb3d2cfe321bf648f46bec950822c23f48d5a7 Mon Sep 17 00:00:00 2001 From: Ankita Kejriwal Date: Fri, 21 Oct 2022 16:11:13 -0700 Subject: [PATCH 07/52] Simplify how tenants' storage usage is stored in TenantCache --- fdbrpc/include/fdbrpc/TenantInfo.h | 2 -- fdbserver/TenantCache.actor.cpp | 23 ++++++++--------------- fdbserver/include/fdbserver/TCInfo.h | 1 - fdbserver/include/fdbserver/TenantCache.h | 15 ++++++++------- 4 files changed, 16 insertions(+), 25 deletions(-) diff --git a/fdbrpc/include/fdbrpc/TenantInfo.h b/fdbrpc/include/fdbrpc/TenantInfo.h index 0a5822fb77..f05412d7d4 100644 --- a/fdbrpc/include/fdbrpc/TenantInfo.h +++ b/fdbrpc/include/fdbrpc/TenantInfo.h @@ -42,8 +42,6 @@ struct TenantInfo { // Is set during deserialization. It will be set to true if the tenant // name is set and the client is authorized to use this tenant. bool tenantAuthorized = false; - // Number of storage bytes currently used by this tenant. - int64_t storageUsage = 0; // Helper function for most endpoints that read/write data. This returns true iff // the client is either a) a trusted peer or b) is accessing keyspace belonging to a tenant, diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index 8631b866d6..aba790f152 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -127,14 +127,14 @@ public: loop { state double fetchStartTime = now(); - state std::vector> tenantList = tenantCache->getTenantList(); + state std::vector tenants = tenantCache->getTenantList(); state int i; - for (i = 0; i < tenantList.size(); i++) { - state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenantList[i].second); + for (i = 0; i < tenants.size(); i++) { + state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]); loop { try { state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys)); - tenantCache->updateStorageUsage(tenantList[i].first, size); + tenantCache->tenantStorageMap[tenants[i]].usage = size; } catch (Error& e) { TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e); wait(tr.onError(e)); @@ -161,7 +161,7 @@ public: for (auto const kv : currentQuotas) { TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix); uint64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); - tenantCache->tenantStorageMap[tenant] = quota; + tenantCache->tenantStorageMap[tenant].quota = quota; } wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL)); break; @@ -226,21 +226,14 @@ int TenantCache::cleanup() { return tenantsRemoved; } -std::vector> TenantCache::getTenantList() const { - std::vector> tenants; +std::vector TenantCache::getTenantList() const { + std::vector tenants; for (const auto& [prefix, entry] : tenantCache) { - tenants.push_back({ prefix, entry->name() }); + tenants.push_back(entry->name()); } return tenants; } -void TenantCache::updateStorageUsage(KeyRef prefix, int64_t size) { - auto it = tenantCache.find(prefix); - if (it != tenantCache.end()) { - it->value->updateStorageUsage(size); - } -} - std::string TenantCache::desc() const { std::string s("@Generation: "); s += std::to_string(generation) + " "; diff --git a/fdbserver/include/fdbserver/TCInfo.h b/fdbserver/include/fdbserver/TCInfo.h index 9b6edc118f..4911658bb2 100644 --- a/fdbserver/include/fdbserver/TCInfo.h +++ b/fdbserver/include/fdbserver/TCInfo.h @@ -268,5 +268,4 @@ public: void removeTeam(TCTeamInfo team); void updateCacheGeneration(int64_t generation) { m_cacheGeneration = generation; } int64_t cacheGeneration() const { return m_cacheGeneration; } - void updateStorageUsage(int64_t size) { m_tenantInfo.storageUsage = size; } }; diff --git a/fdbserver/include/fdbserver/TenantCache.h b/fdbserver/include/fdbserver/TenantCache.h index 95c9402921..d93d473360 100644 --- a/fdbserver/include/fdbserver/TenantCache.h +++ b/fdbserver/include/fdbserver/TenantCache.h @@ -32,7 +32,11 @@ typedef Map> TenantMapByPrefix; -typedef std::unordered_map TenantStorageMap; +struct Storage { + uint64_t quota; + int64_t usage; +}; +typedef std::unordered_map TenantStorageMap; struct TenantCacheTenantCreated { KeyRange keys; @@ -52,7 +56,7 @@ private: uint64_t generation; TenantMapByPrefix tenantCache; - // Map from tenant names to storage quota + // Map from tenant names to storage quota and usage TenantStorageMap tenantStorageMap; // mark the start of a new sweep of the tenant cache @@ -67,11 +71,8 @@ private: // return count of tenants that were found to be stale and removed from the cache int cleanup(); - // return the mapping from prefix -> tenant name for all tenants stored in the cache - std::vector> getTenantList() const; - - // update the size for a tenant; do nothing if the tenant doesn't exist in the map - void updateStorageUsage(KeyRef prefix, int64_t size); + // return all the TenantName for all tenants stored in the cache + std::vector getTenantList() const; UID id() const { return distributorID; } From c34a23152cdeb180b066f61c4f666c07edaacfd1 Mon Sep 17 00:00:00 2001 From: Ankita Kejriwal Date: Fri, 21 Oct 2022 16:18:52 -0700 Subject: [PATCH 08/52] Change the storage quota type from unit64_t to int64_t With this change, the storage quota will be of the same type as the storage bytes used returned by `getEstimatedRangeSizeBytes`. --- fdbclient/ManagementAPI.actor.cpp | 10 +++++----- fdbclient/include/fdbclient/ManagementAPI.actor.h | 4 ++-- fdbserver/TenantCache.actor.cpp | 2 +- fdbserver/include/fdbserver/TenantCache.h | 2 +- fdbserver/workloads/StorageQuota.actor.cpp | 12 ++++++------ 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 665fbd9274..d4382ee086 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -2559,19 +2559,19 @@ bool schemaMatch(json_spirit::mValue const& schemaValue, } } -void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota) { +void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) { tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); auto key = storageQuotaKey(tenantName); - tr.set(key, BinaryWriter::toValue(quota, Unversioned())); + tr.set(key, BinaryWriter::toValue(quota, Unversioned())); } -ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName) { +ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); state Optional v = wait(tr->get(storageQuotaKey(tenantName))); if (!v.present()) { - return Optional(); + return Optional(); } - return BinaryReader::fromStringRef(v.get(), Unversioned()); + return BinaryReader::fromStringRef(v.get(), Unversioned()); } std::string ManagementAPI::generateErrorMessage(const CoordinatorsResult& res) { diff --git a/fdbclient/include/fdbclient/ManagementAPI.actor.h b/fdbclient/include/fdbclient/ManagementAPI.actor.h index bd19da06f6..e220f0b156 100644 --- a/fdbclient/include/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h @@ -164,8 +164,8 @@ bool schemaMatch(json_spirit::mValue const& schema, ACTOR Future mgmtSnapCreate(Database cx, Standalone snapCmd, UID snapUID); // Set and get the storage quota per tenant -void setStorageQuota(Transaction& tr, StringRef tenantName, uint64_t quota); -ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName); +void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota); +ACTOR Future> getStorageQuota(Transaction* tr, StringRef tenantName); #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index aba790f152..7425880e1e 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -160,7 +160,7 @@ public: state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); for (auto const kv : currentQuotas) { TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix); - uint64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); + int64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); tenantCache->tenantStorageMap[tenant].quota = quota; } wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL)); diff --git a/fdbserver/include/fdbserver/TenantCache.h b/fdbserver/include/fdbserver/TenantCache.h index d93d473360..5a6278203d 100644 --- a/fdbserver/include/fdbserver/TenantCache.h +++ b/fdbserver/include/fdbserver/TenantCache.h @@ -33,7 +33,7 @@ typedef Map> TenantMapByPrefix; struct Storage { - uint64_t quota; + int64_t quota; int64_t usage; }; typedef std::unordered_map TenantStorageMap; diff --git a/fdbserver/workloads/StorageQuota.actor.cpp b/fdbserver/workloads/StorageQuota.actor.cpp index 1623083412..469832550e 100644 --- a/fdbserver/workloads/StorageQuota.actor.cpp +++ b/fdbserver/workloads/StorageQuota.actor.cpp @@ -38,17 +38,17 @@ struct StorageQuotaWorkload : TestWorkload { wait(setStorageQuotaHelper(cx, "name2"_sr, 200)); wait(setStorageQuotaHelper(cx, "name1"_sr, 300)); - state Optional quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr)); + state Optional quota1 = wait(getStorageQuotaHelper(cx, "name1"_sr)); ASSERT(quota1.present() && quota1.get() == 300); - state Optional quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr)); + state Optional quota2 = wait(getStorageQuotaHelper(cx, "name2"_sr)); ASSERT(quota2.present() && quota2.get() == 200); - state Optional quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr)); + state Optional quota3 = wait(getStorageQuotaHelper(cx, "name3"_sr)); ASSERT(!quota3.present()); return Void(); } - ACTOR static Future setStorageQuotaHelper(Database cx, StringRef tenantName, uint64_t quota) { + ACTOR static Future setStorageQuotaHelper(Database cx, StringRef tenantName, int64_t quota) { state Transaction tr(cx); loop { try { @@ -61,11 +61,11 @@ struct StorageQuotaWorkload : TestWorkload { } } - ACTOR static Future> getStorageQuotaHelper(Database cx, StringRef tenantName) { + ACTOR static Future> getStorageQuotaHelper(Database cx, StringRef tenantName) { state Transaction tr(cx); loop { try { - state Optional quota = wait(getStorageQuota(&tr, tenantName)); + state Optional quota = wait(getStorageQuota(&tr, tenantName)); wait(tr.commit()); return quota; } catch (Error& e) { From b98366351b27e1d04dc60d199191a7039975eadf Mon Sep 17 00:00:00 2001 From: Ankita Kejriwal Date: Fri, 21 Oct 2022 16:27:43 -0700 Subject: [PATCH 09/52] Add a function in TenantCache to get list of tenants over storage quota --- fdbserver/TenantCache.actor.cpp | 10 ++++++++++ fdbserver/include/fdbserver/TenantCache.h | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index 7425880e1e..bba438e3a3 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -280,6 +280,16 @@ Optional> TenantCache::tenantOwning(KeyRef key) const { return it->value; } +std::vector TenantCache::getTenantsOverQuota() const { + std::vector tenants; + for (const auto& [tenant, storage] : tenantStorageMap) { + if (storage.usage > storage.quota) { + tenants.push_back(tenant); + } + } + return tenants; +} + Future TenantCache::monitorTenantMap() { return TenantCacheImpl::monitorTenantMap(this); } diff --git a/fdbserver/include/fdbserver/TenantCache.h b/fdbserver/include/fdbserver/TenantCache.h index 5a6278203d..32c9a0eeb6 100644 --- a/fdbserver/include/fdbserver/TenantCache.h +++ b/fdbserver/include/fdbserver/TenantCache.h @@ -33,8 +33,8 @@ typedef Map> TenantMapByPrefix; struct Storage { - int64_t quota; - int64_t usage; + int64_t quota = std::numeric_limits::max(); + int64_t usage = 0; }; typedef std::unordered_map TenantStorageMap; @@ -98,4 +98,7 @@ public: bool isTenantKey(KeyRef key) const; Optional> tenantOwning(KeyRef key) const; + + // Get the list of tenants where the storage bytes currently used is greater than the quota allocated + std::vector getTenantsOverQuota() const; }; From aa99b89d53103d1b3a9a6684cf801ad0daf622fd Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 21 Oct 2022 17:02:02 -0700 Subject: [PATCH 10/52] Don't fail ConsistencyCheck on first mismatch ConsistencyCheck fails when it sees the first corrupted shard. We may want to keep it running so that we can see all the corrupted data in logs. --- fdbserver/ConsistencyScan.actor.cpp | 5 +++-- fdbserver/workloads/ConsistencyCheck.actor.cpp | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fdbserver/ConsistencyScan.actor.cpp b/fdbserver/ConsistencyScan.actor.cpp index f961cb20a0..34a36b4629 100644 --- a/fdbserver/ConsistencyScan.actor.cpp +++ b/fdbserver/ConsistencyScan.actor.cpp @@ -393,6 +393,7 @@ ACTOR Future checkDataConsistency(Database cx, state double rateLimiterStartTime = now(); state int64_t bytesReadInthisRound = 0; state bool resume = !(restart || shuffleShards); + state bool testResult = true; state double dbSize = 100e12; if (g_network->isSimulated()) { @@ -710,7 +711,7 @@ ACTOR Future checkDataConsistency(Database cx, (!storageServerInterfaces[j].isTss() && !storageServerInterfaces[firstValidServer].isTss())) { testFailure("Data inconsistent", performQuiescentChecks, true); - return false; + testResult = false; } } } @@ -949,7 +950,7 @@ ACTOR Future checkDataConsistency(Database cx, } *bytesReadInPrevRound = bytesReadInthisRound; - return true; + return testResult; } ACTOR Future runDataValidationCheck(ConsistencyScanData* self) { diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 6abeb66e5f..7c53744b13 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -394,6 +394,7 @@ struct ConsistencyCheckWorkload : TestWorkload { state Standalone> serverList; // "\xff/serverList/[[serverID]]" := "[[StorageServerInterface]]" state Standalone> serverTag; // "\xff/serverTag/[[serverID]]" = "[[Tag]]" + state bool testResult = true; std::vector> cacheResultsPromise; cacheResultsPromise.push_back(self->fetchKeyValuesFromSS(cx, self, storageCacheKeys, cacheKeyPromise, true)); @@ -581,7 +582,7 @@ struct ConsistencyCheckWorkload : TestWorkload { for (j = 0; j < keyValueFutures.size(); j++) { ErrorOr rangeResult = keyValueFutures[j].get(); // if (rangeResult.isError()) { - // throw rangeResult.getError(); + // throw rangeResult.getError(); // } // Compare the results with other storage servers @@ -709,7 +710,7 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("MatchingKVPairs", matchingKVPairs); self->testFailure("Data inconsistent", true); - return false; + testResult = false; } } } @@ -755,7 +756,7 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("BytesRead", bytesReadInRange); } } - return true; + return testResult; } // Directly fetch key/values from storage servers through GetKeyValuesRequest From 56cdb687bed0509ed38f95eeec76974c454528f1 Mon Sep 17 00:00:00 2001 From: Ankita Kejriwal Date: Fri, 21 Oct 2022 18:12:22 -0700 Subject: [PATCH 11/52] Code fixes in TenantCache monitors --- fdbserver/TenantCache.actor.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index bba438e3a3..690a0dc871 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -135,6 +135,7 @@ public: try { state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys)); tenantCache->tenantStorageMap[tenants[i]].usage = size; + break; } catch (Error& e) { TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e); wait(tr.onError(e)); @@ -153,8 +154,9 @@ public: ACTOR static Future monitorStorageQuota(TenantCache* tenantCache) { TraceEvent(SevInfo, "StartingTenantCacheStorageQuotaMonitor", tenantCache->id()).log(); + state Transaction tr(tenantCache->dbcx()); + loop { - state Transaction tr(tenantCache->dbcx()); loop { try { state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); @@ -163,13 +165,14 @@ public: int64_t const quota = BinaryReader::fromStringRef(kv.value, Unversioned()); tenantCache->tenantStorageMap[tenant].quota = quota; } - wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL)); + tr.reset(); break; } catch (Error& e) { TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e); wait(tr.onError(e)); } } + wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL)); } } }; From 04ae47b9b9a3ba2c9efc4aadace122d32067a608 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 21 Oct 2022 22:17:33 -0700 Subject: [PATCH 12/52] Addressed review comments --- fdbclient/ClientKnobs.cpp | 2 +- fdbclient/include/fdbclient/ClientKnobs.h | 4 ++-- .../GrvProxyTransactionTagThrottler.actor.cpp | 14 ++++++++------ .../fdbserver/GrvProxyTransactionTagThrottler.h | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index ead11794fd..36fdaea5b0 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -272,7 +272,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0; init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096; init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096; - init( PROXY_MAX_TAG_THROTTLE, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE = 0.5; + init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5; // busyness reporting init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 ); diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 5e34eab002..b4961ba3f7 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -262,8 +262,8 @@ public: double TAG_THROTTLE_EXPIRATION_INTERVAL; int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations - double PROXY_MAX_TAG_THROTTLE; // Maximum duration that a transaction can be tag throttled by proxy before being - // rejected + double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before + // being rejected // busyness reporting double BUSYNESS_SPIKE_START_THRESHOLD; diff --git a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp index 4487b28266..ae71d575a8 100644 --- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp +++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp @@ -18,8 +18,8 @@ * limitations under the License. */ -#include "fdbserver/GrvProxyTransactionTagThrottler.h" #include "fdbclient/Knobs.h" +#include "fdbserver/GrvProxyTransactionTagThrottler.h" #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // must be last include @@ -29,8 +29,8 @@ void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDur req.proxyTagThrottledDuration = now() - startTime; } -bool GrvProxyTransactionTagThrottler::DelayedRequest::isTooOld() const { - return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE; +bool GrvProxyTransactionTagThrottler::DelayedRequest::isMaxThrottled() const { + return now() - startTime > CLIENT_KNOBS->PROXY_MAX_TAG_THROTTLE_DURATION; } void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) { @@ -41,8 +41,8 @@ void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) { } } -bool GrvProxyTransactionTagThrottler::TagQueue::isTooOld() const { - return requests.empty() || requests.front().isTooOld(); +bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const { + return !requests.empty() && requests.front().isMaxThrottled(); } void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() { @@ -159,7 +159,9 @@ void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed, // Cannot release any more transaction from this tag (don't push the tag queue handle back into // pqOfQueues) CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction"); - if (tagQueueHandle.queue->isTooOld()) { + if (tagQueueHandle.queue->isMaxThrottled()) { + // Requests in this queue have been throttled too long and errors + // should be sent to clients. tagQueueHandle.queue->rejectRequests(); } break; diff --git a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h index 259bb6adc2..887ff9ffb0 100644 --- a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h +++ b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h @@ -46,7 +46,7 @@ class GrvProxyTransactionTagThrottler { : req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {} void updateProxyTagThrottledDuration(); - bool isTooOld() const; + bool isMaxThrottled() const; }; struct TagQueue { @@ -57,7 +57,7 @@ class GrvProxyTransactionTagThrottler { explicit TagQueue(double rate) : rateInfo(rate) {} void setRate(double rate); - bool isTooOld() const; + bool isMaxThrottled() const; void rejectRequests(); }; From 1ae98808f997dd3091a141de2ff6fef55cadf015 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Sat, 22 Oct 2022 07:12:03 -0700 Subject: [PATCH 13/52] Retry on proxy_tag_throttled errors --- fdbclient/NativeAPI.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index bf5483f82c..7372e66f6d 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -6558,7 +6558,7 @@ ACTOR static Future tryCommit(Reference trState, e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled && e.code() != error_code_process_behind && e.code() != error_code_future_version && - e.code() != error_code_tenant_not_found) { + e.code() != error_code_tenant_not_found && e.code() != error_code_proxy_tag_throttled) { TraceEvent(SevError, "TryCommitError").error(e); } if (trState->trLogInfo) @@ -7484,7 +7484,7 @@ Future Transaction::onError(Error const& e) { e.code() == error_code_database_locked || e.code() == error_code_commit_proxy_memory_limit_exceeded || e.code() == error_code_grv_proxy_memory_limit_exceeded || e.code() == error_code_process_behind || e.code() == error_code_batch_transaction_throttled || e.code() == error_code_tag_throttled || - e.code() == error_code_blob_granule_request_failed) { + e.code() == error_code_blob_granule_request_failed || e.code() == error_code_proxy_tag_throttled) { if (e.code() == error_code_not_committed) ++trState->cx->transactionsNotCommitted; else if (e.code() == error_code_commit_unknown_result) From da4ceaf73586599792ea2b2decab0b344e740b79 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Sun, 23 Oct 2022 10:39:59 -0700 Subject: [PATCH 14/52] Improve code coverage for proxy-level transaction tag throttling --- fdbclient/NativeAPI.actor.cpp | 4 +++- fdbserver/GrvProxyTransactionTagThrottler.actor.cpp | 1 + tests/rare/ThroughputQuota.toml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 7372e66f6d..2bf00bc4ba 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -6999,6 +6999,8 @@ ACTOR Future getConsistentReadVersion(SpanContext parentSpa &GrvProxyInterface::getConsistentReadVersion, req, cx->taskID))) { + CODE_PROBE(v.proxyTagThrottledDuration > 0.0, + "getConsistentReadVersion received GetReadVersionReply delayed by proxy tag throttling"); if (tags.size() != 0) { auto& priorityThrottledTags = cx->throttledTags[priority]; for (auto& tag : tags) { @@ -7033,7 +7035,7 @@ ACTOR Future getConsistentReadVersion(SpanContext parentSpa } } catch (Error& e) { if (e.code() != error_code_broken_promise && e.code() != error_code_batch_transaction_throttled && - e.code() != error_code_grv_proxy_memory_limit_exceeded) + e.code() != error_code_grv_proxy_memory_limit_exceeded && e.code() != error_code_proxy_tag_throttled) TraceEvent(SevError, "GetConsistentReadVersionError").error(e); if (e.code() == error_code_batch_transaction_throttled && !cx->apiVersionAtLeast(630)) { wait(delayJittered(5.0)); diff --git a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp index ae71d575a8..d6cf76bc63 100644 --- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp +++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp @@ -46,6 +46,7 @@ bool GrvProxyTransactionTagThrottler::TagQueue::isMaxThrottled() const { } void GrvProxyTransactionTagThrottler::TagQueue::rejectRequests() { + CODE_PROBE(true, "GrvProxyTransactionTagThrottler rejecting requests"); while (!requests.empty()) { auto& delayedReq = requests.front(); delayedReq.updateProxyTagThrottledDuration(); diff --git a/tests/rare/ThroughputQuota.toml b/tests/rare/ThroughputQuota.toml index 1c7d191306..9ab3c0aa4b 100644 --- a/tests/rare/ThroughputQuota.toml +++ b/tests/rare/ThroughputQuota.toml @@ -3,7 +3,7 @@ testTitle='ThroughputQuotaTest' [[test.workload]] testName='ThroughputQuota' - transactionTag='sampleTag1' + transactionTag='a' totalQuota=1.0 [[test.workload]] From 70eb9aef28b140f4c85d1c57813b3bc948ddfe8d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 18 Oct 2022 13:24:24 -0700 Subject: [PATCH 15/52] simple MGS IKeyLocationService implementation --- fdbclient/FDBTypes.cpp | 10 ++++ fdbclient/NativeAPI.actor.cpp | 21 +++----- fdbclient/include/fdbclient/FDBTypes.h | 2 + .../include/fdbclient/KeyLocationService.h | 48 +++++++++++++++++++ fdbserver/DDTxnProcessor.actor.cpp | 3 +- fdbserver/MockGlobalState.cpp | 46 ++++++++++++++++++ fdbserver/include/fdbserver/DDTxnProcessor.h | 1 + fdbserver/include/fdbserver/MockGlobalState.h | 27 ++++++++++- 8 files changed, 140 insertions(+), 18 deletions(-) create mode 100644 fdbclient/include/fdbclient/KeyLocationService.h diff --git a/fdbclient/FDBTypes.cpp b/fdbclient/FDBTypes.cpp index ff268075bd..edd016d391 100644 --- a/fdbclient/FDBTypes.cpp +++ b/fdbclient/FDBTypes.cpp @@ -22,6 +22,16 @@ #include "fdbclient/Knobs.h" #include "fdbclient/NativeAPI.actor.h" +KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix) { + if (prefix.empty()) { + return range; + } else { + KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin; + KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end; + return KeyRangeRef(begin, end); + } +} + KeyRef keyBetween(const KeyRangeRef& keys) { int pos = 0; // will be the position of the first difference between keys.begin and keys.end int minSize = std::min(keys.begin.size(), keys.end.size()); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index bf5483f82c..4c1b75d905 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1479,16 +1479,6 @@ Future HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* return healthMetricsGetRangeActor(ryw, kr); } -KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) { - if (prefix.empty()) { - return range; - } else { - KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin; - KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end; - return KeyRangeRef(begin, end); - } -} - ACTOR Future getClusterId(Database db) { while (!db->clientInfo->get().clusterId.isValid()) { wait(db->clientInfo->onChange()); @@ -1925,7 +1915,7 @@ Optional DatabaseContext::getCachedLocation(const Optional auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); if (range->value()) { - return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value()); + return KeyRangeLocationInfo(tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value()); } return Optional(); @@ -1962,7 +1952,7 @@ bool DatabaseContext::getCachedLocations(const Optional& tenantNa result.clear(); return false; } - result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value()); + result.emplace_back(tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value()); if (result.size() == limit || begin == end) { break; } @@ -2978,7 +2968,7 @@ ACTOR Future getKeyLocation_internal(Database cx, return KeyRangeLocationInfo( rep.tenantEntry, - KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), + KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), locationInfo); } } @@ -3123,7 +3113,7 @@ ACTOR Future> getKeyRangeLocations_internal( // efficient to save the map pairs and insert them all at once. results.emplace_back( rep.tenantEntry, - (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), + (toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), cx->setCachedLocation( tenant.name, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); wait(yield()); @@ -7779,7 +7769,8 @@ ACTOR Future, int>> waitStorageMetrics( } else { TraceEvent(SevWarn, "WaitStorageMetricsPenalty") .detail("Keys", keys) - .detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) + .detail("Limit", shardLimit) + .detail("LocationSize", locations.size()) .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY); wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); // make sure that the next getKeyRangeLocations() call will actually re-fetch the range diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index b25c45ee8b..a532d9d59e 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -589,6 +589,8 @@ inline KeyRange prefixRange(KeyRef prefix) { // The returned reference is valid as long as keys is valid. KeyRef keyBetween(const KeyRangeRef& keys); +KeyRangeRef toPrefixRelativeRange(KeyRangeRef range, KeyRef prefix); + struct KeySelectorRef { private: KeyRef key; // Find the last item less than key diff --git a/fdbclient/include/fdbclient/KeyLocationService.h b/fdbclient/include/fdbclient/KeyLocationService.h new file mode 100644 index 0000000000..50e8e8820e --- /dev/null +++ b/fdbclient/include/fdbclient/KeyLocationService.h @@ -0,0 +1,48 @@ +/* + * KeyLocationService.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef FOUNDATIONDB_KEYLOCATIONSERVICE_H +#define FOUNDATIONDB_KEYLOCATIONSERVICE_H + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/DatabaseContext.h" + +class IKeyLocationService { + + // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). + // Otherwise returns the shard containing key. It's possible the returned location is a failed interface. + virtual Future getKeyLocation(TenantInfo tenant, + Key key, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Reverse isBackward, + Version version) = 0; + + virtual Future> getKeyRangeLocations(TenantInfo tenant, + KeyRange keys, + int limit, + Reverse reverse, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Version version) = 0; +}; + +#endif // FOUNDATIONDB_KEYLOCATIONSERVICE_H diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index e379b8e33f..bcd8acf0d0 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -871,7 +871,6 @@ Future>> DDMockTxnProcessor::splitStorageMetrics( return Future>>(); } -// FIXME: finish implementation Future, int>> DDMockTxnProcessor::waitStorageMetrics( const KeyRange& keys, const StorageMetrics& min, @@ -879,7 +878,7 @@ Future, int>> DDMockTxnProcessor::waitStorage const StorageMetrics& permittedError, int shardLimit, int expectedShardCount) const { - return Future, int>>(); + return mgs->waitStorageMetrics(keys, min, max, permittedError, shardLimit, expectedShardCount); } // FIXME: finish implementation diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.cpp index 310cd9c854..31162494fe 100644 --- a/fdbserver/MockGlobalState.cpp +++ b/fdbserver/MockGlobalState.cpp @@ -190,6 +190,52 @@ bool MockGlobalState::allShardRemovedFromServer(const UID& serverId) { return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0; } +Future, int>> MockGlobalState::waitStorageMetrics( + const KeyRange& keys, + const StorageMetrics& min, + const StorageMetrics& max, + const StorageMetrics& permittedError, + int shardLimit, + int expectedShardCount) { + return Future, int>>(); +} + +Future MockGlobalState::getKeyLocation(TenantInfo tenant, + Key key, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Reverse isBackward, + Version version) { + GetKeyServerLocationsReply rep; + + // construct the location info with the servers + std::vector>> serverRefs; + auto& servers = rep.results[0].second; + serverRefs.reserve(servers.size()); + for (const auto& interf : servers) { + serverRefs.push_back(makeReference>(interf)); + } + + auto locationInfo = makeReference(serverRefs); + + return KeyRangeLocationInfo( + rep.tenantEntry, + KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), + locationInfo); +} +Future> MockGlobalState::getKeyRangeLocations( + TenantInfo tenant, + KeyRange keys, + int limit, + Reverse reverse, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Version version) { + return Future>(); +} + TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") { BasicTestConfig testConfig; testConfig.simpleConfig = true; diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index a3dc98e04a..17373439ae 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -117,6 +117,7 @@ public: virtual Future moveKeys(const MoveKeysParams& params) = 0; + // metrics.second is the number of key-ranges (i.e., shards) in the 'keys' key-range virtual Future, int>> waitStorageMetrics(KeyRange const& keys, StorageMetrics const& min, StorageMetrics const& max, diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index d814984270..ce5c34d32e 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -25,6 +25,7 @@ #include "fdbclient/KeyRangeMap.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/KeyLocationService.h" #include "SimulatedCluster.h" #include "ShardsAffectedByTeamFailure.h" @@ -112,7 +113,7 @@ protected: void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize); }; -class MockGlobalState { +class MockGlobalState : public IKeyLocationService { friend struct MockGlobalStateTester; public: @@ -163,6 +164,30 @@ public: * * mgs.allServer[X] is existed */ bool allShardRemovedFromServer(const UID& serverId); + + Future, int>> waitStorageMetrics(KeyRange const& keys, + StorageMetrics const& min, + StorageMetrics const& max, + StorageMetrics const& permittedError, + int shardLimit, + int expectedShardCount); + + Future getKeyLocation(TenantInfo tenant, + Key key, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Reverse isBackward, + Version version) override; + + Future> getKeyRangeLocations(TenantInfo tenant, + KeyRange keys, + int limit, + Reverse reverse, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Version version) override; }; #endif // FOUNDATIONDB_MOCKGLOBALSTATE_H From 5d90703dc8b02af81f8a1a0f9e383008983e6f63 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 19 Oct 2022 11:43:29 -0700 Subject: [PATCH 16/52] finish getKeysLocations etc, and unit test pass. --- fdbserver/DDTeamCollection.actor.cpp | 2 +- fdbserver/DDTxnProcessor.actor.cpp | 4 +- fdbserver/MockGlobalState.cpp | 166 ++++++++++++++++-- fdbserver/ShardsAffectedByTeamFailure.cpp | 12 +- fdbserver/include/fdbserver/MockGlobalState.h | 2 + .../fdbserver/ShardsAffectedByTeamFailure.h | 9 +- 6 files changed, 176 insertions(+), 19 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 253e7ee558..f0db0d48fa 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -895,7 +895,7 @@ public: if (maxPriority < SERVER_KNOBS->PRIORITY_TEAM_FAILED) { std::pair, std::vector> - teams = self->shardsAffectedByTeamFailure->getTeamsFor(shards[i]); + teams = self->shardsAffectedByTeamFailure->getTeamsForFirstShard(shards[i]); for (int j = 0; j < teams.first.size() + teams.second.size(); j++) { // t is the team in primary DC or the remote DC auto& t = diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index bcd8acf0d0..5a5b1de276 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -763,7 +763,7 @@ std::vector DDMockTxnProcessor::getDDShardInfos() const { KeyRangeRef curRange = it->range(); DDShardInfo info(curRange.begin); - auto teams = mgs->shardMapping->getTeamsFor(curRange); + auto teams = mgs->shardMapping->getTeamsForFirstShard(curRange); if (!teams.first.empty() && !teams.second.empty()) { CODE_PROBE(true, "Mock InitialDataDistribution In-Flight shard"); info.hasDest = true; @@ -909,7 +909,7 @@ void DDMockTxnProcessor::rawFinishMovement(MoveKeysParams& params, ASSERT(params.finishMoveKeysParallelismLock->take().isReady()); // get source and dest teams - auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsFor(params.keys); + auto [destTeams, srcTeams] = mgs->shardMapping->getTeamsForFirstShard(params.keys); ASSERT_EQ(destTeams.size(), 0); if (destTeams.front() != ShardsAffectedByTeamFailure::Team{ params.destinationTeam, true }) { diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.cpp index 31162494fe..7596923fc8 100644 --- a/fdbserver/MockGlobalState.cpp +++ b/fdbserver/MockGlobalState.cpp @@ -19,6 +19,7 @@ */ #include "fdbserver/MockGlobalState.h" +#include "fdbserver/workloads/workloads.actor.h" bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) { auto ranges = serverKeys.intersectingRanges(range); @@ -159,7 +160,7 @@ bool MockGlobalState::serverIsSourceForShard(const UID& serverId, KeyRangeRef sh } // check keyServers - auto teams = shardMapping->getTeamsFor(shard); + auto teams = shardMapping->getTeamsForFirstShard(shard); if (inFlightShard) { return std::any_of(teams.second.begin(), teams.second.end(), [&serverId](const Team& team) { return team.hasServer(serverId); @@ -180,7 +181,7 @@ bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shar } // check keyServers - auto teams = shardMapping->getTeamsFor(shard); + auto teams = shardMapping->getTeamsForFirstShard(shard); return !teams.second.empty() && std::any_of(teams.first.begin(), teams.first.end(), [&serverId](const Team& team) { return team.hasServer(serverId); }); @@ -200,6 +201,17 @@ Future, int>> MockGlobalState::waitStorageMet return Future, int>>(); } +Reference buildLocationInfo(const std::vector& interfaces) { + // construct the location info with the servers + std::vector>> serverRefs; + serverRefs.reserve(interfaces.size()); + for (const auto& interf : interfaces) { + serverRefs.push_back(makeReference>(interf)); + } + + return makeReference(serverRefs); +} + Future MockGlobalState::getKeyLocation(TenantInfo tenant, Key key, SpanContext spanContext, @@ -207,23 +219,25 @@ Future MockGlobalState::getKeyLocation(TenantInfo tenant, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, Version version) { - GetKeyServerLocationsReply rep; - - // construct the location info with the servers - std::vector>> serverRefs; - auto& servers = rep.results[0].second; - serverRefs.reserve(servers.size()); - for (const auto& interf : servers) { - serverRefs.push_back(makeReference>(interf)); + if (isBackward) { + // DD never ask for backward range. + UNREACHABLE(); } + ASSERT(key < allKeys.end); - auto locationInfo = makeReference(serverRefs); + GetKeyServerLocationsReply rep; + KeyRange single = singleKeyRange(key); + auto teamPair = shardMapping->getTeamsForFirstShard(single); + auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second; + ASSERT_EQ(srcTeam.size(), 1); + rep.results.emplace_back(single, extractStorageServerInterfaces(srcTeam.front().servers)); return KeyRangeLocationInfo( rep.tenantEntry, KeyRange(toPrefixRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), - locationInfo); + buildLocationInfo(rep.results[0].second)); } + Future> MockGlobalState::getKeyRangeLocations( TenantInfo tenant, KeyRange keys, @@ -233,7 +247,39 @@ Future> MockGlobalState::getKeyRangeLocations( Optional debugID, UseProvisionalProxies useProvisionalProxies, Version version) { - return Future>(); + + if (reverse) { + // DD never ask for backward range. + UNREACHABLE(); + } + ASSERT(keys.begin < keys.end); + + GetKeyServerLocationsReply rep; + auto ranges = shardMapping->intersectingRanges(keys); + auto it = ranges.begin(); + for (int count = 0; it != ranges.end() && count < limit; ++it, ++count) { + auto teamPair = shardMapping->getTeamsFor(it->begin()); + auto& srcTeam = teamPair.second.empty() ? teamPair.first : teamPair.second; + ASSERT_EQ(srcTeam.size(), 1); + rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers)); + } + CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare); + + std::vector results; + for (int shard = 0; shard < rep.results.size(); shard++) { + results.emplace_back(rep.tenantEntry, + (toPrefixRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), + buildLocationInfo(rep.results[shard].second)); + } + return results; +} + +std::vector MockGlobalState::extractStorageServerInterfaces(const std::vector& ids) const { + std::vector interfaces; + for (auto& id : ids) { + interfaces.emplace_back(allServers.at(id).ssi); + } + return interfaces; } TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") { @@ -302,6 +348,28 @@ struct MockGlobalStateTester { ranges.pop_front(); ASSERT(ranges.empty()); } + + KeyRangeLocationInfo getKeyLocationInfo(KeyRef key, std::shared_ptr mgs) { + return mgs + ->getKeyLocation( + TenantInfo(), key, SpanContext(), Optional(), UseProvisionalProxies::False, Reverse::False, 0) + .get(); + } + + std::vector getKeyRangeLocations(KeyRangeRef keys, + int limit, + std::shared_ptr mgs) { + return mgs + ->getKeyRangeLocations(TenantInfo(), + keys, + limit, + Reverse::False, + SpanContext(), + Optional(), + UseProvisionalProxies::False, + 0) + .get(); + } }; TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") { @@ -325,3 +393,75 @@ TEST_CASE("/MockGlobalState/MockStorageServer/SplittingFunctions") { return Void(); } + +namespace { +inline bool locationInfoEqualsToTeam(Reference loc, const std::vector& ids) { + return loc->locations()->size() == ids.size() && + std::all_of(ids.begin(), ids.end(), [loc](const UID& id) { return loc->locations()->hasInterface(id); }); +} +}; // namespace +TEST_CASE("/MockGlobalState/MockStorageServer/GetKeyLocations") { + BasicTestConfig testConfig; + testConfig.simpleConfig = true; + testConfig.minimumReplication = 1; + testConfig.logAntiQuorum = 0; + DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig); + TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString()); + + auto mgs = std::make_shared(); + mgs->initializeAsEmptyDatabaseMGS(dbConfig); + // add one empty server + mgs->addStorageServer(StorageServerInterface(mgs->indexToUID(mgs->allServers.size() + 1))); + + // define 3 ranges: + // team 1 (UID 1,2,...,n-1):[begin, 1.0), [2.0, end) + // team 2 (UID 2,3,...n-1, n): [1.0, 2.0) + ShardsAffectedByTeamFailure::Team team1, team2; + for (int i = 0; i < mgs->allServers.size() - 1; ++i) { + UID id = mgs->indexToUID(i + 1); + team1.servers.emplace_back(id); + id = mgs->indexToUID(i + 2); + team2.servers.emplace_back(id); + } + Key one = doubleToTestKey(1.0), two = doubleToTestKey(2.0); + std::vector ranges{ KeyRangeRef(allKeys.begin, one), + KeyRangeRef(one, two), + KeyRangeRef(two, allKeys.end) }; + mgs->shardMapping->assignRangeToTeams(ranges[0], { team1 }); + mgs->shardMapping->assignRangeToTeams(ranges[1], { team2 }); + mgs->shardMapping->assignRangeToTeams(ranges[2], { team1 }); + + // query key location + MockGlobalStateTester tester; + // -- team 1 + Key testKey = doubleToTestKey(0.5); + auto locInfo = tester.getKeyLocationInfo(testKey, mgs); + ASSERT(locationInfoEqualsToTeam(locInfo.locations, team1.servers)); + + // -- team 2 + testKey = doubleToTestKey(1.3); + locInfo = tester.getKeyLocationInfo(testKey, mgs); + ASSERT(locationInfoEqualsToTeam(locInfo.locations, team2.servers)); + + // query range location + testKey = doubleToTestKey(3.0); + // team 1,2,1 + auto locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 100, mgs); + ASSERT(locInfos.size() == 3); + ASSERT(locInfos[0].range == ranges[0]); + ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers)); + ASSERT(locInfos[1].range == ranges[1]); + ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers)); + ASSERT(locInfos[2].range == KeyRangeRef(ranges[2].begin, testKey)); + ASSERT(locationInfoEqualsToTeam(locInfos[2].locations, team1.servers)); + + // team 1,2 + locInfos = tester.getKeyRangeLocations(KeyRangeRef(allKeys.begin, testKey), 2, mgs); + ASSERT(locInfos.size() == 2); + ASSERT(locInfos[0].range == ranges[0]); + ASSERT(locationInfoEqualsToTeam(locInfos[0].locations, team1.servers)); + ASSERT(locInfos[1].range == ranges[1]); + ASSERT(locationInfoEqualsToTeam(locInfos[1].locations, team2.servers)); + + return Void(); +} diff --git a/fdbserver/ShardsAffectedByTeamFailure.cpp b/fdbserver/ShardsAffectedByTeamFailure.cpp index d834febaf7..cc634689ec 100644 --- a/fdbserver/ShardsAffectedByTeamFailure.cpp +++ b/fdbserver/ShardsAffectedByTeamFailure.cpp @@ -40,10 +40,16 @@ int ShardsAffectedByTeamFailure::getNumberOfShards(UID ssID) const { } std::pair, std::vector> -ShardsAffectedByTeamFailure::getTeamsFor(KeyRangeRef keys) { +ShardsAffectedByTeamFailure::getTeamsForFirstShard(KeyRangeRef keys) { return shard_teams[keys.begin]; } +std::pair, std::vector> + +ShardsAffectedByTeamFailure::getTeamsFor(KeyRef key) { + return shard_teams[key]; +} + void ShardsAffectedByTeamFailure::erase(Team team, KeyRange const& range) { DisabledTraceEvent(SevDebug, "ShardsAffectedByTeamFailureErase") .detail("Range", range) @@ -236,3 +242,7 @@ void ShardsAffectedByTeamFailure::removeFailedServerForRange(KeyRangeRef keys, c } check(); } + +auto ShardsAffectedByTeamFailure::intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges { + return shard_teams.intersectingRanges(keyRange); +} diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index ce5c34d32e..b091f263ff 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -116,6 +116,8 @@ protected: class MockGlobalState : public IKeyLocationService { friend struct MockGlobalStateTester; + std::vector extractStorageServerInterfaces(const std::vector& ids) const; + public: typedef ShardsAffectedByTeamFailure::Team Team; // In-memory counterpart of the `keyServers` in system keyspace diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h index 3f85fefb7a..9055098bc7 100644 --- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h +++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h @@ -80,8 +80,12 @@ public: bool hasShards(Team team) const; // The first element of the pair is either the source for non-moving shards or the destination team for in-flight - // shards The second element of the pair is all previous sources for in-flight shards - std::pair, std::vector> getTeamsFor(KeyRangeRef keys); + // shards The second element of the pair is all previous sources for in-flight shards. This function only return the + // teams for the first shard in [keys.begin, keys.end) + std::pair, std::vector> getTeamsForFirstShard(KeyRangeRef keys); + + std::pair, std::vector> getTeamsFor(KeyRef key); + // Shard boundaries are modified in defineShard and the content of what servers correspond to each shard is a copy // or union of the shards already there void defineShard(KeyRangeRef keys); @@ -124,6 +128,7 @@ private: public: // return the iterator that traversing all ranges auto getAllRanges() const -> decltype(shard_teams)::ConstRanges; + auto intersectingRanges(KeyRangeRef keyRange) const -> decltype(shard_teams)::ConstRanges; // get total shards count size_t getNumberOfShards() const; void removeFailedServerForRange(KeyRangeRef keys, const UID& serverID); From 1603926595b0c9972b780b56de6d0974c500b066 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 19 Oct 2022 23:20:31 -0700 Subject: [PATCH 17/52] refactoring old waitStorageMetrics and finish MGS::waitStorageMetrics (no unit test yet) --- fdbclient/NativeAPI.actor.cpp | 69 ++++++++++++------- fdbclient/include/fdbclient/NativeAPI.actor.h | 10 +++ ...balState.cpp => MockGlobalState.actor.cpp} | 41 ++++++++++- fdbserver/include/fdbserver/MockGlobalState.h | 26 ++++--- 4 files changed, 107 insertions(+), 39 deletions(-) rename fdbserver/{MockGlobalState.cpp => MockGlobalState.actor.cpp} (90%) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 4c1b75d905..41fd90cd52 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1915,7 +1915,8 @@ Optional DatabaseContext::getCachedLocation(const Optional auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); if (range->value()) { - return KeyRangeLocationInfo(tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value()); + return KeyRangeLocationInfo( + tenantEntry, toPrefixRelativeRange(range->range(), tenantEntry.prefix), range->value()); } return Optional(); @@ -1952,7 +1953,8 @@ bool DatabaseContext::getCachedLocations(const Optional& tenantNa result.clear(); return false; } - result.emplace_back(tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value()); + result.emplace_back( + tenantEntry, toPrefixRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value()); if (result.size() == limit || begin == end) { break; } @@ -7714,6 +7716,34 @@ ACTOR Future>> getReadHotRanges(Da } } +ACTOR Future> waitStorageMetricsWithLocation(TenantInfo tenantInfo, + KeyRange keys, + std::vector locations, + StorageMetrics min, + StorageMetrics max, + StorageMetrics permittedError) { + try { + Future fx; + if (locations.size() > 1) { + fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError); + } else { + WaitMetricsRequest req(tenantInfo, keys, min, max); + fx = loadBalance(locations[0].locations->locations(), + &StorageServerInterface::waitMetrics, + req, + TaskPriority::DataDistribution); + } + StorageMetrics x = wait(fx); + return x; + } catch (Error& e) { + if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) { + TraceEvent(SevError, "WaitStorageMetricsError").error(e); + throw; + } + } + return Optional(); +} + ACTOR Future, int>> waitStorageMetrics( Database cx, KeyRange keys, @@ -7743,30 +7773,8 @@ ACTOR Future, int>> waitStorageMetrics( } // SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better - // solution to this. - if (locations.size() < shardLimit) { - try { - Future fx; - if (locations.size() > 1) { - fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError); - } else { - WaitMetricsRequest req(tenantInfo, keys, min, max); - fx = loadBalance(locations[0].locations->locations(), - &StorageServerInterface::waitMetrics, - req, - TaskPriority::DataDistribution); - } - StorageMetrics x = wait(fx); - return std::make_pair(x, -1); - } catch (Error& e) { - if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) { - TraceEvent(SevError, "WaitStorageMetricsError").error(e); - throw; - } - cx->invalidateCache(locations[0].tenantEntry.prefix, keys); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); - } - } else { + // solution to this. How could this happen? + if (locations.size() >= shardLimit) { TraceEvent(SevWarn, "WaitStorageMetricsPenalty") .detail("Keys", keys) .detail("Limit", shardLimit) @@ -7775,7 +7783,16 @@ ACTOR Future, int>> waitStorageMetrics( wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); // make sure that the next getKeyRangeLocations() call will actually re-fetch the range cx->invalidateCache(locations[0].tenantEntry.prefix, keys); + continue; } + + Optional res = + wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError)); + if (res.present()) { + return std::make_pair(res, -1); + } + cx->invalidateCache(locations[0].tenantEntry.prefix, keys); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 3931182ab0..1d94fc14f4 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -591,6 +591,16 @@ int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess); // Returns the maximum legal size of a key that can be cleared. Keys larger than this will be assumed not to exist. int64_t getMaxClearKeySize(KeyRef const& key); +struct KeyRangeLocationInfo; +// Return the aggregated StorageMetrics of range keys to the caller. The locations tell which interface should +// serve the request. The final result is within (min-permittedError/2, max + permittedError/2) if valid. +ACTOR Future> waitStorageMetricsWithLocation(TenantInfo tenantInfo, + KeyRange keys, + std::vector locations, + StorageMetrics min, + StorageMetrics max, + StorageMetrics permittedError); + namespace NativeAPI { ACTOR Future>> getServerListAndProcessClasses( Transaction* tr); diff --git a/fdbserver/MockGlobalState.cpp b/fdbserver/MockGlobalState.actor.cpp similarity index 90% rename from fdbserver/MockGlobalState.cpp rename to fdbserver/MockGlobalState.actor.cpp index 7596923fc8..e5e6263c71 100644 --- a/fdbserver/MockGlobalState.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -1,5 +1,5 @@ /* - * MockGlobalState.cpp + * MockGlobalState.actor.cpp * * This source file is part of the FoundationDB open source project * @@ -20,6 +20,42 @@ #include "fdbserver/MockGlobalState.h" #include "fdbserver/workloads/workloads.actor.h" +#include "flow/actorcompiler.h" + +class MockGlobalStateImpl { +public: + ACTOR static Future, int>> waitStorageMetrics(MockGlobalState* mgs, + KeyRange keys, + StorageMetrics min, + StorageMetrics max, + StorageMetrics permittedError, + int shardLimit, + int expectedShardCount) { + state TenantInfo tenantInfo; + loop { + auto locations = mgs->getKeyRangeLocations(tenantInfo, + keys, + shardLimit, + Reverse::False, + SpanContext(), + Optional(), + UseProvisionalProxies::False, + 0) + .get(); + // NOTE(xwang): in native API, there's code handling the non-equal situation, but I think in mock world + // there shouldn't have any delay to update the locations. + ASSERT_EQ(expectedShardCount, locations.size()); + + Optional res = + wait(::waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError)); + + if (res.present()) { + return std::make_pair(res, -1); + } + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); + } + } +}; bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) { auto ranges = serverKeys.intersectingRanges(range); @@ -198,7 +234,8 @@ Future, int>> MockGlobalState::waitStorageMet const StorageMetrics& permittedError, int shardLimit, int expectedShardCount) { - return Future, int>>(); + return MockGlobalStateImpl::waitStorageMetrics( + this, keys, min, max, permittedError, shardLimit, expectedShardCount); } Reference buildLocationInfo(const std::vector& interfaces) { diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index b091f263ff..70cb93d8c5 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -113,8 +113,11 @@ protected: void twoWayShardSplitting(KeyRangeRef range, KeyRef splitPoint, uint64_t rangeSize, bool restrictSize); }; +class MockGlobalStateImpl; + class MockGlobalState : public IKeyLocationService { friend struct MockGlobalStateTester; + friend class MockGlobalStateImpl; std::vector extractStorageServerInterfaces(const std::vector& ids) const; @@ -167,6 +170,7 @@ public: */ bool allShardRemovedFromServer(const UID& serverId); + // SOMEDAY: NativeAPI::waitStorageMetrics should share the code in the future, this is a simpler version of it Future, int>> waitStorageMetrics(KeyRange const& keys, StorageMetrics const& min, StorageMetrics const& max, @@ -175,21 +179,21 @@ public: int expectedShardCount); Future getKeyLocation(TenantInfo tenant, - Key key, + Key key, + SpanContext spanContext, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Reverse isBackward, + Version version) override; + + Future> getKeyRangeLocations(TenantInfo tenant, + KeyRange keys, + int limit, + Reverse reverse, SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, - Reverse isBackward, Version version) override; - - Future> getKeyRangeLocations(TenantInfo tenant, - KeyRange keys, - int limit, - Reverse reverse, - SpanContext spanContext, - Optional debugID, - UseProvisionalProxies useProvisionalProxies, - Version version) override; }; #endif // FOUNDATIONDB_MOCKGLOBALSTATE_H From e07a50573a394491573ec4978d9605b8593df0fd Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 20 Oct 2022 13:10:03 -0700 Subject: [PATCH 18/52] splitStorageMetrics finish implementation (no unit test yet but 100k test pass) --- fdbclient/NativeAPI.actor.cpp | 109 ++++++++++-------- fdbclient/include/fdbclient/NativeAPI.actor.h | 9 ++ fdbserver/DDTxnProcessor.actor.cpp | 3 +- fdbserver/MockGlobalState.actor.cpp | 42 +++++++ fdbserver/include/fdbserver/MockGlobalState.h | 5 + 5 files changed, 118 insertions(+), 50 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 41fd90cd52..4aece2f05f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -8653,6 +8653,56 @@ Future DatabaseContext::splitStorageMetricsStream(const PromiseStream resultStream, Database(Reference::addRef(this)), keys, limit, estimated, minSplitBytes); } +ACTOR Future>>> splitStorageMetricsWithLocations( + std::vector locations, + KeyRange keys, + StorageMetrics limit, + StorageMetrics estimated, + Optional minSplitBytes) { + state StorageMetrics used; + state Standalone> results; + results.push_back_deep(results.arena(), keys.begin); + //TraceEvent("SplitStorageMetrics").detail("Locations", locations.size()); + try { + state int i = 0; + for (; i < locations.size(); i++) { + SplitMetricsRequest req( + locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes); + SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(), + &StorageServerInterface::splitMetrics, + req, + TaskPriority::DataDistribution)); + if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly + // because of moving data, throw error to retry + ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing + throw all_alternatives_failed(); + } + if (res.splits.size()) { + results.append(results.arena(), res.splits.begin(), res.splits.size()); + results.arena().dependsOn(res.splits.arena()); + } + used = res.used; + + //TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size()); + } + + if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) { + results.resize(results.arena(), results.size() - 1); + } + + if (keys.end <= locations.back().range.end) { + results.push_back_deep(results.arena(), keys.end); + } + return results; + } catch (Error& e) { + if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) { + TraceEvent(SevError, "SplitStorageMetricsError").error(e); + throw; + } + } + return Optional>>(); +} + ACTOR Future>> splitStorageMetrics(Database cx, KeyRange keys, StorageMetrics limit, @@ -8671,61 +8721,24 @@ ACTOR Future>> splitStorageMetrics(Database cx, Optional(), UseProvisionalProxies::False, latestVersion)); - state StorageMetrics used; - state Standalone> results; // SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better // solution to this. if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) { wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); cx->invalidateCache(Key(), keys); - } else { - results.push_back_deep(results.arena(), keys.begin); - try { - //TraceEvent("SplitStorageMetrics").detail("Locations", locations.size()); - - state int i = 0; - for (; i < locations.size(); i++) { - SplitMetricsRequest req( - locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes); - SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(), - &StorageServerInterface::splitMetrics, - req, - TaskPriority::DataDistribution)); - if (res.splits.size() && - res.splits[0] <= results.back()) { // split points are out of order, possibly because of - // moving data, throw error to retry - ASSERT_WE_THINK( - false); // FIXME: This seems impossible and doesn't seem to be covered by testing - throw all_alternatives_failed(); - } - if (res.splits.size()) { - results.append(results.arena(), res.splits.begin(), res.splits.size()); - results.arena().dependsOn(res.splits.arena()); - } - used = res.used; - - //TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size()); - } - - if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && - results.size() > 1) { - results.resize(results.arena(), results.size() - 1); - } - - if (keys.end <= locations.back().range.end) { - results.push_back_deep(results.arena(), keys.end); - } - return results; - } catch (Error& e) { - if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) { - TraceEvent(SevError, "SplitStorageMetricsError").error(e); - throw; - } - cx->invalidateCache(Key(), keys); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); - } + continue; } + + Optional>> results = + wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes)); + + if (results.present()) { + return results.get(); + } + + cx->invalidateCache(Key(), keys); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 1d94fc14f4..d1f4860f23 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -601,6 +601,15 @@ ACTOR Future> waitStorageMetricsWithLocation(TenantInfo StorageMetrics max, StorageMetrics permittedError); +// Return the suggested split points from storage server.The locations tell which interface should +// serve the request. The +ACTOR Future>>> splitStorageMetricsWithLocations( + std::vector locations, + KeyRange keys, + StorageMetrics limit, + StorageMetrics estimated, + Optional minSplitBytes); + namespace NativeAPI { ACTOR Future>> getServerListAndProcessClasses( Transaction* tr); diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index 5a5b1de276..2c4c695c23 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -862,13 +862,12 @@ Future DDMockTxnProcessor::getHealthMetrics(bool detailed) const return Future(); } -// FIXME: finish implementation Future>> DDMockTxnProcessor::splitStorageMetrics( const KeyRange& keys, const StorageMetrics& limit, const StorageMetrics& estimated, const Optional& minSplitBytes) const { - return Future>>(); + return mgs->splitStorageMetrics(keys, limit, estimated, minSplitBytes); } Future, int>> DDMockTxnProcessor::waitStorageMetrics( diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp index e5e6263c71..bdeed264fd 100644 --- a/fdbserver/MockGlobalState.actor.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -55,6 +55,41 @@ public: wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } + + // SOMEDAY: reuse the NativeAPI implementation + ACTOR static Future>> splitStorageMetrics(MockGlobalState* mgs, + KeyRange keys, + StorageMetrics limit, + StorageMetrics estimated, + Optional minSplitBytes) { + state TenantInfo tenantInfo; + loop { + state std::vector locations = + mgs->getKeyRangeLocations(tenantInfo, + keys, + CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, + Reverse::False, + SpanContext(), + Optional(), + UseProvisionalProxies::False, + 0) + .get(); + + // Same solution to NativeAPI::splitStorageMetrics, wait some merge finished + if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) { + wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); + } + + Optional>> results = + wait(splitStorageMetricsWithLocations(locations, keys, limit, estimated, minSplitBytes)); + + if (results.present()) { + return results.get(); + } + + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); + } + } }; bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) { @@ -319,6 +354,13 @@ std::vector MockGlobalState::extractStorageServerInterfa return interfaces; } +Future>> MockGlobalState::splitStorageMetrics(const KeyRange& keys, + const StorageMetrics& limit, + const StorageMetrics& estimated, + const Optional& minSplitBytes) { + return MockGlobalStateImpl::splitStorageMetrics(this, keys, limit, estimated, minSplitBytes); +} + TEST_CASE("/MockGlobalState/initializeAsEmptyDatabaseMGS/SimpleThree") { BasicTestConfig testConfig; testConfig.simpleConfig = true; diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index 70cb93d8c5..a8433d7ce2 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -178,6 +178,11 @@ public: int shardLimit, int expectedShardCount); + Future>> splitStorageMetrics(const KeyRange& keys, + const StorageMetrics& limit, + const StorageMetrics& estimated, + const Optional& minSplitBytes); + Future getKeyLocation(TenantInfo tenant, Key key, SpanContext spanContext, From c14ee5395f82839ef9dfce07a45a9fc8c5d8e1d3 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 20 Oct 2022 23:29:13 -0700 Subject: [PATCH 19/52] define IStorageMetricsService --- fdbserver/MockGlobalState.actor.cpp | 4 +++ fdbserver/include/fdbserver/StorageMetrics.h | 22 ++++++++++++++ fdbserver/storageserver.actor.cpp | 30 +++++++++++++------- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp index bdeed264fd..d40d280e78 100644 --- a/fdbserver/MockGlobalState.actor.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -203,6 +203,10 @@ uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const { return totalSize; } +Future MockStorageServer::serverInterface() const { + return Future(); +} + void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) { ASSERT(conf.storageTeamSize > 0); configuration = conf; diff --git a/fdbserver/include/fdbserver/StorageMetrics.h b/fdbserver/include/fdbserver/StorageMetrics.h index f0f7008cc4..98d562d665 100644 --- a/fdbserver/include/fdbserver/StorageMetrics.h +++ b/fdbserver/include/fdbserver/StorageMetrics.h @@ -152,3 +152,25 @@ struct ByteSampleInfo { // Determines whether a key-value pair should be included in a byte sample // Also returns size information about the sample ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue); + +class IStorageMetricsService { +public: + StorageServerMetrics metrics; + + // penalty used by loadBalance() to balance requests among service instances + virtual double getPenalty() { return 1; } + + virtual bool isReadable(KeyRangeRef const& keys) { return true; } + + virtual void addActor(Future future) = 0; + + virtual void getSplitPoints(SplitRangeRequest const& req) = 0; + + virtual Future waitMetricsTenantAware(const WaitMetricsRequest& req) = 0; + + virtual void getStorageMetrics(const GetStorageMetricsRequest& req) = 0; + + // NOTE: also need to have this function but template can't be a virtual so... + // template + // void sendErrorWithPenalty(const ReplyPromise& promise, const Error& err, double penalty); +}; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index aebec03a41..396d4296f8 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -641,7 +641,7 @@ struct BusiestWriteTagContext { busiestWriteTagEventHolder(makeReference(busiestWriteTagTrackingKey)), lastUpdateTime(-1) {} }; -struct StorageServer { +struct StorageServer : public IStorageMetricsService { typedef VersionedMap VersionedData; private: @@ -988,7 +988,6 @@ public: Database cx; ActorCollection actors; - StorageServerMetrics metrics; CoalescedKeyRangeMap> byteSampleClears; AsyncVar byteSampleClearsTooLarge; Future byteSampleRecovery; @@ -1380,7 +1379,7 @@ public: // This is the maximum version that might be read from storage (the minimum version is durableVersion) Version storageVersion() const { return oldestVersion.get(); } - bool isReadable(KeyRangeRef const& keys) { + bool isReadable(KeyRangeRef const& keys) override { auto sh = shards.intersectingRanges(keys); for (auto i = sh.begin(); i != sh.end(); ++i) if (!i->value()->isReadable()) @@ -1409,7 +1408,7 @@ public: Counter::Value queueSize() { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); } // penalty used by loadBalance() to balance requests among SSes. We prefer SS with less write queue size. - double getPenalty() { + double getPenalty() override { return std::max(std::max(1.0, (queueSize() - (SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER - 2.0 * SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER)) / @@ -1503,7 +1502,7 @@ public: } } - void getSplitPoints(SplitRangeRequest const& req) { + void getSplitPoints(SplitRangeRequest const& req) override { try { Optional entry = getTenantEntry(version.get(), req.tenantInfo); metrics.getSplitPoints(req, entry.map([](TenantMapEntry e) { return e.prefix; })); @@ -1533,6 +1532,15 @@ public: } return false; } + + Future waitMetricsTenantAware(const WaitMetricsRequest& req) override; + + void addActor(Future future) override { actors.add(future); } + + void getStorageMetrics(const GetStorageMetricsRequest& req) override { + StorageBytes sb = storage.getStorageBytes(); + metrics.getStorageMetrics(req, sb, counters.bytesInput.getRate(), versionLag, lastUpdate); + } }; const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = ""_sr; @@ -10166,7 +10174,7 @@ Future StorageServerMetrics::waitMetrics(WaitMetricsRequest req, Future waitMetricsTenantAware(StorageServer* self, WaitMetricsRequest req) { +ACTOR Future waitMetricsTenantAware_internal(StorageServer* self, WaitMetricsRequest req) { if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) { wait(success(waitForVersionNoTooOld(self, latestVersion))); Optional entry = self->getTenantEntry(latestVersion, req.tenantInfo.get()); @@ -10184,6 +10192,10 @@ ACTOR Future waitMetricsTenantAware(StorageServer* self, WaitMetricsReques return Void(); } +Future StorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) { + return waitMetricsTenantAware_internal(this, req); +} + ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) { state Future doPollMetrics = Void(); @@ -10225,7 +10237,7 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()"); self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); } else { - self->actors.add(waitMetricsTenantAware(self, req)); + self->addActor(self->waitMetricsTenantAware(req)); } } when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { @@ -10237,9 +10249,7 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) } } when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { - StorageBytes sb = self->storage.getStorageBytes(); - self->metrics.getStorageMetrics( - req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate); + self->getStorageMetrics(req); } when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) { if (!self->isReadable(req.keys)) { From 3c67b7df397c10519502cb9fc953e76bd3f9dfa3 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 21 Oct 2022 10:59:22 -0700 Subject: [PATCH 20/52] extract serveStorageMetricsRequests template function --- fdbserver/ConsistencyScan.actor.cpp | 2 +- fdbserver/MockGlobalState.actor.cpp | 4 -- fdbserver/Resolver.actor.cpp | 2 +- fdbserver/RestoreLoader.actor.cpp | 2 +- fdbserver/StorageMetrics.actor.cpp | 2 +- fdbserver/include/fdbserver/MockGlobalState.h | 2 +- ...torageMetrics.h => StorageMetrics.actor.h} | 58 ++++++++++++++++++- fdbserver/storageserver.actor.cpp | 48 +-------------- .../workloads/ConsistencyCheck.actor.cpp | 2 +- 9 files changed, 66 insertions(+), 56 deletions(-) rename fdbserver/include/fdbserver/{StorageMetrics.h => StorageMetrics.actor.h} (75%) diff --git a/fdbserver/ConsistencyScan.actor.cpp b/fdbserver/ConsistencyScan.actor.cpp index f961cb20a0..657da9fda4 100644 --- a/fdbserver/ConsistencyScan.actor.cpp +++ b/fdbserver/ConsistencyScan.actor.cpp @@ -29,7 +29,7 @@ #include "fdbclient/ReadYourWrites.h" #include "fdbclient/TagThrottle.actor.h" #include "fdbserver/Knobs.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" #include "fdbserver/DataDistribution.actor.h" #include "fdbserver/RatekeeperInterface.h" #include "fdbserver/ServerDBInfo.h" diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp index d40d280e78..bdeed264fd 100644 --- a/fdbserver/MockGlobalState.actor.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -203,10 +203,6 @@ uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const { return totalSize; } -Future MockStorageServer::serverInterface() const { - return Future(); -} - void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) { ASSERT(conf.storageTeamSize > 0); configuration = conf; diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 61aaed1246..203383cc9f 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -35,7 +35,7 @@ #include "fdbserver/ResolverInterface.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/ServerDBInfo.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" #include "flow/ActorCollection.h" diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 964045c153..dfcd329cc8 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -30,7 +30,7 @@ #include "fdbserver/RestoreLoader.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" #include "fdbserver/MutationTracking.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp index 88bae0695f..ada2301483 100644 --- a/fdbserver/StorageMetrics.actor.cpp +++ b/fdbserver/StorageMetrics.actor.cpp @@ -19,7 +19,7 @@ */ #include "flow/UnitTest.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. int64_t StorageMetricSample::getEstimate(KeyRangeRef keys) const { diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index a8433d7ce2..f3e5213892 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -21,7 +21,7 @@ #ifndef FOUNDATIONDB_MOCKGLOBALSTATE_H #define FOUNDATIONDB_MOCKGLOBALSTATE_H -#include "StorageMetrics.h" +#include "StorageMetrics.actor.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/DatabaseConfiguration.h" diff --git a/fdbserver/include/fdbserver/StorageMetrics.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h similarity index 75% rename from fdbserver/include/fdbserver/StorageMetrics.h rename to fdbserver/include/fdbserver/StorageMetrics.actor.h index 98d562d665..7111227f27 100644 --- a/fdbserver/include/fdbserver/StorageMetrics.h +++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h @@ -19,13 +19,18 @@ */ #pragma once - +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_STORAGEMETRICS_G_H) +#define FDBSERVER_STORAGEMETRICS_G_H +#include "fdbserver/StorageMetrics.actor.g.h" +#elif !defined(FDBSERVER_STORAGEMETRICS_H) +#define FDBSERVER_STORAGEMETRICS_H #include "fdbclient/FDBTypes.h" #include "fdbrpc/simulator.h" #include "flow/UnitTest.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/KeyRangeMap.h" #include "fdbserver/Knobs.h" +#include "flow/actorcompiler.h" const StringRef STORAGESERVER_HISTOGRAM_GROUP = "StorageServer"_sr; const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = "FetchKeysLatency"_sr; @@ -174,3 +179,54 @@ public: // template // void sendErrorWithPenalty(const ReplyPromise& promise, const Error& err, double penalty); }; + +ACTOR template +Future serveStorageMetricsRequests(ServiceType* self, StorageServerInterface ssi){ + state Future doPollMetrics = Void(); + loop { + choose { + when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) { + if (!req.tenantInfo.present() && !self->isReadable(req.keys)) { + CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()"); + self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); + } else { + self->addActor(self->waitMetricsTenantAware(req)); + } + } + when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { + if (!self->isReadable(req.keys)) { + CODE_PROBE(true, "splitMetrics immediate wrong_shard_server()"); + self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); + } else { + self->metrics.splitMetrics(req); + } + } + when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { + self->getStorageMetrics(req); + } + when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) { + if (!self->isReadable(req.keys)) { + CODE_PROBE(true, "readHotSubRanges immediate wrong_shard_server()", probe::decoration::rare); + self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); + } else { + self->metrics.getReadHotRanges(req); + } + } + when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) { + if (!self->isReadable(req.keys)) { + CODE_PROBE(true, "getSplitPoints immediate wrong_shard_server()"); + self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); + } else { + self->getSplitPoints(req); + } + } + when(wait(doPollMetrics)) { + self->metrics.poll(); + doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY); + } + } + } +} + +#include "flow/unactorcompiler.h" +#endif // FDBSERVER_STORAGEMETRICS_H \ No newline at end of file diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 396d4296f8..1d8fb40c8f 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -81,7 +81,7 @@ #include "fdbserver/ServerCheckpoint.actor.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/SpanContextMessage.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" #include "fdbserver/TLogInterface.h" #include "fdbserver/TransactionTagCounter.h" #include "fdbserver/WaitFailure.h" @@ -10197,7 +10197,6 @@ Future StorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req } ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) { - state Future doPollMetrics = Void(); wait(self->byteSampleRecovery); TraceEvent("StorageServerRestoreDurableState", self->thisServerID).detail("RestoredBytes", self->bytesRestored); @@ -10230,49 +10229,8 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) } })); - loop { - choose { - when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) { - if (!req.tenantInfo.present() && !self->isReadable(req.keys)) { - CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()"); - self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); - } else { - self->addActor(self->waitMetricsTenantAware(req)); - } - } - when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { - if (!self->isReadable(req.keys)) { - CODE_PROBE(true, "splitMetrics immediate wrong_shard_server()"); - self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); - } else { - self->metrics.splitMetrics(req); - } - } - when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { - self->getStorageMetrics(req); - } - when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) { - if (!self->isReadable(req.keys)) { - CODE_PROBE(true, "readHotSubRanges immediate wrong_shard_server()", probe::decoration::rare); - self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); - } else { - self->metrics.getReadHotRanges(req); - } - } - when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) { - if (!self->isReadable(req.keys)) { - CODE_PROBE(true, "getSplitPoints immediate wrong_shard_server()"); - self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); - } else { - self->getSplitPoints(req); - } - } - when(wait(doPollMetrics)) { - self->metrics.poll(); - doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY); - } - } - } + wait(serveStorageMetricsRequests(self, ssi)); + return Void(); } ACTOR Future logLongByteSampleRecovery(Future recovery) { diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 6abeb66e5f..ee9c63bb50 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -30,7 +30,7 @@ #include "flow/IRateControl.h" #include "fdbrpc/simulator.h" #include "fdbserver/Knobs.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" #include "fdbserver/DataDistribution.actor.h" #include "fdbserver/QuietDatabase.h" #include "fdbserver/TSSMappingUtil.actor.h" From 0d4b4d05e20c320004396f5f0114f1539eb1c44b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 21 Oct 2022 16:53:03 -0700 Subject: [PATCH 21/52] implement MSS as IStorageMetricsService and pass the unit test --- fdbserver/DDShardTracker.actor.cpp | 16 +---- fdbserver/MockGlobalState.actor.cpp | 69 +++++++++++++++++++ .../fdbserver/DataDistribution.actor.h | 14 ++++ fdbserver/include/fdbserver/MockGlobalState.h | 34 ++++++++- 4 files changed, 115 insertions(+), 18 deletions(-) diff --git a/fdbserver/DDShardTracker.actor.cpp b/fdbserver/DDShardTracker.actor.cpp index be7343ba4c..7964915217 100644 --- a/fdbserver/DDShardTracker.actor.cpp +++ b/fdbserver/DDShardTracker.actor.cpp @@ -212,7 +212,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys, const Reference>>& shardMetrics, const BandwidthStatus& bandwidthStatus, PromiseStream readHotShard) { - ShardSizeBounds bounds; + ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack(); if (shardMetrics->get().present()) { auto bytes = shardMetrics->get().get().metrics.bytes; auto readBandwidthStatus = getReadBandwidthStatus(shardMetrics->get().get().metrics); @@ -259,21 +259,7 @@ ShardSizeBounds calculateShardSizeBounds(const KeyRange& keys, } else { ASSERT(false); } - } else { - bounds.max.bytes = -1; - bounds.min.bytes = -1; - bounds.permittedError.bytes = -1; - bounds.max.bytesPerKSecond = bounds.max.infinity; - bounds.min.bytesPerKSecond = 0; - bounds.permittedError.bytesPerKSecond = bounds.permittedError.infinity; - bounds.max.bytesReadPerKSecond = bounds.max.infinity; - bounds.min.bytesReadPerKSecond = 0; - bounds.permittedError.bytesReadPerKSecond = bounds.permittedError.infinity; } - - bounds.max.iosPerKSecond = bounds.max.infinity; - bounds.min.iosPerKSecond = 0; - bounds.permittedError.iosPerKSecond = bounds.permittedError.infinity; return bounds; } diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp index bdeed264fd..1e03b71e85 100644 --- a/fdbserver/MockGlobalState.actor.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -20,6 +20,7 @@ #include "fdbserver/MockGlobalState.h" #include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/DataDistribution.actor.h" #include "flow/actorcompiler.h" class MockGlobalStateImpl { @@ -42,6 +43,7 @@ public: UseProvisionalProxies::False, 0) .get(); + TraceEvent(SevDebug, "MGSWaitStorageMetrics").detail("Phase", "GetLocation"); // NOTE(xwang): in native API, there's code handling the non-equal situation, but I think in mock world // there shouldn't have any delay to update the locations. ASSERT_EQ(expectedShardCount, locations.size()); @@ -92,6 +94,28 @@ public: } }; +class MockStorageServerImpl { +public: + ACTOR static Future waitMetricsTenantAware(MockStorageServer* self, WaitMetricsRequest req) { + if (req.tenantInfo.present() && req.tenantInfo.get().tenantId != TenantInfo::INVALID_TENANT) { + // TODO(xwang) add support for tenant test, search for tenant entry + Optional entry; + Optional tenantPrefix = entry.map([](TenantMapEntry e) { return e.prefix; }); + if (tenantPrefix.present()) { + UNREACHABLE(); + // req.keys = req.keys.withPrefix(tenantPrefix.get(), req.arena); + } + } + + if (!self->isReadable(req.keys)) { + self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty()); + } else { + wait(self->metrics.waitMetrics(req, delayJittered(SERVER_KNOBS->STORAGE_METRIC_TIMEOUT))); + } + return Void(); + } +}; + bool MockStorageServer::allShardStatusEqual(KeyRangeRef range, MockShardStatus status) { auto ranges = serverKeys.intersectingRanges(range); ASSERT(!ranges.empty()); // at least the range is allKeys @@ -203,6 +227,22 @@ uint64_t MockStorageServer::sumRangeSize(KeyRangeRef range) const { return totalSize; } +void MockStorageServer::addActor(Future future) { + actors.add(future); +} + +void MockStorageServer::getSplitPoints(const SplitRangeRequest& req) {} + +Future MockStorageServer::waitMetricsTenantAware(const WaitMetricsRequest& req) { + return MockStorageServerImpl::waitMetricsTenantAware(this, req); +} + +void MockStorageServer::getStorageMetrics(const GetStorageMetricsRequest& req) {} + +Future MockStorageServer::run() { + return serveStorageMetricsRequests(this, ssi); +} + void MockGlobalState::initializeAsEmptyDatabaseMGS(const DatabaseConfiguration& conf, uint64_t defaultDiskSpace) { ASSERT(conf.storageTeamSize > 0); configuration = conf; @@ -544,3 +584,32 @@ TEST_CASE("/MockGlobalState/MockStorageServer/GetKeyLocations") { return Void(); } + +TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") { + BasicTestConfig testConfig; + testConfig.simpleConfig = true; + testConfig.minimumReplication = 1; + testConfig.logAntiQuorum = 0; + DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig); + TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString()); + + state std::shared_ptr mgs = std::make_shared(); + mgs->initializeAsEmptyDatabaseMGS(dbConfig); + state ActorCollection actors; + + ActorCollection* ptr = &actors; // get around ACTOR syntax restriction + std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) { + ptr->add(server.second.run()); + server.second.metrics.byteSample.sample.insert("something"_sr, 500000); + }); + + KeyRange testRange = allKeys; + ShardSizeBounds bounds = ShardSizeBounds::shardSizeBoundsBeforeTrack(); + std::pair, int> res = + wait(mgs->waitStorageMetrics(testRange, bounds.min, bounds.max, bounds.permittedError, 1, 1)); + // std::cout << "get result " << res.second << "\n"; + // std::cout << "get byte "<< res.first.get().bytes << "\n"; + ASSERT_EQ(res.second, -1); // the valid result always return -1, strange contraction though. + ASSERT_EQ(res.first.get().bytes, 500000); + return Void(); +} diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 2389dc0ab6..2e77d07459 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -476,6 +476,20 @@ struct ShardSizeBounds { bool operator==(ShardSizeBounds const& rhs) const { return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError; } + + static ShardSizeBounds shardSizeBoundsBeforeTrack() { + return ShardSizeBounds{ + .max = StorageMetrics{ .bytes = -1, + .bytesPerKSecond = StorageMetrics::infinity, + .iosPerKSecond = StorageMetrics::infinity, + .bytesReadPerKSecond = StorageMetrics::infinity }, + .min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 }, + .permittedError = StorageMetrics{ .bytes = -1, + .bytesPerKSecond = StorageMetrics::infinity, + .iosPerKSecond = StorageMetrics::infinity, + .bytesReadPerKSecond = StorageMetrics::infinity } + }; + } }; // Gets the permitted size and IO bounds for a shard diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index f3e5213892..a404f24027 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -52,9 +52,11 @@ inline bool isStatusTransitionValid(MockShardStatus from, MockShardStatus to) { return false; } -class MockStorageServer { +class MockStorageServer : public IStorageMetricsService { friend struct MockGlobalStateTester; + ActorCollection actors; + public: struct ShardInfo { MockShardStatus status; @@ -74,8 +76,6 @@ public: // size() and nthRange() would use the metrics as index instead KeyRangeMap serverKeys; - // sampled metrics - StorageServerMetrics metrics; CoalescedKeyRangeMap> byteSampleClears; StorageServerInterface ssi; // serve RPC requests @@ -104,6 +104,34 @@ public: uint64_t sumRangeSize(KeyRangeRef range) const; + void addActor(Future future) override; + + void getSplitPoints(SplitRangeRequest const& req) override; + + Future waitMetricsTenantAware(const WaitMetricsRequest& req) override; + + void getStorageMetrics(const GetStorageMetricsRequest& req) override; + + template + using isLoadBalancedReply = std::is_base_of; + + template + typename std::enable_if::value, void>::type + sendErrorWithPenalty(const ReplyPromise& promise, const Error& err, double penalty) { + Reply reply; + reply.error = err; + reply.penalty = penalty; + promise.send(reply); + } + + template + typename std::enable_if::value, void>::type + sendErrorWithPenalty(const ReplyPromise& promise, const Error& err, double) { + promise.sendError(err); + } + + Future run(); + protected: void threeWayShardSplitting(KeyRangeRef outerRange, KeyRangeRef innerRange, From 918018d4928f1563d5c7ed80123485068fd67013 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 24 Oct 2022 10:50:46 -0700 Subject: [PATCH 22/52] format code --- fdbserver/include/fdbserver/StorageMetrics.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h index 7111227f27..552db2c6f7 100644 --- a/fdbserver/include/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h @@ -181,7 +181,7 @@ public: }; ACTOR template -Future serveStorageMetricsRequests(ServiceType* self, StorageServerInterface ssi){ +Future serveStorageMetricsRequests(ServiceType* self, StorageServerInterface ssi) { state Future doPollMetrics = Void(); loop { choose { From dd52d997ba33de0327702a6ff94b63882c775482 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 24 Oct 2022 11:07:36 -0700 Subject: [PATCH 23/52] blob: rename rowLimit variable for listBlobbifiedRanges() --- fdbclient/NativeAPI.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index bf5483f82c..0f33aefb95 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -10802,9 +10802,9 @@ ACTOR Future>> listBlobbifiedRangesActor(Refer } Future>> DatabaseContext::listBlobbifiedRanges(KeyRange range, - int rowLimit, + int rangeLimit, Optional tenantName) { - return listBlobbifiedRangesActor(Reference::addRef(this), range, rowLimit, tenantName); + return listBlobbifiedRangesActor(Reference::addRef(this), range, rangeLimit, tenantName); } int64_t getMaxKeySize(KeyRef const& key) { From 07f12478853b6a6b987c938a35c25ab6ba18b510 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Tue, 18 Oct 2022 13:24:27 -0700 Subject: [PATCH 24/52] blob: getBlobRanges() use transaction instead of RYWTransaction --- fdbclient/NativeAPI.actor.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 0f33aefb95..7e59f4324d 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -10662,9 +10662,7 @@ Future DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) { return waitPurgeGranulesCompleteActor(Reference::addRef(this), purgeKey); } -ACTOR Future>> getBlobRanges(Reference tr, - KeyRange range, - int batchLimit) { +ACTOR Future>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) { state Standalone> blobRanges; state Key beginKey = range.begin; @@ -10716,7 +10714,7 @@ ACTOR Future setBlobRangeActor(Reference cx, range = range.withPrefix(tenantEntry.prefix); } - Standalone> startBlobRanges = wait(getBlobRanges(tr, range, 1)); + Standalone> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1)); if (active) { // Idempotent request. @@ -10765,22 +10763,22 @@ ACTOR Future>> listBlobbifiedRangesActor(Refer int rangeLimit, Optional tenantName) { state Database db(cx); - state Reference tr = makeReference(db); + state Transaction tr(db); state TenantMapEntry tme; loop { try { if (tenantName.present()) { - wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName))); + wait(store(tme, blobGranuleGetTenantEntry(&tr, range.begin, tenantName))); range = range.withPrefix(tme.prefix); } break; } catch (Error& e) { - wait(tr->onError(e)); + wait(tr.onError(e)); } } - state Standalone> blobRanges = wait(getBlobRanges(tr, range, rangeLimit)); + state Standalone> blobRanges = wait(getBlobRanges(&tr, range, rangeLimit)); if (!tenantName.present()) { return blobRanges; } From 48d6e725c2ddbb99053c6cd818901611e7319f67 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Tue, 18 Oct 2022 14:11:07 -0700 Subject: [PATCH 25/52] blob: convert listBlobbifiedRangesActor() to take a Transaction --- fdbclient/NativeAPI.actor.cpp | 135 ++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 62 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 7e59f4324d..1c0a224d18 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -10540,6 +10540,76 @@ Reference DatabaseContext::createTransaction() { } // BlobGranule API. +ACTOR Future>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) { + state Standalone> blobRanges; + state Key beginKey = range.begin; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state RangeResult results = wait( + krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2)); + + blobRanges.arena().dependsOn(results.arena()); + for (int i = 0; i < results.size() - 1; i++) { + if (results[i].value == blobRangeActive) { + blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key)); + } + if (blobRanges.size() == batchLimit) { + return blobRanges; + } + } + + if (!results.more) { + return blobRanges; + } + beginKey = results.back().key; + } catch (Error& e) { + wait(tr->onError(e)); + } + } +} + +ACTOR Future>> getBlobbifiedRanges(Transaction* tr, + KeyRange range, + int rangeLimit, + Optional tenantName) { + state TenantMapEntry tme; + + loop { + try { + if (tenantName.present()) { + wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName))); + range = range.withPrefix(tme.prefix); + } + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + + state Standalone> blobRanges = wait(getBlobRanges(tr, range, rangeLimit)); + if (!tenantName.present()) { + return blobRanges; + } + + // Strip tenant prefix out. + state Standalone> tenantBlobRanges; + for (auto& blobRange : blobRanges) { + // Filter out blob ranges that span tenants for some reason. + if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) { + TraceEvent("ListBlobbifiedRangeSpansTenants") + .suppressFor(/*seconds=*/5) + .detail("Tenant", tenantName.get()) + .detail("Range", blobRange); + continue; + } + tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix)); + } + return tenantBlobRanges; +} + ACTOR Future purgeBlobGranulesActor(Reference db, KeyRange range, Version purgeVersion, @@ -10662,37 +10732,6 @@ Future DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) { return waitPurgeGranulesCompleteActor(Reference::addRef(this), purgeKey); } -ACTOR Future>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) { - state Standalone> blobRanges; - state Key beginKey = range.begin; - - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - - state RangeResult results = wait( - krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2)); - - blobRanges.arena().dependsOn(results.arena()); - for (int i = 0; i < results.size() - 1; i++) { - if (results[i].value == blobRangeActive) { - blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key)); - } - if (blobRanges.size() == batchLimit) { - return blobRanges; - } - } - - if (!results.more) { - return blobRanges; - } - beginKey = results.back().key; - } catch (Error& e) { - wait(tr->onError(e)); - } - } -} - ACTOR Future setBlobRangeActor(Reference cx, KeyRange range, bool active, @@ -10762,41 +10801,13 @@ ACTOR Future>> listBlobbifiedRangesActor(Refer KeyRange range, int rangeLimit, Optional tenantName) { + state Database db(cx); state Transaction tr(db); - state TenantMapEntry tme; - loop { - try { - if (tenantName.present()) { - wait(store(tme, blobGranuleGetTenantEntry(&tr, range.begin, tenantName))); - range = range.withPrefix(tme.prefix); - } - break; - } catch (Error& e) { - wait(tr.onError(e)); - } - } + Standalone> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName)); - state Standalone> blobRanges = wait(getBlobRanges(&tr, range, rangeLimit)); - if (!tenantName.present()) { - return blobRanges; - } - - // Strip tenant prefix out. - state Standalone> tenantBlobRanges; - for (auto& blobRange : blobRanges) { - // Filter out blob ranges that span tenants for some reason. - if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) { - TraceEvent("ListBlobbifiedRangeSpansTenants") - .suppressFor(/*seconds=*/5) - .detail("Tenant", tenantName.get()) - .detail("Range", blobRange); - continue; - } - tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix)); - } - return tenantBlobRanges; + return blobbifiedRanges; } Future>> DatabaseContext::listBlobbifiedRanges(KeyRange range, From 474955b10d11644232d05ed151e18ce95ec51610 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Tue, 18 Oct 2022 14:26:36 -0700 Subject: [PATCH 26/52] blob: allow purge as long as it doesn't start in a range --- fdbclient/NativeAPI.actor.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1c0a224d18..e44f677ee0 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -10652,10 +10652,13 @@ ACTOR Future purgeBlobGranulesActor(Reference db, } // must be aligned to blob range(s) - state Future> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin)); - state Future> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin)); - wait(success(beginPresent) && success(endPresent)); - if (!beginPresent.get().present() || !endPresent.get().present()) { + state Future>> blobbifiedBegin = + getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {}); + state Future>> blobbifiedEnd = + getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {}); + wait(success(blobbifiedBegin) && success(blobbifiedEnd)); + if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) || + (!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) { TraceEvent("UnalignedPurge") .detail("Range", range) .detail("Version", purgeVersion) From 070e4c133e96104e2c5111bc5a8ac6c49d601fea Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 20 Oct 2022 23:01:38 -0700 Subject: [PATCH 27/52] blob/testing: remove setRange() and call (un)blobbifyRange() directly This also fixes a few wrong setRange(true/false). --- .../BlobGranuleRangesWorkload.actor.cpp | 74 ++++++++----------- 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp index 8ac26ccf61..e94a134ae7 100644 --- a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp +++ b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp @@ -105,16 +105,6 @@ struct BlobGranuleRangesWorkload : TestWorkload { } } - ACTOR Future setRange(Database cx, KeyRange range, bool active, Optional tenantName) { - if (active) { - bool success = wait(cx->blobbifyRange(range, tenantName)); - return success; - } else { - bool success = wait(cx->unblobbifyRange(range, tenantName)); - return success; - } - } - ACTOR Future registerNewRange(Database cx, BlobGranuleRangesWorkload* self, Optional tenantName) { std::string nextRangeKey = "R_" + self->newKey(); state KeyRange range(KeyRangeRef(StringRef(nextRangeKey), strinc(StringRef(nextRangeKey)))); @@ -124,8 +114,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { // don't put in active ranges until AFTER set range command succeeds, to avoid checking a range that maybe // wasn't initialized - bool success = - wait(self->setRange(cx, range, true, tenantName.present() ? tenantName.get() : self->tenantName)); + bool success = wait(cx->blobbifyRange(range, tenantName.present() ? tenantName.get() : self->tenantName)); ASSERT(success); if (BGRW_DEBUG) { @@ -163,7 +152,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { Key purgeKey = wait(self->versionedForcePurge(cx, range, self->tenantName)); wait(cx->waitPurgeGranulesComplete(purgeKey)); } - bool success = wait(self->setRange(cx, range, false, self->tenantName)); + bool success = wait(cx->unblobbifyRange(range, self->tenantName)); ASSERT(success); if (BGRW_DEBUG) { @@ -356,7 +345,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { // tear down range at end Key purgeKey = wait(self->versionedForcePurge(cx, range, self->tenantName)); wait(cx->waitPurgeGranulesComplete(purgeKey)); - bool success = wait(self->setRange(cx, range, false, self->tenantName)); + bool success = wait(cx->unblobbifyRange(range, self->tenantName)); ASSERT(success); if (BGRW_DEBUG) { @@ -373,7 +362,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { if (BGRW_DEBUG) { fmt::print("VerifyRangeUnit: [{0} - {1})\n", range.begin.printable(), range.end.printable()); } - bool setSuccess = wait(self->setRange(cx, activeRange, true, self->tenantName)); + bool setSuccess = wait(cx->blobbifyRange(activeRange, self->tenantName)); ASSERT(setSuccess); wait(self->checkRange(cx, self, activeRange, true)); @@ -426,7 +415,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { for (i = 0; i < rangeCount; i++) { state KeyRange subRange(KeyRangeRef(boundaries[i], boundaries[i + 1])); if (i != rangeToNotBlobbify) { - bool setSuccess = wait(self->setRange(cx, subRange, true, self->tenantName)); + bool setSuccess = wait(cx->blobbifyRange(subRange, self->tenantName)); ASSERT(setSuccess); wait(self->checkRange(cx, self, subRange, true)); } else { @@ -473,7 +462,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { } ACTOR Future rangesMisalignedUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) { - bool setSuccess = wait(self->setRange(cx, range, true, self->tenantName)); + bool setSuccess = wait(cx->blobbifyRange(range, self->tenantName)); ASSERT(setSuccess); state KeyRange subRange(KeyRangeRef(range.begin.withSuffix("A"_sr), range.begin.withSuffix("B"_sr))); @@ -526,42 +515,42 @@ struct BlobGranuleRangesWorkload : TestWorkload { // unblobbifying range that already doesn't exist should be no-op if (deterministicRandom()->coinflip()) { - bool unblobbifyStartSuccess = wait(self->setRange(cx, activeRange, false, self->tenantName)); + bool unblobbifyStartSuccess = wait(cx->blobbifyRange(activeRange, self->tenantName)); ASSERT(unblobbifyStartSuccess); } - bool success = wait(self->setRange(cx, activeRange, true, self->tenantName)); + bool success = wait(cx->blobbifyRange(activeRange, self->tenantName)); ASSERT(success); wait(self->checkRange(cx, self, activeRange, true)); // check that re-blobbifying same range is successful - bool retrySuccess = wait(self->setRange(cx, activeRange, true, self->tenantName)); + bool retrySuccess = wait(cx->blobbifyRange(activeRange, self->tenantName)); ASSERT(retrySuccess); wait(self->checkRange(cx, self, activeRange, true)); // check that blobbifying range that overlaps but does not match existing blob range fails - bool fail1 = wait(self->setRange(cx, range, true, self->tenantName)); + bool fail1 = wait(cx->blobbifyRange(range, self->tenantName)); ASSERT(!fail1); - bool fail2 = wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), true, self->tenantName)); + bool fail2 = wait(cx->blobbifyRange(KeyRangeRef(range.begin, activeRange.end), self->tenantName)); ASSERT(!fail2); - bool fail3 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), true, self->tenantName)); + bool fail3 = wait(cx->blobbifyRange(KeyRangeRef(activeRange.begin, range.end), self->tenantName)); ASSERT(!fail3); - bool fail4 = wait(self->setRange(cx, KeyRangeRef(range.begin, middleKey), true, self->tenantName)); + bool fail4 = wait(cx->blobbifyRange(KeyRangeRef(range.begin, middleKey), self->tenantName)); ASSERT(!fail4); - bool fail5 = wait(self->setRange(cx, KeyRangeRef(middleKey, range.end), true, self->tenantName)); + bool fail5 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, range.end), self->tenantName)); ASSERT(!fail5); - bool fail6 = wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), true, self->tenantName)); + bool fail6 = wait(cx->blobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName)); ASSERT(!fail6); - bool fail7 = wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), true, self->tenantName)); + bool fail7 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName)); ASSERT(!fail7); - bool fail8 = wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), true, self->tenantName)); + bool fail8 = wait(cx->blobbifyRange(KeyRangeRef(middleKey, middleKey2), self->tenantName)); ASSERT(!fail8); { @@ -600,41 +589,38 @@ struct BlobGranuleRangesWorkload : TestWorkload { ASSERT(blobRanges.size() == 1); ASSERT(blobRanges[0] == activeRange); - bool unblobbifyFail1 = wait(self->setRange(cx, range, false, self->tenantName)); + bool unblobbifyFail1 = wait(cx->unblobbifyRange(range, self->tenantName)); ASSERT(!unblobbifyFail1); bool unblobbifyFail2 = - wait(self->setRange(cx, KeyRangeRef(range.begin, activeRange.end), false, self->tenantName)); + wait(cx->unblobbifyRange(KeyRangeRef(range.begin, activeRange.end), self->tenantName)); ASSERT(!unblobbifyFail2); bool unblobbifyFail3 = - wait(self->setRange(cx, KeyRangeRef(activeRange.begin, range.end), false, self->tenantName)); + wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, range.end), self->tenantName)); ASSERT(!unblobbifyFail3); bool unblobbifyFail4 = - wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false, self->tenantName)); + wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName)); ASSERT(!unblobbifyFail4); - bool unblobbifyFail5 = - wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false, self->tenantName)); + bool unblobbifyFail5 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName)); ASSERT(!unblobbifyFail5); bool unblobbifyFail6 = - wait(self->setRange(cx, KeyRangeRef(activeRange.begin, middleKey), false, self->tenantName)); + wait(cx->unblobbifyRange(KeyRangeRef(activeRange.begin, middleKey), self->tenantName)); ASSERT(!unblobbifyFail6); - bool unblobbifyFail7 = - wait(self->setRange(cx, KeyRangeRef(middleKey, activeRange.end), false, self->tenantName)); + bool unblobbifyFail7 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, activeRange.end), self->tenantName)); ASSERT(!unblobbifyFail7); - bool unblobbifyFail8 = - wait(self->setRange(cx, KeyRangeRef(middleKey, middleKey2), false, self->tenantName)); + bool unblobbifyFail8 = wait(cx->unblobbifyRange(KeyRangeRef(middleKey, middleKey2), self->tenantName)); ASSERT(!unblobbifyFail8); - bool unblobbifySuccess = wait(self->setRange(cx, activeRange, true, self->tenantName)); + bool unblobbifySuccess = wait(cx->unblobbifyRange(activeRange, self->tenantName)); ASSERT(unblobbifySuccess); - bool unblobbifySuccessAgain = wait(self->setRange(cx, activeRange, true, self->tenantName)); + bool unblobbifySuccessAgain = wait(cx->unblobbifyRange(activeRange, self->tenantName)); ASSERT(unblobbifySuccessAgain); } @@ -642,7 +628,7 @@ struct BlobGranuleRangesWorkload : TestWorkload { } ACTOR Future reBlobbifyUnit(Database cx, BlobGranuleRangesWorkload* self, KeyRange range) { - bool setSuccess = wait(self->setRange(cx, range, true, self->tenantName)); + bool setSuccess = wait(cx->blobbifyRange(range, self->tenantName)); ASSERT(setSuccess); wait(self->checkRange(cx, self, range, true)); @@ -651,11 +637,11 @@ struct BlobGranuleRangesWorkload : TestWorkload { wait(cx->waitPurgeGranulesComplete(purgeKey)); wait(self->checkRange(cx, self, range, false)); - bool unsetSuccess = wait(self->setRange(cx, range, false, self->tenantName)); + bool unsetSuccess = wait(cx->unblobbifyRange(range, self->tenantName)); ASSERT(unsetSuccess); wait(self->checkRange(cx, self, range, false)); - bool reSetSuccess = wait(self->setRange(cx, range, true, self->tenantName)); + bool reSetSuccess = wait(cx->blobbifyRange(range, self->tenantName)); ASSERT(reSetSuccess); wait(self->checkRange(cx, self, range, true)); From 136a325fdc6c1463a8f075759e2a970eea011c10 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 20 Oct 2022 23:07:21 -0700 Subject: [PATCH 28/52] blob/testing: randomly purge the whole range instead of just active --- fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp index e94a134ae7..7bb58b77b2 100644 --- a/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp +++ b/fdbserver/workloads/BlobGranuleRangesWorkload.actor.cpp @@ -571,13 +571,14 @@ struct BlobGranuleRangesWorkload : TestWorkload { } } - // tear down + check that un-blobbifying at a non-aligned range also doesn't work - Key purgeKey = wait(self->versionedForcePurge(cx, activeRange, self->tenantName)); + state Version purgeVersion = deterministicRandom()->coinflip() ? latestVersion : 1; + state KeyRangeRef purgeRange = deterministicRandom()->coinflip() ? activeRange : range; + Key purgeKey = wait(cx->purgeBlobGranules(purgeRange, purgeVersion, self->tenantName, true)); wait(cx->waitPurgeGranulesComplete(purgeKey)); if (deterministicRandom()->coinflip()) { // force purge again and ensure it is idempotent - Key purgeKeyAgain = wait(cx->purgeBlobGranules(activeRange, 1, self->tenantName, true)); + Key purgeKeyAgain = wait(cx->purgeBlobGranules(purgeRange, purgeVersion, self->tenantName, true)); wait(cx->waitPurgeGranulesComplete(purgeKeyAgain)); } } From a8f821e1529a9118f27fc070ad2301ea4ddb0d92 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 24 Oct 2022 10:33:30 -0700 Subject: [PATCH 29/52] Fix stack overflows The loop is transformed by actor compiler into recursions that may cause stack overflows. Thus, I added yield() to unwind stack and refactor the parsing code so that the subsequent files are blocked until previous ones have finished. --- fdbserver/RestoreLoader.actor.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 964045c153..1786e055b9 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -405,10 +405,6 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( .detail("Offset", asset.offset) .detail("Length", asset.len); - // Ensure data blocks in the same file are processed in order - wait(processedFileOffset->whenAtLeast(asset.offset)); - ASSERT(processedFileOffset->get() == asset.offset); - state Arena tempArena; state StringRefReader reader(buf, restore_corrupted_data()); try { @@ -430,8 +426,9 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( const uint8_t* message = reader.consume(msgSize); // Skip mutations out of the version range - if (!asset.isInVersionRange(msgVersion.version)) + if (!asset.isInVersionRange(msgVersion.version)) { continue; + } state VersionedMutationsMap::iterator it; bool inserted; @@ -452,6 +449,7 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( // Skip mutation whose commitVesion < range kv's version if (logMutationTooOld(pRangeVersions, mutation, msgVersion.version)) { cc->oldLogMutations += 1; + wait(yield()); // avoid potential stack overflows continue; } @@ -459,6 +457,7 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( if (mutation.param1 >= asset.range.end || (isRangeMutation(mutation) && mutation.param2 < asset.range.begin) || (!isRangeMutation(mutation) && mutation.param1 < asset.range.begin)) { + wait(yield()); // avoid potential stack overflows continue; } @@ -509,7 +508,6 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( .detail("BlockLen", asset.len); throw; } - processedFileOffset->set(asset.offset + asset.len); return Void(); } @@ -526,8 +524,19 @@ ACTOR static Future parsePartitionedLogFileOnLoader( state int readFileRetries = 0; loop { try { + // Ensure data blocks in the same file are processed in order + wait(processedFileOffset->whenAtLeast(asset.offset)); + ASSERT(processedFileOffset->get() == asset.offset); + wait(_parsePartitionedLogFileOnLoader( pRangeVersions, processedFileOffset, kvOpsIter, samplesIter, cc, bc, asset, cx)); + processedFileOffset->set(asset.offset + asset.len); + + TraceEvent("FastRestoreLoaderDecodingLogFileDone") + .detail("BatchIndex", asset.batchIndex) + .detail("Filename", asset.filename) + .detail("Offset", asset.offset) + .detail("Length", asset.len); break; } catch (Error& e) { if (e.code() == error_code_restore_bad_read || e.code() == error_code_restore_unsupported_file_version || From db72a29c06c37163018326ffc42fccbcae5726c1 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 24 Oct 2022 11:16:23 -0700 Subject: [PATCH 30/52] fix compile error after rebase --- fdbserver/include/fdbserver/IKeyValueStore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/include/fdbserver/IKeyValueStore.h b/fdbserver/include/fdbserver/IKeyValueStore.h index 3069527d7a..22fb03ef73 100644 --- a/fdbserver/include/fdbserver/IKeyValueStore.h +++ b/fdbserver/include/fdbserver/IKeyValueStore.h @@ -29,7 +29,7 @@ #include "fdbserver/IClosable.h" #include "fdbserver/IPageEncryptionKeyProvider.actor.h" #include "fdbserver/ServerDBInfo.h" -#include "fdbserver/StorageMetrics.h" +#include "fdbserver/StorageMetrics.actor.h" struct CheckpointRequest { const Version version; // The FDB version at which the checkpoint is created. From f2289ced27cea5ecd2de15374b6df948274c1dd5 Mon Sep 17 00:00:00 2001 From: Hui Liu Date: Wed, 12 Oct 2022 15:25:37 -0700 Subject: [PATCH 31/52] Add StorageServerInterface for BlobMigrator --- fdbclient/BlobGranuleReader.actor.cpp | 3 +- fdbclient/include/fdbclient/FDBTypes.h | 7 +- fdbserver/BlobManager.actor.cpp | 2 +- fdbserver/BlobManifest.actor.cpp | 105 +++--- fdbserver/BlobMigrator.actor.cpp | 298 ++++++++++++++++-- fdbserver/BlobWorker.actor.cpp | 2 +- fdbserver/ClusterController.actor.cpp | 5 +- .../fdbserver/BlobGranuleServerCommon.actor.h | 18 ++ .../include/fdbserver/BlobMigratorInterface.h | 14 +- fdbserver/storageserver.actor.cpp | 26 +- fdbserver/worker.actor.cpp | 20 +- 11 files changed, 398 insertions(+), 102 deletions(-) diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index 9ba1ccffdb..09c2f5a050 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -142,7 +142,6 @@ bool isRangeFullyCovered(KeyRange range, Standalone& ranges) { ASSERT(std::is_sorted(ranges.begin(), ranges.end(), KeyRangeRef::ArbitraryOrder())); KeyRangeRef clone(begin, end); + for (auto r : ranges) { - if (begin < r.begin) + if (clone.begin < r.begin) return false; // uncovered gap between clone.begin and r.begin - if (end <= r.end) + if (clone.end <= r.end) return true; // range is fully covered - if (end > r.begin) + if (clone.end > r.begin) // {clone.begin, r.end} is covered. need to check coverage for {r.end, clone.end} clone = KeyRangeRef(r.end, clone.end); } diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index ad6051b602..a571b52842 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -3537,7 +3537,7 @@ ACTOR Future recoverBlobManager(Reference bmData) { } // skip the rest of the algorithm for the first blob manager - if (bmData->epoch == 1) { + if (bmData->epoch == 1 && !isFullRestoreMode()) { bmData->doneRecovering.send(Void()); return Void(); } diff --git a/fdbserver/BlobManifest.actor.cpp b/fdbserver/BlobManifest.actor.cpp index 7e64130234..e85d774a67 100644 --- a/fdbserver/BlobManifest.actor.cpp +++ b/fdbserver/BlobManifest.actor.cpp @@ -26,6 +26,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbserver/Knobs.h" #include "flow/FastRef.h" +#include "flow/Trace.h" #include "flow/flow.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/BlobConnectionProvider.h" @@ -189,23 +190,6 @@ private: static const int sMaxCount_{ 5 }; // max number of manifest file to keep }; -// Defines granule info that interests full restore -struct BlobGranuleVersion { - // Two constructors required by VectorRef - BlobGranuleVersion() {} - BlobGranuleVersion(Arena& a, const BlobGranuleVersion& copyFrom) - : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version), - sizeInBytes(copyFrom.sizeInBytes) {} - - UID granuleID; - KeyRangeRef keyRange; - Version version; - int64_t sizeInBytes; -}; - -// Defines a vector for BlobGranuleVersion -typedef Standalone> BlobGranuleVersionVector; - // Defines filename, version, size for each granule file that interests full restore struct GranuleFileVersion { Version version; @@ -226,16 +210,53 @@ public: Value data = wait(readFromFile(self)); Standalone manifest = decode(data); wait(writeSystemKeys(self, manifest.rows)); - BlobGranuleVersionVector _ = wait(listGranules(self)); + BlobGranuleRestoreVersionVector _ = wait(listGranules(self)); } catch (Error& e) { dprint("WARNING: unexpected manifest loader error {}\n", e.what()); // skip error handling so far } return Void(); } + // Iterate active granules and return their version/sizes + ACTOR static Future listGranules(Reference self) { + state Transaction tr(self->db_); + loop { + state BlobGranuleRestoreVersionVector results; + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + + try { + std::vector granules; + state int i = 0; + auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED; + state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit)); + for (i = 0; i < blobRanges.size() - 1; i++) { + Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin); + Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin); + state KeyRange granuleRange = KeyRangeRef(startKey, endKey); + try { + Standalone granule = wait(getGranule(&tr, granuleRange)); + results.push_back_deep(results.arena(), granule); + } catch (Error& e) { + if (e.code() == error_code_restore_missing_data) { + dprint("missing data for key range {} \n", granuleRange.toString()); + TraceEvent("BlobRestoreMissingData").detail("KeyRange", granuleRange.toString()); + } else { + throw; + } + } + } + return results; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + // Print out a summary for blob granules ACTOR static Future print(Reference self) { - state BlobGranuleVersionVector granules = wait(listGranules(self)); + state BlobGranuleRestoreVersionVector granules = wait(listGranules(self)); for (auto granule : granules) { wait(checkGranuleFiles(self, granule)); } @@ -285,41 +306,9 @@ private: } } - // Iterate active granules and return their version/sizes - ACTOR static Future listGranules(Reference self) { - state Transaction tr(self->db_); - loop { - state BlobGranuleVersionVector results; - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - - try { - std::vector granules; - state int i = 0; - auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED; - state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit)); - for (i = 0; i < blobRanges.size() - 1; i++) { - Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin); - Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin); - state KeyRange granuleRange = KeyRangeRef(startKey, endKey); - try { - Standalone granule = wait(getGranule(&tr, granuleRange)); - results.push_back_deep(results.arena(), granule); - } catch (Error& e) { - dprint("missing data for key range {} \n", granuleRange.toString()); - } - } - return results; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - // Find the newest granule for a key range. The newest granule has the max version and relevant files - ACTOR static Future> getGranule(Transaction* tr, KeyRangeRef range) { - state Standalone granuleVersion; + ACTOR static Future> getGranule(Transaction* tr, KeyRangeRef range) { + state Standalone granuleVersion; KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range); // reverse lookup so that the first row is the newest version state RangeResult results = @@ -389,7 +378,7 @@ private: } // Read data from granules and print out summary - ACTOR static Future checkGranuleFiles(Reference self, BlobGranuleVersion granule) { + ACTOR static Future checkGranuleFiles(Reference self, BlobGranuleRestoreVersion granule) { state KeyRangeRef range = granule.keyRange; state Version readVersion = granule.version; state Transaction tr(self->db_); @@ -441,3 +430,11 @@ ACTOR Future printRestoreSummary(Database db, Reference listBlobGranules(Database db, + Reference blobConn) { + Reference loader = makeReference(db, blobConn); + BlobGranuleRestoreVersionVector result = wait(BlobManifestLoader::listGranules(loader)); + return result; +} diff --git a/fdbserver/BlobMigrator.actor.cpp b/fdbserver/BlobMigrator.actor.cpp index 2044b093d4..9be19fa6a4 100644 --- a/fdbserver/BlobMigrator.actor.cpp +++ b/fdbserver/BlobMigrator.actor.cpp @@ -30,54 +30,312 @@ #include "fdbclient/KeyRangeMap.h" #include "fdbclient/SystemData.h" #include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/ServerDBInfo.actor.h" #include "fdbserver/WaitFailure.h" - +#include "fdbserver/MoveKeys.actor.h" +#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "flow/actorcompiler.h" // has to be last include +#include "flow/network.h" +#include +#include + +#define ENABLE_DEBUG_MG true + +template +static inline void dprint(fmt::format_string fmt, T&&... args) { + if (ENABLE_DEBUG_MG) + fmt::print(fmt, std::forward(args)...); +} // BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of // StorageServerInterface APIs which are needed for DataDistributor to start data migration. class BlobMigrator : public NonCopyable, public ReferenceCounted { public: BlobMigrator(Reference const> dbInfo, BlobMigratorInterface interf) - : blobMigratorInterf(interf), actors(false) { - if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") { - blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL); + : interf_(interf), actors_(false) { + if (!blobConn_.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") { + blobConn_ = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL); } - db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); + db_ = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True); } ~BlobMigrator() {} + // Start migration ACTOR static Future start(Reference self) { - self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture())); + if (!isFullRestoreMode()) { + return Void(); + } + wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager + + BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_)); + self->blobGranules_ = granules; + + wait(prepare(self, normalKeys)); + + wait(serverLoop(self)); + return Void(); + } + +private: + // Prepare for data migration for given key range. + ACTOR static Future prepare(Reference self, KeyRangeRef keys) { + // Register as a storage server, so that DataDistributor could start data movement after + std::pair verAndTag = wait(addStorageServer(self->db_, self->interf_.ssi)); + dprint("Started storage server interface {} {}\n", verAndTag.first, verAndTag.second.toString()); + + // Reassign key ranges to the storage server + // It'll restart DataDistributor so that internal data structures like ShardTracker, ShardsAffectedByTeamFailure + // could be re-initialized. Ideally it should be done within DataDistributor, then we don't need to + // restart DataDistributor + state int oldMode = wait(setDDMode(self->db_, 0)); + wait(unassignServerKeys(self, keys)); + wait(assignKeysToServer(self, keys, self->interf_.ssi.id())); + wait(success(setDDMode(self->db_, oldMode))); + return Void(); + } + + // Assign given key range to specified storage server. Subsquent + ACTOR static Future assignKeysToServer(Reference self, KeyRangeRef keys, UID serverUID) { + state Transaction tr(self->db_); loop { - choose { - when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) { - req.reply.send(Void()); - TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID); - break; + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + try { + state Value value = keyServersValue(std::vector({ serverUID }), std::vector(), UID(), UID()); + wait(krmSetRange(&tr, keyServersPrefix, keys, value)); + wait(krmSetRange(&tr, serverKeysPrefixFor(serverUID), keys, serverKeysTrue)); + wait(tr.commit()); + dprint("Assign {} to server {}\n", normalKeys.toString(), serverUID.toString()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + // Unassign given key range from its current storage servers + ACTOR static Future unassignServerKeys(Reference self, KeyRangeRef keys) { + state Transaction tr(self->db_); + loop { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + try { + state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY); + for (auto& server : serverList) { + state UID id = decodeServerListValue(server.value).id(); + RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), keys)); + bool owning = false; + for (auto& r : ranges) { + if (r.value == serverKeysTrue) { + owning = true; + break; + } + } + if (owning) { + dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString()); + wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse)); + } } - when(wait(self->actors.getResult())) {} + wait(tr.commit()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + // Main server loop + ACTOR static Future serverLoop(Reference self) { + self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture())); + self->actors_.add(handleRequest(self)); + self->actors_.add(handleUnsupportedRequest(self)); + loop { + try { + choose { + when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) { + req.reply.send(Void()); + TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID); + break; + } + when(wait(self->actors_.getResult())) {} + } + } catch (Error& e) { + dprint("Unexpected serverLoop error {}\n", e.what()); + throw; } } return Void(); } + // Handle StorageServerInterface APIs + ACTOR static Future handleRequest(Reference self) { + state StorageServerInterface ssi = self->interf_.ssi; + loop { + try { + choose { + when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) { + dprint("Handle GetShardStateRequest\n"); + Version version = maxVersion(self); + GetShardStateReply rep(version, version); + req.reply.send(rep); // return empty shards + } + when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) { + // dprint("Handle WaitMetricsRequest\n"); + self->actors_.add(processWaitMetricsRequest(self, req)); + } + when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { + dprint("Handle SplitMetrics {}\n", req.keys.toString()); + SplitMetricsReply rep; + for (auto granule : self->blobGranules_) { + // TODO: Use granule boundary as split point. A better approach is to split by size + if (granule.keyRange.begin > req.keys.begin && granule.keyRange.end < req.keys.end) + rep.splits.push_back_deep(rep.splits.arena(), granule.keyRange.begin); + } + req.reply.send(rep); + } + when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { + fmt::print("Handle GetStorageMetrics\n"); + StorageMetrics metrics; + metrics.bytes = sizeInBytes(self); + GetStorageMetricsReply resp; + resp.load = metrics; + req.reply.send(resp); + } + when(ReplyPromise reply = waitNext(ssi.getKeyValueStoreType.getFuture())) { + dprint("Handle KeyValueStoreType\n"); + reply.send(KeyValueStoreType::MEMORY); + } + } + } catch (Error& e) { + dprint("Unexpected blob migrator request error {}\n", e.what()); + throw; + } + } + } + + // Handle StorageServerInterface APIs that are not supported. Simply log and return error + ACTOR static Future handleUnsupportedRequest(Reference self) { + state StorageServerInterface ssi = self->interf_.ssi; + loop { + try { + choose { + when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) { + dprint("Unsupported SplitRangeRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) { + self->actors_.add(processStorageQueuingMetricsRequest(req)); + } + when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) { + dprint("Unsupported ReadHotSubRange\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) { + dprint("Unsupported GetKeyValuesStreamRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { + dprint("Unsupported GetKeyRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture())) { + /* dprint("Unsupported GetKeyValuesRequest {} - {} @ {}\n", + req.begin.getKey().printable(), + req.end.getKey().printable(), + req.version); */ + req.reply.sendError(unsupported_operation()); + } + when(GetValueRequest req = waitNext(ssi.getValue.getFuture())) { + dprint("Unsupported GetValueRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) { + dprint("Unsupported GetCheckpoint \n"); + req.reply.sendError(unsupported_operation()); + } + when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) { + dprint("Unsupported FetchCheckpointRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) { + dprint("Unsupported UpdateCommitCostRequest\n"); + req.reply.sendError(unsupported_operation()); + } + when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) { + dprint("Unsupported FetchCheckpointKeyValuesRequest\n"); + req.reply.sendError(unsupported_operation()); + } + } + } catch (Error& e) { + dprint("Unexpected request handling error {}\n", e.what()); + throw; + } + } + } + + ACTOR static Future processWaitMetricsRequest(Reference self, WaitMetricsRequest req) { + state WaitMetricsRequest waitMetricsRequest = req; + // FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD + // processes + wait(delay(1)); + StorageMetrics metrics; + metrics.bytes = sizeInBytes(self, waitMetricsRequest.keys); + waitMetricsRequest.reply.send(metrics); + return Void(); + } + + ACTOR static Future processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) { + dprint("Unsupported StorageQueuingMetricsRequest\n"); + // FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD + // processes + wait(delay(1)); + req.reply.sendError(unsupported_operation()); + return Void(); + } + + // Return total storage size in bytes for migration + static int64_t sizeInBytes(Reference self) { return sizeInBytes(self, normalKeys); } + + // Return storage size in bytes for given key range + static int64_t sizeInBytes(Reference self, KeyRangeRef range) { + int64_t bytes = 0; + for (auto granule : self->blobGranules_) { + if (range.intersects(granule.keyRange)) + bytes += granule.sizeInBytes; + } + return bytes; + } + + // Return max version for all blob granules + static Version maxVersion(Reference self) { + Version max = 0; + for (auto granule : self->blobGranules_) { + max = std::max(granule.version, max); + } + return max; + } + private: - Database db; - Reference blobConn; - BlobMigratorInterface blobMigratorInterf; - ActorCollection actors; + Database db_; + Reference blobConn_; + BlobGranuleRestoreVersionVector blobGranules_; + BlobMigratorInterface interf_; + ActorCollection actors_; }; // Main entry point -ACTOR Future blobMigrator(BlobMigratorInterface ssi, Reference const> dbInfo) { - fmt::print("Start blob migrator {} \n", ssi.id().toString()); +ACTOR Future blobMigrator(BlobMigratorInterface interf, Reference const> dbInfo) { + fmt::print("Start blob migrator {} \n", interf.id().toString()); try { - Reference self = makeReference(dbInfo, ssi); + Reference self = makeReference(dbInfo, interf); wait(BlobMigrator::start(self)); } catch (Error& e) { - fmt::print("unexpected blob migrator error {}\n", e.what()); + dprint("Unexpected blob migrator error {}\n", e.what()); + TraceEvent("BlobMigratorError", interf.id()).error(e); } return Void(); } diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index a1c76463ff..6d2c0038ae 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -3961,7 +3961,7 @@ ACTOR Future openGranule(Reference bwData, As } } - if (createChangeFeed) { + if (createChangeFeed && !isFullRestoreMode()) { // create new change feed for new version of granule wait(updateChangeFeed( &tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange)); diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index c962ca891d..e59eddd8cf 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -2615,8 +2615,9 @@ ACTOR Future monitorBlobMigrator(ClusterControllerData* self) { } loop { if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) { - state Future wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure, - SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME); + state Future wfClient = + waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure, + SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME); loop { choose { when(wait(wfClient)) { diff --git a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h index b7510bd0b1..c4a2bc2344 100644 --- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h +++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h @@ -140,9 +140,27 @@ private: Future collection; }; +// Defines granule info that interests full restore +struct BlobGranuleRestoreVersion { + // Two constructors required by VectorRef + BlobGranuleRestoreVersion() {} + BlobGranuleRestoreVersion(Arena& a, const BlobGranuleRestoreVersion& copyFrom) + : granuleID(copyFrom.granuleID), keyRange(a, copyFrom.keyRange), version(copyFrom.version), + sizeInBytes(copyFrom.sizeInBytes) {} + + UID granuleID; + KeyRangeRef keyRange; + Version version; + int64_t sizeInBytes; +}; + +// Defines a vector for BlobGranuleVersion +typedef Standalone> BlobGranuleRestoreVersionVector; + ACTOR Future dumpManifest(Database db, Reference blobConn, int64_t epoch, int64_t seqNo); ACTOR Future loadManifest(Database db, Reference blobConn); ACTOR Future printRestoreSummary(Database db, Reference blobConn); +ACTOR Future listBlobGranules(Database db, Reference blobConn); inline bool isFullRestoreMode() { return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE; }; diff --git a/fdbserver/include/fdbserver/BlobMigratorInterface.h b/fdbserver/include/fdbserver/BlobMigratorInterface.h index 6e3cbe3c7c..5b9cb6b97a 100644 --- a/fdbserver/include/fdbserver/BlobMigratorInterface.h +++ b/fdbserver/include/fdbserver/BlobMigratorInterface.h @@ -30,23 +30,25 @@ struct BlobMigratorInterface { constexpr static FileIdentifier file_identifier = 869199; RequestStream haltBlobMigrator; - RequestStream> waitFailure; LocalityData locality; UID uniqueID; + StorageServerInterface ssi; BlobMigratorInterface() {} - BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {} + BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) { + ssi.locality = l; + ssi.uniqueID = id; + } - void initEndpoints() {} + void initEndpoints() { ssi.initEndpoints(); } UID id() const { return uniqueID; } - NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); } + NetworkAddress address() const { return haltBlobMigrator.getEndpoint().getPrimaryAddress(); } bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); } bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); } template void serialize(Archive& ar) { - // StorageServerInterface::serialize(ar); - serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID); + serializer(ar, locality, uniqueID, haltBlobMigrator); } }; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index f942d8394f..8a44eaf169 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -86,6 +86,7 @@ #include "fdbserver/TransactionTagCounter.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" +#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "flow/ActorCollection.h" #include "flow/Arena.h" #include "flow/Error.h" @@ -5976,27 +5977,26 @@ ACTOR Future tryGetRangeFromBlob(PromiseStream results, Reference blobConn) { ASSERT(blobConn.isValid()); try { - state Standalone> chunks = wait(tryReadBlobGranules(tr, keys, fetchVersion)); - if (chunks.size() == 0) { throw blob_granule_transaction_too_old(); // no data on blob } - if (!isRangeFullyCovered(keys, chunks)) { throw blob_granule_transaction_too_old(); } - - for (const BlobGranuleChunkRef& chunk : chunks) { - state KeyRangeRef chunkRange = chunk.keyRange; - state RangeResult rows = wait(readBlobGranule(chunk, keys, 0, fetchVersion, blobConn)); + state int i; + for (i = 0; i < chunks.size(); ++i) { + state KeyRangeRef chunkRange = chunks[i].keyRange; + state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn)); TraceEvent("ReadBlobData") .detail("Rows", rows.size()) .detail("ChunkRange", chunkRange.toString()) .detail("Keys", keys.toString()); - if (rows.size() == 0) { - rows.readThrough = KeyRef(rows.arena(), chunkRange.end); + rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end)); + } + if (i == chunks.size() - 1) { + rows.readThrough = KeyRef(rows.arena(), keys.end); } results.send(rows); } @@ -6010,7 +6010,7 @@ ACTOR Future tryGetRangeFromBlob(PromiseStream results, tr->reset(); tr->setVersion(fetchVersion); tr->trState->taskID = TaskPriority::FetchKeys; - wait(tryGetRange(results, tr, keys)); // fail back to storage server + throw; } return Void(); } @@ -6798,8 +6798,10 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { // We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure // change feed mutations get applied correctly state std::vector changeFeedsToFetch; - std::vector _cfToFetch = wait(fetchCFMetadata); - changeFeedsToFetch = _cfToFetch; + if (!isFullRestoreMode()) { + std::vector _cfToFetch = wait(fetchCFMetadata); + changeFeedsToFetch = _cfToFetch; + } wait(data->durableVersionLock.take()); shard->phase = AddingShard::Fetching; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index be77fd8eaf..a6019b6cec 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -2267,7 +2267,25 @@ ACTOR Future workerServer(Reference connRecord, CODE_PROBE(true, "Recruited while already a blob migrator."); } else { startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id()); - DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.haltBlobMigrator); + DUMPTOKEN(recruited.ssi.getValue); + DUMPTOKEN(recruited.ssi.getKey); + DUMPTOKEN(recruited.ssi.getKeyValues); + DUMPTOKEN(recruited.ssi.getMappedKeyValues); + DUMPTOKEN(recruited.ssi.getShardState); + DUMPTOKEN(recruited.ssi.waitMetrics); + DUMPTOKEN(recruited.ssi.splitMetrics); + DUMPTOKEN(recruited.ssi.getReadHotRanges); + DUMPTOKEN(recruited.ssi.getRangeSplitPoints); + DUMPTOKEN(recruited.ssi.getStorageMetrics); + DUMPTOKEN(recruited.ssi.waitFailure); + DUMPTOKEN(recruited.ssi.getQueuingMetrics); + DUMPTOKEN(recruited.ssi.getKeyValueStoreType); + DUMPTOKEN(recruited.ssi.watchValue); + DUMPTOKEN(recruited.ssi.getKeyValuesStream); + DUMPTOKEN(recruited.ssi.changeFeedStream); + DUMPTOKEN(recruited.ssi.changeFeedPop); + DUMPTOKEN(recruited.ssi.changeFeedVersionUpdate); Future blobMigratorProcess = blobMigrator(recruited, dbInfo); errorForwarders.add(forwardError(errors, From 22047385c4f69e3538a0d339a8f951607e86817b Mon Sep 17 00:00:00 2001 From: Zhe Wu Date: Sat, 22 Oct 2022 20:48:58 -0700 Subject: [PATCH 32/52] Count the detailed reason for new physical shard creation during data move --- fdbserver/DDRelocationQueue.actor.cpp | 63 ++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index b460e53cf4..8243f973b6 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -689,6 +689,19 @@ struct DDQueue : public IDDRelocationQueue { int moveReusePhysicalShard; int moveCreateNewPhysicalShard; + enum NewPhysicalShardReason { + None = 0, + PrimaryBestTeamNotReady, + RemoteBestTeamNotReady, + PrimaryNoHealthyTeam, + RemoteNoHealthyTeam, + RemoteTeamIsFull, + RemoteTeamIsNotHealthy, + NoAvailablePhysicalShard, + NumberOfTypes, + }; + // std::unordered_map newPhysicalShardReasonCount; + std::vector newPhysicalShardReasonCount; void startRelocation(int priority, int healthPriority) { // Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement, @@ -754,7 +767,8 @@ struct DDQueue : public IDDRelocationQueue { suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar(false)), rawProcessingWiggle(new AsyncVar(false)), unhealthyRelocations(0), movedKeyServersEventHolder(makeReference("MovedKeyServers")), moveReusePhysicalShard(0), - moveCreateNewPhysicalShard(0) {} + moveCreateNewPhysicalShard(0), + newPhysicalShardReasonCount(static_cast(NewPhysicalShardReason::NumberOfTypes), 0) {} DDQueue() = default; void validate() { @@ -1467,6 +1481,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, loop { destOverloadedCount = 0; stuckCount = 0; + state DDQueue::NewPhysicalShardReason newPhysicalShardReason = DDQueue::NewPhysicalShardReason::None; // state int bestTeamStuckThreshold = 50; loop { state int tciIndex = 0; @@ -1493,10 +1508,16 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, .detail("TeamCollectionIndex", tciIndex) .detail("RestoreDataMoveForDest", describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest)); + newPhysicalShardReason = tciIndex == 0 + ? DDQueue::NewPhysicalShardReason::PrimaryBestTeamNotReady + : DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady; foundTeams = false; break; } if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) { + newPhysicalShardReason = tciIndex == 0 + ? DDQueue::NewPhysicalShardReason::PrimaryNoHealthyTeam + : DDQueue::NewPhysicalShardReason::RemoteNoHealthyTeam; foundTeams = false; break; } @@ -1549,12 +1570,16 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, // getting the destination team or we could miss failure notifications for the storage // servers in the destination team TraceEvent("BestTeamNotReady"); + newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady; foundTeams = false; break; } // If a DC has no healthy team, we stop checking the other DCs until // the unhealthy DC is healthy again or is excluded. if (!bestTeam.first.present()) { + newPhysicalShardReason = tciIndex == 0 + ? DDQueue::NewPhysicalShardReason::PrimaryNoHealthyTeam + : DDQueue::NewPhysicalShardReason::RemoteNoHealthyTeam; foundTeams = false; break; } @@ -1578,6 +1603,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, if (tciIndex == 1 && !forceToUseNewPhysicalShard) { bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true); if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) { + newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteTeamIsFull; foundTeams = false; break; } @@ -1620,6 +1646,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD && bestTeams.size() > 1 && !forceToUseNewPhysicalShard) { if (!bestTeams[1].first->isHealthy()) { + newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteTeamIsNotHealthy; foundTeams = false; } } @@ -1684,6 +1711,15 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, self->moveReusePhysicalShard++; } else { self->moveCreateNewPhysicalShard++; + if (newPhysicalShardReason == DDQueue::NewPhysicalShardReason::None) { + // When creating a new physical shard, but the reason is none, this can only happen when + // determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical + // shard. + self->newPhysicalShardReasonCount + [DDQueue::NewPhysicalShardReason::NoAvailablePhysicalShard]++; + } else { + self->newPhysicalShardReasonCount[newPhysicalShardReason]++; + } } rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False); auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin); @@ -2485,9 +2521,32 @@ ACTOR Future dataDistributionQueue(Reference db, if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { TraceEvent("PhysicalShardMoveStats") .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard) - .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard); + .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard) + .detail("PrimaryBestTeamNotReady", + self.newPhysicalShardReasonCount + [DDQueue::NewPhysicalShardReason::PrimaryBestTeamNotReady]) + .detail("RemoteBestTeamNotReady", + self.newPhysicalShardReasonCount + [DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady]) + .detail( + "PrimaryNoHealthyTeam", + self.newPhysicalShardReasonCount[DDQueue::NewPhysicalShardReason::PrimaryNoHealthyTeam]) + .detail( + "RemoteNoHealthyTeam", + self.newPhysicalShardReasonCount[DDQueue::NewPhysicalShardReason::RemoteNoHealthyTeam]) + .detail("RemoteTeamIsFull", + self.newPhysicalShardReasonCount[DDQueue::NewPhysicalShardReason::RemoteTeamIsFull]) + .detail("RemoteTeamIsNotHealthy", + self.newPhysicalShardReasonCount + [DDQueue::NewPhysicalShardReason::RemoteTeamIsNotHealthy]) + .detail("NoAvailablePhysicalShard", + self.newPhysicalShardReasonCount + [DDQueue::NewPhysicalShardReason::NoAvailablePhysicalShard]); self.moveCreateNewPhysicalShard = 0; self.moveReusePhysicalShard = 0; + for (int i = 0; i < self.newPhysicalShardReasonCount.size(); ++i) { + self.newPhysicalShardReasonCount[i] = 0; + } } } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator From fc9295ab663684ef09c7151dadbb4b7ccbfa09c6 Mon Sep 17 00:00:00 2001 From: Zhe Wu Date: Sun, 23 Oct 2022 22:10:01 -0700 Subject: [PATCH 33/52] Address comments --- fdbserver/DDRelocationQueue.actor.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index 8243f973b6..719de8215e 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -691,7 +691,6 @@ struct DDQueue : public IDDRelocationQueue { int moveCreateNewPhysicalShard; enum NewPhysicalShardReason { None = 0, - PrimaryBestTeamNotReady, RemoteBestTeamNotReady, PrimaryNoHealthyTeam, RemoteNoHealthyTeam, @@ -700,7 +699,6 @@ struct DDQueue : public IDDRelocationQueue { NoAvailablePhysicalShard, NumberOfTypes, }; - // std::unordered_map newPhysicalShardReasonCount; std::vector newPhysicalShardReasonCount; void startRelocation(int priority, int healthPriority) { @@ -1508,9 +1506,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, .detail("TeamCollectionIndex", tciIndex) .detail("RestoreDataMoveForDest", describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest)); - newPhysicalShardReason = tciIndex == 0 - ? DDQueue::NewPhysicalShardReason::PrimaryBestTeamNotReady - : DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady; + newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady; foundTeams = false; break; } @@ -2522,9 +2518,6 @@ ACTOR Future dataDistributionQueue(Reference db, TraceEvent("PhysicalShardMoveStats") .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard) .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard) - .detail("PrimaryBestTeamNotReady", - self.newPhysicalShardReasonCount - [DDQueue::NewPhysicalShardReason::PrimaryBestTeamNotReady]) .detail("RemoteBestTeamNotReady", self.newPhysicalShardReasonCount [DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady]) From 0140991d15cbe6131f80424c936b5ef8a0f38ec6 Mon Sep 17 00:00:00 2001 From: Zhe Wu Date: Mon, 24 Oct 2022 10:39:32 -0700 Subject: [PATCH 34/52] Rename NewPhysicalShardReason to RetryFindDstReason --- fdbserver/DDRelocationQueue.actor.cpp | 63 ++++++++++++--------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index 719de8215e..98fa27689d 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -689,7 +689,7 @@ struct DDQueue : public IDDRelocationQueue { int moveReusePhysicalShard; int moveCreateNewPhysicalShard; - enum NewPhysicalShardReason { + enum RetryFindDstReason { None = 0, RemoteBestTeamNotReady, PrimaryNoHealthyTeam, @@ -699,7 +699,7 @@ struct DDQueue : public IDDRelocationQueue { NoAvailablePhysicalShard, NumberOfTypes, }; - std::vector newPhysicalShardReasonCount; + std::vector retryFindDstReasonCount; void startRelocation(int priority, int healthPriority) { // Although PRIORITY_TEAM_REDUNDANT has lower priority than split and merge shard movement, @@ -765,8 +765,8 @@ struct DDQueue : public IDDRelocationQueue { suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar(false)), rawProcessingWiggle(new AsyncVar(false)), unhealthyRelocations(0), movedKeyServersEventHolder(makeReference("MovedKeyServers")), moveReusePhysicalShard(0), - moveCreateNewPhysicalShard(0), - newPhysicalShardReasonCount(static_cast(NewPhysicalShardReason::NumberOfTypes), 0) {} + moveCreateNewPhysicalShard(0), retryFindDstReasonCount(static_cast(RetryFindDstReason::NumberOfTypes), 0) { + } DDQueue() = default; void validate() { @@ -1479,7 +1479,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, loop { destOverloadedCount = 0; stuckCount = 0; - state DDQueue::NewPhysicalShardReason newPhysicalShardReason = DDQueue::NewPhysicalShardReason::None; + state DDQueue::RetryFindDstReason retryFindDstReason = DDQueue::RetryFindDstReason::None; // state int bestTeamStuckThreshold = 50; loop { state int tciIndex = 0; @@ -1506,14 +1506,13 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, .detail("TeamCollectionIndex", tciIndex) .detail("RestoreDataMoveForDest", describe(tciIndex == 0 ? rd.dataMove->primaryDest : rd.dataMove->remoteDest)); - newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady; + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady; foundTeams = false; break; } if (!bestTeam.first.present() || !bestTeam.first.get()->isHealthy()) { - newPhysicalShardReason = tciIndex == 0 - ? DDQueue::NewPhysicalShardReason::PrimaryNoHealthyTeam - : DDQueue::NewPhysicalShardReason::RemoteNoHealthyTeam; + retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam + : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam; foundTeams = false; break; } @@ -1566,16 +1565,15 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, // getting the destination team or we could miss failure notifications for the storage // servers in the destination team TraceEvent("BestTeamNotReady"); - newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady; + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteBestTeamNotReady; foundTeams = false; break; } // If a DC has no healthy team, we stop checking the other DCs until // the unhealthy DC is healthy again or is excluded. if (!bestTeam.first.present()) { - newPhysicalShardReason = tciIndex == 0 - ? DDQueue::NewPhysicalShardReason::PrimaryNoHealthyTeam - : DDQueue::NewPhysicalShardReason::RemoteNoHealthyTeam; + retryFindDstReason = tciIndex == 0 ? DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam + : DDQueue::RetryFindDstReason::RemoteNoHealthyTeam; foundTeams = false; break; } @@ -1599,7 +1597,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, if (tciIndex == 1 && !forceToUseNewPhysicalShard) { bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true); if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) { - newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteTeamIsFull; + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull; foundTeams = false; break; } @@ -1642,7 +1640,7 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD && bestTeams.size() > 1 && !forceToUseNewPhysicalShard) { if (!bestTeams[1].first->isHealthy()) { - newPhysicalShardReason = DDQueue::NewPhysicalShardReason::RemoteTeamIsNotHealthy; + retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy; foundTeams = false; } } @@ -1707,14 +1705,13 @@ ACTOR Future dataDistributionRelocator(DDQueue* self, self->moveReusePhysicalShard++; } else { self->moveCreateNewPhysicalShard++; - if (newPhysicalShardReason == DDQueue::NewPhysicalShardReason::None) { + if (retryFindDstReason == DDQueue::RetryFindDstReason::None) { // When creating a new physical shard, but the reason is none, this can only happen when // determinePhysicalShardIDGivenPrimaryTeam() finds that there is no available physical // shard. - self->newPhysicalShardReasonCount - [DDQueue::NewPhysicalShardReason::NoAvailablePhysicalShard]++; + self->retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]++; } else { - self->newPhysicalShardReasonCount[newPhysicalShardReason]++; + self->retryFindDstReasonCount[retryFindDstReason]++; } } rd.dataMoveId = newShardId(physicalShardIDCandidate, AssignEmptyRange::False); @@ -2519,26 +2516,22 @@ ACTOR Future dataDistributionQueue(Reference db, .detail("MoveCreateNewPhysicalShard", self.moveCreateNewPhysicalShard) .detail("MoveReusePhysicalShard", self.moveReusePhysicalShard) .detail("RemoteBestTeamNotReady", - self.newPhysicalShardReasonCount - [DDQueue::NewPhysicalShardReason::RemoteBestTeamNotReady]) - .detail( - "PrimaryNoHealthyTeam", - self.newPhysicalShardReasonCount[DDQueue::NewPhysicalShardReason::PrimaryNoHealthyTeam]) - .detail( - "RemoteNoHealthyTeam", - self.newPhysicalShardReasonCount[DDQueue::NewPhysicalShardReason::RemoteNoHealthyTeam]) + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteBestTeamNotReady]) + .detail("PrimaryNoHealthyTeam", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::PrimaryNoHealthyTeam]) + .detail("RemoteNoHealthyTeam", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteNoHealthyTeam]) .detail("RemoteTeamIsFull", - self.newPhysicalShardReasonCount[DDQueue::NewPhysicalShardReason::RemoteTeamIsFull]) + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull]) .detail("RemoteTeamIsNotHealthy", - self.newPhysicalShardReasonCount - [DDQueue::NewPhysicalShardReason::RemoteTeamIsNotHealthy]) - .detail("NoAvailablePhysicalShard", - self.newPhysicalShardReasonCount - [DDQueue::NewPhysicalShardReason::NoAvailablePhysicalShard]); + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy]) + .detail( + "NoAvailablePhysicalShard", + self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]); self.moveCreateNewPhysicalShard = 0; self.moveReusePhysicalShard = 0; - for (int i = 0; i < self.newPhysicalShardReasonCount.size(); ++i) { - self.newPhysicalShardReasonCount[i] = 0; + for (int i = 0; i < self.retryFindDstReasonCount.size(); ++i) { + self.retryFindDstReasonCount[i] = 0; } } } From e32affc9f09a0308f5c2dc71be3b56b1fe163d38 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Tue, 25 Oct 2022 15:53:56 +0200 Subject: [PATCH 35/52] Add TLS and token file option to mako --- bindings/c/test/mako/mako.cpp | 245 ++++++++++++++++++++++------------ bindings/c/test/mako/mako.hpp | 14 +- 2 files changed, 170 insertions(+), 89 deletions(-) diff --git a/bindings/c/test/mako/mako.cpp b/bindings/c/test/mako/mako.cpp index f9f3827a70..4303e5343a 100644 --- a/bindings/c/test/mako/mako.cpp +++ b/bindings/c/test/mako/mako.cpp @@ -59,6 +59,8 @@ #include "shm.hpp" #include "stats.hpp" #include "time.hpp" +#include "rapidjson/document.h" +#include "rapidjson/error/en.h" namespace mako { @@ -88,14 +90,29 @@ Transaction createNewTransaction(Database db, Arguments const& args, int id = -1 } // Create Tenant Transaction int tenant_id = (id == -1) ? urand(0, args.active_tenants - 1) : id; + Transaction tr; + std::string tenantStr; // If provided tenants array, use it if (tenants) { - return tenants[tenant_id].createTransaction(); + tr = tenants[tenant_id].createTransaction(); + } else { + tenantStr = "tenant" + std::to_string(tenant_id); + BytesRef tenant_name = toBytesRef(tenantStr); + Tenant t = db.openTenant(tenant_name); + tr = t.createTransaction(); } - std::string tenantStr = "tenant" + std::to_string(tenant_id); - BytesRef tenant_name = toBytesRef(tenantStr); - Tenant t = db.openTenant(tenant_name); - return t.createTransaction(); + if (!args.authorization_tokens.empty()) { + // lookup token based on tenant name and, if found, set authz token to transaction + if (tenantStr.empty()) + tenantStr = "tenant" + std::to_string(tenant_id); + auto tokenMapItr = args.authorization_tokens.find(tenantStr); + if (tokenMapItr != args.authorization_tokens.end()) { + tr.setOption(FDB_TR_OPTION_AUTHORIZATION_TOKEN, tokenMapItr->second); + } else { + logr.warn("Authorization token map is not empty, but could not find token for tenant '{}'", tenantStr); + } + } + return tr; } uint64_t byteswapHelper(uint64_t input) { @@ -815,6 +832,18 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces logr.error("network::setOption(FDB_NET_OPTION_DISTRIBUTED_CLIENT_TRACER): {}", err.what()); } + if (args.tls_certificate_file.has_value()) { + network::setOption(FDB_NET_OPTION_TLS_CERT_PATH, args.tls_certificate_file.value()); + } + + if (args.tls_key_file.has_value()) { + network::setOption(FDB_NET_OPTION_TLS_KEY_PATH, args.tls_key_file.value()); + } + + if (args.tls_ca_file.has_value()) { + network::setOption(FDB_NET_OPTION_TLS_CA_PATH, args.tls_ca_file.value()); + } + /* enable flatbuffers if specified */ if (args.flatbuffers) { #ifdef FDB_NET_OPTION_USE_FLATBUFFERS @@ -982,57 +1011,55 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces } /* initialize the parameters with default values */ -int initArguments(Arguments& args) { - memset(&args, 0, sizeof(Arguments)); /* zero-out everything */ - args.num_fdb_clusters = 0; - args.num_databases = 1; - args.api_version = maxApiVersion(); - args.json = 0; - args.num_processes = 1; - args.num_threads = 1; - args.async_xacts = 0; - args.mode = MODE_INVALID; - args.rows = 100000; - args.load_factor = 1.0; - args.row_digits = digits(args.rows); - args.seconds = 30; - args.iteration = 0; - args.tpsmax = 0; - args.tpsmin = -1; - args.tpsinterval = 10; - args.tpschange = TPS_SIN; - args.sampling = 1000; - args.key_length = 32; - args.value_length = 16; - args.active_tenants = 0; - args.total_tenants = 0; - args.tenant_batch_size = 10000; - args.zipf = 0; - args.commit_get = 0; - args.verbose = 1; - args.flatbuffers = 0; /* internal */ - args.knobs[0] = '\0'; - args.log_group[0] = '\0'; - args.prefixpadding = 0; - args.trace = 0; - args.tracepath[0] = '\0'; - args.traceformat = 0; /* default to client's default (XML) */ - args.streaming_mode = FDB_STREAMING_MODE_WANT_ALL; - args.txntrace = 0; - args.txntagging = 0; - memset(args.txntagging_prefix, 0, TAGPREFIXLENGTH_MAX); +Arguments::Arguments() { + num_fdb_clusters = 0; + num_databases = 1; + api_version = maxApiVersion(); + json = 0; + num_processes = 1; + num_threads = 1; + async_xacts = 0; + mode = MODE_INVALID; + rows = 100000; + load_factor = 1.0; + row_digits = digits(rows); + seconds = 30; + iteration = 0; + tpsmax = 0; + tpsmin = -1; + tpsinterval = 10; + tpschange = TPS_SIN; + sampling = 1000; + key_length = 32; + value_length = 16; + active_tenants = 0; + total_tenants = 0; + tenant_batch_size = 10000; + zipf = 0; + commit_get = 0; + verbose = 1; + flatbuffers = 0; /* internal */ + knobs[0] = '\0'; + log_group[0] = '\0'; + prefixpadding = 0; + trace = 0; + tracepath[0] = '\0'; + traceformat = 0; /* default to client's default (XML) */ + streaming_mode = FDB_STREAMING_MODE_WANT_ALL; + txntrace = 0; + txntagging = 0; + memset(txntagging_prefix, 0, TAGPREFIXLENGTH_MAX); for (auto i = 0; i < MAX_OP; i++) { - args.txnspec.ops[i][OP_COUNT] = 0; + txnspec.ops[i][OP_COUNT] = 0; } - args.client_threads_per_version = 0; - args.disable_client_bypass = false; - args.disable_ryw = 0; - args.json_output_path[0] = '\0'; - args.stats_export_path[0] = '\0'; - args.bg_materialize_files = false; - args.bg_file_path[0] = '\0'; - args.distributed_tracer_client = 0; - return 0; + client_threads_per_version = 0; + disable_client_bypass = false; + disable_ryw = 0; + json_output_path[0] = '\0'; + stats_export_path[0] = '\0'; + bg_materialize_files = false; + bg_file_path[0] = '\0'; + distributed_tracer_client = 0; } /* parse transaction specification */ @@ -1279,6 +1306,10 @@ int parseArguments(int argc, char* argv[], Arguments& args) { { "bg_file_path", required_argument, NULL, ARG_BG_FILE_PATH }, { "stats_export_path", optional_argument, NULL, ARG_EXPORT_PATH }, { "distributed_tracer_client", required_argument, NULL, ARG_DISTRIBUTED_TRACER_CLIENT }, + { "tls_certificate_file", required_argument, NULL, ARG_TLS_CERTIFICATE_FILE }, + { "tls_key_file", required_argument, NULL, ARG_TLS_KEY_FILE }, + { "tls_ca_file", required_argument, NULL, ARG_TLS_CA_FILE }, + { "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE }, { NULL, 0, NULL, 0 } }; idx = 0; @@ -1515,6 +1546,45 @@ int parseArguments(int argc, char* argv[], Arguments& args) { args.distributed_tracer_client = -1; } break; + case ARG_TLS_CERTIFICATE_FILE: + args.tls_certificate_file = std::string(optarg); + break; + case ARG_TLS_KEY_FILE: + args.tls_key_file = std::string(optarg); + break; + case ARG_TLS_CA_FILE: + args.tls_ca_file = std::string(optarg); + break; + case ARG_AUTHORIZATION_TOKEN_FILE: { + std::string tokenFilename(optarg); + std::ifstream ifs(tokenFilename); + std::ostringstream oss; + oss << ifs.rdbuf(); + rapidjson::Document d; + d.Parse(oss.str().c_str()); + if (d.HasParseError()) { + logr.error("Failed to parse authorization token JSON file '{}': {} at offset {}", + tokenFilename, + GetParseError_En(d.GetParseError()), + d.GetErrorOffset()); + return -1; + } else if (!d.IsObject()) { + logr.error("Authorization token JSON file '{}' must contain a JSON object", tokenFilename); + return -1; + } + for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) { + if (!itr->value.IsString()) { + logr.error("Token '{}' is not a string", itr->name.GetString()); + return -1; + } + args.authorization_tokens.insert_or_assign( + std::string(itr->name.GetString(), itr->name.GetStringLength()), + std::string(itr->value.GetString(), itr->value.GetStringLength())); + } + logr.info("Added {} tenant authorization tokens to map from file '{}'", + args.authorization_tokens.size(), + tokenFilename); + } break; } } @@ -1525,93 +1595,97 @@ int parseArguments(int argc, char* argv[], Arguments& args) { return 0; } -int validateArguments(Arguments const& args) { - if (args.mode == MODE_INVALID) { +int Arguments::validate() { + if (mode == MODE_INVALID) { logr.error("--mode has to be set"); return -1; } - if (args.verbose < VERBOSE_NONE || args.verbose > VERBOSE_DEBUG) { + if (verbose < VERBOSE_NONE || verbose > VERBOSE_DEBUG) { logr.error("--verbose must be between 0 and 3"); return -1; } - if (args.rows <= 0) { + if (rows <= 0) { logr.error("--rows must be a positive integer"); return -1; } - if (args.load_factor <= 0 || args.load_factor > 1) { + if (load_factor <= 0 || load_factor > 1) { logr.error("--load_factor must be in range (0, 1]"); return -1; } - if (args.key_length < 0) { + if (key_length < 0) { logr.error("--keylen must be a positive integer"); return -1; } - if (args.value_length < 0) { + if (value_length < 0) { logr.error("--vallen must be a positive integer"); return -1; } - if (args.num_fdb_clusters > NUM_CLUSTERS_MAX) { + if (num_fdb_clusters > NUM_CLUSTERS_MAX) { logr.error("Mako is not supported to do work to more than {} clusters", NUM_CLUSTERS_MAX); return -1; } - if (args.num_databases > NUM_DATABASES_MAX) { + if (num_databases > NUM_DATABASES_MAX) { logr.error("Mako is not supported to do work to more than {} databases", NUM_DATABASES_MAX); return -1; } - if (args.num_databases < args.num_fdb_clusters) { - logr.error("--num_databases ({}) must be >= number of clusters({})", args.num_databases, args.num_fdb_clusters); + if (num_databases < num_fdb_clusters) { + logr.error("--num_databases ({}) must be >= number of clusters({})", num_databases, num_fdb_clusters); return -1; } - if (args.num_threads < args.num_databases) { - logr.error("--threads ({}) must be >= number of databases ({})", args.num_threads, args.num_databases); + if (num_threads < num_databases) { + logr.error("--threads ({}) must be >= number of databases ({})", num_threads, num_databases); return -1; } - if (args.key_length < 4 /* "mako" */ + args.row_digits) { + if (key_length < 4 /* "mako" */ + row_digits) { logr.error("--keylen must be larger than {} to store \"mako\" prefix " "and maximum row number", - 4 + args.row_digits); + 4 + row_digits); return -1; } - if (args.active_tenants > args.total_tenants) { + if (active_tenants > total_tenants) { logr.error("--active_tenants must be less than or equal to --total_tenants"); return -1; } - if (args.tenant_batch_size < 1) { + if (tenant_batch_size < 1) { logr.error("--tenant_batch_size must be at least 1"); return -1; } - if (args.mode == MODE_RUN) { - if ((args.seconds > 0) && (args.iteration > 0)) { + if (mode == MODE_RUN) { + if ((seconds > 0) && (iteration > 0)) { logr.error("Cannot specify seconds and iteration together"); return -1; } - if ((args.seconds == 0) && (args.iteration == 0)) { + if ((seconds == 0) && (iteration == 0)) { logr.error("Must specify either seconds or iteration"); return -1; } - if (args.txntagging < 0) { + if (txntagging < 0) { logr.error("--txntagging must be a non-negative integer"); return -1; } } // ensure that all of the files provided to mako are valid and exist - if (args.mode == MODE_REPORT) { - if (!args.num_report_files) { + if (mode == MODE_REPORT) { + if (!num_report_files) { logr.error("No files to merge"); } - for (int i = 0; i < args.num_report_files; i++) { + for (int i = 0; i < num_report_files; i++) { struct stat buffer; - if (stat(args.report_files[i], &buffer) != 0) { - logr.error("Couldn't open file {}", args.report_files[i]); + if (stat(report_files[i], &buffer) != 0) { + logr.error("Couldn't open file {}", report_files[i]); return -1; } } } - if (args.distributed_tracer_client < 0) { - logr.error("--disibuted_tracer_client must specify either (disabled, network_lossy, log_file)"); + if (distributed_tracer_client < 0) { + logr.error("--distributed_tracer_client must specify either (disabled, network_lossy, log_file)"); return -1; } + + if (!authorization_tokens.empty() && !tls_ca_file.has_value()) { + logr.warn("Authorization tokens are being used without explicit TLS CA file configured"); + } return 0; } @@ -2262,11 +2336,6 @@ int main(int argc, char* argv[]) { auto rc = int{}; auto args = Arguments{}; - rc = initArguments(args); - if (rc < 0) { - logr.error("initArguments failed"); - return -1; - } rc = parseArguments(argc, argv, args); if (rc < 0) { /* usage printed */ @@ -2282,7 +2351,7 @@ int main(int argc, char* argv[]) { args.total_tenants = args.active_tenants; } - rc = validateArguments(args); + rc = args.validate(); if (rc < 0) return -1; logr.setVerbosity(args.verbose); diff --git a/bindings/c/test/mako/mako.hpp b/bindings/c/test/mako/mako.hpp index 952cffc7fa..dee75bad82 100644 --- a/bindings/c/test/mako/mako.hpp +++ b/bindings/c/test/mako/mako.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -79,7 +80,11 @@ enum ArgKind { ARG_JSON_REPORT, ARG_BG_FILE_PATH, // if blob granule files are stored locally, mako will read and materialize them if this is set ARG_EXPORT_PATH, - ARG_DISTRIBUTED_TRACER_CLIENT + ARG_DISTRIBUTED_TRACER_CLIENT, + ARG_TLS_CERTIFICATE_FILE, + ARG_TLS_KEY_FILE, + ARG_TLS_CA_FILE, + ARG_AUTHORIZATION_TOKEN_FILE, }; constexpr const int OP_COUNT = 0; @@ -131,6 +136,9 @@ constexpr const int MAX_REPORT_FILES = 200; /* benchmark parameters */ struct Arguments { + Arguments(); + int validate(); + int api_version; int json; int num_processes; @@ -180,6 +188,10 @@ struct Arguments { char report_files[MAX_REPORT_FILES][PATH_MAX]; int num_report_files; int distributed_tracer_client; + std::optional tls_certificate_file; + std::optional tls_key_file; + std::optional tls_ca_file; + std::map authorization_tokens; // maps tenant name to token string }; } // namespace mako From 33c8a8061482518149824b4f0a7cc6d48c7c2ea6 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Tue, 25 Oct 2022 16:13:01 +0200 Subject: [PATCH 36/52] Update Mako documentation for authz/TLS enablement --- bindings/c/test/mako/mako.rst | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/bindings/c/test/mako/mako.rst b/bindings/c/test/mako/mako.rst index 4b3c3d7b26..0e307b3d8a 100644 --- a/bindings/c/test/mako/mako.rst +++ b/bindings/c/test/mako/mako.rst @@ -38,7 +38,7 @@ Arguments | - ``build``: Populate data | - ``run``: Run the benchmark -- | ``-c | --cluster `` +- | ``-c | --cluster `` | FDB cluster files (Required, comma-separated) - | ``-d | --num_databases `` @@ -125,9 +125,21 @@ Arguments | Disable snapshot read-your-writes - | ``--json_report`` defaults to ``mako.json`` - | ``--json_report=PATH`` + | ``--json_report `` | Output stats to the specified json file +- | ``--tls_certificate_file `` + | Use TLS certificate located in ```` + +- | ``--tls_key_file `` + | Use TLS key file located in ```` + +- | ``--tls_ca_file `` + | Use TLS CA file located in ```` + +- | ``--authorization_token_file `` + | Use authorization token JSON file located in ```` + | Expected content is a JSON object where each key is a tenant name and the mapped value is a token string Transaction Specification ========================= From f88b8e2351c1bc5978b1a729d8c45a37f9189ed5 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 25 Oct 2022 11:11:42 -0600 Subject: [PATCH 37/52] fix summarize bug --- contrib/TestHarness2/test_harness/summarize.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py index 39eae8803e..3629fa7d43 100644 --- a/contrib/TestHarness2/test_harness/summarize.py +++ b/contrib/TestHarness2/test_harness/summarize.py @@ -159,13 +159,20 @@ class Parser: pass -class XmlParser(Parser, xml.sax.handler.ContentHandler): +class XmlParser(Parser, xml.sax.handler.ContentHandler, xml.sax.handler.ErrorHandler): def __init__(self): super().__init__() self.handler: ParseHandler | None = None def parse(self, file: TextIO, handler: ParseHandler) -> None: - xml.sax.parse(file, self) + self.handler = handler + xml.sax.parse(file, self, errorHandler=self) + + def error(self, exception): + pass + + def fatalError(self, exception): + pass def startElement(self, name, attrs) -> None: attributes: Dict[str, str] = {} @@ -276,6 +283,7 @@ class TraceFiles: raise StopIteration self.current += 1 return self.trace_files[self.current - 1] + return TraceFilesIterator(self) @@ -426,7 +434,8 @@ class Summary: lines = self.error_out.splitlines() stderr_bytes = 0 for line in lines: - if line.endswith("WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"): + if line.endswith( + "WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases!"): # When running ASAN we expect to see this message. Boost coroutine should be using the correct asan annotations so that it shouldn't produce any false positives. continue if line.endswith("Warning: unimplemented fcntl command: 1036"): @@ -606,6 +615,7 @@ class Summary: child.attributes['File'] = attrs['File'] child.attributes['Line'] = attrs['Line'] self.out.append(child) + self.handler.add_handler(('Type', 'BuggifySection'), buggify_section) self.handler.add_handler(('Type', 'FaultInjected'), buggify_section) @@ -614,9 +624,11 @@ class Summary: child.attributes['Name'] = attrs['Name'] child.attributes['File'] = attrs['File'] child.attributes['Line'] = attrs['Line'] + self.handler.add_handler(('Type', 'RunningUnitTest'), running_unit_test) def stderr_severity(attrs: Dict[str, str]): if 'NewSeverity' in attrs: self.stderr_severity = attrs['NewSeverity'] + self.handler.add_handler(('Type', 'StderrSeverity'), stderr_severity) From 74212eeacf2d5306dc19ee0e52a42a85cb26e0d4 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 25 Oct 2022 10:17:15 -0700 Subject: [PATCH 38/52] Encapsulate CounterCollection --- fdbclient/BlobCipher.cpp | 2 +- fdbclient/TaskBucket.actor.cpp | 4 +- .../include/fdbclient/BlobWorkerCommon.h | 4 +- fdbrpc/Stats.actor.cpp | 68 +++++++++++-------- fdbrpc/include/fdbrpc/Stats.h | 50 +++++++++----- fdbserver/BackupWorker.actor.cpp | 4 +- fdbserver/BlobManager.actor.cpp | 2 +- fdbserver/ClusterController.actor.cpp | 9 ++- fdbserver/ConfigBroadcaster.actor.cpp | 4 +- fdbserver/ConfigNode.actor.cpp | 2 +- fdbserver/GrvProxyServer.actor.cpp | 2 +- fdbserver/LocalConfiguration.actor.cpp | 4 +- fdbserver/LogRouter.actor.cpp | 17 +++-- fdbserver/OldTLogServer_4_6.actor.cpp | 46 ++++++------- fdbserver/OldTLogServer_6_0.actor.cpp | 46 ++++++------- fdbserver/OldTLogServer_6_2.actor.cpp | 46 ++++++------- fdbserver/Resolver.actor.cpp | 2 +- fdbserver/SimpleConfigConsumer.actor.cpp | 4 +- fdbserver/StorageCache.actor.cpp | 15 ++-- fdbserver/TLogServer.actor.cpp | 46 ++++++------- .../include/fdbserver/ClusterRecovery.actor.h | 9 ++- .../include/fdbserver/ProxyCommitData.actor.h | 2 +- .../include/fdbserver/RestoreApplier.actor.h | 10 +-- .../include/fdbserver/RestoreLoader.actor.h | 10 +-- fdbserver/masterserver.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 60 ++++++++-------- 26 files changed, 246 insertions(+), 224 deletions(-) diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp index e2b4890b24..9dc2c19798 100644 --- a/fdbclient/BlobCipher.cpp +++ b/fdbclient/BlobCipher.cpp @@ -83,7 +83,7 @@ BlobCipherMetrics::BlobCipherMetrics() CounterSet(cc, "Backup"), CounterSet(cc, "Test") }) { specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); }); - traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc); + traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL); } std::string toString(BlobCipherMetrics::UsageType type) { diff --git a/fdbclient/TaskBucket.actor.cpp b/fdbclient/TaskBucket.actor.cpp index 2e72b301c0..347395892b 100644 --- a/fdbclient/TaskBucket.actor.cpp +++ b/fdbclient/TaskBucket.actor.cpp @@ -579,8 +579,8 @@ public: int maxConcurrentTasks) { state Reference> paused = makeReference>(true); state Future watchPausedFuture = watchPaused(cx, taskBucket, paused); - taskBucket->metricLogger = traceCounters( - "TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY, &taskBucket->cc); + taskBucket->metricLogger = taskBucket->cc.traceCounters( + "TaskBucketMetrics", taskBucket->dbgid, CLIENT_KNOBS->TASKBUCKET_LOGGING_DELAY); loop { while (paused->get()) { wait(paused->onChange() || watchPausedFuture); diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h index 9539db459b..b4cbbac2a7 100644 --- a/fdbclient/include/fdbclient/BlobWorkerCommon.h +++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h @@ -103,8 +103,8 @@ struct BlobWorkerStats { specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); }); specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); }); - logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics"); + logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics"); } }; -#endif \ No newline at end of file +#endif diff --git a/fdbrpc/Stats.actor.cpp b/fdbrpc/Stats.actor.cpp index 274a4ec92e..065e7622e3 100644 --- a/fdbrpc/Stats.actor.cpp +++ b/fdbrpc/Stats.actor.cpp @@ -24,8 +24,8 @@ Counter::Counter(std::string const& name, CounterCollection& collection) : name(name), interval_start(0), last_event(0), interval_sq_time(0), roughness_interval_start(0), interval_delta(0), interval_start_value(0) { - metric.init(collection.name + "." + (char)toupper(name.at(0)) + name.substr(1), collection.id); - collection.counters.push_back(this); + metric.init(collection.getName() + "." + (char)toupper(name.at(0)) + name.substr(1), collection.getId()); + collection.addCounter(this); } void Counter::operator+=(Value delta) { @@ -88,36 +88,48 @@ void CounterCollection::logToTraceEvent(TraceEvent& te) const { } } -ACTOR Future traceCounters(std::string traceEventName, - UID traceEventID, - double interval, - CounterCollection* counters, - std::string trackLatestName, - std::function decorator) { - wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized +class CounterCollectionImpl { +public: + ACTOR static Future traceCounters(CounterCollection* counters, + std::string traceEventName, + UID traceEventID, + double interval, + std::string trackLatestName, + std::function decorator) { + wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized - for (ICounter* c : counters->counters) - c->resetInterval(); - - state Reference traceEventHolder; - if (!trackLatestName.empty()) { - traceEventHolder = makeReference(trackLatestName); - } - - state double last_interval = now(); - - loop { - TraceEvent te(traceEventName.c_str(), traceEventID); - te.detail("Elapsed", now() - last_interval); - - counters->logToTraceEvent(te); - decorator(te); + for (ICounter* c : counters->counters) + c->resetInterval(); + state Reference traceEventHolder; if (!trackLatestName.empty()) { - te.trackLatest(traceEventHolder->trackingKey); + traceEventHolder = makeReference(trackLatestName); } - last_interval = now(); - wait(delay(interval, TaskPriority::FlushTrace)); + state double last_interval = now(); + + loop { + TraceEvent te(traceEventName.c_str(), traceEventID); + te.detail("Elapsed", now() - last_interval); + + counters->logToTraceEvent(te); + decorator(te); + + if (!trackLatestName.empty()) { + te.trackLatest(traceEventHolder->trackingKey); + } + + last_interval = now(); + wait(delay(interval, TaskPriority::FlushTrace)); + } } +}; + +Future CounterCollection::traceCounters(std::string const& traceEventName, + UID traceEventID, + double interval, + std::string const& trackLatestName, + std::function const& decorator) { + return CounterCollectionImpl::traceCounters( + this, traceEventName, traceEventID, interval, trackLatestName, decorator); } diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h index f8a15e7c16..10e430c317 100644 --- a/fdbrpc/include/fdbrpc/Stats.h +++ b/fdbrpc/include/fdbrpc/Stats.h @@ -67,17 +67,39 @@ struct Traceable : std::true_type { } }; -struct CounterCollection { - CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {} - std::vector counters, counters_to_remove; - ~CounterCollection() { - for (auto c : counters_to_remove) - c->remove(); - } +class CounterCollectionImpl; + +class CounterCollection { + friend class CounterCollectionImpl; + std::string name; std::string id; + std::vector counters, countersToRemove; + +public: + CounterCollection(std::string const& name, std::string const& id = std::string()) : name(name), id(id) {} + ~CounterCollection() { + for (auto c : countersToRemove) + c->remove(); + } + + void addCounter(ICounter* counter) { counters.push_back(counter); } + + // Call remove method on this counter in ~CounterCollection + void markForRemoval(ICounter* counter) { countersToRemove.push_back(counter); } + + std::string const& getName() const { return name; } + + std::string const& getId() const { return id; } void logToTraceEvent(TraceEvent& te) const; + + Future traceCounters( + std::string const& traceEventName, + UID traceEventID, + double interval, + std::string const& trackLatestName = std::string(), + std::function const& decorator = [](auto& te) {}); }; struct Counter final : ICounter, NonCopyable { @@ -131,8 +153,8 @@ struct Traceable : std::true_type { template struct SpecialCounter final : ICounter, FastAllocated>, NonCopyable { SpecialCounter(CounterCollection& collection, std::string const& name, F&& f) : name(name), f(f) { - collection.counters.push_back(this); - collection.counters_to_remove.push_back(this); + collection.addCounter(this); + collection.markForRemoval(this); } void remove() override { delete this; } @@ -162,14 +184,6 @@ static void specialCounter(CounterCollection& collection, std::string const& nam new SpecialCounter(collection, name, std::move(f)); } -Future traceCounters( - std::string const& traceEventName, - UID const& traceEventID, - double const& interval, - CounterCollection* const& counters, - std::string const& trackLatestName = std::string(), - std::function const& decorator = [](TraceEvent& te) {}); - class LatencyBands { public: LatencyBands(std::string name, UID id, double loggingInterval) @@ -180,7 +194,7 @@ public: if (bands.size() == 0) { ASSERT(!cc && !filteredCount); cc = std::make_unique(name, id.toString()); - logger = traceCounters(name, id, loggingInterval, cc.get(), id.toString() + "/" + name); + logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name); filteredCount = std::make_unique("Filtered", *cc); insertBand(std::numeric_limits::infinity()); } diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 67fc20aff4..488d35b3c3 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -290,8 +290,8 @@ struct BackupData { specialCounter(cc, "MsgQ", [this]() { return this->messages.size(); }); specialCounter(cc, "BufferedBytes", [this]() { return this->lock->activePermits(); }); specialCounter(cc, "AvailableBytes", [this]() { return this->lock->available(); }); - logger = traceCounters( - "BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "BackupWorkerMetrics"); + logger = + cc.traceCounters("BackupWorkerMetrics", myId, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "BackupWorkerMetrics"); } bool pullFinished() const { return endVersion.present() && pulledVersion.get() > endVersion.get(); } diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index ad6051b602..06bc97b4eb 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -296,7 +296,7 @@ struct BlobManagerStats { specialCounter(cc, "HardBoundaries", [mergeHardBoundaries]() { return mergeHardBoundaries->size(); }); specialCounter(cc, "SoftBoundaries", [mergeBoundaries]() { return mergeBoundaries->size(); }); specialCounter(cc, "BlockedAssignments", [this]() { return this->blockedAssignments; }); - logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics"); + logger = cc.traceCounters("BlobManagerMetrics", id, interval, "BlobManagerMetrics"); } }; diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index c962ca891d..f65bf43746 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -3006,11 +3006,10 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(monitorConsistencyScan(&self)); self.addActor.send(metaclusterMetricsUpdater(&self)); self.addActor.send(dbInfoUpdater(&self)); - self.addActor.send(traceCounters("ClusterControllerMetrics", - self.id, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &self.clusterControllerMetrics, - self.id.toString() + "/ClusterControllerMetrics")); + self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics", + self.id, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + self.id.toString() + "/ClusterControllerMetrics")); self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id())); // printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); diff --git a/fdbserver/ConfigBroadcaster.actor.cpp b/fdbserver/ConfigBroadcaster.actor.cpp index eafce9c9cb..bf49f8e58a 100644 --- a/fdbserver/ConfigBroadcaster.actor.cpp +++ b/fdbserver/ConfigBroadcaster.actor.cpp @@ -183,8 +183,8 @@ class ConfigBroadcasterImpl { id(deterministicRandom()->randomUniqueID()), cc("ConfigBroadcaster"), compactRequest("CompactRequest", cc), successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc), snapshotRequest("SnapshotRequest", cc) { - logger = traceCounters( - "ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigBroadcasterMetrics"); + logger = cc.traceCounters( + "ConfigBroadcasterMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigBroadcasterMetrics"); } void addChanges(Standalone> const& changes, diff --git a/fdbserver/ConfigNode.actor.cpp b/fdbserver/ConfigNode.actor.cpp index 7c8ce60d04..b67e856be0 100644 --- a/fdbserver/ConfigNode.actor.cpp +++ b/fdbserver/ConfigNode.actor.cpp @@ -812,7 +812,7 @@ public: successfulCommits("SuccessfulCommits", cc), failedCommits("FailedCommits", cc), setMutations("SetMutations", cc), clearMutations("ClearMutations", cc), getValueRequests("GetValueRequests", cc), getGenerationRequests("GetGenerationRequests", cc) { - logger = traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigNode"); + logger = cc.traceCounters("ConfigNodeMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigNode"); TraceEvent(SevInfo, "StartingConfigNode", id).detail("KVStoreAlreadyExists", kvStore.exists()); } diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index b865b9407b..a55748505f 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -154,7 +154,7 @@ struct GrvProxyStats { return int64_t(100 * this->percentageOfBatchGRVQueueProcessed); }); - logger = traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "GrvProxyMetrics"); + logger = cc.traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "GrvProxyMetrics"); for (int i = 0; i < FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS; i++) { requestBuckets.push_back(0); } diff --git a/fdbserver/LocalConfiguration.actor.cpp b/fdbserver/LocalConfiguration.actor.cpp index 7a8e04e76b..c2cdaf7479 100644 --- a/fdbserver/LocalConfiguration.actor.cpp +++ b/fdbserver/LocalConfiguration.actor.cpp @@ -347,8 +347,8 @@ public: Randomize::False, g_network->isSimulated() ? IsSimulated::True : IsSimulated::False); } - logger = traceCounters( - "LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "LocalConfigurationMetrics"); + logger = cc.traceCounters( + "LocalConfigurationMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "LocalConfigurationMetrics"); } Future addChanges(Standalone> changes, diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 2ee606e6fd..399d820608 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -190,15 +190,14 @@ struct LogRouterData { }); specialCounter(cc, "Generation", [this]() { return this->generation; }); specialCounter(cc, "ActivePeekStreams", [this]() { return this->activePeekStreams; }); - logger = traceCounters("LogRouterMetrics", - dbgid, - SERVER_KNOBS->WORKER_LOGGING_INTERVAL, - &cc, - "LogRouterMetrics", - [this](TraceEvent& te) { - te.detail("PrimaryPeekLocation", this->primaryPeekLocation); - te.detail("RouterTag", this->routerTag.toString()); - }); + logger = cc.traceCounters("LogRouterMetrics", + dbgid, + SERVER_KNOBS->WORKER_LOGGING_INTERVAL, + "LogRouterMetrics", + [this](TraceEvent& te) { + te.detail("PrimaryPeekLocation", this->primaryPeekLocation); + te.detail("RouterTag", this->routerTag.toString()); + }); } }; diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index bdf0d06bc3..b122fde854 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -447,10 +447,10 @@ struct LogData : NonCopyable, public ReferenceCounted { "Restored"); addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id())); - persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id); - persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id); - version.initMetric("TLog.Version"_sr, cc.id); - queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id); + persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId()); + persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId()); + version.initMetric("TLog.Version"_sr, cc.getId()); + queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId()); specialCounter(cc, "Version", [this]() { return this->version.get(); }); specialCounter(cc, "SharedBytesInput", [tLogData]() { return tLogData->bytesInput; }); @@ -1399,26 +1399,26 @@ ACTOR Future tLogCore(TLogData* self, Reference logData) { logData->addActor.send(waitFailureServer(logData->tli.waitFailure.getFuture())); logData->addActor.send(logData->removed); // FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance - logData->addActor.send(traceCounters("TLogMetrics", - logData->logId, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &logData->cc, - logData->logId.toString() + "/TLogMetrics", - [self = self](TraceEvent& te) { - StorageBytes sbTlog = self->persistentData->getStorageBytes(); - te.detail("KvstoreBytesUsed", sbTlog.used); - te.detail("KvstoreBytesFree", sbTlog.free); - te.detail("KvstoreBytesAvailable", sbTlog.available); - te.detail("KvstoreBytesTotal", sbTlog.total); - te.detail("KvstoreBytesTemp", sbTlog.temp); + logData->addActor.send(logData->cc.traceCounters("TLogMetrics", + logData->logId, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + logData->logId.toString() + "/TLogMetrics", + [self = self](TraceEvent& te) { + StorageBytes sbTlog = self->persistentData->getStorageBytes(); + te.detail("KvstoreBytesUsed", sbTlog.used); + te.detail("KvstoreBytesFree", sbTlog.free); + te.detail("KvstoreBytesAvailable", sbTlog.available); + te.detail("KvstoreBytesTotal", sbTlog.total); + te.detail("KvstoreBytesTemp", sbTlog.temp); - StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes(); - te.detail("QueueDiskBytesUsed", sbQueue.used); - te.detail("QueueDiskBytesFree", sbQueue.free); - te.detail("QueueDiskBytesAvailable", sbQueue.available); - te.detail("QueueDiskBytesTotal", sbQueue.total); - te.detail("QueueDiskBytesTemp", sbQueue.temp); - })); + StorageBytes sbQueue = + self->rawPersistentQueue->getStorageBytes(); + te.detail("QueueDiskBytesUsed", sbQueue.used); + te.detail("QueueDiskBytesFree", sbQueue.free); + te.detail("QueueDiskBytesAvailable", sbQueue.available); + te.detail("QueueDiskBytesTotal", sbQueue.total); + te.detail("QueueDiskBytesTemp", sbQueue.temp); + })); logData->addActor.send(serveTLogInterface(self, logData->tli, logData, warningCollectorInput)); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index fb0d7daacb..a346d214ff 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -533,10 +533,10 @@ struct LogData : NonCopyable, public ReferenceCounted { context); addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id())); - persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id); - persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id); - version.initMetric("TLog.Version"_sr, cc.id); - queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id); + persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId()); + persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId()); + version.initMetric("TLog.Version"_sr, cc.getId()); + queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId()); specialCounter(cc, "Version", [this]() { return this->version.get(); }); specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); }); @@ -2212,26 +2212,26 @@ ACTOR Future tLogCore(TLogData* self, logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture())); logData->addActor.send(logData->removed); // FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance - logData->addActor.send(traceCounters("TLogMetrics", - logData->logId, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &logData->cc, - logData->logId.toString() + "/TLogMetrics", - [self = self](TraceEvent& te) { - StorageBytes sbTlog = self->persistentData->getStorageBytes(); - te.detail("KvstoreBytesUsed", sbTlog.used); - te.detail("KvstoreBytesFree", sbTlog.free); - te.detail("KvstoreBytesAvailable", sbTlog.available); - te.detail("KvstoreBytesTotal", sbTlog.total); - te.detail("KvstoreBytesTemp", sbTlog.temp); + logData->addActor.send(logData->cc.traceCounters("TLogMetrics", + logData->logId, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + logData->logId.toString() + "/TLogMetrics", + [self = self](TraceEvent& te) { + StorageBytes sbTlog = self->persistentData->getStorageBytes(); + te.detail("KvstoreBytesUsed", sbTlog.used); + te.detail("KvstoreBytesFree", sbTlog.free); + te.detail("KvstoreBytesAvailable", sbTlog.available); + te.detail("KvstoreBytesTotal", sbTlog.total); + te.detail("KvstoreBytesTemp", sbTlog.temp); - StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes(); - te.detail("QueueDiskBytesUsed", sbQueue.used); - te.detail("QueueDiskBytesFree", sbQueue.free); - te.detail("QueueDiskBytesAvailable", sbQueue.available); - te.detail("QueueDiskBytesTotal", sbQueue.total); - te.detail("QueueDiskBytesTemp", sbQueue.temp); - })); + StorageBytes sbQueue = + self->rawPersistentQueue->getStorageBytes(); + te.detail("QueueDiskBytesUsed", sbQueue.used); + te.detail("QueueDiskBytesFree", sbQueue.free); + te.detail("QueueDiskBytesAvailable", sbQueue.available); + te.detail("QueueDiskBytesTotal", sbQueue.total); + te.detail("QueueDiskBytesTemp", sbQueue.temp); + })); logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput)); logData->addActor.send(cleanupPeekTrackers(logData.getPtr())); diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index e9e52dd21c..c68fb26b26 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -616,10 +616,10 @@ struct LogData : NonCopyable, public ReferenceCounted { context); addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id())); - persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id); - persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id); - version.initMetric("TLog.Version"_sr, cc.id); - queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id); + persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId()); + persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId()); + version.initMetric("TLog.Version"_sr, cc.getId()); + queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId()); specialCounter(cc, "Version", [this]() { return this->version.get(); }); specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); }); @@ -2671,26 +2671,26 @@ ACTOR Future tLogCore(TLogData* self, logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture())); logData->addActor.send(logData->removed); // FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance - logData->addActor.send(traceCounters("TLogMetrics", - logData->logId, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &logData->cc, - logData->logId.toString() + "/TLogMetrics", - [self = self](TraceEvent& te) { - StorageBytes sbTlog = self->persistentData->getStorageBytes(); - te.detail("KvstoreBytesUsed", sbTlog.used); - te.detail("KvstoreBytesFree", sbTlog.free); - te.detail("KvstoreBytesAvailable", sbTlog.available); - te.detail("KvstoreBytesTotal", sbTlog.total); - te.detail("KvstoreBytesTemp", sbTlog.temp); + logData->addActor.send(logData->cc.traceCounters("TLogMetrics", + logData->logId, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + logData->logId.toString() + "/TLogMetrics", + [self = self](TraceEvent& te) { + StorageBytes sbTlog = self->persistentData->getStorageBytes(); + te.detail("KvstoreBytesUsed", sbTlog.used); + te.detail("KvstoreBytesFree", sbTlog.free); + te.detail("KvstoreBytesAvailable", sbTlog.available); + te.detail("KvstoreBytesTotal", sbTlog.total); + te.detail("KvstoreBytesTemp", sbTlog.temp); - StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes(); - te.detail("QueueDiskBytesUsed", sbQueue.used); - te.detail("QueueDiskBytesFree", sbQueue.free); - te.detail("QueueDiskBytesAvailable", sbQueue.available); - te.detail("QueueDiskBytesTotal", sbQueue.total); - te.detail("QueueDiskBytesTemp", sbQueue.temp); - })); + StorageBytes sbQueue = + self->rawPersistentQueue->getStorageBytes(); + te.detail("QueueDiskBytesUsed", sbQueue.used); + te.detail("QueueDiskBytesFree", sbQueue.free); + te.detail("QueueDiskBytesAvailable", sbQueue.available); + te.detail("QueueDiskBytesTotal", sbQueue.total); + te.detail("QueueDiskBytesTemp", sbQueue.temp); + })); logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput)); logData->addActor.send(cleanupPeekTrackers(logData.getPtr())); diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 61aaed1246..c3a59166a2 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -188,7 +188,7 @@ struct Resolver : ReferenceCounted { specialCounter(cc, "NeededVersion", [this]() { return this->neededVersion.get(); }); specialCounter(cc, "TotalStateBytes", [this]() { return this->totalStateBytes.get(); }); - logger = traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ResolverMetrics"); + logger = cc.traceCounters("ResolverMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ResolverMetrics"); } ~Resolver() { destroyConflictSet(conflictSet); } }; diff --git a/fdbserver/SimpleConfigConsumer.actor.cpp b/fdbserver/SimpleConfigConsumer.actor.cpp index 7241ffe48d..769bdf142f 100644 --- a/fdbserver/SimpleConfigConsumer.actor.cpp +++ b/fdbserver/SimpleConfigConsumer.actor.cpp @@ -166,8 +166,8 @@ public: successfulChangeRequest("SuccessfulChangeRequest", cc), failedChangeRequest("FailedChangeRequest", cc), snapshotRequest("SnapshotRequest", cc) { cfi = getConfigFollowerInterface(configSource); - logger = traceCounters( - "ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ConfigConsumerMetrics"); + logger = cc.traceCounters( + "ConfigConsumerMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ConfigConsumerMetrics"); } Future consume(ConfigBroadcaster& broadcaster) { diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 0ac15840d2..6cf1c9b934 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -248,9 +248,9 @@ public: lastTLogVersion(0), lastVersionWithData(0), peekVersion(0), compactionInProgress(Void()), fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), debug_inApplyUpdate(false), debug_lastValidateTime(0), versionLag(0), behind(false), counters(this) { - version.initMetric("StorageCacheData.Version"_sr, counters.cc.id); - desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.id); - oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.id); + version.initMetric("StorageCacheData.Version"_sr, counters.cc.getId()); + desiredOldestVersion.initMetric("StorageCacheData.DesriedOldestVersion"_sr, counters.cc.getId()); + oldestVersion.initMetric("StorageCacheData.OldestVersion"_sr, counters.cc.getId()); newestAvailableVersion.insert(allKeys, invalidVersion); newestDirtyVersion.insert(allKeys, invalidVersion); @@ -2224,11 +2224,10 @@ ACTOR Future storageCacheServer(StorageServerInterface ssi, self.ck = cacheKeysPrefixFor(id).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/ actors.add(waitFailureServer(ssi.waitFailure.getFuture())); - actors.add(traceCounters("CacheMetrics", - self.thisServerID, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &self.counters.cc, - self.thisServerID.toString() + "/CacheMetrics")); + actors.add(self.counters.cc.traceCounters("CacheMetrics", + self.thisServerID, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + self.thisServerID.toString() + "/CacheMetrics")); // fetch already cached ranges from the database and apply them before proceeding wait(storageCacheStartUpWarmup(&self)); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 2c8a007fde..1334c33eef 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -652,10 +652,10 @@ struct LogData : NonCopyable, public ReferenceCounted { context); addActor.send(traceRole(Role::TRANSACTION_LOG, interf.id())); - persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.id); - persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.id); - version.initMetric("TLog.Version"_sr, cc.id); - queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.id); + persistentDataVersion.init("TLog.PersistentDataVersion"_sr, cc.getId()); + persistentDataDurableVersion.init("TLog.PersistentDataDurableVersion"_sr, cc.getId()); + version.initMetric("TLog.Version"_sr, cc.getId()); + queueCommittedVersion.initMetric("TLog.QueueCommittedVersion"_sr, cc.getId()); specialCounter(cc, "Version", [this]() { return this->version.get(); }); specialCounter(cc, "QueueCommittedVersion", [this]() { return this->queueCommittedVersion.get(); }); @@ -2930,26 +2930,26 @@ ACTOR Future tLogCore(TLogData* self, logData->addActor.send(waitFailureServer(tli.waitFailure.getFuture())); logData->addActor.send(logData->removed); // FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance - logData->addActor.send(traceCounters("TLogMetrics", - logData->logId, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &logData->cc, - logData->logId.toString() + "/TLogMetrics", - [self = self](TraceEvent& te) { - StorageBytes sbTlog = self->persistentData->getStorageBytes(); - te.detail("KvstoreBytesUsed", sbTlog.used); - te.detail("KvstoreBytesFree", sbTlog.free); - te.detail("KvstoreBytesAvailable", sbTlog.available); - te.detail("KvstoreBytesTotal", sbTlog.total); - te.detail("KvstoreBytesTemp", sbTlog.temp); + logData->addActor.send(logData->cc.traceCounters("TLogMetrics", + logData->logId, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + logData->logId.toString() + "/TLogMetrics", + [self = self](TraceEvent& te) { + StorageBytes sbTlog = self->persistentData->getStorageBytes(); + te.detail("KvstoreBytesUsed", sbTlog.used); + te.detail("KvstoreBytesFree", sbTlog.free); + te.detail("KvstoreBytesAvailable", sbTlog.available); + te.detail("KvstoreBytesTotal", sbTlog.total); + te.detail("KvstoreBytesTemp", sbTlog.temp); - StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes(); - te.detail("QueueDiskBytesUsed", sbQueue.used); - te.detail("QueueDiskBytesFree", sbQueue.free); - te.detail("QueueDiskBytesAvailable", sbQueue.available); - te.detail("QueueDiskBytesTotal", sbQueue.total); - te.detail("QueueDiskBytesTemp", sbQueue.temp); - })); + StorageBytes sbQueue = + self->rawPersistentQueue->getStorageBytes(); + te.detail("QueueDiskBytesUsed", sbQueue.used); + te.detail("QueueDiskBytesFree", sbQueue.free); + te.detail("QueueDiskBytesAvailable", sbQueue.available); + te.detail("QueueDiskBytesTotal", sbQueue.total); + te.detail("QueueDiskBytesTemp", sbQueue.temp); + })); logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput)); logData->addActor.send(cleanupPeekTrackers(logData.getPtr())); diff --git a/fdbserver/include/fdbserver/ClusterRecovery.actor.h b/fdbserver/include/fdbserver/ClusterRecovery.actor.h index aa5e25a46c..eb2c4bf464 100644 --- a/fdbserver/include/fdbserver/ClusterRecovery.actor.h +++ b/fdbserver/include/fdbserver/ClusterRecovery.actor.h @@ -289,11 +289,10 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_DURATION_EVENT_NAME)); clusterRecoveryAvailableEventHolder = makeReference( getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_AVAILABLE_EVENT_NAME)); - logger = traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME), - dbgid, - SERVER_KNOBS->WORKER_LOGGING_INTERVAL, - &cc, - getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME)); + logger = cc.traceCounters(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME), + dbgid, + SERVER_KNOBS->WORKER_LOGGING_INTERVAL, + getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_METRICS_EVENT_NAME)); if (forceRecovery && !controllerData->clusterControllerDcId.present()) { TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log(); forceRecovery = false; diff --git a/fdbserver/include/fdbserver/ProxyCommitData.actor.h b/fdbserver/include/fdbserver/ProxyCommitData.actor.h index f5a8a060f9..7ac83c935a 100644 --- a/fdbserver/include/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/include/fdbserver/ProxyCommitData.actor.h @@ -156,7 +156,7 @@ struct ProxyStats { specialCounter(cc, "NumTenants", [pTenantMap]() { return pTenantMap ? pTenantMap->size() : 0; }); specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); }); specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); }); - logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics"); + logger = cc.traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "ProxyMetrics"); } }; diff --git a/fdbserver/include/fdbserver/RestoreApplier.actor.h b/fdbserver/include/fdbserver/RestoreApplier.actor.h index da370bc62c..d85af1c317 100644 --- a/fdbserver/include/fdbserver/RestoreApplier.actor.h +++ b/fdbserver/include/fdbserver/RestoreApplier.actor.h @@ -284,11 +284,11 @@ struct ApplierBatchData : public ReferenceCounted { : vbState(ApplierVersionBatchState::NOT_INIT), receiveMutationReqs(0), receivedBytes(0), appliedBytes(0), targetWriteRateMB(SERVER_KNOBS->FASTRESTORE_WRITE_BW_MB / SERVER_KNOBS->FASTRESTORE_NUM_APPLIERS), totalBytesToWrite(-1), applyingDataBytes(0), counters(this, nodeID, batchIndex) { - pollMetrics = traceCounters(format("FastRestoreApplierMetrics%d", batchIndex), - nodeID, - SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY, - &counters.cc, - nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex)); + pollMetrics = + counters.cc.traceCounters(format("FastRestoreApplierMetrics%d", batchIndex), + nodeID, + SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY, + nodeID.toString() + "/RestoreApplierMetrics/" + std::to_string(batchIndex)); TraceEvent("FastRestoreApplierMetricsCreated").detail("Node", nodeID); } ~ApplierBatchData() { diff --git a/fdbserver/include/fdbserver/RestoreLoader.actor.h b/fdbserver/include/fdbserver/RestoreLoader.actor.h index 92b11a5a1c..bd4d361c28 100644 --- a/fdbserver/include/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/include/fdbserver/RestoreLoader.actor.h @@ -93,11 +93,11 @@ struct LoaderBatchData : public ReferenceCounted { explicit LoaderBatchData(UID nodeID, int batchIndex) : vbState(LoaderVersionBatchState::NOT_INIT), loadFileReqs(0), counters(this, nodeID, batchIndex) { - pollMetrics = traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex), - nodeID, - SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY, - &counters.cc, - nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex)); + pollMetrics = + counters.cc.traceCounters(format("FastRestoreLoaderMetrics%d", batchIndex), + nodeID, + SERVER_KNOBS->FASTRESTORE_ROLE_LOGGING_DELAY, + nodeID.toString() + "/RestoreLoaderMetrics/" + std::to_string(batchIndex)); TraceEvent("FastRestoreLoaderMetricsCreated").detail("Node", nodeID); } diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 16486de2e8..69a02e2bc2 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -114,7 +114,7 @@ struct MasterData : NonCopyable, ReferenceCounted { SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE), addActor(addActor) { - logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics"); + logger = cc.traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, "MasterMetrics"); if (forceRecovery && !myInterface.locality.dcId().present()) { TraceEvent(SevError, "ForcedRecoveryRequiresDcID").log(); forceRecovery = false; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 5c4393de87..801cdad98d 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -1308,10 +1308,10 @@ public: storageServerSourceTLogIDEventHolder( makeReference(ssi.id().toString() + "/StorageServerSourceTLogID")) { - version.initMetric("StorageServer.Version"_sr, counters.cc.id); - oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.id); - durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.id); - desiredOldestVersion.initMetric("StorageServer.DesiredOldestVersion"_sr, counters.cc.id); + version.initMetric("StorageServer.Version"_sr, counters.cc.getId()); + oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId()); + durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.getId()); + desiredOldestVersion.initMetric("StorageServer.DesiredOldestVersion"_sr, counters.cc.getId()); newestAvailableVersion.insert(allKeys, invalidVersion); newestDirtyVersion.insert(allKeys, invalidVersion); @@ -10188,32 +10188,32 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) TraceEvent("StorageServerRestoreDurableState", self->thisServerID).detail("RestoredBytes", self->bytesRestored); // Logs all counters in `counters.cc` and reset the interval. - self->actors.add(traceCounters("StorageMetrics", - self->thisServerID, - SERVER_KNOBS->STORAGE_LOGGING_DELAY, - &self->counters.cc, - self->thisServerID.toString() + "/StorageMetrics", - [self = self](TraceEvent& te) { - te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString()); - te.detail("Tag", self->tag.toString()); - StorageBytes sb = self->storage.getStorageBytes(); - te.detail("KvstoreBytesUsed", sb.used); - te.detail("KvstoreBytesFree", sb.free); - te.detail("KvstoreBytesAvailable", sb.available); - te.detail("KvstoreBytesTotal", sb.total); - te.detail("KvstoreBytesTemp", sb.temp); - if (self->isTss()) { - te.detail("TSSPairID", self->tssPairID); - te.detail("TSSJointID", - UID(self->thisServerID.first() ^ self->tssPairID.get().first(), - self->thisServerID.second() ^ self->tssPairID.get().second())); - } else if (self->isSSWithTSSPair()) { - te.detail("SSPairID", self->ssPairID); - te.detail("TSSJointID", - UID(self->thisServerID.first() ^ self->ssPairID.get().first(), - self->thisServerID.second() ^ self->ssPairID.get().second())); - } - })); + self->actors.add(self->counters.cc.traceCounters( + "StorageMetrics", + self->thisServerID, + SERVER_KNOBS->STORAGE_LOGGING_DELAY, + self->thisServerID.toString() + "/StorageMetrics", + [self = self](TraceEvent& te) { + te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString()); + te.detail("Tag", self->tag.toString()); + StorageBytes sb = self->storage.getStorageBytes(); + te.detail("KvstoreBytesUsed", sb.used); + te.detail("KvstoreBytesFree", sb.free); + te.detail("KvstoreBytesAvailable", sb.available); + te.detail("KvstoreBytesTotal", sb.total); + te.detail("KvstoreBytesTemp", sb.temp); + if (self->isTss()) { + te.detail("TSSPairID", self->tssPairID); + te.detail("TSSJointID", + UID(self->thisServerID.first() ^ self->tssPairID.get().first(), + self->thisServerID.second() ^ self->tssPairID.get().second())); + } else if (self->isSSWithTSSPair()) { + te.detail("SSPairID", self->ssPairID); + te.detail("TSSJointID", + UID(self->thisServerID.first() ^ self->ssPairID.get().first(), + self->thisServerID.second() ^ self->ssPairID.get().second())); + } + })); loop { choose { From 5a8adca1f7a7283a4f225f78a3c5b8143c80d535 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 25 Oct 2022 10:45:21 -0700 Subject: [PATCH 39/52] solve review comments: mark const; add comments; template abbreviation --- fdbclient/include/fdbclient/NativeAPI.actor.h | 3 ++- fdbserver/DataDistribution.actor.cpp | 14 ++++++++++++++ .../include/fdbserver/DataDistribution.actor.h | 14 +------------- fdbserver/include/fdbserver/MockGlobalState.h | 9 +++++---- .../fdbserver/ShardsAffectedByTeamFailure.h | 4 ++-- fdbserver/include/fdbserver/StorageMetrics.actor.h | 4 ++-- fdbserver/storageserver.actor.cpp | 10 +++++----- 7 files changed, 31 insertions(+), 27 deletions(-) diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index d1f4860f23..642a4e747a 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -602,7 +602,8 @@ ACTOR Future> waitStorageMetricsWithLocation(TenantInfo StorageMetrics permittedError); // Return the suggested split points from storage server.The locations tell which interface should -// serve the request. The +// serve the request. `limit` is the current estimated storage metrics of `keys`.The returned points, if present, +// guarantee the metrics of split result is within limit. ACTOR Future>>> splitStorageMetricsWithLocations( std::vector locations, KeyRange keys, diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 9f01a1b2be..10b168b7b3 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -53,6 +53,20 @@ #include "fdbserver/DDSharedContext.h" #include "flow/actorcompiler.h" // This must be the last #include. +ShardSizeBounds ShardSizeBounds::shardSizeBoundsBeforeTrack() { + return ShardSizeBounds{ + .max = StorageMetrics{ .bytes = -1, + .bytesPerKSecond = StorageMetrics::infinity, + .iosPerKSecond = StorageMetrics::infinity, + .bytesReadPerKSecond = StorageMetrics::infinity }, + .min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 }, + .permittedError = StorageMetrics{ .bytes = -1, + .bytesPerKSecond = StorageMetrics::infinity, + .iosPerKSecond = StorageMetrics::infinity, + .bytesReadPerKSecond = StorageMetrics::infinity } + }; +} + struct DDAudit { DDAudit(UID id, KeyRange range, AuditType type) : id(id), range(range), type(type), auditMap(AuditPhase::Invalid, allKeys.end), actors(true) {} diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 2e77d07459..ff33386233 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -477,19 +477,7 @@ struct ShardSizeBounds { return max == rhs.max && min == rhs.min && permittedError == rhs.permittedError; } - static ShardSizeBounds shardSizeBoundsBeforeTrack() { - return ShardSizeBounds{ - .max = StorageMetrics{ .bytes = -1, - .bytesPerKSecond = StorageMetrics::infinity, - .iosPerKSecond = StorageMetrics::infinity, - .bytesReadPerKSecond = StorageMetrics::infinity }, - .min = StorageMetrics{ .bytes = -1, .bytesPerKSecond = 0, .iosPerKSecond = 0, .bytesReadPerKSecond = 0 }, - .permittedError = StorageMetrics{ .bytes = -1, - .bytesPerKSecond = StorageMetrics::infinity, - .iosPerKSecond = StorageMetrics::infinity, - .bytesReadPerKSecond = StorageMetrics::infinity } - }; - } + static ShardSizeBounds shardSizeBoundsBeforeTrack(); }; // Gets the permitted size and IO bounds for a shard diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index a404f24027..5f6109626d 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -113,11 +113,12 @@ public: void getStorageMetrics(const GetStorageMetricsRequest& req) override; template - using isLoadBalancedReply = std::is_base_of; + static constexpr bool isLoadBalancedReply = std::is_base_of_v; template - typename std::enable_if::value, void>::type - sendErrorWithPenalty(const ReplyPromise& promise, const Error& err, double penalty) { + typename std::enable_if_t, void> sendErrorWithPenalty(const ReplyPromise& promise, + const Error& err, + double penalty) { Reply reply; reply.error = err; reply.penalty = penalty; @@ -125,7 +126,7 @@ public: } template - typename std::enable_if::value, void>::type + typename std::enable_if_t, void> sendErrorWithPenalty(const ReplyPromise& promise, const Error& err, double) { promise.sendError(err); } diff --git a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h index 9055098bc7..7b674510d4 100644 --- a/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h +++ b/fdbserver/include/fdbserver/ShardsAffectedByTeamFailure.h @@ -80,8 +80,8 @@ public: bool hasShards(Team team) const; // The first element of the pair is either the source for non-moving shards or the destination team for in-flight - // shards The second element of the pair is all previous sources for in-flight shards. This function only return the - // teams for the first shard in [keys.begin, keys.end) + // shards. The second element of the pair is all previous sources for in-flight shards. This function only returns + // the teams for the first shard in [keys.begin, keys.end) std::pair, std::vector> getTeamsForFirstShard(KeyRangeRef keys); std::pair, std::vector> getTeamsFor(KeyRef key); diff --git a/fdbserver/include/fdbserver/StorageMetrics.actor.h b/fdbserver/include/fdbserver/StorageMetrics.actor.h index 552db2c6f7..dc518cf318 100644 --- a/fdbserver/include/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/include/fdbserver/StorageMetrics.actor.h @@ -163,9 +163,9 @@ public: StorageServerMetrics metrics; // penalty used by loadBalance() to balance requests among service instances - virtual double getPenalty() { return 1; } + virtual double getPenalty() const { return 1; } - virtual bool isReadable(KeyRangeRef const& keys) { return true; } + virtual bool isReadable(KeyRangeRef const& keys) const { return true; } virtual void addActor(Future future) = 0; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 1d8fb40c8f..1472b4dcaa 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -807,8 +807,8 @@ public: VersionedData const& data() const { return versionedData; } VersionedData& mutableData() { return versionedData; } - double old_rate = 1.0; - double currentRate() { + mutable double old_rate = 1.0; + double currentRate() const { auto versionLag = version.get() - durableVersion.get(); double res; if (versionLag >= SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX) { @@ -1379,7 +1379,7 @@ public: // This is the maximum version that might be read from storage (the minimum version is durableVersion) Version storageVersion() const { return oldestVersion.get(); } - bool isReadable(KeyRangeRef const& keys) override { + bool isReadable(KeyRangeRef const& keys) const override { auto sh = shards.intersectingRanges(keys); for (auto i = sh.begin(); i != sh.end(); ++i) if (!i->value()->isReadable()) @@ -1405,10 +1405,10 @@ public: } } - Counter::Value queueSize() { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); } + Counter::Value queueSize() const { return counters.bytesInput.getValue() - counters.bytesDurable.getValue(); } // penalty used by loadBalance() to balance requests among SSes. We prefer SS with less write queue size. - double getPenalty() override { + double getPenalty() const override { return std::max(std::max(1.0, (queueSize() - (SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER - 2.0 * SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER)) / From 31a48f404613221e42c9379bc6004377e34eee45 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 25 Oct 2022 13:10:15 -0700 Subject: [PATCH 40/52] Disable GLOBAL_TAG_THROTTLING by default --- fdbclient/ServerKnobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 8dda15b584..009891f664 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -727,7 +727,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0; init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false; init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10); - init( GLOBAL_TAG_THROTTLING, true ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip(); + init( GLOBAL_TAG_THROTTLING, false ); if(isSimulated) GLOBAL_TAG_THROTTLING = deterministicRandom()->coinflip(); init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING ); init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); From ca0b068f2b37bb65fbe866ac2a44e81fca28c148 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Tue, 25 Oct 2022 13:14:10 -0700 Subject: [PATCH 41/52] Remove unnecessary forward declaration Co-authored-by: Markus Pilman --- fdbrpc/include/fdbrpc/Stats.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdbrpc/include/fdbrpc/Stats.h b/fdbrpc/include/fdbrpc/Stats.h index 10e430c317..5e16e1cd4a 100644 --- a/fdbrpc/include/fdbrpc/Stats.h +++ b/fdbrpc/include/fdbrpc/Stats.h @@ -67,8 +67,6 @@ struct Traceable : std::true_type { } }; -class CounterCollectionImpl; - class CounterCollection { friend class CounterCollectionImpl; From ad2888423cae73202892cde9ba464365ce81a198 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 25 Oct 2022 14:48:04 -0600 Subject: [PATCH 42/52] ignore injected errors in old fdb versions --- contrib/TestHarness2/test_harness/summarize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/TestHarness2/test_harness/summarize.py b/contrib/TestHarness2/test_harness/summarize.py index 3629fa7d43..3ecfa2243b 100644 --- a/contrib/TestHarness2/test_harness/summarize.py +++ b/contrib/TestHarness2/test_harness/summarize.py @@ -569,6 +569,9 @@ class Summary: self.handler.add_handler(('Severity', '30'), parse_warning) def parse_error(attrs: Dict[str, str]): + if 'ErrorIsInjectedFault' in attrs and attrs['ErrorIsInjectedFault'].lower() in ['1', 'true']: + # ignore injected errors. In newer fdb versions these will have a lower severity + return self.errors += 1 self.error = True if self.errors > config.max_errors: From 36d9de90724b270f73993ffa1f11b7c67c220e95 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 25 Oct 2022 15:43:24 -0700 Subject: [PATCH 43/52] change UNREACHABLE to ASSERT(false); change function name --- fdbclient/NativeAPI.actor.cpp | 1 + fdbserver/DDTxnProcessor.actor.cpp | 2 +- fdbserver/MockGlobalState.actor.cpp | 13 ++++++++++--- fdbserver/include/fdbserver/MockGlobalState.h | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 4aece2f05f..84111ecd98 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -7736,6 +7736,7 @@ ACTOR Future> waitStorageMetricsWithLocation(TenantInfo StorageMetrics x = wait(fx); return x; } catch (Error& e) { + TraceEvent(SevDebug, "WaitStorageMetricsError").error(e); if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) { TraceEvent(SevError, "WaitStorageMetricsError").error(e); throw; diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index 2c4c695c23..2770345a72 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -816,7 +816,7 @@ Future DDMockTxnProcessor::removeStorageServer(const UID& serverID, const Optional& tssPairID, const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const { - ASSERT(mgs->allShardRemovedFromServer(serverID)); + ASSERT(mgs->allShardsRemovedFromServer(serverID)); mgs->allServers.erase(serverID); return Void(); } diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp index 1e03b71e85..84b28e6d18 100644 --- a/fdbserver/MockGlobalState.actor.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -240,6 +240,13 @@ Future MockStorageServer::waitMetricsTenantAware(const WaitMetricsRequest& void MockStorageServer::getStorageMetrics(const GetStorageMetricsRequest& req) {} Future MockStorageServer::run() { + ssi.locality = LocalityData(Optional>(), + Standalone(deterministicRandom()->randomUniqueID().toString()), + Standalone(deterministicRandom()->randomUniqueID().toString()), + Optional>()); + ssi.initEndpoints(); + ssi.startAcceptingRequests(); + TraceEvent("MockStorageServerStart").detail("Address", ssi.address()); return serveStorageMetricsRequests(this, ssi); } @@ -298,7 +305,7 @@ bool MockGlobalState::serverIsDestForShard(const UID& serverId, KeyRangeRef shar }); } -bool MockGlobalState::allShardRemovedFromServer(const UID& serverId) { +bool MockGlobalState::allShardsRemovedFromServer(const UID& serverId) { return allServers.count(serverId) && shardMapping->getNumberOfShards(serverId) == 0; } @@ -362,7 +369,7 @@ Future> MockGlobalState::getKeyRangeLocations( if (reverse) { // DD never ask for backward range. - UNREACHABLE(); + ASSERT(false); } ASSERT(keys.begin < keys.end); @@ -591,7 +598,7 @@ TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") { testConfig.minimumReplication = 1; testConfig.logAntiQuorum = 0; DatabaseConfiguration dbConfig = generateNormalDatabaseConfiguration(testConfig); - TraceEvent("UnitTestDbConfig").detail("Config", dbConfig.toString()); + TraceEvent("WaitStorageMetricsRequestUnitTestConfig").detail("Config", dbConfig.toString()); state std::shared_ptr mgs = std::make_shared(); mgs->initializeAsEmptyDatabaseMGS(dbConfig); diff --git a/fdbserver/include/fdbserver/MockGlobalState.h b/fdbserver/include/fdbserver/MockGlobalState.h index 5f6109626d..ac984e9069 100644 --- a/fdbserver/include/fdbserver/MockGlobalState.h +++ b/fdbserver/include/fdbserver/MockGlobalState.h @@ -197,7 +197,7 @@ public: * * mgs.shardMapping doesn’t have any information about X * * mgs.allServer[X] is existed */ - bool allShardRemovedFromServer(const UID& serverId); + bool allShardsRemovedFromServer(const UID& serverId); // SOMEDAY: NativeAPI::waitStorageMetrics should share the code in the future, this is a simpler version of it Future, int>> waitStorageMetrics(KeyRange const& keys, From 0a5e59675878bff9277bd9c60667de3dd7208630 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 25 Oct 2022 16:43:00 -0700 Subject: [PATCH 44/52] fix network failure check in unit test --- fdbserver/MockGlobalState.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/MockGlobalState.actor.cpp b/fdbserver/MockGlobalState.actor.cpp index 84b28e6d18..8de995fd2b 100644 --- a/fdbserver/MockGlobalState.actor.cpp +++ b/fdbserver/MockGlobalState.actor.cpp @@ -607,6 +607,7 @@ TEST_CASE("/MockGlobalState/MockStorageServer/WaitStorageMetricsRequest") { ActorCollection* ptr = &actors; // get around ACTOR syntax restriction std::for_each(mgs->allServers.begin(), mgs->allServers.end(), [ptr](auto& server) { ptr->add(server.second.run()); + IFailureMonitor::failureMonitor().setStatus(server.second.ssi.address(), FailureStatus(false)); server.second.metrics.byteSample.sample.insert("something"_sr, 500000); }); From e4116f8aee49fe95f4586259b71d04ee584d7576 Mon Sep 17 00:00:00 2001 From: Aaron Molitor Date: Tue, 25 Oct 2022 19:19:54 -0500 Subject: [PATCH 45/52] cleanup shell script, remove set -x, add more detailed logging --- packaging/docker/run_ycsb.sh | 48 ++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/packaging/docker/run_ycsb.sh b/packaging/docker/run_ycsb.sh index deb065a728..bfe3e8df6e 100755 --- a/packaging/docker/run_ycsb.sh +++ b/packaging/docker/run_ycsb.sh @@ -1,22 +1,44 @@ #!/usr/bin/env bash -set -Eeuxo pipefail +set -Eeuo pipefail + +function logg () { + printf "##### $(date +'%Y-%m-%dT%H:%M:%SZ') # %-56.55s #####\n" "${1}" +} + +function error_exit () { + echo "################################################################################" + logg "${0} FAILED" + logg "RUN_ID: ${RUN_ID}" + logg "WORKLOAD: ${WORKLOAD}" + logg "ENVIRONMENT IS:" + env + echo "################################################################################" +} + +trap error_exit ERR namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) -POD_NUM=$(echo $POD_NAME | cut -d - -f3) -KEY="ycsb_load_${POD_NUM}_of_${NUM_PODS}_complete" -CLI=$(ls /var/dynamic-conf/bin/*/fdbcli | head -n1) -echo "WAITING FOR ALL PODS TO COME UP" -while [[ $(kubectl get pods -n ${namespace} -l name=ycsb,run=${RUN_ID} --field-selector=status.phase=Running | grep -cv NAME) -lt ${NUM_PODS} ]]; do +logg "WAITING FOR ${NUM_PODS} PODS TO COME UP IN ${namespace}" +while [[ $(kubectl get pods -n "${namespace}" -l name=ycsb,run="${RUN_ID}" --field-selector=status.phase=Running | grep -cv NAME) -lt ${NUM_PODS} ]]; do sleep 1 done -echo "ALL PODS ARE UP" +logg "${NUM_PODS} PODS ARE UP IN ${namespace}" -echo "RUNNING YCSB" -./bin/ycsb.sh ${MODE} foundationdb -s -P workloads/${WORKLOAD} ${YCSB_ARGS} -echo "YCSB FINISHED" +logg "RUNNING YCSB ${WORKLOAD}" +set -x +./bin/ycsb.sh "${MODE}" foundationdb -s -P "workloads/${WORKLOAD}" "${YCSB_ARGS}" +set +x +logg "YCSB ${WORKLOAD} FINISHED" -echo "COPYING HISTOGRAMS TO S3" -aws s3 sync --sse aws:kms --exclude "*" --include "histogram.*" /tmp s3://${BUCKET}/ycsb_histograms/${namespace}/${POD_NAME} -echo "COPYING HISTOGRAMS TO S3 FINISHED" +logg "COPYING HISTOGRAMS TO S3" +set -x +aws s3 sync --sse aws:kms --exclude "*" --include "histogram.*" /tmp "s3://${BUCKET}/ycsb_histograms/${namespace}/${POD_NAME}" +set +x +logg "COPYING HISTOGRAMS TO S3 FINISHED" +echo "################################################################################" +logg "COMPLETED ${0}" +logg "RUN_ID: ${RUN_ID}" +logg "WORKLOAD: ${WORKLOAD}" +echo "################################################################################" From 3c5d3f7a94d58639ee8aee84c5cf8216a076f0d5 Mon Sep 17 00:00:00 2001 From: Marian Dvorsky Date: Wed, 26 Oct 2022 16:29:28 +0200 Subject: [PATCH 46/52] Fix SpanContext for GP:getLiveCommittedVersion (#8565) * Fix SpanContext for GP:getLiveCommittedVersion --- fdbclient/include/fdbclient/Tracing.h | 15 +------- fdbserver/GrvProxyServer.actor.cpp | 35 ++++++++++--------- .../GrvProxyTransactionTagThrottler.actor.cpp | 12 +++---- .../GrvProxyTransactionTagThrottler.h | 4 +-- 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/fdbclient/include/fdbclient/Tracing.h b/fdbclient/include/fdbclient/Tracing.h index 789b346dfd..01ffcaa5dd 100644 --- a/fdbclient/include/fdbclient/Tracing.h +++ b/fdbclient/include/fdbclient/Tracing.h @@ -273,17 +273,4 @@ struct ITracer { virtual void trace(Span const& span) = 0; }; -void openTracer(TracerType type); - -template -struct SpannedDeque : Deque { - Span span; - explicit SpannedDeque(Location loc) : span(loc) {} - SpannedDeque(SpannedDeque&& other) : Deque(std::move(other)), span(std::move(other.span)) {} - SpannedDeque(SpannedDeque const&) = delete; - SpannedDeque& operator=(SpannedDeque const&) = delete; - SpannedDeque& operator=(SpannedDeque&& other) { - *static_cast*>(this) = std::move(other); - span = std::move(other.span); - } -}; +void openTracer(TracerType type); \ No newline at end of file diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index a55748505f..e384678f9d 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -459,9 +459,9 @@ void dropRequestFromQueue(Deque* queue, GrvProxyStats* st // Put a GetReadVersion request into the queue corresponding to its priority. ACTOR Future queueGetReadVersionRequests(Reference const> db, - SpannedDeque* systemQueue, - SpannedDeque* defaultQueue, - SpannedDeque* batchQueue, + Deque* systemQueue, + Deque* defaultQueue, + Deque* batchQueue, FutureStream readVersionRequests, PromiseStream GRVTimer, double* lastGRVTime, @@ -531,7 +531,6 @@ ACTOR Future queueGetReadVersionRequests(Reference stats->txnSystemPriorityStartIn += req.transactionCount; ++stats->systemGRVQueueSize; systemQueue->push_back(req); - // systemQueue->span.addParent(req.spanContext); } else if (req.priority >= TransactionPriority::DEFAULT) { ++stats->txnRequestIn; stats->txnStartIn += req.transactionCount; @@ -542,7 +541,6 @@ ACTOR Future queueGetReadVersionRequests(Reference } else { defaultQueue->push_back(req); } - // defaultQueue->span.addParent(req.spanContext); } else { // Return error for batch_priority GRV requests int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1); @@ -559,7 +557,6 @@ ACTOR Future queueGetReadVersionRequests(Reference } else { batchQueue->push_back(req); } - // batchQueue->span.addParent(req.spanContext); } } } @@ -607,7 +604,7 @@ ACTOR Future lastCommitUpdater(GrvProxyData* self, PromiseStream getLiveCommittedVersion(SpanContext parentSpan, +ACTOR Future getLiveCommittedVersion(std::vector spanContexts, GrvProxyData* grvProxyData, uint32_t flags, Optional debugID, @@ -620,7 +617,10 @@ ACTOR Future getLiveCommittedVersion(SpanContext parentSpan // before the request returns, so it is committed. (2) No proxy on our list reported committed a higher version // before this request was received, because then its committedVersion would have been higher, // and no other proxy could have already committed anything without first ending the epoch - state Span span("GP:getLiveCommittedVersion"_loc, parentSpan); + state Span span("GP:getLiveCommittedVersion"_loc); + for (const SpanContext& spanContext : spanContexts) { + span.addLink(spanContext); + } ++grvProxyData->stats.txnStartBatch; state double grvStart = now(); @@ -826,15 +826,14 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, state GrvTransactionRateInfo batchRateInfo(0); state GrvProxyTransactionTagThrottler tagThrottler; - state SpannedDeque systemQueue("GP:transactionStarterSystemQueue"_loc); - state SpannedDeque defaultQueue("GP:transactionStarterDefaultQueue"_loc); - state SpannedDeque batchQueue("GP:transactionStarterBatchQueue"_loc); + state Deque systemQueue; + state Deque defaultQueue; + state Deque batchQueue; state TransactionTagMap transactionTagCounter; state PrioritizedTransactionTagMap clientThrottledTags; state PromiseStream normalGRVLatency; - // state Span span; state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES; getCurrentLineage()->modify(&TransactionLineage::operation) = @@ -911,7 +910,7 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, uint32_t defaultQueueSize = defaultQueue.size(); uint32_t batchQueueSize = batchQueue.size(); while (requestsToStart < SERVER_KNOBS->START_TRANSACTION_MAX_REQUESTS_TO_START) { - SpannedDeque* transactionQueue; + Deque* transactionQueue; if (!systemQueue.empty()) { transactionQueue = &systemQueue; } else if (!defaultQueue.empty()) { @@ -921,7 +920,6 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, } else { break; } - // transactionQueue->span.swap(span); auto& req = transactionQueue->front(); int tc = req.transactionCount; @@ -1017,7 +1015,13 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, int batchGRVProcessed = 0; for (int i = 0; i < start.size(); i++) { if (start[i].size()) { - Future readVersionReply = getLiveCommittedVersion(SpanContext(), + std::vector spanContexts; + spanContexts.reserve(start[i].size()); + for (const GetReadVersionRequest& request : start[i]) { + spanContexts.push_back(request.spanContext); + } + + Future readVersionReply = getLiveCommittedVersion(spanContexts, grvProxyData, i, debugID, @@ -1041,7 +1045,6 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, batchGRVProcessed += batchPriTransactionsStarted[i]; } } - // span = Span(span.location); grvProxyData->stats.percentageOfDefaultGRVQueueProcessed = defaultQueueSize ? (double)defaultGRVProcessed / defaultQueueSize : 1; diff --git a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp index d6cf76bc63..772bd24ba7 100644 --- a/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp +++ b/fdbserver/GrvProxyTransactionTagThrottler.actor.cpp @@ -100,8 +100,8 @@ void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& re } void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed, - SpannedDeque& outBatchPriority, - SpannedDeque& outDefaultPriority) { + Deque& outBatchPriority, + Deque& outDefaultPriority) { // Pointer to a TagQueue with some extra metadata stored alongside struct TagQueueHandle { // Store pointers here to avoid frequent std::unordered_map lookups @@ -280,8 +280,8 @@ ACTOR static Future mockFifoClient(GrvProxyTransactionTagThrottler* thrott } ACTOR static Future mockServer(GrvProxyTransactionTagThrottler* throttler) { - state SpannedDeque outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc); - state SpannedDeque outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc); + state Deque outBatchPriority; + state Deque outDefaultPriority; loop { state double elapsed = (0.009 + 0.002 * deterministicRandom()->random01()); wait(delay(elapsed)); @@ -404,8 +404,8 @@ TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") { throttler.updateRates(TransactionTagMap{}); ASSERT_EQ(throttler.size(), 1); { - SpannedDeque outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc); - SpannedDeque outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc); + Deque outBatchPriority; + Deque outDefaultPriority; throttler.releaseTransactions(0.1, outBatchPriority, outDefaultPriority); } // Calling updates cleans up the queues in throttler diff --git a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h index 887ff9ffb0..9575280ea4 100644 --- a/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h +++ b/fdbserver/include/fdbserver/GrvProxyTransactionTagThrottler.h @@ -72,8 +72,8 @@ public: // If a request is ready to be executed, it is sent to the deque // corresponding to its priority. If not, the request remains queued. void releaseTransactions(double elapsed, - SpannedDeque& outBatchPriority, - SpannedDeque& outDefaultPriority); + Deque& outBatchPriority, + Deque& outDefaultPriority); void addRequest(GetReadVersionRequest const&); From b8b7b46d8f11390ca283ba0a252b60f2d2029b3a Mon Sep 17 00:00:00 2001 From: Aaron Molitor Date: Wed, 26 Oct 2022 08:16:27 -0500 Subject: [PATCH 47/52] update kubectl and awscli --- packaging/docker/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packaging/docker/Dockerfile b/packaging/docker/Dockerfile index eddc5b488f..70d488b88c 100644 --- a/packaging/docker/Dockerfile +++ b/packaging/docker/Dockerfile @@ -178,13 +178,13 @@ RUN yum -y install \ rm -rf /var/cache/yum WORKDIR /tmp -RUN curl -Ls https://amazon-eks.s3.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl -o kubectl && \ - echo "08ff68159bbcb844455167abb1d0de75bbfe5ae1b051f81ab060a1988027868a kubectl" > kubectl.txt && \ +RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \ + echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \ sha256sum --quiet -c kubectl.txt && \ mv kubectl /usr/local/bin/kubectl && \ chmod 755 /usr/local/bin/kubectl && \ - curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.2.43.zip -o "awscliv2.zip" && \ - echo "9a8b3c4e7f72bbcc55e341dce3af42479f2730c225d6d265ee6f9162cfdebdfd awscliv2.zip" > awscliv2.txt && \ + curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m)-2.7.34.zip -o "awscliv2.zip" && \ + echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88 awscliv2.zip" > awscliv2.txt && \ sha256sum --quiet -c awscliv2.txt && \ unzip -qq awscliv2.zip && \ ./aws/install && \ From ab6953be7da4dc652a8abddd21fd2686d9bd16c0 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Wed, 26 Oct 2022 11:02:50 -0500 Subject: [PATCH 48/52] Blob Granule read-driven compaction (#8572) --- fdbclient/ServerKnobs.cpp | 5 + .../include/fdbclient/BlobWorkerCommon.h | 10 +- fdbclient/include/fdbclient/ServerKnobs.h | 4 + fdbserver/BlobWorker.actor.cpp | 187 ++++++++++++++++-- .../workloads/BlobGranuleVerifier.actor.cpp | 10 +- 5 files changed, 197 insertions(+), 19 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 009891f664..68eed2867f 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -967,6 +967,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false; init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10); init( BG_KEY_TUPLE_TRUNCATE_OFFSET, 0 ); + init( BG_ENABLE_READ_DRIVEN_COMPACTION, true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false; + init( BG_RDC_BYTES_FACTOR, 2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10); + init( BG_RDC_READ_FACTOR, 3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10); init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false; init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0; @@ -975,6 +978,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1; init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10); init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100); + init( BLOB_WORKER_RDC_PARALLELISM, 2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6); + init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0; init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0; init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 ); diff --git a/fdbclient/include/fdbclient/BlobWorkerCommon.h b/fdbclient/include/fdbclient/BlobWorkerCommon.h index b4cbbac2a7..7870bacdb7 100644 --- a/fdbclient/include/fdbclient/BlobWorkerCommon.h +++ b/fdbclient/include/fdbclient/BlobWorkerCommon.h @@ -45,6 +45,7 @@ struct BlobWorkerStats { Counter compressionBytesFinal; Counter fullRejections; Counter forceFlushCleanups; + Counter readDrivenCompactions; int numRangesAssigned; int mutationBytesBuffered; @@ -83,10 +84,11 @@ struct BlobWorkerStats { readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc), flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc), compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc), - forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0), - activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0), - notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0), - initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) { + forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc), + numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0), + minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0), + estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), + deltaWritesLock(deltaWritesLock) { specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; }); specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; }); specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; }); diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index bfa48e8b09..5a5df49dc8 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -950,10 +950,14 @@ public: int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS; int BG_MERGE_CANDIDATE_DELAY_SECONDS; int BG_KEY_TUPLE_TRUNCATE_OFFSET; + bool BG_ENABLE_READ_DRIVEN_COMPACTION; + int BG_RDC_BYTES_FACTOR; + int BG_RDC_READ_FACTOR; int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM; int BLOB_WORKER_RESNAPSHOT_PARALLELISM; int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM; + int BLOB_WORKER_RDC_PARALLELISM; double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index 5717472890..cf5b7b1340 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -84,6 +84,15 @@ struct GranuleStartState { Optional history; }; +// TODO: add more (blob file request cost, in-memory mutations vs blob delta file, etc...) +struct GranuleReadStats { + int64_t deltaBytesRead; + + void reset() { deltaBytesRead = 0; } + + GranuleReadStats() { reset(); } +}; + struct GranuleMetadata : NonCopyable, ReferenceCounted { KeyRange keyRange; @@ -120,11 +129,74 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted { AssignBlobRangeRequest originalReq; + GranuleReadStats readStats; + bool rdcCandidate; + Promise runRDC; + void resume() { if (resumeSnapshot.canBeSet()) { resumeSnapshot.send(Void()); } } + + void resetReadStats() { + rdcCandidate = false; + readStats.reset(); + runRDC.reset(); + } + + // determine eligibility (>1) and priority for re-snapshotting this granule + double weightRDC() { + // ratio of read amp to write amp that would be incurred by re-snapshotting now + int64_t lastSnapshotSize = (files.snapshotFiles.empty()) ? 0 : files.snapshotFiles.back().length; + int64_t minSnapshotSize = SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES / 2; + lastSnapshotSize = std::max(minSnapshotSize, lastSnapshotSize); + + int64_t writeAmp = lastSnapshotSize + bufferedDeltaBytes + bytesInNewDeltaFiles; + // read amp is deltaBytesRead. Read amp must be READ_FACTOR times larger than write amp + return (1.0 * readStats.deltaBytesRead) / (writeAmp * SERVER_KNOBS->BG_RDC_READ_FACTOR); + } + + bool isEligibleRDC() { + // granule should be reasonably read-hot to be eligible + int64_t bytesWritten = bufferedDeltaBytes + bytesInNewDeltaFiles; + return bytesWritten * SERVER_KNOBS->BG_RDC_READ_FACTOR < readStats.deltaBytesRead; + } + + bool updateReadStats(Version readVersion, const BlobGranuleChunkRef& chunk) { + // Only update stats for re-compacting for at-latest reads that have to do snapshot + delta merge + if (!SERVER_KNOBS->BG_ENABLE_READ_DRIVEN_COMPACTION || !chunk.snapshotFile.present() || + pendingSnapshotVersion != durableSnapshotVersion.get() || readVersion <= pendingSnapshotVersion) { + return false; + } + + if (chunk.newDeltas.empty() && chunk.deltaFiles.empty()) { + return false; + } + + readStats.deltaBytesRead += chunk.newDeltas.expectedSize(); + for (auto& it : chunk.deltaFiles) { + readStats.deltaBytesRead += it.length; + } + + if (rdcCandidate) { + return false; + } + + if (isEligibleRDC() && weightRDC() > 1.0) { + rdcCandidate = true; + CODE_PROBE(true, "Granule read triggering read-driven compaction"); + if (BW_DEBUG) { + fmt::print("Triggering read-driven compaction of [{0} - {1})\n", + keyRange.begin.printable(), + keyRange.end.printable()); + } + return true; + } + return false; + } + + inline bool doReadDrivenCompaction() { return runRDC.isSet(); } }; struct GranuleRangeMetadata { @@ -200,6 +272,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted { NotifiedVersion grvVersion; Promise fatalError; Promise simInjectFailure; + Promise doReadDrivenCompaction; Reference initialSnapshotLock; Reference resnapshotLock; @@ -293,6 +366,13 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted { return stats.estimatedMaxResidentMemory >= memoryFullThreshold; } + void triggerReadDrivenCompaction() { + Promise doRDC = doReadDrivenCompaction; + if (doRDC.canBeSet()) { + doRDC.send(Void()); + } + } + bool maybeInjectTargetedRestart() { // inject a BW restart at most once per test if (g_network->isSimulated() && !g_simulator->speedUpSimulation && @@ -2042,6 +2122,7 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, metadata->pendingDeltaVersion = startVersion; metadata->bufferedDeltaVersion = startVersion; metadata->knownCommittedVersion = startVersion; + metadata->resetReadStats(); Reference cfData = makeReference(bwData->db.getPtr()); @@ -2184,6 +2265,10 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, } nextForceFlush = metadata->forceFlushVersion.whenAtLeast(lastForceFlushVersion + 1); } + when(wait(metadata->runRDC.getFuture())) { + // return control flow back to the triggering actor before continuing + wait(delay(0)); + } } } catch (Error& e) { // only error we should expect here is when we finish consuming old change feed @@ -2310,6 +2395,7 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, startState.granuleID, inFlightFiles.empty() ? Future(Void()) : success(inFlightFiles.back().future)); + metadata->resetReadStats(); } // reset force flush state, requests should retry and add it back once feed is ready forceFlushVersions.clear(); @@ -2418,20 +2504,20 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, // The force flush contract is a version cannot be put in forceFlushVersion unless the change feed // is already whenAtLeast that version bool forceFlush = !forceFlushVersions.empty() && forceFlushVersions.back() > metadata->pendingDeltaVersion; + bool doReadDrivenFlush = !metadata->currentDeltas.empty() && metadata->doReadDrivenCompaction(); CODE_PROBE(forceFlush, "Force flushing granule"); - if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush) { + if (metadata->bufferedDeltaBytes >= SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES || forceFlush || + doReadDrivenFlush) { TraceEvent(SevDebug, "BlobGranuleDeltaFile", bwData->id) .detail("Granule", metadata->keyRange) .detail("Version", lastDeltaVersion); // sanity check for version order - - if (forceFlush) { + if (forceFlush || doReadDrivenFlush) { if (lastDeltaVersion == invalidVersion) { - lastDeltaVersion = metadata->currentDeltas.empty() ? metadata->pendingDeltaVersion - : metadata->currentDeltas.back().version; + lastDeltaVersion = metadata->bufferedDeltaVersion; } - if (lastDeltaVersion < forceFlushVersions.back()) { + if (!forceFlushVersions.empty() && lastDeltaVersion < forceFlushVersions.back()) { if (BW_DEBUG) { fmt::print("Granule [{0} - {1}) force flushing delta version {2} -> {3}\n", metadata->keyRange.begin.printable(), @@ -2443,13 +2529,6 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, } } if (!metadata->currentDeltas.empty()) { - if (lastDeltaVersion < metadata->currentDeltas.back().version) { - fmt::print("Granule [{0} - {1}) LDV {2} < DeltaBack {3}\n", - metadata->keyRange.begin.printable(), - metadata->keyRange.end.printable(), - lastDeltaVersion, - metadata->currentDeltas.back().version); - } ASSERT(lastDeltaVersion >= metadata->currentDeltas.back().version); ASSERT(metadata->pendingDeltaVersion < metadata->currentDeltas.front().version); } else { @@ -2506,6 +2585,7 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, // add new pending delta file ASSERT(metadata->pendingDeltaVersion < lastDeltaVersion); metadata->pendingDeltaVersion = lastDeltaVersion; + ASSERT(metadata->bufferedDeltaVersion <= lastDeltaVersion); metadata->bufferedDeltaVersion = lastDeltaVersion; // In case flush was forced at non-mutation version metadata->bytesInNewDeltaFiles += metadata->bufferedDeltaBytes; @@ -2527,6 +2607,9 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, // Wait on delta file starting here. If we have too many pending delta file writes, we need to not // continue to consume from the change feed, as that will pile on even more delta files to write wait(startDeltaFileWrite); + } else if (metadata->doReadDrivenCompaction()) { + ASSERT(metadata->currentDeltas.empty()); + snapshotEligible = true; } // FIXME: if we're still reading from old change feed, we should probably compact if we're @@ -2534,7 +2617,8 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, // yet // If we have enough delta files, try to re-snapshot - if (snapshotEligible && metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT) { + if (snapshotEligible && (metadata->doReadDrivenCompaction() || + metadata->bytesInNewDeltaFiles >= SERVER_KNOBS->BG_DELTA_BYTES_BEFORE_COMPACT)) { if (BW_DEBUG && !inFlightFiles.empty()) { fmt::print("Granule [{0} - {1}) ready to re-snapshot at {2} after {3} > {4} bytes, " "waiting for " @@ -2582,6 +2666,7 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, // reset metadata metadata->bytesInNewDeltaFiles = 0; + metadata->resetReadStats(); // If we have more than one snapshot file and that file is unblocked (committedVersion >= // snapshotVersion), wait for it to finish @@ -3739,6 +3824,11 @@ ACTOR Future doBlobGranuleFileRequest(Reference bwData, Bl } } } + + // don't update read stats on a summarize read + if (metadata->updateReadStats(req.readVersion, chunk)) { + bwData->triggerReadDrivenCompaction(); + } } rep.chunks.push_back(rep.arena, chunk); @@ -4553,6 +4643,74 @@ ACTOR Future runGRVChecks(Reference bwData) { } } +struct RDCEntry { + double weight; + Reference granule; + RDCEntry(double weight, Reference granule) : weight(weight), granule(granule) {} +}; + +// for a top-k algorithm, we actually want a min-heap, so reverse the sort order +struct OrderForTopK { + bool operator()(RDCEntry const& a, RDCEntry const& b) const { return b.weight - a.weight; } +}; + +typedef std::priority_queue, OrderForTopK> TopKPQ; + +ACTOR Future runReadDrivenCompaction(Reference bwData) { + state bool processedAll = true; + loop { + if (processedAll) { + wait(bwData->doReadDrivenCompaction.getFuture()); + bwData->doReadDrivenCompaction.reset(); + wait(delay(0)); + } + + TopKPQ topK; + + // FIXME: possible to scan candidates instead of all granules? + int candidates = 0; + auto allRanges = bwData->granuleMetadata.intersectingRanges(normalKeys); + for (auto& it : allRanges) { + if (it.value().activeMetadata.isValid() && it.value().activeMetadata->cancelled.canBeSet()) { + auto metadata = it.value().activeMetadata; + if (metadata->rdcCandidate && metadata->isEligibleRDC() && metadata->runRDC.canBeSet() && + metadata->pendingSnapshotVersion == metadata->durableSnapshotVersion.get()) { + candidates++; + double weight = metadata->weightRDC(); + if (weight > 1.0 && + (topK.size() < SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM || weight > topK.top().weight)) { + if (topK.size() == SERVER_KNOBS->BLOB_WORKER_RDC_PARALLELISM) { + topK.pop(); + } + topK.push(RDCEntry(weight, metadata)); + } + } + } + } + + CODE_PROBE(candidates > topK.size(), "Too many read-driven compaction candidates for one cycle"); + + std::vector> futures; + futures.reserve(topK.size()); + while (!topK.empty()) { + ++bwData->stats.readDrivenCompactions; + Promise runRDC = topK.top().granule->runRDC; + ASSERT(runRDC.canBeSet()); + Future waitForSnapshotComplete = topK.top().granule->durableSnapshotVersion.whenAtLeast( + topK.top().granule->durableSnapshotVersion.get() + 1) || + topK.top().granule->cancelled.getFuture(); + futures.push_back(waitForSnapshotComplete); + topK.pop(); + runRDC.send(Void()); + } + processedAll = futures.empty(); + if (!futures.empty()) { + // wait at least one second to throttle this actor a bit + wait(waitForAll(futures) && delay(1.0)); + } + } +} + // FIXME: better way to do this? // monitor system keyspace for new tenants ACTOR Future monitorTenants(Reference bwData) { @@ -4890,6 +5048,7 @@ ACTOR Future blobWorker(BlobWorkerInterface bwInterf, self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture())); self->addActor.send(runGRVChecks(self)); self->addActor.send(monitorTenants(self)); + self->addActor.send(runReadDrivenCompaction(self)); state Future selfRemoved = monitorRemoval(self); if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.25)) { self->addActor.send(simForceFileWriteContention(self)); diff --git a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp index 3b87852af9..c904649f56 100644 --- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp +++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp @@ -305,6 +305,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload { state Version prevPurgeVersion = -1; state UID dbgId = debugRandom()->randomUniqueID(); state Version newPurgeVersion = 0; + // usually we want randomness to verify maximum data, but sometimes hotspotting a subset is good too + state bool pickGranuleUniform = deterministicRandom()->random01() < 0.1; TraceEvent("BlobGranuleVerifierStart"); if (BGV_DEBUG) { @@ -458,7 +460,13 @@ struct BlobGranuleVerifierWorkload : TestWorkload { } // pick a random range - int rIndex = deterministicRandom()->randomInt(0, self->granuleRanges.get().size()); + size_t granuleCount = self->granuleRanges.get().size(); + size_t rIndex; + if (pickGranuleUniform) { + rIndex = deterministicRandom()->randomInt(0, granuleCount); + } else { + rIndex = deterministicRandom()->randomSkewedUInt32(0, granuleCount); + } state KeyRange range = self->granuleRanges.get()[rIndex]; state std::pair fdb = wait(readFromFDB(cx, range)); From 6f37f55917b463bb028a7d08a63ea497d9bda58d Mon Sep 17 00:00:00 2001 From: Nim Wijetunga Date: Wed, 26 Oct 2022 09:38:27 -0700 Subject: [PATCH 49/52] Restore System Keys First in Backup/Restore Workloads (#8475) * system key restore ordering * restore system keys before regular data * atomic restore backup fix * change testing * fix compile error * fix compile issue * fix compile issues * Trigger Build * only split restore if encryption is enabled * revert knob changes * Update fdbserver/workloads/AtomicSwitchover.actor.cpp Co-authored-by: A.J. Beamon * Update fdbserver/workloads/AtomicSwitchover.actor.cpp Co-authored-by: A.J. Beamon * Update fdbserver/workloads/BackupCorrectness.actor.cpp Co-authored-by: A.J. Beamon * Update fdbserver/workloads/AtomicRestore.actor.cpp Co-authored-by: A.J. Beamon * add todo * strengthen check * seperate system restore for atomic restore * address pr comments * address pr comments Co-authored-by: A.J. Beamon --- fdbbackup/backup.actor.cpp | 1 + fdbclient/FileBackupAgent.actor.cpp | 208 +++++++++++++----- .../include/fdbclient/BackupAgent.actor.h | 1 + .../fdbclient/TenantEntryCache.actor.h | 4 + fdbserver/workloads/AtomicRestore.actor.cpp | 5 +- ...kupAndParallelRestoreCorrectness.actor.cpp | 9 +- .../workloads/BackupCorrectness.actor.cpp | 76 ++++++- fdbserver/workloads/BackupToBlob.actor.cpp | 5 +- .../workloads/BackupToDBCorrectness.actor.cpp | 44 +++- .../workloads/IncrementalBackup.actor.cpp | 44 +++- fdbserver/workloads/RestoreBackup.actor.cpp | 46 +++- fdbserver/workloads/RestoreFromBlob.actor.cpp | 21 +- fdbserver/workloads/SubmitBackup.actor.cpp | 7 +- tests/fast/EncryptedBackupCorrectness.toml | 1 - 14 files changed, 391 insertions(+), 81 deletions(-) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index a55a6f83df..64233b8e74 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -2365,6 +2365,7 @@ ACTOR Future runRestore(Database db, KeyRef(addPrefix), KeyRef(removePrefix), LockDB::True, + UnlockDB::True, onlyApplyMutationLogs, inconsistentSnapshotOnly, beginVersion, diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 52bb607d8d..137f401df0 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -167,6 +167,7 @@ public: KeyBackedProperty removePrefix() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); } + KeyBackedProperty unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); } // XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges KeyBackedProperty restoreRange() { return configSpace.pack(__FUNCTION__sr); } KeyBackedProperty> restoreRanges() { return configSpace.pack(__FUNCTION__sr); } @@ -591,12 +592,11 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { } ACTOR static Future decryptImpl(Database cx, - StringRef headerS, + BlobCipherEncryptHeader header, const uint8_t* dataP, int64_t dataLen, Arena* arena) { Reference const> dbInfo = cx->clientInfo; - state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS); TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP)); ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid()); validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header); @@ -606,7 +606,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { } static Future decrypt(Database cx, - StringRef headerS, + BlobCipherEncryptHeader headerS, const uint8_t* dataP, int64_t dataLen, Arena* arena) { @@ -651,7 +651,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { } ACTOR static Future updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) { - state std::pair curTenantInfo = wait(getEncryptionDomainDetails(key, self)); + state std::pair curTenantInfo = wait(getEncryptionDomainDetails(key, self->tenantCache)); state Reference const> dbInfo = self->cx->clientInfo; // Get text and header cipher key @@ -693,12 +693,13 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; } - ACTOR static Future> - getEncryptionDomainDetailsImpl(KeyRef key, Reference> tenantCache, bool useTenantCache) { + ACTOR static Future> getEncryptionDomainDetailsImpl( + KeyRef key, + Reference> tenantCache) { if (isSystemKey(key)) { return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); } - if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) { + if (key.size() < TENANT_PREFIX_SIZE) { return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); } KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE); @@ -710,21 +711,10 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); } - static Future> getEncryptionDomainDetails(KeyRef key, - EncryptedRangeFileWriter* self) { - // If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of - // unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may - // degrade if most of the mutations belong to an invalid tenant - TenantMode mode = self->cx->clientInfo->get().tenantMode; - bool useTenantCache = mode != TenantMode::DISABLED; - if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) { - // TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any - // tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode - // support with backups is more performant - useTenantCache = false; - } - CODE_PROBE(useTenantCache, "using tenant cache"); - return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache); + static Future> getEncryptionDomainDetails( + KeyRef key, + Reference> tenantCache) { + return getEncryptionDomainDetailsImpl(key, tenantCache); } // Handles the first block and internal blocks. Ends current block if needed. @@ -816,6 +806,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE); } + state ValueRef newValue = StringRef(); self->lastKey = k; self->lastValue = v; @@ -834,9 +825,9 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { if (self->lastKey.size() == 0 || k.size() == 0) { return false; } - state std::pair curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self)); - state std::pair prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self)); - // crossing tenant boundaries so finish the current block using only the tenant prefix of the new key + state std::pair curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache)); + state std::pair prevKeyTenantInfo = + wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache)); if (curKeyTenantInfo.first != prevKeyTenantInfo.first) { CODE_PROBE(true, "crossed tenant boundaries"); wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo)); @@ -1040,11 +1031,18 @@ private: Key lastValue; }; -void decodeKVPairs(StringRefReader* reader, Standalone>* results) { +ACTOR static Future decodeKVPairs(StringRefReader* reader, + Standalone>* results, + bool encryptedBlock, + Optional>> tenantCache, + Optional encryptHeader) { // Read begin key, if this fails then block was invalid. - uint32_t kLen = reader->consumeNetworkUInt32(); - const uint8_t* k = reader->consume(kLen); + state uint32_t kLen = reader->consumeNetworkUInt32(); + state const uint8_t* k = reader->consume(kLen); results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); + state KeyRef prevKey = KeyRef(k, kLen); + state bool done = false; + state Optional> prevTenantInfo; // Read kv pairs and end key while (1) { @@ -1052,6 +1050,35 @@ void decodeKVPairs(StringRefReader* reader, Standalone>* kLen = reader->consumeNetworkUInt32(); k = reader->consume(kLen); + // make sure that all keys in a block belong to exactly one tenant, + // unless its the last key in which case it can be a truncated (different) tenant prefix + if (encryptedBlock && g_network && g_network->isSimulated()) { + ASSERT(tenantCache.present()); + ASSERT(encryptHeader.present()); + state KeyRef curKey = KeyRef(k, kLen); + if (!prevTenantInfo.present()) { + std::pair tenantInfo = + wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, tenantCache.get())); + prevTenantInfo = tenantInfo; + } + std::pair curTenantInfo = + wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, tenantCache.get())); + if (!curKey.empty() && !prevKey.empty() && prevTenantInfo.get().first != curTenantInfo.first) { + ASSERT(!done); + if (curTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && + curTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) { + ASSERT(curKey.size() == TENANT_PREFIX_SIZE); + } + done = true; + } + // make sure that all keys (except possibly the last key) in a block are encrypted using the correct key + if (!prevKey.empty()) { + ASSERT(prevTenantInfo.get().first == encryptHeader.get().cipherTextDetails.encryptDomainId); + } + prevKey = curKey; + prevTenantInfo = curTenantInfo; + } + // If eof reached or first value len byte is 0xFF then a valid block end was reached. if (reader->eof() || *reader->rptr == 0xFF) { results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef())); @@ -1072,6 +1099,8 @@ void decodeKVPairs(StringRefReader* reader, Standalone>* for (auto b : reader->remainder()) if (b != 0xFF) throw restore_corrupted_data_padding(); + + return Void(); } ACTOR Future>> decodeRangeFileBlock(Reference file, @@ -1094,7 +1123,11 @@ ACTOR Future>> decodeRangeFileBlock(Reference< // BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION int32_t file_version = reader.consume(); if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) { - decodeKVPairs(&reader, &results); + wait(decodeKVPairs(&reader, + &results, + false, + Optional>>(), + Optional())); } else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { CODE_PROBE(true, "decoding encrypted block"); ASSERT(cx.present()); @@ -1108,7 +1141,8 @@ ACTOR Future>> decodeRangeFileBlock(Reference< // read encryption header const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize); - StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize); + StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize); + state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS); const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize; // calculate the total bytes read up to (and including) the header int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize; @@ -1117,7 +1151,12 @@ ACTOR Future>> decodeRangeFileBlock(Reference< StringRef decryptedData = wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena())); reader = StringRefReader(decryptedData, restore_corrupted_data()); - decodeKVPairs(&reader, &results); + state Optional>> tenantCache; + if (g_network && g_simulator->isSimulated()) { + tenantCache = makeReference>(cx.get(), TenantEntryCacheRefreshMode::WATCH); + wait(tenantCache.get()->init()); + } + wait(decodeKVPairs(&reader, &results, true, tenantCache, header)); } else { throw restore_unsupported_file_version(); } @@ -3398,6 +3437,8 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { state RestoreConfig restore(task); restore.stateEnum().set(tr, ERestoreState::COMPLETED); + state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true)); + tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue); // Clear the file map now since it could be huge. restore.fileSet().clear(tr); @@ -3413,7 +3454,9 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase { restore.clearApplyMutationsKeys(tr); wait(taskBucket->finish(tr, task)); - wait(unlockDatabase(tr, restore.getUid())); + if (unlockDB) { + wait(unlockDatabase(tr, restore.getUid())); + } return Void(); } @@ -5172,6 +5215,7 @@ public: Key addPrefix, Key removePrefix, LockDB lockDB, + UnlockDB unlockDB, OnlyApplyMutationLogs onlyApplyMutationLogs, InconsistentSnapshotOnly inconsistentSnapshotOnly, Version beginVersion, @@ -5245,6 +5289,7 @@ public: restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs); restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly); restore.beginVersion().set(tr, beginVersion); + restore.unlockDBAfterRestore().set(tr, unlockDB); if (BUGGIFY && restoreRanges.size() == 1) { restore.restoreRange().set(tr, restoreRanges[0]); } else { @@ -5836,6 +5881,7 @@ public: Key addPrefix, Key removePrefix, LockDB lockDB, + UnlockDB unlockDB, OnlyApplyMutationLogs onlyApplyMutationLogs, InconsistentSnapshotOnly inconsistentSnapshotOnly, Version beginVersion, @@ -5892,6 +5938,7 @@ public: addPrefix, removePrefix, lockDB, + unlockDB, onlyApplyMutationLogs, inconsistentSnapshotOnly, beginVersion, @@ -6017,7 +6064,7 @@ public: } } - Reference bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference())); + state Reference bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference())); if (fastRestore) { TraceEvent("AtomicParallelRestoreStartRestore").log(); @@ -6043,24 +6090,80 @@ public: return -1; } else { TraceEvent("AS_StartRestore").log(); - Version ver = wait(restore(backupAgent, - cx, - cx, - tagName, - KeyRef(bc->getURL()), - bc->getProxy(), - ranges, - WaitForComplete::True, - ::invalidVersion, - Verbose::True, - addPrefix, - removePrefix, - LockDB::True, - OnlyApplyMutationLogs::False, - InconsistentSnapshotOnly::False, - ::invalidVersion, - {}, - randomUid)); + state Standalone> restoreRange; + state Standalone> systemRestoreRange; + bool encryptionEnabled = cx->clientInfo->get().isEncryptionEnabled; + for (auto r : ranges) { + if (!encryptionEnabled || !r.intersects(getSystemBackupRanges())) { + restoreRange.push_back_deep(restoreRange.arena(), r); + } else { + KeyRangeRef normalKeyRange = r & normalKeys; + KeyRangeRef systemKeyRange = r & systemKeys; + if (!normalKeyRange.empty()) { + restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange); + } + if (!systemKeyRange.empty()) { + systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange); + } + } + } + if (!systemRestoreRange.empty()) { + // restore system keys + wait(success(restore(backupAgent, + cx, + cx, + "system_restore"_sr, + KeyRef(bc->getURL()), + bc->getProxy(), + systemRestoreRange, + WaitForComplete::True, + ::invalidVersion, + Verbose::True, + addPrefix, + removePrefix, + LockDB::True, + UnlockDB::False, + OnlyApplyMutationLogs::False, + InconsistentSnapshotOnly::False, + ::invalidVersion, + {}, + randomUid))); + state Reference rywTransaction = + Reference(new ReadYourWritesTransaction(cx)); + // clear old restore config associated with system keys + loop { + try { + rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE); + state RestoreConfig oldRestore(randomUid); + oldRestore.clear(rywTransaction); + wait(rywTransaction->commit()); + break; + } catch (Error& e) { + wait(rywTransaction->onError(e)); + } + } + } + // restore user data + state Version ver = wait(restore(backupAgent, + cx, + cx, + tagName, + KeyRef(bc->getURL()), + bc->getProxy(), + restoreRange, + WaitForComplete::True, + ::invalidVersion, + Verbose::True, + addPrefix, + removePrefix, + LockDB::True, + UnlockDB::True, + OnlyApplyMutationLogs::False, + InconsistentSnapshotOnly::False, + ::invalidVersion, + {}, + randomUid)); return ver; } } @@ -6120,6 +6223,7 @@ Future FileBackupAgent::restore(Database cx, Key addPrefix, Key removePrefix, LockDB lockDB, + UnlockDB unlockDB, OnlyApplyMutationLogs onlyApplyMutationLogs, InconsistentSnapshotOnly inconsistentSnapshotOnly, Version beginVersion, @@ -6137,6 +6241,7 @@ Future FileBackupAgent::restore(Database cx, addPrefix, removePrefix, lockDB, + unlockDB, onlyApplyMutationLogs, inconsistentSnapshotOnly, beginVersion, @@ -6178,6 +6283,7 @@ Future FileBackupAgent::restore(Database cx, addPrefix, removePrefix, lockDB, + UnlockDB::True, onlyApplyMutationLogs, inconsistentSnapshotOnly, beginVersion, diff --git a/fdbclient/include/fdbclient/BackupAgent.actor.h b/fdbclient/include/fdbclient/BackupAgent.actor.h index 314f151fd0..de1f5a06f4 100644 --- a/fdbclient/include/fdbclient/BackupAgent.actor.h +++ b/fdbclient/include/fdbclient/BackupAgent.actor.h @@ -196,6 +196,7 @@ public: Key addPrefix = Key(), Key removePrefix = Key(), LockDB = LockDB::True, + UnlockDB = UnlockDB::True, OnlyApplyMutationLogs = OnlyApplyMutationLogs::False, InconsistentSnapshotOnly = InconsistentSnapshotOnly::False, Version beginVersion = ::invalidVersion, diff --git a/fdbclient/include/fdbclient/TenantEntryCache.actor.h b/fdbclient/include/fdbclient/TenantEntryCache.actor.h index 4ff5438d5a..0429494714 100644 --- a/fdbclient/include/fdbclient/TenantEntryCache.actor.h +++ b/fdbclient/include/fdbclient/TenantEntryCache.actor.h @@ -68,6 +68,10 @@ using TenantEntryCachePayloadFunc = std::function(con // 1. Lookup by 'TenantId' // 2. Lookup by 'TenantPrefix' // 3. Lookup by 'TenantName' +// TODO: Currently this cache performs poorly if there are tenant access happening to unknown tenants which happens most +// frequently in optional tenant mode but can also happen in required mode if there are alot of tenants created. Further +// as a consequence of the design we cannot be sure that the state of a given tenant is accurate even if its present in +// the cache. template class TenantEntryCache : public ReferenceCounted>, NonCopyable { diff --git a/fdbserver/workloads/AtomicRestore.actor.cpp b/fdbserver/workloads/AtomicRestore.actor.cpp index 2acc8b092d..6f90f900d7 100644 --- a/fdbserver/workloads/AtomicRestore.actor.cpp +++ b/fdbserver/workloads/AtomicRestore.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/ManagementAPI.actor.h" #include "fdbrpc/simulator.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbserver/Knobs.h" @@ -95,6 +96,7 @@ struct AtomicRestoreWorkload : TestWorkload { TraceEvent("AtomicRestore_Start").detail("UsePartitionedLog", self->usePartitionedLogs); state std::string backupContainer = "file://simfdb/backups/"; + state DatabaseConfiguration conf = wait(getDatabaseConfiguration(cx)); try { wait(backupAgent.submitBackup(cx, StringRef(backupContainer), @@ -103,7 +105,8 @@ struct AtomicRestoreWorkload : TestWorkload { deterministicRandom()->randomInt(0, 100), BackupAgentBase::getDefaultTagName(), self->backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + conf.tenantMode != TenantMode::OPTIONAL_TENANT, StopWhenDone::False, self->usePartitionedLogs)); } catch (Error& e) { diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 0feee4f55c..4f8dbc23bb 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -215,7 +215,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { state std::string backupContainer = "file://simfdb/backups/"; state Future status = statusLoop(cx, tag.toString()); - + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); try { wait(backupAgent->submitBackup(cx, StringRef(backupContainer), @@ -224,7 +224,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { deterministicRandom()->randomInt(0, 100), tag.toString(), backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT, StopWhenDone{ !stopDifferentialDelay }, self->usePartitionedLogs)); } catch (Error& e) { @@ -474,6 +475,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { // Occasionally start yet another backup that might still be running when we restore if (!self->locked && BUGGIFY) { TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag)); + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); try { // Note the "partitionedLog" must be false, because we change // the configuration to disable backup workers before restore. @@ -484,7 +486,8 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { deterministicRandom()->randomInt(0, 100), self->backupTag.toString(), self->backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT, StopWhenDone::True, UsePartitionedLog::False); } catch (Error& e) { diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp index 7bd309457e..7e65e60e10 100644 --- a/fdbserver/workloads/BackupCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupCorrectness.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/ReadYourWrites.h" #include "fdbrpc/simulator.h" #include "fdbclient/BackupAgent.actor.h" @@ -331,7 +332,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { state std::string backupContainer = "file://simfdb/backups/"; state Future status = statusLoop(cx, tag.toString()); - + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); try { wait(backupAgent->submitBackup(cx, StringRef(backupContainer), @@ -340,7 +341,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { deterministicRandom()->randomInt(0, 2000), tag.toString(), backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT, StopWhenDone{ !stopDifferentialDelay }, UsePartitionedLog::False, IncrementalBackupOnly::False, @@ -515,6 +517,42 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { return Void(); } + ACTOR static Future clearAndRestoreSystemKeys(Database cx, + BackupAndRestoreCorrectnessWorkload* self, + FileBackupAgent* backupAgent, + Version targetVersion, + Reference lastBackupContainer, + Standalone> systemRestoreRanges) { + // restore system keys before restoring any other ranges + wait(runRYWTransaction(cx, [=](Reference tr) -> Future { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + for (auto& range : systemRestoreRanges) + tr->clear(range); + return Void(); + })); + state Standalone restoreTag(self->backupTag.toString() + "_system"); + printf("BackupCorrectness, backupAgent.restore is called for tag:%s\n", restoreTag.toString().c_str()); + wait(success(backupAgent->restore(cx, + cx, + restoreTag, + KeyRef(lastBackupContainer->getURL()), + lastBackupContainer->getProxy(), + systemRestoreRanges, + WaitForComplete::True, + targetVersion, + Verbose::True, + Key(), + Key(), + self->locked, + UnlockDB::True, + OnlyApplyMutationLogs::False, + InconsistentSnapshotOnly::False, + ::invalidVersion, + self->encryptionKeyFileName))); + printf("BackupCorrectness, backupAgent.restore finished for tag:%s\n", restoreTag.toString().c_str()); + return Void(); + } + ACTOR static Future _start(Database cx, BackupAndRestoreCorrectnessWorkload* self) { state FileBackupAgent backupAgent; state Future extraBackup; @@ -593,6 +631,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { // Occasionally start yet another backup that might still be running when we restore if (!self->locked && BUGGIFY) { TraceEvent("BARW_SubmitBackup2", randomID).detail("Tag", printable(self->backupTag)); + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); try { extraBackup = backupAgent.submitBackup(cx, "file://simfdb/backups/"_sr, @@ -601,7 +640,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { deterministicRandom()->randomInt(0, 100), self->backupTag.toString(), self->backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT, StopWhenDone::True); } catch (Error& e) { TraceEvent("BARW_SubmitBackup2Exception", randomID) @@ -638,7 +678,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { lastBackupContainer->getEncryptionKeyFileName()); BackupDescription desc = wait(container->describeBackup()); - Version targetVersion = -1; + state Version targetVersion = -1; if (desc.maxRestorableVersion.present()) { if (deterministicRandom()->random01() < 0.1) { targetVersion = desc.minRestorableVersion.get(); @@ -656,6 +696,32 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { state std::vector> restoreTags; state bool multipleRangesInOneTag = false; state int restoreIndex = 0; + // make sure system keys are not present in the restoreRanges as they will get restored first separately + // from the rest + Standalone> modifiedRestoreRanges; + Standalone> systemRestoreRanges; + for (int i = 0; i < self->restoreRanges.size(); ++i) { + if (!SERVER_KNOBS->ENABLE_ENCRYPTION || + !self->restoreRanges[i].intersects(getSystemBackupRanges())) { + modifiedRestoreRanges.push_back_deep(modifiedRestoreRanges.arena(), self->restoreRanges[i]); + } else { + KeyRangeRef normalKeyRange = self->restoreRanges[i] & normalKeys; + KeyRangeRef systemKeyRange = self->restoreRanges[i] & systemKeys; + if (!normalKeyRange.empty()) { + modifiedRestoreRanges.push_back_deep(modifiedRestoreRanges.arena(), normalKeyRange); + } + if (!systemKeyRange.empty()) { + systemRestoreRanges.push_back_deep(systemRestoreRanges.arena(), systemKeyRange); + } + } + } + self->restoreRanges = modifiedRestoreRanges; + if (!systemRestoreRanges.empty()) { + // We are able to restore system keys first since we restore an entire cluster at once rather than + // partial key ranges. + wait(clearAndRestoreSystemKeys( + cx, self, &backupAgent, targetVersion, lastBackupContainer, systemRestoreRanges)); + } if (deterministicRandom()->random01() < 0.5) { for (restoreIndex = 0; restoreIndex < self->restoreRanges.size(); restoreIndex++) { auto range = self->restoreRanges[restoreIndex]; @@ -703,6 +769,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { Key(), Key(), self->locked, + UnlockDB::True, OnlyApplyMutationLogs::False, InconsistentSnapshotOnly::False, ::invalidVersion, @@ -735,6 +802,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { Key(), Key(), self->locked, + UnlockDB::True, OnlyApplyMutationLogs::False, InconsistentSnapshotOnly::False, ::invalidVersion, diff --git a/fdbserver/workloads/BackupToBlob.actor.cpp b/fdbserver/workloads/BackupToBlob.actor.cpp index d1995f388b..fa09409280 100644 --- a/fdbserver/workloads/BackupToBlob.actor.cpp +++ b/fdbserver/workloads/BackupToBlob.actor.cpp @@ -21,6 +21,7 @@ #include "fdbrpc/simulator.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" +#include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/Knobs.h" #include "fdbserver/workloads/BlobStoreWorkload.h" #include "fdbserver/workloads/workloads.actor.h" @@ -57,6 +58,7 @@ struct BackupToBlobWorkload : TestWorkload { addDefaultBackupRanges(backupRanges); wait(delay(self->backupAfter)); + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); wait(backupAgent.submitBackup(cx, self->backupURL, {}, @@ -64,7 +66,8 @@ struct BackupToBlobWorkload : TestWorkload { self->snapshotInterval, self->backupTag.toString(), backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION)); + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT)); EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), StopWhenDone::True)); TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus)); return Void(); diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp index f56780d4af..02d05ef208 100644 --- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp @@ -22,6 +22,7 @@ #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/TenantManagement.actor.h" +#include "fdbserver/Knobs.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" #include "flow/ApiVersion.h" @@ -667,10 +668,47 @@ struct BackupToDBCorrectnessWorkload : TestWorkload { // wait(diffRanges(self->backupRanges, self->backupPrefix, cx, self->extraDB)); state Standalone> restoreRange; + state Standalone> systemRestoreRange; for (auto r : self->backupRanges) { - restoreRange.push_back_deep( - restoreRange.arena(), - KeyRangeRef(r.begin.withPrefix(self->backupPrefix), r.end.withPrefix(self->backupPrefix))); + if (!SERVER_KNOBS->ENABLE_ENCRYPTION || !r.intersects(getSystemBackupRanges())) { + restoreRange.push_back_deep( + restoreRange.arena(), + KeyRangeRef(r.begin.withPrefix(self->backupPrefix), r.end.withPrefix(self->backupPrefix))); + } else { + KeyRangeRef normalKeyRange = r & normalKeys; + KeyRangeRef systemKeyRange = r & systemKeys; + if (!normalKeyRange.empty()) { + restoreRange.push_back_deep(restoreRange.arena(), + KeyRangeRef(normalKeyRange.begin.withPrefix(self->backupPrefix), + normalKeyRange.end.withPrefix(self->backupPrefix))); + } + if (!systemKeyRange.empty()) { + systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange); + } + } + } + + // restore system keys first before restoring user data + if (!systemRestoreRange.empty()) { + state Key systemRestoreTag = "restore_system"_sr; + try { + wait(restoreTool.submitBackup(cx, + systemRestoreTag, + systemRestoreRange, + StopWhenDone::True, + StringRef(), + self->backupPrefix, + self->locked, + DatabaseBackupAgent::PreBackupAction::CLEAR)); + } catch (Error& e) { + TraceEvent("BARW_DoBackupSubmitBackupException", randomID) + .error(e) + .detail("Tag", printable(systemRestoreTag)); + if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate) + throw; + } + wait(success(restoreTool.waitBackup(cx, systemRestoreTag))); + wait(restoreTool.unlockBackup(cx, systemRestoreTag)); } try { diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp index 23cd2b6ca7..d2b984114f 100644 --- a/fdbserver/workloads/IncrementalBackup.actor.cpp +++ b/fdbserver/workloads/IncrementalBackup.actor.cpp @@ -20,6 +20,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/Knobs.h" +#include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/SystemData.h" #include "fdbclient/ReadYourWrites.h" #include "fdbrpc/simulator.h" @@ -150,6 +151,7 @@ struct IncrementalBackupWorkload : TestWorkload { if (self->submitOnly) { TraceEvent("IBackupSubmitAttempt").log(); + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); try { wait(self->backupAgent.submitBackup(cx, self->backupDir, @@ -158,7 +160,8 @@ struct IncrementalBackupWorkload : TestWorkload { 1e8, self->tag.toString(), backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT, StopWhenDone::False, UsePartitionedLog::False, IncrementalBackupOnly::True)); @@ -227,19 +230,56 @@ struct IncrementalBackupWorkload : TestWorkload { .detail("Size", containers.size()) .detail("First", containers.front()); state Key backupURL = Key(containers.front()); + + state Standalone> restoreRange; + state Standalone> systemRestoreRange; + for (auto r : backupRanges) { + if (!SERVER_KNOBS->ENABLE_ENCRYPTION || !r.intersects(getSystemBackupRanges())) { + restoreRange.push_back_deep(restoreRange.arena(), r); + } else { + KeyRangeRef normalKeyRange = r & normalKeys; + KeyRangeRef systemKeyRange = r & systemKeys; + if (!normalKeyRange.empty()) { + restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange); + } + if (!systemKeyRange.empty()) { + systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange); + } + } + } + if (!systemRestoreRange.empty()) { + TraceEvent("IBackupSystemRestoreAttempt").detail("BeginVersion", beginVersion); + wait(success(self->backupAgent.restore(cx, + cx, + "system_restore"_sr, + backupURL, + {}, + systemRestoreRange, + WaitForComplete::True, + invalidVersion, + Verbose::True, + Key(), + Key(), + LockDB::True, + UnlockDB::True, + OnlyApplyMutationLogs::True, + InconsistentSnapshotOnly::False, + beginVersion))); + } TraceEvent("IBackupRestoreAttempt").detail("BeginVersion", beginVersion); wait(success(self->backupAgent.restore(cx, cx, Key(self->tag.toString()), backupURL, {}, - backupRanges, + restoreRange, WaitForComplete::True, invalidVersion, Verbose::True, Key(), Key(), LockDB::True, + UnlockDB::True, OnlyApplyMutationLogs::True, InconsistentSnapshotOnly::False, beginVersion))); diff --git a/fdbserver/workloads/RestoreBackup.actor.cpp b/fdbserver/workloads/RestoreBackup.actor.cpp index 7254d4b59d..02191ef1ea 100644 --- a/fdbserver/workloads/RestoreBackup.actor.cpp +++ b/fdbserver/workloads/RestoreBackup.actor.cpp @@ -24,6 +24,7 @@ #include "fdbrpc/simulator.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" +#include "fdbserver/Knobs.h" #include "fdbserver/workloads/workloads.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -113,14 +114,43 @@ struct RestoreBackupWorkload : TestWorkload { wait(delay(self->delayFor)); wait(waitOnBackup(self, cx)); wait(clearDatabase(cx)); - wait(success(self->backupAgent.restore(cx, - cx, - self->tag, - Key(self->backupContainer->getURL()), - self->backupContainer->getProxy(), - WaitForComplete::True, - ::invalidVersion, - Verbose::True))); + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { + // restore system keys + VectorRef systemBackupRanges = getSystemBackupRanges(); + state std::vector> restores; + for (int i = 0; i < systemBackupRanges.size(); i++) { + restores.push_back((self->backupAgent.restore(cx, + cx, + "system_restore"_sr, + Key(self->backupContainer->getURL()), + self->backupContainer->getProxy(), + WaitForComplete::True, + ::invalidVersion, + Verbose::True, + systemBackupRanges[i]))); + } + waitForAll(restores); + // restore non-system keys + wait(success(self->backupAgent.restore(cx, + cx, + self->tag, + Key(self->backupContainer->getURL()), + self->backupContainer->getProxy(), + WaitForComplete::True, + ::invalidVersion, + Verbose::True, + normalKeys))); + } else { + wait(success(self->backupAgent.restore(cx, + cx, + self->tag, + Key(self->backupContainer->getURL()), + self->backupContainer->getProxy(), + WaitForComplete::True, + ::invalidVersion, + Verbose::True))); + } + return Void(); } diff --git a/fdbserver/workloads/RestoreFromBlob.actor.cpp b/fdbserver/workloads/RestoreFromBlob.actor.cpp index afea1585e8..41d3d0f569 100644 --- a/fdbserver/workloads/RestoreFromBlob.actor.cpp +++ b/fdbserver/workloads/RestoreFromBlob.actor.cpp @@ -18,9 +18,11 @@ * limitations under the License. */ +#include "fdbclient/SystemData.h" #include "fdbrpc/simulator.h" #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupContainer.h" +#include "fdbserver/Knobs.h" #include "fdbserver/workloads/BlobStoreWorkload.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" @@ -52,13 +54,22 @@ struct RestoreFromBlobWorkload : TestWorkload { ACTOR static Future _start(Database cx, RestoreFromBlobWorkload* self) { state FileBackupAgent backupAgent; - state Standalone> restoreRanges; - - addDefaultBackupRanges(restoreRanges); wait(delay(self->restoreAfter)); - Version v = wait( - backupAgent.restore(cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete)); + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { + // restore system keys followed by user keys + wait(success(backupAgent.restore( + cx, {}, self->backupTag, self->backupURL, {}, getSystemBackupRanges(), self->waitForComplete))); + Standalone> restoreRanges; + restoreRanges.push_back_deep(restoreRanges.arena(), normalKeys); + wait(success(backupAgent.restore( + cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete))); + } else { + Standalone> restoreRanges; + addDefaultBackupRanges(restoreRanges); + wait(success(backupAgent.restore( + cx, {}, self->backupTag, self->backupURL, {}, restoreRanges, self->waitForComplete))); + } return Void(); } diff --git a/fdbserver/workloads/SubmitBackup.actor.cpp b/fdbserver/workloads/SubmitBackup.actor.cpp index 8cf59652ed..417edcba4a 100644 --- a/fdbserver/workloads/SubmitBackup.actor.cpp +++ b/fdbserver/workloads/SubmitBackup.actor.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/FDBTypes.h" +#include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/ReadYourWrites.h" #include "fdbrpc/simulator.h" #include "fdbclient/BackupAgent.actor.h" @@ -52,8 +53,9 @@ struct SubmitBackupWorkload : TestWorkload { ACTOR static Future _start(SubmitBackupWorkload* self, Database cx) { wait(delay(self->delayFor)); - Standalone> backupRanges; + state Standalone> backupRanges; addDefaultBackupRanges(backupRanges); + state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx)); try { wait(self->backupAgent.submitBackup(cx, self->backupDir, @@ -62,7 +64,8 @@ struct SubmitBackupWorkload : TestWorkload { self->snapshotInterval, self->tag.toString(), backupRanges, - SERVER_KNOBS->ENABLE_ENCRYPTION, + SERVER_KNOBS->ENABLE_ENCRYPTION && + configuration.tenantMode != TenantMode::OPTIONAL_TENANT, self->stopWhenDone, UsePartitionedLog::False, self->incremental)); diff --git a/tests/fast/EncryptedBackupCorrectness.toml b/tests/fast/EncryptedBackupCorrectness.toml index 597532737e..e674555491 100644 --- a/tests/fast/EncryptedBackupCorrectness.toml +++ b/tests/fast/EncryptedBackupCorrectness.toml @@ -1,5 +1,4 @@ [configuration] -allowDefaultTenant = true tenantModes = ['required'] [[knobs]] From 623e6ef761962034599a13b261a84b8b2366d588 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Wed, 26 Oct 2022 12:22:41 -0500 Subject: [PATCH 50/52] adding delay in bw forced shutdown to prevent crash races (#8552) --- fdbserver/BlobWorker.actor.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index cf5b7b1340..fc51c8fae0 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -5182,13 +5182,22 @@ ACTOR Future blobWorker(BlobWorkerInterface bwInterf, ASSERT(false); throw internal_error(); } - when(wait(selfRemoved || self->simInjectFailure.getFuture())) { + when(wait(selfRemoved)) { if (BW_DEBUG) { printf("Blob worker detected removal. Exiting...\n"); } TraceEvent("BlobWorkerRemoved", self->id); break; } + when(wait(self->simInjectFailure.getFuture())) { + // wait to let triggering actor finish to prevent weird shutdown races + wait(delay(0)); + if (BW_DEBUG) { + printf("Blob worker simulation injected failure. Exiting...\n"); + } + TraceEvent("BlobWorkerSimRemoved", self->id); + break; + } when(wait(self->fatalError.getFuture())) { TraceEvent(SevError, "BlobWorkerActorCollectionFatalErrorNotError", self->id); ASSERT(false); From f620f391f5ef745e681cf6fe8867c77eacad160c Mon Sep 17 00:00:00 2001 From: Aaron Molitor Date: Wed, 26 Oct 2022 11:07:04 -0500 Subject: [PATCH 51/52] make same change to Dockerfile.eks (from #8583) --- packaging/docker/Dockerfile | 2 +- packaging/docker/Dockerfile.eks | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packaging/docker/Dockerfile b/packaging/docker/Dockerfile index 70d488b88c..1c19b1d9bd 100644 --- a/packaging/docker/Dockerfile +++ b/packaging/docker/Dockerfile @@ -183,7 +183,7 @@ RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin sha256sum --quiet -c kubectl.txt && \ mv kubectl /usr/local/bin/kubectl && \ chmod 755 /usr/local/bin/kubectl && \ - curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m)-2.7.34.zip -o "awscliv2.zip" && \ + curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.7.34.zip -o "awscliv2.zip" && \ echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88 awscliv2.zip" > awscliv2.txt && \ sha256sum --quiet -c awscliv2.txt && \ unzip -qq awscliv2.zip && \ diff --git a/packaging/docker/Dockerfile.eks b/packaging/docker/Dockerfile.eks index fc037f3bf9..9a3eacb84b 100644 --- a/packaging/docker/Dockerfile.eks +++ b/packaging/docker/Dockerfile.eks @@ -53,13 +53,13 @@ RUN curl -Ls https://github.com/krallin/tini/releases/download/v0.19.0/tini-amd6 mv tini /usr/bin/ && \ rm -rf /tmp/* -RUN curl -Ls https://amazon-eks.s3.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl -o kubectl && \ - echo "08ff68159bbcb844455167abb1d0de75bbfe5ae1b051f81ab060a1988027868a kubectl" > kubectl.txt && \ +RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \ + echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \ sha256sum --quiet -c kubectl.txt && \ mv kubectl /usr/local/bin/kubectl && \ chmod 755 /usr/local/bin/kubectl && \ - curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.2.43.zip -o "awscliv2.zip" && \ - echo "9a8b3c4e7f72bbcc55e341dce3af42479f2730c225d6d265ee6f9162cfdebdfd awscliv2.zip" > awscliv2.txt && \ + curl -Ls https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.7.34.zip -o "awscliv2.zip" && \ + echo "daf9253f0071b5cfee9532bc5220bedd7a5d29d4e0f92b42b9e3e4c496341e88 awscliv2.zip" > awscliv2.txt && \ sha256sum --quiet -c awscliv2.txt && \ unzip -qq awscliv2.zip && \ ./aws/install && \ From 989731f7f4d6aa67cd5c40a81ec49958598ea0de Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 26 Oct 2022 11:48:10 -0600 Subject: [PATCH 52/52] Fix AWS SDK build and removed check for old build system --- cmake/FlowCommands.cmake | 38 +++------------------ cmake/awssdk.cmake | 69 ++++++++++++++++++++------------------- fdbmonitor/CMakeLists.txt | 15 ++++----- 3 files changed, 47 insertions(+), 75 deletions(-) diff --git a/cmake/FlowCommands.cmake b/cmake/FlowCommands.cmake index 81ea36892d..c12b4dff03 100644 --- a/cmake/FlowCommands.cmake +++ b/cmake/FlowCommands.cmake @@ -76,38 +76,11 @@ function(generate_coverage_xml) add_dependencies(coverage_${target_name} coveragetool) endfunction() -# This function asserts that `versions.h` does not exist in the source -# directory. It does this in the prebuild phase of the target. -# This is an ugly hack that should make sure that cmake isn't used with -# a source directory in which FDB was previously built with `make`. -function(assert_no_version_h target) - - message(STATUS "Check versions.h on ${target}") - set(target_name "${target}_versions_h_check") - - if (DEFINED ENV{VERBOSE}) - add_custom_target("${target_name}" - COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h" - -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake" - COMMAND echo - "${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake" - -DFILE="${CMAKE_SOURCE_DIR}/versions.h" - COMMENT "Check old build system wasn't used in source dir") - else() - add_custom_target("${target_name}" - COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h" - -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake" - COMMENT "Check old build system wasn't used in source dir") - endif() - - add_dependencies(${target} ${target_name}) -endfunction() - add_custom_target(strip_targets) add_dependencies(packages strip_targets) function(strip_debug_symbols target) - if (WIN32) + if(WIN32) return() endif() get_target_property(target_type ${target} TYPE) @@ -146,7 +119,7 @@ function(strip_debug_symbols target) COMMAND objcopy --verbose --only-keep-debug $ "${out_file}.debug" COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}" COMMENT "Copy debug symbols to ${out_name}.debug") - add_custom_target(strip_${target} DEPENDS "${out_file}.debug") + add_custom_target(strip_${target} DEPENDS "${out_file}.debug") else() add_custom_target(strip_${target}) add_dependencies(strip_${target} strip_only_${target}) @@ -171,7 +144,7 @@ function(copy_headers) foreach(f IN LISTS CP_SRCS) is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}") is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}") - if (bd OR sd) + if(bd OR sd) continue() endif() is_header(hdr "${f}") @@ -180,7 +153,7 @@ function(copy_headers) endif() get_filename_component(fname ${f} NAME) get_filename_component(dname ${f} DIRECTORY) - if (dname) + if(dname) make_directory(${incl_dir}/${dname}) endif() set(fpath "${incl_dir}/${dname}/${fname}") @@ -309,9 +282,6 @@ function(add_flow_target) add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files}) add_dependencies(${AFT_NAME} ${AFT_NAME}_actors) - if(NOT WIN32) - assert_no_version_h(${AFT_NAME}_actors) - endif() generate_coverage_xml(${AFT_NAME}) if(strip_target) strip_debug_symbols(${AFT_NAME}) diff --git a/cmake/awssdk.cmake b/cmake/awssdk.cmake index ab62f9b6d6..28c81166d3 100644 --- a/cmake/awssdk.cmake +++ b/cmake/awssdk.cmake @@ -8,40 +8,43 @@ endif() include(ExternalProject) ExternalProject_Add(awssdk_project - GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git - GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331 - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build" - GIT_CONFIG advice.detachedHead=false - CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs - -DENABLE_TESTING=OFF - -DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries - -DSIMPLE_INSTALL=ON - -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path - -DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own - -DBUILD_CURL=ON - -DBUILD_ZLIB=ON - - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS} - TEST_COMMAND "" + GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git + GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331 + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build" + GIT_CONFIG advice.detachedHead=false + # it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt. + # This option forces to cmake to build the aws sdk only once and never attempt to update it + UPDATE_DISCONNECTED ON + CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs + -DENABLE_TESTING=OFF + -DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries + -DSIMPLE_INSTALL=ON + -DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path + -DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own + -DBUILD_CURL=ON + -DBUILD_ZLIB=ON + + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS} + TEST_COMMAND "" # the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in - BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a" - "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a" -) + BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a" + "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a" + ) add_library(awssdk_core STATIC IMPORTED) add_dependencies(awssdk_core awssdk_project) diff --git a/fdbmonitor/CMakeLists.txt b/fdbmonitor/CMakeLists.txt index 177dde5170..1eb15ac269 100644 --- a/fdbmonitor/CMakeLists.txt +++ b/fdbmonitor/CMakeLists.txt @@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES) target_link_libraries(fdbmonitor PUBLIC SimpleOpt) target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}") strip_debug_symbols(fdbmonitor) -assert_no_version_h(fdbmonitor) if(UNIX AND NOT APPLE) - target_link_libraries(fdbmonitor PRIVATE rt) + target_link_libraries(fdbmonitor PRIVATE rt) endif() # FIXME: This include directory is an ugly hack. We probably want to fix this. # as soon as we get rid of the old build system @@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads) # appears to change its behavior (it no longer seems to restart killed # processes). fdbmonitor is single-threaded anyway. get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS) -if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND") +if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND") list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread") set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options}) -endif () +endif() get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS) -if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND") +if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND") list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread") set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options}) -endif () +endif() if(GENERATE_DEBUG_PACKAGES) fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server) @@ -51,7 +50,7 @@ add_custom_target(clean_sandbox add_custom_target(start_sandbox COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf - --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock) + --lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock) add_dependencies(start_sandbox fdbmonitor fdbserver) @@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh) endif() add_custom_target(generate_profile - COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR}) + COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR}) add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)