From eb0de3b937a305a4c0e338989cf2b580e7167ad9 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 18 Feb 2022 10:10:30 -0800 Subject: [PATCH 001/299] skew setting; shard map reading --- fdbclient/SystemData.cpp | 14 ++++ fdbclient/SystemData.h | 2 + fdbserver/tester.actor.cpp | 12 ++++ fdbserver/workloads/ReadWrite.actor.cpp | 94 ++++++++++++++++++++++--- fdbserver/workloads/workloads.actor.h | 1 + 5 files changed, 114 insertions(+), 9 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index fe10cae868..a0257347ad 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -276,6 +276,7 @@ std::pair>, std::vector> server_id; return server_id; } + +std::pair serverKeysDecodeServerBegin(const KeyRef& key) { + UID server_id; + BinaryReader rd(key.removePrefix(serverKeysPrefix), Unversioned()); + rd >> server_id; + rd.readBytes(1); // skip "/" + std::string bytes; + while(!rd.empty()) { + bytes.push_back((char)*rd.arenaRead(1)); + } + return std::make_pair(server_id, Key(bytes)); +} + bool serverHasKey(ValueRef storedValue) { return storedValue == serverKeysTrue || storedValue == serverKeysTrueEmptyRange; } diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 14234151a3..88717b9d4c 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -91,11 +91,13 @@ void decodeStorageCacheValue(const ValueRef& value, std::vector& serve // Using the serverID as a prefix, then followed by the beginning of the shard range // as the key, the value indicates whether the shard does or does not exist on the server. // These values can be changed as data movement occurs. +extern const KeyRangeRef serverKeysRange; extern const KeyRef serverKeysPrefix; extern const ValueRef serverKeysTrue, serverKeysTrueEmptyRange, serverKeysFalse; const Key serverKeysKey(UID serverID, const KeyRef& keys); const Key serverKeysPrefixFor(UID serverID); UID serverKeysDecodeServer(const KeyRef& key); +std::pair serverKeysDecodeServerBegin(const KeyRef& key) ; bool serverHasKey(ValueRef storedValue); extern const KeyRangeRef conflictingKeysRange; diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 1581732625..d552d50240 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -98,6 +98,18 @@ Key KVWorkload::keyForIndex(uint64_t index) const { } } +// the reverse process of keyForIndex() without division +int64_t KVWorkload::indexForKey(const KeyRef& key) const { + int idx = 0; + if(nodePrefix > 0) { + ASSERT(keyBytes >= 32); + idx += 16; + } + ASSERT(keyBytes >= 16); + int64_t res = *(int64_t*)(key.begin() + idx); + return res; +} + Key KVWorkload::keyForIndex(uint64_t index, bool absent) const { int adjustedKeyBytes = (absent) ? (keyBytes + 1) : keyBytes; Key result = makeString(adjustedKeyBytes); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 4eae887c77..5e37bb37b5 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -30,6 +30,7 @@ #include "fdbserver/workloads/BulkSetup.actor.h" #include "fdbclient/ReadYourWrites.h" #include "flow/TDMetric.actor.h" +#include "fdbclient/RunTransaction.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. const int sampleSize = 10000; @@ -81,6 +82,7 @@ struct ReadWriteWorkload : KVWorkload { int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; double testDuration, transactionsPerSecond, alpha, warmingDelay, loadTime, maxInsertRate, debugInterval, debugTime; double metricsStart, metricsDuration, clientBegin; + std::string valueString; bool dependentReads; @@ -92,7 +94,6 @@ struct ReadWriteWorkload : KVWorkload { bool adjacentWrites; bool rampUpLoad; int rampSweepCount; - double hotKeyFraction, forceHotProbability; bool rangeReads; bool useRYW; bool rampTransactionType; @@ -118,6 +119,16 @@ struct ReadWriteWorkload : KVWorkload { std::vector periodicMetrics; + double hotKeyFraction, forceHotProbability; // key based hot traffic setting + + // server based hot traffic setting + double skewDuration = 0; // skewDuration = ceil(testDuration / skewRound) + double hotServerFraction = 0; // set > 0 to issue hot key based on shard map + double hotServerReadFrac, hotServerWriteFrac; // hot many traffic goes to hot servers + typedef std::vector> IndexRangeVec; + // keyForIndex generate key from index. So for a shard range, recording the start and end is enough + std::vector> serverShards; // storage server and the shards it owns + bool doSetup; ReadWriteWorkload(WorkloadContext const& wcx) @@ -209,14 +220,24 @@ struct ReadWriteWorkload : KVWorkload { { // with P(hotTrafficFraction) an access is directed to one of a fraction // of hot keys, else it is directed to a disjoint set of cold keys - hotKeyFraction = getOption(options, LiteralStringRef("hotKeyFraction"), 0.0); - double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); - ASSERT(hotKeyFraction >= 0 && hotTrafficFraction <= 1); - ASSERT(hotKeyFraction <= hotTrafficFraction); // hot keys should be actually hot! - // p(Cold key) = (1-FHP) * (1-hkf) - // p(Cold key) = (1-htf) - // solving for FHP gives: - forceHotProbability = (hotTrafficFraction - hotKeyFraction) / (1 - hotKeyFraction); + hotKeyFraction = getOption(options, "hotKeyFraction"_sr, 0.0); + hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.0); + + if (hotServerFraction > 0) { + int skewRound = getOption(options, "skewRound"_sr, 0); + hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.0); + hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); + ASSERT(hotServerReadFrac >= hotServerFraction && hotServerWriteFrac >= hotServerFraction && skewRound > 0); + skewDuration = ceil(testDuration / skewRound); + } else if (hotKeyFraction > 0) { + double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); + ASSERT(hotTrafficFraction <= 1); + ASSERT(hotKeyFraction <= hotTrafficFraction); // hot keys should be actually hot! + // p(Cold key) = (1-FHP) * (1-hkf) + // p(Cold key) = (1-htf) + // solving for FHP gives: + forceHotProbability = (hotTrafficFraction - hotKeyFraction) / (1 - hotKeyFraction); + } } } @@ -334,6 +355,61 @@ struct ReadWriteWorkload : KVWorkload { } } + ACTOR static Future updateServerShards(Database cx, ReadWriteWorkload* self) { + state RangeResult range = + wait(runRYWTransaction(cx, [](Reference tr) -> Future { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return tr->getRange(serverKeysRange, CLIENT_KNOBS->TOO_MANY, Snapshot::True); + })); + + // leftEdge < workloadBegin < workloadEnd + Key workloadBegin = self->keyForIndex(0), workloadEnd = self->keyForIndex(self->nodeCount); + Key leftEdge(allKeys.begin); + std::vector leftServer; // left server owns the range [leftEdge, workloadBegin) + KeyRangeRef workloadRange(workloadBegin, workloadEnd); + std::map> beginServers; // begin index to server ID + std::vector keyIndex; // shard boundary by index + + for (auto kv = range.begin(); kv != range.end(); kv++) { + if (serverHasKey(kv->value)) { + auto [id, key] = serverKeysDecodeServerBegin(kv->key); + + if (workloadRange.contains(key)) { + auto idx = self->indexForKey(key); + beginServers[idx].push_back(id); + keyIndex.push_back(idx); + } else if (workloadBegin > key && key > leftEdge) { // update left boundary + leftEdge = key; + leftServer.clear(); + } + + if (key == leftEdge) { + leftServer.push_back(id); + } + } + } + ASSERT(beginServers.begin()->first >= 0); + // handle the left boundary + if (beginServers.begin()->first > 0) { + keyIndex.push_back(0); + beginServers[0] = std::move(leftServer); + } + + // sort shard begin idx + ASSERT(keyIndex.size() == beginServers.size()); + std::sort(keyIndex.begin(), keyIndex.end()); + // build self->serverShards, starting from the left shard + std::unordered_map serverShards; + int i = 0; + for (auto it = beginServers.begin(); i < keyIndex.size() && it != beginServers.end(); ++i, ++it) { + auto shardEnd = i < keyIndex.size() - 1 ? keyIndex[i + 1] : self->nodeCount; + for (auto id : it->second) { + serverShards[id].emplace_back(keyIndex[i], shardEnd); + } + } + return Void(); + } + ACTOR static Future tracePeriodically(ReadWriteWorkload* self) { state double start = now(); state double elapsed = 0.0; diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index 1770c7eb52..9515c3df7a 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -101,6 +101,7 @@ struct KVWorkload : TestWorkload { Key getRandomKey(bool absent) const; Key keyForIndex(uint64_t index) const; Key keyForIndex(uint64_t index, bool absent) const; + int64_t indexForKey(const KeyRef& key) const; }; struct IWorkloadFactory : ReferenceCounted { From 5eeb3712dc3c7a6f1ca8a2eec3fcab52a69434fc Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 18 Feb 2022 16:36:24 -0800 Subject: [PATCH 002/299] add useful comments --- fdbserver/workloads/ReadWrite.actor.cpp | 148 +++++++++++++----------- tests/rare/ReadSkewReadWrite.toml | 14 +++ 2 files changed, 96 insertions(+), 66 deletions(-) create mode 100644 tests/rare/ReadSkewReadWrite.toml diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 5e37bb37b5..8a08a1aa43 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -77,59 +77,63 @@ DESCR struct ReadMetric { }; struct ReadWriteWorkload : KVWorkload { + // general test setting + Standalone descriptionString; + bool doSetup, cancelWorkersAtDuration; + double testDuration, transactionsPerSecond, warmingDelay, maxInsertRate, debugInterval, debugTime; + double metricsStart, metricsDuration; + std::vector insertionCountsToMeasure; // measure the speed of sequential insertion when bulkSetup + + // test log setting + bool enableReadLatencyLogging; + double periodicLoggingInterval; + + // use ReadWrite as a ramp up workload + bool rampUpLoad; // indicate this is a ramp up workload + int rampSweepCount; // how many times of ramp up + bool rampTransactionType; // choose transaction type based on client start time + bool rampUpConcurrency; // control client concurrency + + // transaction setting + bool useRYW; + bool batchPriority; + bool rangeReads; // read operations are all single key range read + bool dependentReads; // read operations are issued sequentially + bool inconsistentReads; // read with previous read version + bool adjacentReads; // keys are adjacent within a transaction + bool adjacentWrites; + double alpha; // probability for run TransactionA type + // two type of transaction int readsPerTransactionA, writesPerTransactionA; int readsPerTransactionB, writesPerTransactionB; int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; - double testDuration, transactionsPerSecond, alpha, warmingDelay, loadTime, maxInsertRate, debugInterval, debugTime; - double metricsStart, metricsDuration, clientBegin; - std::string valueString; - - bool dependentReads; - bool enableReadLatencyLogging; - double periodicLoggingInterval; - bool cancelWorkersAtDuration; - bool inconsistentReads; - bool adjacentReads; - bool adjacentWrites; - bool rampUpLoad; - int rampSweepCount; - bool rangeReads; - bool useRYW; - bool rampTransactionType; - bool rampUpConcurrency; - bool batchPriority; - - Standalone descriptionString; - - Int64MetricHandle totalReadsMetric; - Int64MetricHandle totalRetriesMetric; - EventMetricHandle transactionSuccessMetric; - EventMetricHandle transactionFailureMetric; - EventMetricHandle readMetric; - - std::vector> clients; - PerfIntCounter aTransactions, bTransactions, retries; - ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; - double readLatencyTotal; - int readLatencyCount; - - std::vector insertionCountsToMeasure; - std::vector> ratesAtKeyCounts; - - std::vector periodicMetrics; - + // hot traffic pattern double hotKeyFraction, forceHotProbability; // key based hot traffic setting - // server based hot traffic setting - double skewDuration = 0; // skewDuration = ceil(testDuration / skewRound) + int skewRound = 0; // skewDuration = ceil(testDuration / skewRound) double hotServerFraction = 0; // set > 0 to issue hot key based on shard map double hotServerReadFrac, hotServerWriteFrac; // hot many traffic goes to hot servers typedef std::vector> IndexRangeVec; // keyForIndex generate key from index. So for a shard range, recording the start and end is enough std::vector> serverShards; // storage server and the shards it owns - bool doSetup; + // states of metric + Int64MetricHandle totalReadsMetric; + Int64MetricHandle totalRetriesMetric; + EventMetricHandle transactionSuccessMetric; + EventMetricHandle transactionFailureMetric; + EventMetricHandle readMetric; + PerfIntCounter aTransactions, bTransactions, retries; + ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; + double readLatencyTotal; + int readLatencyCount; + std::vector periodicMetrics; + std::vector> ratesAtKeyCounts; // sequential insertion speed + + // other internal states + std::vector> clients; + double loadTime, clientBegin; ReadWriteWorkload(WorkloadContext const& wcx) : KVWorkload(wcx), loadTime(0.0), clientBegin(0), dependentReads(false), adjacentReads(false), @@ -224,11 +228,11 @@ struct ReadWriteWorkload : KVWorkload { hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.0); if (hotServerFraction > 0) { - int skewRound = getOption(options, "skewRound"_sr, 0); + skewRound = getOption(options, "skewRound"_sr, 0); hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.0); hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); - ASSERT(hotServerReadFrac >= hotServerFraction && hotServerWriteFrac >= hotServerFraction && skewRound > 0); - skewDuration = ceil(testDuration / skewRound); + ASSERT(hotServerReadFrac >= hotServerFraction && hotServerWriteFrac >= hotServerFraction && + skewRound > 0); } else if (hotKeyFraction > 0) { double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); ASSERT(hotTrafficFraction <= 1); @@ -346,8 +350,6 @@ struct ReadWriteWorkload : KVWorkload { deterministicRandom()->randomInt(minValueBytes, maxValueBytes + 1)); } - Standalone operator()(uint64_t n) { return KeyValueRef(keyForIndex(n, false), randomValue()); } - template void setupTransaction(Trans* tr) { if (batchPriority) { @@ -657,14 +659,13 @@ struct ReadWriteWorkload : KVWorkload { return Void(); } - ACTOR Future _start(Database cx, ReadWriteWorkload* self) { + ACTOR static Future warmCache(Database cx, ReadWriteWorkload* self) { // Read one record from the database to warm the cache of keyServers state std::vector keys; keys.push_back(deterministicRandom()->randomInt64(0, self->nodeCount)); state double startTime = now(); loop { state Transaction tr(cx); - try { self->setupTransaction(&tr); wait(self->readOp(&tr, keys, self, false)); @@ -674,30 +675,45 @@ struct ReadWriteWorkload : KVWorkload { wait(tr.onError(e)); } } - wait(delay(std::max(0.1, 1.0 - (now() - startTime)))); + return Void(); + } - std::vector> clients; + void startReadWriteClients(Database cx, std::vector>& clients) { + clientBegin = now(); + for (int c = 0; c < actorCount; c++) { + Future worker; + if (useRYW) + worker = + randomReadWriteClient(cx, this, actorCount / transactionsPerSecond, c); + else + worker = randomReadWriteClient(cx, this, actorCount / transactionsPerSecond, c); + clients.push_back(worker); + } + } + + ACTOR static Future _start(Database cx, ReadWriteWorkload* self) { + wait(warmCache(cx, self)); + + state std::vector> clients; if (self->enableReadLatencyLogging) clients.push_back(tracePeriodically(self)); - self->clientBegin = now(); - for (int c = 0; c < self->actorCount; c++) { - Future worker; - if (self->useRYW) - worker = self->randomReadWriteClient( - cx, self, self->actorCount / self->transactionsPerSecond, c); - else - worker = self->randomReadWriteClient( - cx, self, self->actorCount / self->transactionsPerSecond, c); - clients.push_back(worker); + if (self->skewRound > 0) { + while (self->skewRound--) { + wait(updateServerShards(cx, self)); + self->startReadWriteClients(cx, clients); + wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void())); + clients.clear(); + } + } else { + self->startReadWriteClients(cx, clients); + if (!self->cancelWorkersAtDuration) + self->clients = clients; // Don't cancel them until check() + + wait(self->cancelWorkersAtDuration ? timeout(waitForAll(clients), self->testDuration, Void()) + : delay(self->testDuration)); } - - if (!self->cancelWorkersAtDuration) - self->clients = clients; // Don't cancel them until check() - - wait(self->cancelWorkersAtDuration ? timeout(waitForAll(clients), self->testDuration, Void()) - : delay(self->testDuration)); return Void(); } diff --git a/tests/rare/ReadSkewReadWrite.toml b/tests/rare/ReadSkewReadWrite.toml new file mode 100644 index 0000000000..51631c9519 --- /dev/null +++ b/tests/rare/ReadSkewReadWrite.toml @@ -0,0 +1,14 @@ +[[test]] +testTitle = 'RandomReadWriteTest' +simCheckRelocationDuration = true +connectionFailuresDisableDuration = 100000 + + [[test.workload]] + testName = 'ReadWrite' + testDuration = 300.0 + skewRound = 1 + transactionsPerSecond = 2000 + nodeCount = 150000 + valueBytes = 128 + discardEdgeMeasurements = false + warmingDelay = 10.0 \ No newline at end of file From f1d410abc967de23dddf185551f8daa9e00f43e9 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sat, 19 Feb 2022 00:01:20 -0800 Subject: [PATCH 003/299] fix indexForKey bug --- fdbclient/SystemData.cpp | 1 + fdbserver/tester.actor.cpp | 3 +- fdbserver/workloads/ReadWrite.actor.cpp | 41 +++++++++++++++++++------ tests/CMakeLists.txt | 1 + tests/rare/ReadSkewReadWrite.toml | 4 +-- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index a0257347ad..af856028aa 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -312,6 +312,7 @@ std::pair serverKeysDecodeServerBegin(const KeyRef& key) { while(!rd.empty()) { bytes.push_back((char)*rd.arenaRead(1)); } + // std::cout << bytes.size() << " " <= 16); - int64_t res = *(int64_t*)(key.begin() + idx); + std::string str((char*)key.begin()+idx, key.size() - idx); + int64_t res = std::stoll(str, nullptr, 16); return res; } diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 8a08a1aa43..6cd0a4a107 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -109,11 +109,13 @@ struct ReadWriteWorkload : KVWorkload { int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; std::string valueString; // hot traffic pattern - double hotKeyFraction, forceHotProbability; // key based hot traffic setting + double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting // server based hot traffic setting int skewRound = 0; // skewDuration = ceil(testDuration / skewRound) double hotServerFraction = 0; // set > 0 to issue hot key based on shard map double hotServerReadFrac, hotServerWriteFrac; // hot many traffic goes to hot servers + + // hot server state typedef std::vector> IndexRangeVec; // keyForIndex generate key from index. So for a shard range, recording the start and end is enough std::vector> serverShards; // storage server and the shards it owns @@ -136,12 +138,13 @@ struct ReadWriteWorkload : KVWorkload { double loadTime, clientBegin; ReadWriteWorkload(WorkloadContext const& wcx) - : KVWorkload(wcx), loadTime(0.0), clientBegin(0), dependentReads(false), adjacentReads(false), - adjacentWrites(false), totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), + : KVWorkload(wcx), dependentReads(false), adjacentReads(false), adjacentWrites(false), + totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), totalRetriesMetric(LiteralStringRef("RWWorkload.TotalRetries")), aTransactions("A Transactions"), bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), - readLatencyCount(0) { + readLatencyCount(0), loadTime(0.0), clientBegin(0) { + transactionSuccessMetric.init(LiteralStringRef("RWWorkload.SuccessfulTransaction")); transactionFailureMetric.init(LiteralStringRef("RWWorkload.FailedTransaction")); readMetric.init(LiteralStringRef("RWWorkload.Read")); @@ -226,15 +229,15 @@ struct ReadWriteWorkload : KVWorkload { // of hot keys, else it is directed to a disjoint set of cold keys hotKeyFraction = getOption(options, "hotKeyFraction"_sr, 0.0); hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.0); + skewRound = getOption(options, "skewRound"_sr, 0); + hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.0); + hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); + double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); if (hotServerFraction > 0) { - skewRound = getOption(options, "skewRound"_sr, 0); - hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.0); - hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); ASSERT(hotServerReadFrac >= hotServerFraction && hotServerWriteFrac >= hotServerFraction && skewRound > 0); } else if (hotKeyFraction > 0) { - double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); ASSERT(hotTrafficFraction <= 1); ASSERT(hotKeyFraction <= hotTrafficFraction); // hot keys should be actually hot! // p(Cold key) = (1-FHP) * (1-hkf) @@ -350,6 +353,8 @@ struct ReadWriteWorkload : KVWorkload { deterministicRandom()->randomInt(minValueBytes, maxValueBytes + 1)); } + Standalone operator()(uint64_t n) { return KeyValueRef(keyForIndex(n, false), randomValue()); } + template void setupTransaction(Trans* tr) { if (batchPriority) { @@ -357,6 +362,16 @@ struct ReadWriteWorkload : KVWorkload { } } + void debugPrintServerShards() const { + for (auto it : this->serverShards) { + std::cout << it.first.toString() << ": ["; + for (auto p : it.second) { + std::cout << "[" << p.first << "," << p.second << "), "; + } + std::cout << "] \n"; + } + } + ACTOR static Future updateServerShards(Database cx, ReadWriteWorkload* self) { state RangeResult range = wait(runRYWTransaction(cx, [](Reference tr) -> Future { @@ -401,7 +416,7 @@ struct ReadWriteWorkload : KVWorkload { ASSERT(keyIndex.size() == beginServers.size()); std::sort(keyIndex.begin(), keyIndex.end()); // build self->serverShards, starting from the left shard - std::unordered_map serverShards; + std::map serverShards; int i = 0; for (auto it = beginServers.begin(); i < keyIndex.size() && it != beginServers.end(); ++i, ++it) { auto shardEnd = i < keyIndex.size() - 1 ? keyIndex[i + 1] : self->nodeCount; @@ -409,6 +424,11 @@ struct ReadWriteWorkload : KVWorkload { serverShards[id].emplace_back(keyIndex[i], shardEnd); } } + // self->serverShards is ordered by UID + for (auto it : serverShards) { + self->serverShards.emplace_back(it); + } + // self->debugPrintServerShards(); return Void(); } @@ -700,7 +720,8 @@ struct ReadWriteWorkload : KVWorkload { clients.push_back(tracePeriodically(self)); if (self->skewRound > 0) { - while (self->skewRound--) { + state int round = 0; + for (; round < self->skewRound; ++round) { wait(updateServerShards(cx, self)); self->startReadWriteClients(cx, clients); wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void())); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index df6cf64466..78c0bdb542 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -181,6 +181,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT) add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE) # re-enable as needed for RocksDB. Breaks correctness tests if RocksDB is disabled. add_fdb_test(TEST_FILES rare/CheckRelocation.toml) + add_fdb_test(TEST_FILES rare/ReadSkewReadWrite.toml) add_fdb_test(TEST_FILES rare/ClogUnclog.toml) add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml) add_fdb_test(TEST_FILES rare/ConfigIncrement.toml) diff --git a/tests/rare/ReadSkewReadWrite.toml b/tests/rare/ReadSkewReadWrite.toml index 51631c9519..2e5c3e228d 100644 --- a/tests/rare/ReadSkewReadWrite.toml +++ b/tests/rare/ReadSkewReadWrite.toml @@ -5,8 +5,8 @@ connectionFailuresDisableDuration = 100000 [[test.workload]] testName = 'ReadWrite' - testDuration = 300.0 - skewRound = 1 + testDuration = 30.0 + skewRound = 2 transactionsPerSecond = 2000 nodeCount = 150000 valueBytes = 128 From 7864d0c7b214c6a609dac7415ef66693243373ad Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 22 Feb 2022 09:28:13 -0800 Subject: [PATCH 004/299] finish workload --- fdbserver/workloads/ReadWrite.actor.cpp | 37 +++++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 6cd0a4a107..64546ce877 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -119,6 +119,7 @@ struct ReadWriteWorkload : KVWorkload { typedef std::vector> IndexRangeVec; // keyForIndex generate key from index. So for a shard range, recording the start and end is enough std::vector> serverShards; // storage server and the shards it owns + int hotServerCount = 0; // states of metric Int64MetricHandle totalReadsMetric; @@ -723,6 +724,7 @@ struct ReadWriteWorkload : KVWorkload { state int round = 0; for (; round < self->skewRound; ++round) { wait(updateServerShards(cx, self)); + self->setHotServers(); self->startReadWriteClients(cx, clients); wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void())); clients.clear(); @@ -745,12 +747,35 @@ struct ReadWriteWorkload : KVWorkload { return timeSinceStart >= metricsStart && timeSinceStart < (metricsStart + metricsDuration); } - int64_t getRandomKey(uint64_t nodeCount) { - if (forceHotProbability && deterministicRandom()->random01() < forceHotProbability) + // set the last N server in serverShards as hot server + void setHotServers() { + hotServerCount = ceil(hotServerFraction * serverShards.size()); + for (int i = serverShards.size(), j = 0; j < hotServerCount; --i, ++j) { + auto idx = deterministicRandom()->randomInt(0, i); + std::swap(serverShards[idx], serverShards[i - 1]); + } + } + + int64_t getRandomKeyFromHotServer() { + ASSERT(hotServerCount > 0); + int idx, shardIdx; + idx = deterministicRandom()->randomInt(serverShards.size() - hotServerCount, serverShards.size()); + shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); + return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, + serverShards[idx].second[shardIdx].second); + } + + int64_t getRandomKey(uint64_t nodeCount, bool hotServerRead = true) { + auto random = deterministicRandom()->random01(); + if (forceHotProbability && random < forceHotProbability) { return deterministicRandom()->randomInt64(0, nodeCount * hotKeyFraction) / hotKeyFraction; // spread hot keys over keyspace - else - return deterministicRandom()->randomInt64(0, nodeCount); + } else if (hotServerFraction > 0) { + if ((hotServerRead && random < hotServerReadFrac) || (!hotServerRead && random < hotServerWriteFrac)) { + return getRandomKeyFromHotServer(); + } + } + return deterministicRandom()->randomInt64(0, nodeCount); } double sweepAlpha(double startTime) { @@ -868,12 +893,12 @@ struct ReadWriteWorkload : KVWorkload { break; if (self->adjacentWrites) { - int64_t startKey = self->getRandomKey(self->nodeCount - writes); + int64_t startKey = self->getRandomKey(self->nodeCount - writes, false); for (int op = 0; op < writes; op++) tr.set(self->keyForIndex(startKey + op, false), values[op]); } else { for (int op = 0; op < writes; op++) - tr.set(self->keyForIndex(self->getRandomKey(self->nodeCount), false), values[op]); + tr.set(self->keyForIndex(self->getRandomKey(self->nodeCount, false), false), values[op]); } for (int op = 0; op < extra_read_conflict_ranges; op++) tr.addReadConflictRange(extra_ranges[op]); From 3766ff259f3532d9b07b4f06432e0228aa146912 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 22 Feb 2022 16:20:15 -0800 Subject: [PATCH 005/299] change keyForIndex --- fdbserver/tester.actor.cpp | 13 ++++--- fdbserver/workloads/ReadWrite.actor.cpp | 49 ++++++++++++++++++------- fdbserver/workloads/workloads.actor.h | 2 +- tests/rare/ReadSkewReadWrite.toml | 5 ++- 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index dd6a106884..4c609b3bd0 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -99,14 +99,16 @@ Key KVWorkload::keyForIndex(uint64_t index) const { } // the reverse process of keyForIndex() without division -int64_t KVWorkload::indexForKey(const KeyRef& key) const { +int64_t KVWorkload::indexForKey(const KeyRef& key, bool absent) const { int idx = 0; - if(nodePrefix > 0) { + if (nodePrefix > 0) { ASSERT(keyBytes >= 32); idx += 16; } ASSERT(keyBytes >= 16); - std::string str((char*)key.begin()+idx, key.size() - idx); + // extract int64_t index, the reverse process of emplaceIndex() + auto end = key.size() - idx - (absent ? 1 : 0); + std::string str((char*)key.begin() + idx, end); int64_t res = std::stoll(str, nullptr, 16); return res; } @@ -124,9 +126,8 @@ Key KVWorkload::keyForIndex(uint64_t index, bool absent) const { idx += 16; } ASSERT(keyBytes >= 16); - double d = double(index) / nodeCount; - emplaceIndex(data, idx, *(int64_t*)&d); - + emplaceIndex(data, idx, (int64_t)index); + // ASSERT(indexForKey(result) == (int64_t)index); // debug assert return result; } diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 64546ce877..539d8819ad 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -231,12 +231,12 @@ struct ReadWriteWorkload : KVWorkload { hotKeyFraction = getOption(options, "hotKeyFraction"_sr, 0.0); hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.0); skewRound = getOption(options, "skewRound"_sr, 0); - hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.0); + hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.8); hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); if (hotServerFraction > 0) { - ASSERT(hotServerReadFrac >= hotServerFraction && hotServerWriteFrac >= hotServerFraction && + ASSERT((hotServerReadFrac >= hotServerFraction || hotServerWriteFrac >= hotServerFraction) && skewRound > 0); } else if (hotKeyFraction > 0) { ASSERT(hotTrafficFraction <= 1); @@ -380,13 +380,15 @@ struct ReadWriteWorkload : KVWorkload { return tr->getRange(serverKeysRange, CLIENT_KNOBS->TOO_MANY, Snapshot::True); })); + // clear self->serverShards + self->serverShards.clear(); + // leftEdge < workloadBegin < workloadEnd Key workloadBegin = self->keyForIndex(0), workloadEnd = self->keyForIndex(self->nodeCount); Key leftEdge(allKeys.begin); std::vector leftServer; // left server owns the range [leftEdge, workloadBegin) KeyRangeRef workloadRange(workloadBegin, workloadEnd); std::map> beginServers; // begin index to server ID - std::vector keyIndex; // shard boundary by index for (auto kv = range.begin(); kv != range.end(); kv++) { if (serverHasKey(kv->value)) { @@ -395,7 +397,6 @@ struct ReadWriteWorkload : KVWorkload { if (workloadRange.contains(key)) { auto idx = self->indexForKey(key); beginServers[idx].push_back(id); - keyIndex.push_back(idx); } else if (workloadBegin > key && key > leftEdge) { // update left boundary leftEdge = key; leftServer.clear(); @@ -409,20 +410,21 @@ struct ReadWriteWorkload : KVWorkload { ASSERT(beginServers.begin()->first >= 0); // handle the left boundary if (beginServers.begin()->first > 0) { - keyIndex.push_back(0); - beginServers[0] = std::move(leftServer); + beginServers[0] = leftServer; } // sort shard begin idx - ASSERT(keyIndex.size() == beginServers.size()); - std::sort(keyIndex.begin(), keyIndex.end()); // build self->serverShards, starting from the left shard std::map serverShards; - int i = 0; - for (auto it = beginServers.begin(); i < keyIndex.size() && it != beginServers.end(); ++i, ++it) { - auto shardEnd = i < keyIndex.size() - 1 ? keyIndex[i + 1] : self->nodeCount; - for (auto id : it->second) { - serverShards[id].emplace_back(keyIndex[i], shardEnd); + auto nextIt = std::next(beginServers.begin()); + for (auto it : beginServers) { + auto shardEnd = self->nodeCount; + if (nextIt != beginServers.end()) { + shardEnd = nextIt->first; + ++nextIt; + } + for (auto id : it.second) { + serverShards[id].emplace_back(it.first, shardEnd); } } // self->serverShards is ordered by UID @@ -898,7 +900,8 @@ struct ReadWriteWorkload : KVWorkload { tr.set(self->keyForIndex(startKey + op, false), values[op]); } else { for (int op = 0; op < writes; op++) - tr.set(self->keyForIndex(self->getRandomKey(self->nodeCount, false), false), values[op]); + tr.set(self->keyForIndex(self->getRandomKey(self->nodeCount, false), false), + values[op]); } for (int op = 0; op < extra_read_conflict_ranges; op++) tr.addReadConflictRange(extra_ranges[op]); @@ -997,3 +1000,21 @@ ACTOR Future>> trackInsertionCount(Datab } WorkloadFactory ReadWriteWorkloadFactory("ReadWrite"); + +TEST_CASE("/KVWorkload/methods/ParseKeyForIndex") { + auto wk = ReadWriteWorkload(WorkloadContext()); + for (int i = 0; i < 1000; ++i) { + auto idx = deterministicRandom()->randomInt64(0, wk.nodeCount); + Key k = wk.keyForIndex(idx); + auto parse = wk.indexForKey(k); + // std::cout << parse << " " << idx << "\n"; + ASSERT(parse == idx); + } + for (int i = 0; i < 1000; ++i) { + auto idx = deterministicRandom()->randomInt64(0, wk.nodeCount); + Key k = wk.keyForIndex(idx, true); + auto parse = wk.indexForKey(k, true); + ASSERT(parse == idx); + } + return Void(); +} \ No newline at end of file diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index 9515c3df7a..c29b52e5e0 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -101,7 +101,7 @@ struct KVWorkload : TestWorkload { Key getRandomKey(bool absent) const; Key keyForIndex(uint64_t index) const; Key keyForIndex(uint64_t index, bool absent) const; - int64_t indexForKey(const KeyRef& key) const; + int64_t indexForKey(const KeyRef& key, bool absent = false) const; }; struct IWorkloadFactory : ReferenceCounted { diff --git a/tests/rare/ReadSkewReadWrite.toml b/tests/rare/ReadSkewReadWrite.toml index 2e5c3e228d..0c95c78c75 100644 --- a/tests/rare/ReadSkewReadWrite.toml +++ b/tests/rare/ReadSkewReadWrite.toml @@ -6,9 +6,10 @@ connectionFailuresDisableDuration = 100000 [[test.workload]] testName = 'ReadWrite' testDuration = 30.0 - skewRound = 2 + skewRound = 1 transactionsPerSecond = 2000 nodeCount = 150000 valueBytes = 128 discardEdgeMeasurements = false - warmingDelay = 10.0 \ No newline at end of file + warmingDelay = 10.0 + hotServerFraction = 0.1 \ No newline at end of file From e683906c7c0bc1c91f4c33f699fa051de464a612 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 22 Feb 2022 17:57:23 -0800 Subject: [PATCH 006/299] fix assertion error --- fdbserver/workloads/ReadWrite.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 539d8819ad..6911574006 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -407,9 +407,9 @@ struct ReadWriteWorkload : KVWorkload { } } } - ASSERT(beginServers.begin()->first >= 0); + ASSERT(beginServers.size() == 0 || beginServers.begin()->first >= 0); // handle the left boundary - if (beginServers.begin()->first > 0) { + if (beginServers.size() == 0 || beginServers.begin()->first > 0) { beginServers[0] = leftServer; } From a07e413eae991f01bb38ce8c1a484fdca5bdb80a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 24 Feb 2022 16:41:01 -0800 Subject: [PATCH 007/299] metrics comparator; rebalanceReadLoad() --- fdbclient/StorageServerInterface.h | 1 + fdbserver/DDTeamCollection.actor.cpp | 3 +- fdbserver/DataDistribution.actor.h | 20 ++++++--- fdbserver/DataDistributionQueue.actor.cpp | 48 ++++++++++++++++++++- fdbserver/DataDistributionTracker.actor.cpp | 24 ++++++++--- fdbserver/TCInfo.actor.cpp | 1 + 6 files changed, 82 insertions(+), 15 deletions(-) diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 9e40f95e1f..32a992fa3d 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -481,6 +481,7 @@ struct StorageMetrics { int64_t bytesPerKSecond = 0; // network bandwidth (average over 10s) int64_t iosPerKSecond = 0; int64_t bytesReadPerKSecond = 0; + Optional keys; // this metric belongs to which range static const int64_t infinity = 1LL << 60; diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 8a10576a98..2439cc1dbe 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -148,6 +148,7 @@ public: ACTOR static Future getTeam(DDTeamCollection* self, GetTeamRequest req) { try { wait(self->checkBuildTeams()); + // report the median available space if (now() - self->lastMedianAvailableSpaceUpdate > SERVER_KNOBS->AVAILABLE_SPACE_UPDATE_DELAY) { self->lastMedianAvailableSpaceUpdate = now(); std::vector teamAvailableSpace; @@ -159,7 +160,7 @@ public: } size_t pivot = teamAvailableSpace.size() / 2; - if (teamAvailableSpace.size() > 1) { + if (teamAvailableSpace.size() >= 1) { std::nth_element( teamAvailableSpace.begin(), teamAvailableSpace.begin() + pivot, teamAvailableSpace.end()); self->medianAvailableSpace = diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 3adca0b20e..ddcc81ff8c 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -77,10 +77,11 @@ struct IDataDistributionTeam { }; struct GetTeamRequest { - bool wantsNewServers; + bool wantsNewServers; // In additional to servers in completeSources, try to find teams with new server bool wantsTrueBest; - bool preferLowerUtilization; + bool preferLowerUtilization; // = false --> higher utilization team will be returned bool teamMustHaveShards; + bool preferLowerReadTraffic; double inflightPenalty; std::vector completeSources; std::vector src; @@ -91,16 +92,18 @@ struct GetTeamRequest { bool wantsTrueBest, bool preferLowerUtilization, bool teamMustHaveShards, + bool preferLowerReadTraffic = false, double inflightPenalty = 1.0) : wantsNewServers(wantsNewServers), wantsTrueBest(wantsTrueBest), preferLowerUtilization(preferLowerUtilization), - teamMustHaveShards(teamMustHaveShards), inflightPenalty(inflightPenalty) {} + teamMustHaveShards(teamMustHaveShards), preferLowerReadTraffic(preferLowerReadTraffic), + inflightPenalty(inflightPenalty) {} std::string getDesc() const { std::stringstream ss; ss << "WantsNewServers:" << wantsNewServers << " WantsTrueBest:" << wantsTrueBest << " PreferLowerUtilization:" << preferLowerUtilization << " teamMustHaveShards:" << teamMustHaveShards - << " inflightPenalty:" << inflightPenalty << ";"; + << " PreferLowerReadTraffic" << preferLowerReadTraffic << " inflightPenalty:" << inflightPenalty << ";"; ss << "CompleteSources:"; for (const auto& cs : completeSources) { ss << cs.toString() << ","; @@ -111,11 +114,16 @@ struct GetTeamRequest { }; struct GetMetricsRequest { - KeyRange keys; + // whether a < b + typedef std::function MetricsComparator; + std::vector keys; Promise reply; + Optional + comparator; // if comparator is assigned, return the largest one in keys, otherwise return the sum of metrics GetMetricsRequest() {} - GetMetricsRequest(KeyRange const& keys) : keys(keys) {} + GetMetricsRequest(KeyRange const& keys) : keys({ keys }) {} + GetMetricsRequest(std::vector const& keys) : keys(keys) {} }; struct GetMetricsListRequest { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index fcbc73dbed..6e1bdde4d3 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1021,6 +1021,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, + false, inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; @@ -1283,6 +1284,49 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } +// Move the shard with highest read density of sourceTeam's to destTeam if sourceTeam has much more read load than +// destTeam +ACTOR Future rebalanceReadLoad(DDQueueData* self, + int priority, + Reference sourceTeam, + Reference destTeam, + bool primary, + TraceEvent* traceEvent) { + if (g_network->isSimulated() && g_simulator.speedUpSimulation) { + traceEvent->detail("CancelingDueToSimulationSpeedup", true); + return false; + } + + state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( + ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); + if (!shards.size()) + return false; + + state GetMetricsRequest req(shards); + req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { + if (a.bytes > 0 && b.bytes > 0) { + return ((double)a.bytesReadPerKSecond / a.bytes) < ((double)b.bytesReadPerKSecond / b.bytes); + } + return a.allLessOrEqual(b); + }; + + state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); + if(metrics.keys.present() && metrics.bytes > 0) { + // Verify the shard is still in ShardsAffectedByTeamFailure + shards = self->shardsAffectedByTeamFailure->getShardsFor( + ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); + for (int i = 0; i < shards.size(); i++) { + if (metrics.keys == shards[i]) { + traceEvent->detail("ShardStillPresent", true); + self->output.send(RelocateShard(metrics.keys.get(), priority)); + return true; + } + } + traceEvent->detail("ShardStillPresent", false); + } + return false; +} + // Move a random shard of sourceTeam's to destTeam if sourceTeam has much more data than destTeam ACTOR Future rebalanceTeams(DDQueueData* self, int priority, @@ -1504,7 +1548,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { std::pair>, bool> _randomTeam = wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(true, false, false, true)))); + GetTeamRequest(true, false, false, true, false)))); randomTeam = _randomTeam; traceEvent.detail("SourceTeam", printable(randomTeam.first.map( @@ -1513,7 +1557,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) if (randomTeam.first.present()) { std::pair>, bool> unloadedTeam = wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(true, true, true, false)))); + GetTeamRequest(true, true, true, false, true)))); traceEvent.detail( "DestTeam", diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 11b8cdbf6f..49a51f7c3e 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -837,13 +837,25 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr loop { Future onChange; StorageMetrics returnMetrics; - for (auto t : self->shards.intersectingRanges(req.keys)) { - auto& stats = t.value().stats; - if (!stats->get().present()) { - onChange = stats->onChange(); - break; + for (auto range : req.keys) { + StorageMetrics metrics; + for (auto t : self->shards.intersectingRanges(range)) { + auto& stats = t.value().stats; + if (!stats->get().present()) { + onChange = stats->onChange(); + break; + } + metrics += t.value().stats->get().get().metrics; + } + + if (req.comparator.present()) { + if (req.comparator.get()(returnMetrics, metrics)) { + returnMetrics = metrics; + returnMetrics.keys = range; + } + } else { + returnMetrics += metrics; } - returnMetrics += t.value().stats->get().get().metrics; } if (!onChange.isValid()) { diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index c1371136ee..2718770c6b 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -357,6 +357,7 @@ int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { return minAvailableSpace; // Could be negative } +// return the min ratio of servers in this team double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const { double minRatio = 1.0; for (const auto& server : servers) { From 40a1f562a731c5879e8801f016a400bef561b37d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 25 Feb 2022 11:01:23 -0800 Subject: [PATCH 008/299] temporary change special key for data distributor --- fdbcli/DataDistributionCommand.actor.cpp | 45 ++++--- fdbclient/SpecialKeySpace.actor.cpp | 27 ++-- fdbclient/SystemData.h | 1 + fdbserver/DataDistributionQueue.actor.cpp | 117 ++++++++++++------ .../SpecialKeySpaceCorrectness.actor.cpp | 2 +- 5 files changed, 130 insertions(+), 62 deletions(-) diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index a7dc171ef1..399b3836f3 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -61,13 +61,20 @@ ACTOR Future setDDMode(Reference db, int mode) { } } -ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, bool ignoreRebalance) { +ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, int DDIgnoreOption) { state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - if (ignoreRebalance) { - tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, ValueRef()); + if (DDIgnoreOption > 0) { + Optional v = wait(safeThreadFutureToFuture(tr->get(fdb_cli::ddIgnoreRebalanceSpecialKey))); + if (v.present() && v.get().size() > 0) { + int oldValue = BinaryReader::fromStringRef(v.get(), Unversioned()); + tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, + BinaryWriter::toValue(DDIgnoreOption & oldValue, Unversioned())); + } else { + tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, BinaryWriter::toValue(DDIgnoreOption, Unversioned())); + } } else { tr->clear(fdb_cli::ddIgnoreRebalanceSpecialKey); } @@ -85,12 +92,13 @@ namespace fdb_cli { const KeyRef ddModeSpecialKey = LiteralStringRef("\xff\xff/management/data_distribution/mode"); const KeyRef ddIgnoreRebalanceSpecialKey = LiteralStringRef("\xff\xff/management/data_distribution/rebalance_ignored"); - +constexpr auto usage = + "Usage: datadistribution |enable " + ">\n"; ACTOR Future dataDistributionCommandActor(Reference db, std::vector tokens) { state bool result = true; if (tokens.size() != 2 && tokens.size() != 3) { - printf("Usage: datadistribution |enable " - ">\n"); + printf(usage); result = false; } else { if (tokencmp(tokens[1], "on")) { @@ -104,11 +112,16 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve wait(success((setHealthyZone(db, LiteralStringRef("IgnoreSSFailures"), 0)))); printf("Data distribution is disabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { - wait(setDDIgnoreRebalanceSwitch(db, true)); + wait(setDDIgnoreRebalanceSwitch(db, DDIgnore::REBALANCE_DISK | DDIgnore::REBALANCE_READ)); printf("Data distribution is disabled for rebalance.\n"); + } else if (tokencmp(tokens[2], "rebalance_disk")) { + wait(setDDIgnoreRebalanceSwitch(db, DDIgnore::REBALANCE_DISK)); + printf("Data distribution is disabled for rebalance_disk.\n"); + } else if (tokencmp(tokens[2], "rebalance_read")) { + wait(setDDIgnoreRebalanceSwitch(db, DDIgnore::REBALANCE_READ)); + printf("Data distribution is disabled for rebalance_read.\n"); } else { - printf("Usage: datadistribution |enable " - ">\n"); + printf(usage); result = false; } } else if (tokencmp(tokens[1], "enable")) { @@ -116,16 +129,20 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve wait(success((clearHealthyZone(db, false, true)))); printf("Data distribution is enabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { - wait(setDDIgnoreRebalanceSwitch(db, false)); + wait(setDDIgnoreRebalanceSwitch(db, 0)); printf("Data distribution is enabled for rebalance.\n"); + } else if (tokencmp(tokens[2], "rebalance_disk")) { + wait(setDDIgnoreRebalanceSwitch(db, ~DDIgnore::REBALANCE_DISK)); + printf("Data distribution is disabled for rebalance_disk.\n"); + } else if (tokencmp(tokens[2], "rebalance_read")) { + wait(setDDIgnoreRebalanceSwitch(db, ~DDIgnore::REBALANCE_READ)); + printf("Data distribution is disabled for rebalance_read.\n"); } else { - printf("Usage: datadistribution |enable " - ">\n"); + printf(usage); result = false; } } else { - printf("Usage: datadistribution |enable " - ">\n"); + printf(usage); result = false; } } diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 6c1db7639f..bf6d67fc79 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2440,14 +2440,25 @@ Future> DataDistributionImpl::commit(ReadYourWritesTransac iter->value().second.get().toString()); } } else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) { - if (iter->value().second.get().size()) - msg = - ManagementAPIError::toJsonString(false, - "datadistribution", - "Value is unused for the data_distribution/rebalance_ignored " - "key, please set it to an empty value"); - else - ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on")); + ValueRef val = iter->value().second.get(); + try { + boost::lexical_cast(iter->value().second.get().toString()); + } catch (boost::bad_lexical_cast& e) { + ManagementAPIError::toJsonString(false, + "datadistribution", + "Invalid datadistribution rebalance ignore option (int or empty): " + + iter->value().second.get().toString()); + val = ""_sr; + } + ryw->getTransaction().set(rebalanceDDIgnoreKey, iter->value().second.get()); +// if (iter->value().second.get().size()) +// msg = +// ManagementAPIError::toJsonString(false, +// "datadistribution", +// "Value is unused for the data_distribution/rebalance_ignored " +// "key, please set it to an empty value"); +// else +// ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on")); } else { msg = ManagementAPIError::toJsonString( false, diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 88717b9d4c..48db497a86 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -486,6 +486,7 @@ extern const KeyRangeRef monitorConfKeys; extern const KeyRef healthyZoneKey; extern const StringRef ignoreSSFailuresZoneString; extern const KeyRef rebalanceDDIgnoreKey; +enum DDIgnore {REBALANCE_DISK = 1, REBALANCE_READ = 2}; const Value healthyZoneValue(StringRef const& zoneId, Version version); std::pair decodeHealthyZoneValue(ValueRef const&); diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 6e1bdde4d3..c53f8f436c 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1311,7 +1311,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, }; state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); - if(metrics.keys.present() && metrics.bytes > 0) { + if (metrics.keys.present() && metrics.bytes > 0) { // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); @@ -1397,15 +1397,49 @@ ACTOR Future rebalanceTeams(DDQueueData* self, return false; } +ACTOR Future getSrcDestTeams(DDQueueData* self, + int teamCollectionIndex, + GetTeamRequest srcReq, + GetTeamRequest destReq, + Reference sourceTeam, + Reference destTeam, + int priority, + TraceEvent* traceEvent) { + + std::pair>, bool> randomTeam = + wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(destReq))); + traceEvent->detail("DestTeam", + printable(randomTeam.first.map( + [](const Reference& team) { return team->getDesc(); }))); + + if (randomTeam.first.present()) { + destTeam = randomTeam.first.get(); + std::pair>, bool> loadedTeam = + wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(srcReq))); + + traceEvent->detail("SourceTeam", + printable(loadedTeam.first.map( + [](const Reference& team) { return team->getDesc(); }))); + + if (loadedTeam.first.present()) + sourceTeam = loadedTeam.first.get(); + } + return Void(); +} + ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionIndex) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; + state Reference sourceTeam, destTeam; loop { - state std::pair>, bool> randomTeam; state bool moved = false; + state bool disableReadBalance = false; + state bool disableDiskBalance = false; + state GetTeamRequest srcReq; + state GetTeamRequest destReq; state TraceEvent traceEvent("BgDDMountainChopper", self->distributorId); traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval); @@ -1422,8 +1456,21 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde if (skipCurrentLoop && !val.present()) { // reset loop interval rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } else if (val.present()) { + if(val.get().size() > 0) { + std::cout << val.get().toString() <<"\n"; + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + if (ddIgnore & DDIgnore::REBALANCE_DISK) { + disableReadBalance = true; + } + if (ddIgnore & DDIgnore::REBALANCE_READ) { + disableDiskBalance = true; + } + skipCurrentLoop = disableReadBalance && disableDiskBalance; + } else { + skipCurrentLoop = true; + } } - skipCurrentLoop = val.present(); } traceEvent.detail("Enabled", !skipCurrentLoop); @@ -1440,39 +1487,30 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM]); if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { - std::pair>, bool> _randomTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(true, false, true, false)))); - randomTeam = _randomTeam; - traceEvent.detail("DestTeam", - printable(randomTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (randomTeam.first.present()) { - std::pair>, bool> loadedTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(true, true, false, true)))); - - traceEvent.detail( - "SourceTeam", - printable(loadedTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (loadedTeam.first.present()) { - bool _moved = wait(rebalanceTeams(self, - SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, - loadedTeam.first.get(), - randomTeam.first.get(), - teamCollectionIndex == 0, - &traceEvent)); - moved = _moved; - if (moved) { - resetCount = 0; - } else { - resetCount++; - } + // FIXME: read balance and disk balance shouldn't be mutual exclusive in the future + if (!disableReadBalance) { + srcReq = GetTeamRequest(true, true, false, true, false); + destReq = GetTeamRequest(true, false, true, false, false); + } else { + srcReq = GetTeamRequest(true, true, false, true); + destReq = GetTeamRequest(true, false, true, false); + } + // clang-format off + wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, sourceTeam, destTeam, + SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM,&traceEvent)); + if (sourceTeam.isValid() && destTeam.isValid()) { + if (!disableReadBalance) { + wait(store(moved,rebalanceReadLoad(self,SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, + sourceTeam, destTeam,teamCollectionIndex == 0, + &traceEvent))); + } else { + wait(store(moved,rebalanceTeams(self,SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, + sourceTeam, destTeam,teamCollectionIndex == 0, + &traceEvent))); } } + // clang-format on + moved ? resetCount = 0 : resetCount++; } if (now() - (*self->lastLimited) < SERVER_KNOBS->BG_DD_SATURATION_DELAY) { @@ -1658,7 +1696,8 @@ ACTOR Future dataDistributionQueue(Database cx, loop { self.validate(); - // For the given servers that caused us to go around the loop, find the next item(s) that can be launched. + // For the given servers that caused us to go around the loop, find the next item(s) that can be + // launched. if (launchData.startTime != -1) { // Launch dataDistributionRelocator actor to relocate the launchData self.launchQueuedWork(launchData, ddEnabledState); @@ -1748,8 +1787,8 @@ ACTOR Future dataDistributionQueue(Database cx, .detail("PriorityTeam0Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_0_LEFT]) .detail("PrioritySplitShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_SPLIT_SHARD]) .trackLatest("MovingData"); // This trace event's trackLatest lifetime is controlled by - // DataDistributorData::movingDataEventHolder. The track latest key - // we use here must match the key used in the holder. + // DataDistributorData::movingDataEventHolder. The track latest + // key we use here must match the key used in the holder. } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} @@ -1759,8 +1798,8 @@ ACTOR Future dataDistributionQueue(Database cx, } } } catch (Error& e) { - if (e.code() != error_code_broken_promise && // FIXME: Get rid of these broken_promise errors every time we are - // killed by the master dying + if (e.code() != error_code_broken_promise && // FIXME: Get rid of these broken_promise errors every time we + // are killed by the master dying e.code() != error_code_movekeys_conflict) TraceEvent(SevError, "DataDistributionQueueError", distributorId).error(e); throw e; diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index de8af3c4f1..65e443a333 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -1399,7 +1399,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // check DD disabled for rebalance Optional val3 = wait(tx->get(rebalanceDDIgnoreKey)); // default value "on" - ASSERT(val3.present() && val3.get() == LiteralStringRef("on")); + ASSERT(val3.present()); tx->reset(); break; } catch (Error& e) { From 5e74b5006e2f39e48495e7aaf72484a225c1114d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 28 Feb 2022 10:22:32 -0800 Subject: [PATCH 009/299] fix uninitialized member --- fdbserver/DDTeamCollection.actor.cpp | 4 +- fdbserver/DataDistribution.actor.h | 15 ++-- fdbserver/DataDistributionQueue.actor.cpp | 94 +++++++++++++---------- fdbserver/TCInfo.actor.cpp | 14 ++++ fdbserver/TCInfo.h | 2 + 5 files changed, 82 insertions(+), 47 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 2439cc1dbe..6fee55ca58 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -245,7 +245,9 @@ public: (!req.preferLowerUtilization || self->teams[currentIndex]->hasHealthyAvailableSpace(self->medianAvailableSpace))) { int64_t loadBytes = self->teams[currentIndex]->getLoadBytes(true, req.inflightPenalty); - if ((!bestOption.present() || (req.preferLowerUtilization && loadBytes < bestLoadBytes) || + if ((!bestOption.present() || + ((bool)req.teamSorter && req.teamSorter(bestOption.get(), self->teams[currentIndex])) || + (req.preferLowerUtilization && loadBytes < bestLoadBytes) || (!req.preferLowerUtilization && loadBytes > bestLoadBytes)) && (!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team( diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index ddcc81ff8c..b772cb5ef9 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -47,6 +47,7 @@ struct IDataDistributionTeam { virtual void addDataInFlightToTeam(int64_t delta) = 0; virtual int64_t getDataInFlightToTeam() const = 0; virtual int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const = 0; + virtual double getLoadReadBandwidth() const = 0; virtual int64_t getMinAvailableSpace(bool includeInFlight = true) const = 0; virtual double getMinAvailableSpaceRatio(bool includeInFlight = true) const = 0; virtual bool hasHealthyAvailableSpace(double minRatio) const = 0; @@ -81,29 +82,33 @@ struct GetTeamRequest { bool wantsTrueBest; bool preferLowerUtilization; // = false --> higher utilization team will be returned bool teamMustHaveShards; - bool preferLowerReadTraffic; double inflightPenalty; std::vector completeSources; std::vector src; Promise>, bool>> reply; + // optional + typedef Reference TeamRef; + std::function hardConstraint; + std::function + teamSorter; // => true if a.score < b.score, the reply will choose the largest one + GetTeamRequest() {} GetTeamRequest(bool wantsNewServers, bool wantsTrueBest, bool preferLowerUtilization, bool teamMustHaveShards, - bool preferLowerReadTraffic = false, double inflightPenalty = 1.0) : wantsNewServers(wantsNewServers), wantsTrueBest(wantsTrueBest), preferLowerUtilization(preferLowerUtilization), - teamMustHaveShards(teamMustHaveShards), preferLowerReadTraffic(preferLowerReadTraffic), - inflightPenalty(inflightPenalty) {} + teamMustHaveShards(teamMustHaveShards), inflightPenalty(inflightPenalty) {} std::string getDesc() const { std::stringstream ss; ss << "WantsNewServers:" << wantsNewServers << " WantsTrueBest:" << wantsTrueBest << " PreferLowerUtilization:" << preferLowerUtilization << " teamMustHaveShards:" << teamMustHaveShards - << " PreferLowerReadTraffic" << preferLowerReadTraffic << " inflightPenalty:" << inflightPenalty << ";"; + << " inflightPenalty:" << inflightPenalty << " hardConstraint: " << (bool)hardConstraint + << " teamSorter: " << (bool)teamSorter << ";"; ss << "CompleteSources:"; for (const auto& cs : completeSources) { ss << cs.toString() << ","; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index c53f8f436c..5333235610 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -174,6 +174,10 @@ public: }); } + double getLoadReadBandwidth() const override { + return sum([](IDataDistributionTeam const& team) { return team.getLoadReadBandwidth(); }); + } + int64_t getMinAvailableSpace(bool includeInFlight = true) const override { int64_t result = std::numeric_limits::max(); for (const auto& team : teams) { @@ -1021,7 +1025,6 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, - false, inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; @@ -1427,17 +1430,22 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, return Void(); } +bool greaterReadLoad(Reference a, Reference b) { + return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); +} + ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionIndex) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; - state Reference sourceTeam, destTeam; loop { state bool moved = false; state bool disableReadBalance = false; state bool disableDiskBalance = false; + state Reference sourceTeam; + state Reference destTeam; state GetTeamRequest srcReq; state GetTeamRequest destReq; state TraceEvent traceEvent("BgDDMountainChopper", self->distributorId); @@ -1457,8 +1465,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde // reset loop interval rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; } else if (val.present()) { - if(val.get().size() > 0) { - std::cout << val.get().toString() <<"\n"; + if (val.get().size() > 0) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); if (ddIgnore & DDIgnore::REBALANCE_DISK) { disableReadBalance = true; @@ -1488,12 +1495,10 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { // FIXME: read balance and disk balance shouldn't be mutual exclusive in the future + srcReq = GetTeamRequest(true, true, false, true); + destReq = GetTeamRequest(true, false, true, false); if (!disableReadBalance) { - srcReq = GetTeamRequest(true, true, false, true, false); - destReq = GetTeamRequest(true, false, true, false, false); - } else { - srcReq = GetTeamRequest(true, true, false, true); - destReq = GetTeamRequest(true, false, true, false); + srcReq.teamSorter = greaterReadLoad; } // clang-format off wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, sourceTeam, destTeam, @@ -1548,8 +1553,13 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) state bool skipCurrentLoop = false; loop { - state std::pair>, bool> randomTeam; state bool moved = false; + state bool disableReadBalance = false; + state bool disableDiskBalance = false; + state Reference sourceTeam; + state Reference destTeam; + state GetTeamRequest srcReq; + state GetTeamRequest destReq; state TraceEvent traceEvent("BgDDValleyFiller", self->distributorId); traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval); @@ -1566,8 +1576,20 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) if (skipCurrentLoop && !val.present()) { // reset loop interval rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } else if (val.present()) { + if (val.get().size() > 0) { + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + if (ddIgnore & DDIgnore::REBALANCE_DISK) { + disableReadBalance = true; + } + if (ddIgnore & DDIgnore::REBALANCE_READ) { + disableDiskBalance = true; + } + skipCurrentLoop = disableReadBalance && disableDiskBalance; + } else { + skipCurrentLoop = true; + } } - skipCurrentLoop = val.present(); } traceEvent.detail("Enabled", !skipCurrentLoop); @@ -1584,39 +1606,29 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM]); if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { - std::pair>, bool> _randomTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(true, false, false, true, false)))); - randomTeam = _randomTeam; - traceEvent.detail("SourceTeam", - printable(randomTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); + // FIXME: read balance and disk balance shouldn't be mutual exclusive in the future + srcReq = GetTeamRequest(true, false, false, true); + destReq = GetTeamRequest(true, true, true, false); + if (!disableReadBalance) { + destReq.teamSorter = greaterReadLoad; + } - if (randomTeam.first.present()) { - std::pair>, bool> unloadedTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(true, true, true, false, true)))); - - traceEvent.detail( - "DestTeam", - printable(unloadedTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (unloadedTeam.first.present()) { - bool _moved = wait(rebalanceTeams(self, - SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, - randomTeam.first.get(), - unloadedTeam.first.get(), - teamCollectionIndex == 0, - &traceEvent)); - moved = _moved; - if (moved) { - resetCount = 0; - } else { - resetCount++; - } + // clang-format off + wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, sourceTeam, destTeam, + SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM,&traceEvent)); + if (sourceTeam.isValid() && destTeam.isValid()) { + if (!disableReadBalance) { + wait(store(moved,rebalanceReadLoad(self,SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, + sourceTeam, destTeam,teamCollectionIndex == 0, + &traceEvent))); + } else { + wait(store(moved,rebalanceTeams(self,SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, + sourceTeam, destTeam,teamCollectionIndex == 0, + &traceEvent))); } } + // clang-format on + moved ? resetCount = 0 : resetCount++; } if (now() - (*self->lastLimited) < SERVER_KNOBS->BG_DD_SATURATION_DELAY) { diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 2718770c6b..6732b1fdc1 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -336,6 +336,20 @@ int64_t TCTeamInfo::getLoadBytes(bool includeInFlight, double inflightPenalty) c return (physicalBytes + (inflightPenalty * inFlightBytes)) * availableSpaceMultiplier; } +double TCTeamInfo::getLoadReadBandwidth() const { + double sum = 0; + int size = 0; + for (const auto& server : servers) { + if (server->serverMetricsPresent()) { + auto& replyValue = server->getServerMetrics(); + ASSERT(replyValue.load.bytesReadPerKSecond >= 0); + sum += replyValue.load.bytesReadPerKSecond; + size += 1; + } + } + return size == 0 ? 0 : sum / size; +} + int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { int64_t minAvailableSpace = std::numeric_limits::max(); for (const auto& server : servers) { diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index f1886273f8..cea4ecf7fd 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -191,6 +191,8 @@ public: int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const override; + double getLoadReadBandwidth() const override; + int64_t getMinAvailableSpace(bool includeInFlight = true) const override; double getMinAvailableSpaceRatio(bool includeInFlight = true) const override; From e0d624c167afe120049eb1f9c6cbd121f54d7c65 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 28 Feb 2022 10:39:29 -0800 Subject: [PATCH 010/299] format --- fdbclient/SpecialKeySpace.actor.cpp | 24 ++++++++++++------------ fdbclient/SystemData.cpp | 2 +- fdbclient/SystemData.h | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index bf6d67fc79..8ee66b7df7 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2444,21 +2444,21 @@ Future> DataDistributionImpl::commit(ReadYourWritesTransac try { boost::lexical_cast(iter->value().second.get().toString()); } catch (boost::bad_lexical_cast& e) { - ManagementAPIError::toJsonString(false, - "datadistribution", - "Invalid datadistribution rebalance ignore option (int or empty): " + - iter->value().second.get().toString()); + ManagementAPIError::toJsonString( + false, + "datadistribution", + "Invalid datadistribution rebalance ignore option (int or empty): " + + iter->value().second.get().toString()); val = ""_sr; } ryw->getTransaction().set(rebalanceDDIgnoreKey, iter->value().second.get()); -// if (iter->value().second.get().size()) -// msg = -// ManagementAPIError::toJsonString(false, -// "datadistribution", -// "Value is unused for the data_distribution/rebalance_ignored " -// "key, please set it to an empty value"); -// else -// ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on")); + // if (iter->value().second.get().size()) + // msg = + // ManagementAPIError::toJsonString(false, + // "datadistribution", + // "Value is unused for the + //data_distribution/rebalance_ignored " "key, please set it to an empty value"); else + // ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on")); } else { msg = ManagementAPIError::toJsonString( false, diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index af856028aa..96b993f83d 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -309,7 +309,7 @@ std::pair serverKeysDecodeServerBegin(const KeyRef& key) { rd >> server_id; rd.readBytes(1); // skip "/" std::string bytes; - while(!rd.empty()) { + while (!rd.empty()) { bytes.push_back((char)*rd.arenaRead(1)); } // std::cout << bytes.size() << " " < serverKeysDecodeServerBegin(const KeyRef& key) ; +std::pair serverKeysDecodeServerBegin(const KeyRef& key); bool serverHasKey(ValueRef storedValue); extern const KeyRangeRef conflictingKeysRange; @@ -486,7 +486,7 @@ extern const KeyRangeRef monitorConfKeys; extern const KeyRef healthyZoneKey; extern const StringRef ignoreSSFailuresZoneString; extern const KeyRef rebalanceDDIgnoreKey; -enum DDIgnore {REBALANCE_DISK = 1, REBALANCE_READ = 2}; +enum DDIgnore { REBALANCE_DISK = 1, REBALANCE_READ = 2 }; const Value healthyZoneValue(StringRef const& zoneId, Version version); std::pair decodeHealthyZoneValue(ValueRef const&); From eb557d59e18eaea26795b068f49aee61f09d6dca Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 1 Mar 2022 15:27:07 -0800 Subject: [PATCH 011/299] make hot server same on multiple clients --- fdbserver/workloads/ReadWrite.actor.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 6911574006..9e593ee517 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -119,7 +119,7 @@ struct ReadWriteWorkload : KVWorkload { typedef std::vector> IndexRangeVec; // keyForIndex generate key from index. So for a shard range, recording the start and end is enough std::vector> serverShards; // storage server and the shards it owns - int hotServerCount = 0; + int hotServerCount = 0, currentHotRound = -1; // states of metric Int64MetricHandle totalReadsMetric; @@ -723,8 +723,7 @@ struct ReadWriteWorkload : KVWorkload { clients.push_back(tracePeriodically(self)); if (self->skewRound > 0) { - state int round = 0; - for (; round < self->skewRound; ++round) { + for (self->currentHotRound = 0; self->currentHotRound < self->skewRound; ++self->currentHotRound) { wait(updateServerShards(cx, self)); self->setHotServers(); self->startReadWriteClients(cx, clients); @@ -749,19 +748,17 @@ struct ReadWriteWorkload : KVWorkload { return timeSinceStart >= metricsStart && timeSinceStart < (metricsStart + metricsDuration); } - // set the last N server in serverShards as hot server + // calculate hot server count void setHotServers() { hotServerCount = ceil(hotServerFraction * serverShards.size()); - for (int i = serverShards.size(), j = 0; j < hotServerCount; --i, ++j) { - auto idx = deterministicRandom()->randomInt(0, i); - std::swap(serverShards[idx], serverShards[i - 1]); - } + std::cout << "Choose " << hotServerCount << " hot servers\n"; } int64_t getRandomKeyFromHotServer() { ASSERT(hotServerCount > 0); - int idx, shardIdx; - idx = deterministicRandom()->randomInt(serverShards.size() - hotServerCount, serverShards.size()); + int begin = currentHotRound * hotServerCount; + int shardIdx; + int idx = deterministicRandom()->randomInt(begin, begin + hotServerCount) % serverShards.size(); shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, serverShards[idx].second[shardIdx].second); From f48ba989c38012236c022588f66ff612bb36ce58 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 1 Mar 2022 15:27:27 -0800 Subject: [PATCH 012/299] add status --- fdbclient/Schemas.cpp | 1 + fdbserver/DataDistributionQueue.actor.cpp | 3 ++- fdbserver/Status.actor.cpp | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 2e9fcc53a6..5f74b2a502 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -699,6 +699,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "maintenance_seconds_remaining":1.0, "data_distribution_disabled_for_ss_failures":true, "data_distribution_disabled_for_rebalance":true, + "data_distribution_disabled_hex": "1", "data_distribution_disabled":true, "active_primary_dc":"pv", "bounce_impact":{ diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 5333235610..55b3642d07 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1577,7 +1577,8 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) // reset loop interval rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; } else if (val.present()) { - if (val.get().size() > 0) { + // FIXME: better way for upgrade? for example, using a new key to record mode + if (val.get().size() > sizeof(int)) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); if (ddIgnore & DDIgnore::REBALANCE_DISK) { disableReadBalance = true; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 1c4ab387d8..01904cd00a 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1535,6 +1535,8 @@ struct LoadConfigurationResult { Optional healthyZone; double healthyZoneSeconds; bool rebalanceDDIgnored; + // FIXME: possible convert it to int if upgrade value can be resolved? + std::string rebalanceDDIgnoreHex; // any or combination of 0, 1, 2, see enum DDIgnore; bool dataDistributionDisabled; LoadConfigurationResult() @@ -1609,6 +1611,9 @@ loadConfiguration(Database cx, JsonBuilderArray* messages, std::set } } res.rebalanceDDIgnored = rebalanceDDIgnored.get().present(); + if (res.rebalanceDDIgnored) { + res.rebalanceDDIgnoreHex = rebalanceDDIgnored.get().get().toHexString(); + } if (ddModeKey.get().present()) { BinaryReader rd(ddModeKey.get().get(), Unversioned()); int currentMode; @@ -2955,6 +2960,7 @@ ACTOR Future clusterGetStatus( } if (loadResult.get().rebalanceDDIgnored) { statusObj["data_distribution_disabled_for_rebalance"] = true; + statusObj["data_distribution_disabled_hex"] = loadResult.get().rebalanceDDIgnoreHex; } if (loadResult.get().dataDistributionDisabled) { statusObj["data_distribution_disabled"] = true; From a05f6f851a49fd9bc5544f9860d432f4b956aaac Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 2 Mar 2022 21:56:03 -0800 Subject: [PATCH 013/299] fix reference assign bug --- fdbserver/DataDistributionQueue.actor.cpp | 44 +++++++++++++---------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 55b3642d07..4ab42f6768 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1302,8 +1302,10 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); - if (!shards.size()) + if (!shards.size()) { + traceEvent->detail("SkipReason", "NoShardOnSource"); return false; + } state GetMetricsRequest req(shards); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { @@ -1320,13 +1322,13 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); for (int i = 0; i < shards.size(); i++) { if (metrics.keys == shards[i]) { - traceEvent->detail("ShardStillPresent", true); self->output.send(RelocateShard(metrics.keys.get(), priority)); return true; } } - traceEvent->detail("ShardStillPresent", false); - } + traceEvent->detail("SkipReason", "ShardNotPresent"); + } else + traceEvent->detail("SkipReason", metrics.keys.present() ? "ShardZeroSize" : "ShardNoKeys"); return false; } @@ -1351,8 +1353,10 @@ ACTOR Future rebalanceTeams(DDQueueData* self, traceEvent->detail("AverageShardBytes", averageShardBytes).detail("ShardsInSource", shards.size()); - if (!shards.size()) + if (!shards.size()) { + traceEvent->detail("SkipReason", "NoShardOnSource"); return false; + } state KeyRange moveShard; state StorageMetrics metrics; @@ -1382,6 +1386,7 @@ ACTOR Future rebalanceTeams(DDQueueData* self, .detail("SourceAndDestTooSimilar", sourceAndDestTooSimilar); if (sourceAndDestTooSimilar || metrics.bytes == 0) { + traceEvent->detail("SkipReason", sourceAndDestTooSimilar ? "TeamTooSimilar" : "ShardZeroSize"); return false; } @@ -1390,13 +1395,12 @@ ACTOR Future rebalanceTeams(DDQueueData* self, ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); for (int i = 0; i < shards.size(); i++) { if (moveShard == shards[i]) { - traceEvent->detail("ShardStillPresent", true); self->output.send(RelocateShard(moveShard, priority)); return true; } } - traceEvent->detail("ShardStillPresent", false); + traceEvent->detail("SkipReason", "ShardNotPresent"); return false; } @@ -1404,8 +1408,8 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, int teamCollectionIndex, GetTeamRequest srcReq, GetTeamRequest destReq, - Reference sourceTeam, - Reference destTeam, + Reference* sourceTeam, + Reference* destTeam, int priority, TraceEvent* traceEvent) { @@ -1416,7 +1420,7 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, [](const Reference& team) { return team->getDesc(); }))); if (randomTeam.first.present()) { - destTeam = randomTeam.first.get(); + *destTeam = randomTeam.first.get(); std::pair>, bool> loadedTeam = wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply(srcReq))); @@ -1425,7 +1429,7 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, [](const Reference& team) { return team->getDesc(); }))); if (loadedTeam.first.present()) - sourceTeam = loadedTeam.first.get(); + *sourceTeam = loadedTeam.first.get(); } return Void(); } @@ -1468,10 +1472,10 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde if (val.get().size() > 0) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); if (ddIgnore & DDIgnore::REBALANCE_DISK) { - disableReadBalance = true; + disableDiskBalance = true; } if (ddIgnore & DDIgnore::REBALANCE_READ) { - disableDiskBalance = true; + disableReadBalance = true; } skipCurrentLoop = disableReadBalance && disableDiskBalance; } else { @@ -1480,7 +1484,8 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde } } - traceEvent.detail("Enabled", !skipCurrentLoop); + traceEvent.detail("Enabled", + skipCurrentLoop ? "None" : (disableReadBalance ? "NoReadBalance" : "NoDiskBalance")); wait(delayF); if (skipCurrentLoop) { @@ -1501,7 +1506,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde srcReq.teamSorter = greaterReadLoad; } // clang-format off - wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, sourceTeam, destTeam, + wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM,&traceEvent)); if (sourceTeam.isValid() && destTeam.isValid()) { if (!disableReadBalance) { @@ -1581,10 +1586,10 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) if (val.get().size() > sizeof(int)) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); if (ddIgnore & DDIgnore::REBALANCE_DISK) { - disableReadBalance = true; + disableDiskBalance = true; } if (ddIgnore & DDIgnore::REBALANCE_READ) { - disableDiskBalance = true; + disableReadBalance = true; } skipCurrentLoop = disableReadBalance && disableDiskBalance; } else { @@ -1593,7 +1598,8 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) } } - traceEvent.detail("Enabled", !skipCurrentLoop); + traceEvent.detail("Enabled", + skipCurrentLoop ? "None" : (disableReadBalance ? "NoReadBalance" : "NoDiskBalance")); wait(delayF); if (skipCurrentLoop) { @@ -1615,7 +1621,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) } // clang-format off - wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, sourceTeam, destTeam, + wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM,&traceEvent)); if (sourceTeam.isValid() && destTeam.isValid()) { if (!disableReadBalance) { From 87ab7d165b8ad9a377ff751b69ed21e6f3079ff9 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 3 Mar 2022 11:15:11 -0800 Subject: [PATCH 014/299] refactor team compare --- fdbserver/DDTeamCollection.actor.cpp | 13 ++++++------- fdbserver/DataDistribution.actor.h | 23 ++++++++++++++++++++++- fdbserver/workloads/ReadWrite.actor.cpp | 3 +-- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 6fee55ca58..e961f9a7b9 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -245,13 +245,13 @@ public: (!req.preferLowerUtilization || self->teams[currentIndex]->hasHealthyAvailableSpace(self->medianAvailableSpace))) { int64_t loadBytes = self->teams[currentIndex]->getLoadBytes(true, req.inflightPenalty); - if ((!bestOption.present() || - ((bool)req.teamSorter && req.teamSorter(bestOption.get(), self->teams[currentIndex])) || - (req.preferLowerUtilization && loadBytes < bestLoadBytes) || - (!req.preferLowerUtilization && loadBytes > bestLoadBytes)) && + if (req.eligible(self->teams[currentIndex]) && // hard constraints (!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team( - self->teams[currentIndex]->getServerIDs(), self->primary)))) { + self->teams[currentIndex]->getServerIDs(), self->primary))) && + // sort conditions + (!bestOption.present() || req.lessCompare(bestOption.get(), self->teams[currentIndex]) || + !req.lessCompareByLoad(loadBytes, bestLoadBytes))) { bestLoadBytes = loadBytes; bestOption = self->teams[currentIndex]; bestIndex = currentIndex; @@ -299,8 +299,7 @@ public: for (int i = 0; i < randomTeams.size(); i++) { int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty); - if (!bestOption.present() || (req.preferLowerUtilization && loadBytes < bestLoadBytes) || - (!req.preferLowerUtilization && loadBytes > bestLoadBytes)) { + if (!bestOption.present() || !req.lessCompareByLoad(loadBytes, bestLoadBytes)) { bestLoadBytes = loadBytes; bestOption = randomTeams[i]; } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index b772cb5ef9..bfdbfafebf 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -80,7 +80,7 @@ struct IDataDistributionTeam { struct GetTeamRequest { bool wantsNewServers; // In additional to servers in completeSources, try to find teams with new server bool wantsTrueBest; - bool preferLowerUtilization; // = false --> higher utilization team will be returned + bool preferLowerUtilization; // if true, lower utilized team has higher score bool teamMustHaveShards; double inflightPenalty; std::vector completeSources; @@ -102,6 +102,27 @@ struct GetTeamRequest { : wantsNewServers(wantsNewServers), wantsTrueBest(wantsTrueBest), preferLowerUtilization(preferLowerUtilization), teamMustHaveShards(teamMustHaveShards), inflightPenalty(inflightPenalty) {} + // return true if a.score < b.score + [[nodiscard]] bool lessCompare(TeamRef a, TeamRef b) const { + if (teamSorter) { + return teamSorter(a, b); + } + return false; + } + + // return true if scoreWithLoadBytes < bestScoreWithBestLoadBytes + bool lessCompareByLoad(int64_t loadBytes, int64_t bestLoadBytes) const { + bool lessLoad = loadBytes < bestLoadBytes; + return preferLowerUtilization ? !lessLoad : lessLoad; + } + + bool eligible(TeamRef a) const { + if (hardConstraint) { + return hardConstraint(a); + } + return true; + } + std::string getDesc() const { std::stringstream ss; diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 9e593ee517..0c60559215 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -757,9 +757,8 @@ struct ReadWriteWorkload : KVWorkload { int64_t getRandomKeyFromHotServer() { ASSERT(hotServerCount > 0); int begin = currentHotRound * hotServerCount; - int shardIdx; int idx = deterministicRandom()->randomInt(begin, begin + hotServerCount) % serverShards.size(); - shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); + int shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, serverShards[idx].second[shardIdx].second); } From 970a84bf63f8b6916d601f28dc1d8b7e819c9546 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 3 Mar 2022 12:58:45 -0800 Subject: [PATCH 015/299] fix teamSorter setting --- fdbserver/DDTeamCollection.actor.cpp | 3 ++- fdbserver/DataDistributionQueue.actor.cpp | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index e4400d1271..90afb44cac 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -299,7 +299,8 @@ public: for (int i = 0; i < randomTeams.size(); i++) { int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty); - if (!bestOption.present() || !req.lessCompareByLoad(loadBytes, bestLoadBytes)) { + if (!bestOption.present() || req.lessCompare(bestOption.get(), randomTeams[i]) || + !req.lessCompareByLoad(loadBytes, bestLoadBytes)) { bestLoadBytes = loadBytes; bestOption = randomTeams[i]; } diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 73ffe8a617..3614042ec5 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1436,9 +1436,14 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, return Void(); } +// return true if a.readload > b.readload bool greaterReadLoad(Reference a, Reference b) { return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); } +// return true if a.readload < b.readload +bool lessReadLoad(Reference a, Reference b) { + return a->getLoadReadBandwidth() < b->getLoadReadBandwidth(); +} ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionIndex) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; @@ -1505,7 +1510,8 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde srcReq = GetTeamRequest(true, true, false, true); destReq = GetTeamRequest(true, false, true, false); if (!disableReadBalance) { - srcReq.teamSorter = greaterReadLoad; + srcReq.teamSorter = lessReadLoad; + destReq.teamSorter = greaterReadLoad; } // clang-format off wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam, @@ -1619,6 +1625,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) srcReq = GetTeamRequest(true, false, false, true); destReq = GetTeamRequest(true, true, true, false); if (!disableReadBalance) { + srcReq.teamSorter = lessReadLoad; // less read load map to lower score destReq.teamSorter = greaterReadLoad; } From dbeee1dcd01e50df064c46b128564f22f2f06297 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 3 Mar 2022 12:59:04 -0800 Subject: [PATCH 016/299] format code --- fdbclient/SpecialKeySpace.actor.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 5eb8b0ebed..ae1d59349d 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2466,8 +2466,9 @@ Future> DataDistributionImpl::commit(ReadYourWritesTransac // ManagementAPIError::toJsonString(false, // "datadistribution", // "Value is unused for the - //data_distribution/rebalance_ignored " "key, please set it to an empty value"); else - // ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on")); + // data_distribution/rebalance_ignored " "key, please set it + // to an empty value"); else ryw->getTransaction().set(rebalanceDDIgnoreKey, + //LiteralStringRef("on")); } else { msg = ManagementAPIError::toJsonString( false, From 01c347d45cd204096e6013b9a511fdb0c004f93e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 3 Mar 2022 15:38:28 -0800 Subject: [PATCH 017/299] format code; special key space notice update --- fdbcli/DataDistributionCommand.actor.cpp | 4 ++-- fdbclient/SpecialKeySpace.actor.cpp | 8 -------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index 399b3836f3..f334935bce 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -133,10 +133,10 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve printf("Data distribution is enabled for rebalance.\n"); } else if (tokencmp(tokens[2], "rebalance_disk")) { wait(setDDIgnoreRebalanceSwitch(db, ~DDIgnore::REBALANCE_DISK)); - printf("Data distribution is disabled for rebalance_disk.\n"); + printf("Data distribution is enabled for rebalance_disk.\n"); } else if (tokencmp(tokens[2], "rebalance_read")) { wait(setDDIgnoreRebalanceSwitch(db, ~DDIgnore::REBALANCE_READ)); - printf("Data distribution is disabled for rebalance_read.\n"); + printf("Data distribution is enabled for rebalance_read.\n"); } else { printf(usage); result = false; diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index ae1d59349d..43b24ca1ce 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2461,14 +2461,6 @@ Future> DataDistributionImpl::commit(ReadYourWritesTransac val = ""_sr; } ryw->getTransaction().set(rebalanceDDIgnoreKey, iter->value().second.get()); - // if (iter->value().second.get().size()) - // msg = - // ManagementAPIError::toJsonString(false, - // "datadistribution", - // "Value is unused for the - // data_distribution/rebalance_ignored " "key, please set it - // to an empty value"); else ryw->getTransaction().set(rebalanceDDIgnoreKey, - //LiteralStringRef("on")); } else { msg = ManagementAPIError::toJsonString( false, From 28063ff7a195a8d5583caafe62a49703c713551e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 7 Mar 2022 17:36:20 -0800 Subject: [PATCH 018/299] workload print out address --- fdbserver/workloads/ReadWrite.actor.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index c74c9f9ee1..1bea43d512 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -119,6 +119,7 @@ struct ReadWriteWorkload : KVWorkload { typedef std::vector> IndexRangeVec; // keyForIndex generate key from index. So for a shard range, recording the start and end is enough std::vector> serverShards; // storage server and the shards it owns + std::map serverInterfaces; int hotServerCount = 0, currentHotRound = -1; // states of metric @@ -374,12 +375,23 @@ struct ReadWriteWorkload : KVWorkload { } ACTOR static Future updateServerShards(Database cx, ReadWriteWorkload* self) { + state Future serverList = + runRYWTransaction(cx, [](Reference tr) -> Future { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY); + }); state RangeResult range = wait(runRYWTransaction(cx, [](Reference tr) -> Future { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); return tr->getRange(serverKeysRange, CLIENT_KNOBS->TOO_MANY, Snapshot::True); })); - + wait(success(serverList)); + // decode server interfaces + self->serverInterfaces.clear(); + for (int i = 0; i < serverList.get().size(); i++) { + auto ssi = decodeServerListValue(serverList.get()[i].value); + self->serverInterfaces.emplace(ssi.id(), ssi); + } // clear self->serverShards self->serverShards.clear(); @@ -751,7 +763,13 @@ struct ReadWriteWorkload : KVWorkload { // calculate hot server count void setHotServers() { hotServerCount = ceil(hotServerFraction * serverShards.size()); - std::cout << "Choose " << hotServerCount << " hot servers\n"; + std::cout << "Choose " << hotServerCount << " hot servers: ["; + int begin = currentHotRound * hotServerCount; + for (int i = 0; i < hotServerCount; ++i) { + int idx = (begin + i) % serverShards.size(); + std::cout << serverInterfaces.at(serverShards[idx].first).address().toString() << ","; + } + std::cout << "]\n"; } int64_t getRandomKeyFromHotServer() { From 8d20ee84324ed422f1f5544edef5fa13bdb7910b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 7 Mar 2022 17:36:43 -0800 Subject: [PATCH 019/299] change storage metrics of read sample calculation --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/DDTeamCollection.actor.cpp | 62 +++++++++++++++++++++ fdbserver/DataDistributionQueue.actor.cpp | 12 ++-- fdbserver/DataDistributionTracker.actor.cpp | 5 ++ fdbserver/StorageMetrics.actor.h | 24 ++++---- 5 files changed, 87 insertions(+), 18 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 7bddd42b2e..17426263ad 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -622,7 +622,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); // 100K bytes init( READ_HOT_SUB_RANGE_CHUNK_SIZE, 10000000); // 10MB init( EMPTY_READ_PENALTY, 20 ); // 20 bytes - init( READ_SAMPLING_ENABLED, false ); if ( randomize && BUGGIFY ) READ_SAMPLING_ENABLED = true;// enable/disable read sampling + init( READ_SAMPLING_ENABLED, true ); if ( randomize && BUGGIFY ) READ_SAMPLING_ENABLED = true;// enable/disable read sampling //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 90afb44cac..627b7549db 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5649,6 +5649,64 @@ public: return Void(); } + + ACTOR static Future GetTeam_TrueBestLeastReadBandwidth() { + Reference policy = Reference( + new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); + state int processSize = 5; + state int teamSize = 3; + state std::unique_ptr collection = testTeamCollection(teamSize, policy, processSize); + GetStorageMetricsReply mid_read; + mid_read.capacity.bytes = 1000 * 1024 * 1024; + mid_read.available.bytes = 400 * 1024 * 1024; + mid_read.load.bytes = 200 * 1024 * 1024; + mid_read.load.bytesReadPerKSecond = 200 * 1024 * 1024; + + GetStorageMetricsReply high_read; + high_read.capacity.bytes = 1000 * 1024 * 1024; + high_read.available.bytes = 800 * 1024 * 1024; + high_read.load.bytesReadPerKSecond = 800 * 1024 * 1024; + high_read.load.bytes = 400 * 1024 * 1024; + + collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); + collection->addTeam(std::set({ UID(2, 0), UID(3, 0), UID(4, 0) }), true); + collection->disableBuildingTeams(); + collection->setCheckTeamDelay(); + + /* + * Among server teams that have healthy space available, pick the team that is + * least utilized, if the caller says they preferLowerUtilization. + */ + + collection->server_info[UID(1, 0)]->setServerMetrics(mid_read); + collection->server_info[UID(2, 0)]->setServerMetrics(mid_read); + collection->server_info[UID(3, 0)]->setServerMetrics(mid_read); + collection->server_info[UID(4, 0)]->setServerMetrics(high_read); + + bool wantsNewServers = true; + bool wantsTrueBest = true; + bool preferLowerUtilization = true; + bool teamMustHaveShards = false; + std::vector completeSources{ UID(1, 0), UID(2, 0), UID(3, 0) }; + + state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards); + req.completeSources = completeSources; + req.teamSorter = [](Reference a, Reference b) { + return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); + }; + + wait(collection->getTeam(req)); + + std::pair>, bool> resTeam = req.reply.getFuture().get(); + + std::set expectedServers{ UID(1, 0), UID(2, 0), UID(3, 0) }; + ASSERT(resTeam.first.present()); + auto servers = resTeam.first.get()->getServerIDs(); + const std::set selectedServers(servers.begin(), servers.end()); + ASSERT(expectedServers == selectedServers); + + return Void(); + } }; TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") { @@ -5710,3 +5768,7 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") { wait(DDTeamCollectionUnitTest::GetTeam_ServerUtilizationNearCutoff()); return Void(); } +TEST_CASE("/DataDistribution/GetTeam/TrueBestLeastReadBandwidth") { + wait(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth()); + return Void(); +} diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 3614042ec5..5074f9b14e 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1311,10 +1311,8 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, state GetMetricsRequest req(shards); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { - if (a.bytes > 0 && b.bytes > 0) { - return ((double)a.bytesReadPerKSecond / a.bytes) < ((double)b.bytesReadPerKSecond / b.bytes); - } - return a.allLessOrEqual(b); + return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < + b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); @@ -1467,7 +1465,8 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde } try { - state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); + // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test + state Future delayF = delay(0.1, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); @@ -1581,7 +1580,8 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) } try { - state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); + // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test + state Future delayF = delay(0.1, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 49a51f7c3e..43c154ceb2 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -848,6 +848,11 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr metrics += t.value().stats->get().get().metrics; } + // skip if current stats is invalid + if (onChange.isValid()) { + break; + } + if (req.comparator.present()) { if (req.comparator.get()(returnMetrics, metrics)) { returnMetrics = metrics; diff --git a/fdbserver/StorageMetrics.actor.h b/fdbserver/StorageMetrics.actor.h index 01016a3e8b..15bb33ffb8 100644 --- a/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/StorageMetrics.actor.h @@ -494,19 +494,20 @@ struct StorageServerMetrics { int64_t minShardReadBandwidthPerKSeconds) const { std::vector toReturn; - double shardSize = (double)byteSample.getEstimate(shard); + int64_t shardSize = byteSample.getEstimate(shard); int64_t shardReadBandwidth = bytesReadSample.getEstimate(shard); if (shardReadBandwidth * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS <= minShardReadBandwidthPerKSeconds) { return toReturn; } if (shardSize <= baseChunkSize) { + if (shardSize == 0) + shardSize = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; // Shard is small, use it as is - if (bytesReadSample.getEstimate(shard) > (readDensityRatio * shardSize)) { + if (shardReadBandwidth > (readDensityRatio * shardSize)) { toReturn.emplace_back(shard, - bytesReadSample.getEstimate(shard) / shardSize, - bytesReadSample.getEstimate(shard) / - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); + shardReadBandwidth / (double)shardSize, + shardReadBandwidth / SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); } return toReturn; } @@ -525,18 +526,19 @@ struct StorageServerMetrics { ++endKey; continue; } - if (bytesReadSample.getEstimate(KeyRangeRef(beginKey, *endKey)) > - (readDensityRatio * std::max(baseChunkSize, byteSample.getEstimate(KeyRangeRef(beginKey, *endKey))))) { + int64_t readBandwidth = bytesReadSample.getEstimate(KeyRangeRef(beginKey, *endKey)); + if (readBandwidth > + readDensityRatio * std::max(baseChunkSize, byteSample.getEstimate(KeyRangeRef(beginKey, *endKey)))) { auto range = KeyRangeRef(beginKey, *endKey); if (!toReturn.empty() && toReturn.back().keys.end == range.begin) { // in case two consecutive chunks both are over the ratio, merge them. range = KeyRangeRef(toReturn.back().keys.begin, *endKey); toReturn.pop_back(); } - toReturn.emplace_back( - range, - (double)bytesReadSample.getEstimate(range) / std::max(baseChunkSize, byteSample.getEstimate(range)), - bytesReadSample.getEstimate(range) / SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); + readBandwidth = bytesReadSample.getEstimate(range); + toReturn.emplace_back(range, + (double)readBandwidth / std::max(baseChunkSize, byteSample.getEstimate(range)), + readBandwidth / SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); } beginKey = *endKey; endKey = byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + From dd345dbb61770fd844ba00104057bfca4954c829 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 8 Mar 2022 16:31:49 -0800 Subject: [PATCH 020/299] add test utils --- fdbserver/workloads/BulkSetup.actor.h | 28 +++++++++++++++---------- fdbserver/workloads/ReadWrite.actor.cpp | 6 ++++-- tests/noSim/ReadSkewReadWrite.toml | 23 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 13 deletions(-) create mode 100644 tests/noSim/ReadSkewReadWrite.toml diff --git a/fdbserver/workloads/BulkSetup.actor.h b/fdbserver/workloads/BulkSetup.actor.h index a900842b88..722f5404dd 100644 --- a/fdbserver/workloads/BulkSetup.actor.h +++ b/fdbserver/workloads/BulkSetup.actor.h @@ -158,6 +158,22 @@ ACTOR Future>> trackInsertionCount(Datab std::vector countsOfInterest, double checkInterval); +ACTOR template +Future waitForLowInFlight(Database cx, T* workload) { + loop { + int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo)); + TraceEvent("DynamicWarming").detail("InFlight", inFlight); + if (inFlight > 1e6) { // Wait for just 1 MB to be in flight + wait(delay(1.0)); + } else { + wait(delay(1.0)); + TraceEvent("DynamicWarmingDone").log(); + break; + } + } + return Void(); +} + ACTOR template Future bulkSetup(Database cx, T* workload, @@ -279,17 +295,7 @@ Future bulkSetup(Database cx, if (postSetupWarming != 0) { try { wait(delay(5.0)); // Wait for the data distribution in a small test to start - loop { - int64_t inFlight = wait(getDataInFlight(cx, workload->dbInfo)); - TraceEvent("DynamicWarming").detail("InFlight", inFlight); - if (inFlight > 1e6) { // Wait for just 1 MB to be in flight - wait(delay(1.0)); - } else { - wait(delay(1.0)); - TraceEvent("DynamicWarmingDone").log(); - break; - } - } + wait(timeoutError(waitForLowInFlight(cx, workload), postSetupWarming)); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 1bea43d512..4a14c97e8c 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -366,7 +366,7 @@ struct ReadWriteWorkload : KVWorkload { void debugPrintServerShards() const { for (auto it : this->serverShards) { - std::cout << it.first.toString() << ": ["; + std::cout << serverInterfaces.at(it.first).address().toString() << ": ["; for (auto p : it.second) { std::cout << "[" << p.first << "," << p.second << "), "; } @@ -443,7 +443,9 @@ struct ReadWriteWorkload : KVWorkload { for (auto it : serverShards) { self->serverShards.emplace_back(it); } - // self->debugPrintServerShards(); + // if (self->clientId == 0) { + // self->debugPrintServerShards(); + // } return Void(); } diff --git a/tests/noSim/ReadSkewReadWrite.toml b/tests/noSim/ReadSkewReadWrite.toml new file mode 100644 index 0000000000..1034f01cd7 --- /dev/null +++ b/tests/noSim/ReadSkewReadWrite.toml @@ -0,0 +1,23 @@ +[[test]] +testTitle = 'RandomReadWriteTest' +connectionFailuresDisableDuration = 100000 +waitForQuiescenceBegin=false +waitForQuiescenceEnd=false +clearAfterTest = true +runSetup = true +timeout = 3600.0 + +[[test.workload]] +testName = 'ReadWrite' +transactionsPerSecond = 100000 +testDuration = 600.0 +skewRound = 1 +nodeCount = 15000000 +valueBytes = 1000 +readsPerTransactionA = 4 +writesPerTransactionA = 0 +alpha = 0 +discardEdgeMeasurements = false +hotServerFraction = 0.02 +hotServerReadFrac = 0.8 +warmingDelay = 180.0 \ No newline at end of file From 3ca14b4b6fe1ec01c38c44919b43f26793d0567c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 9 Mar 2022 09:51:07 -0800 Subject: [PATCH 021/299] cmakelist --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ea21f218ca..a7fd13a358 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -179,6 +179,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/WriteDuringRead.toml) add_fdb_test(TEST_FILES fast/WriteDuringReadClean.toml) add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT) + add_fdb_test(TEST_FILES noSim/ReadSkewReadWrite.toml IGNORE) if (SSD_ROCKSDB_EXPERIMENTAL) add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE) # re-enable as needed for RocksDB. Breaks correctness tests if RocksDB is disabled. endif() From d76c179bd07092ed930f4242e9841cf256c5428a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 10 Mar 2022 21:40:50 -0800 Subject: [PATCH 022/299] add hex to int --- fdbserver/tester.actor.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 5907f814a8..789737342b 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -52,6 +52,21 @@ WorkloadContext::WorkloadContext(const WorkloadContext& r) WorkloadContext::~WorkloadContext() {} const char HEX_CHAR_LOOKUP[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; +// clang-format off +struct HexTable { + long long tab[128]; + constexpr HexTable() : tab {} { + tab['1'] = 1; tab['2'] = 2; tab['3'] = 3; tab['4'] = 4; tab['5'] = 5; tab['6'] = 6; tab['7'] = 7; tab['8'] = 8; + tab['9'] = 9; tab['a'] = 10; tab['A'] = 10; tab['b'] = 11; tab['B'] = 11; tab['c'] = 12; tab['C'] = 12; + tab['d'] = 13; tab['D'] = 13; tab['e'] = 14; tab['E'] = 14; tab['f'] = 15; tab['F'] = 15; + } + constexpr long long operator[](char const idx) const { return tab[(std::size_t) idx]; } +} constexpr hexTable; +// clang-format on + +constexpr int64_t hexToInt(char number) { + return hexTable[(std::size_t)number]; +} void emplaceIndex(uint8_t* data, int offset, int64_t index) { for (int i = 0; i < 16; i++) { @@ -102,10 +117,10 @@ Key KVWorkload::keyForIndex(uint64_t index) const { int64_t KVWorkload::indexForKey(const KeyRef& key, bool absent) const { int idx = 0; if (nodePrefix > 0) { - ASSERT(keyBytes >= 32); + ASSERT(key.size() >= 32); idx += 16; } - ASSERT(keyBytes >= 16); + ASSERT(key.size() >= 16); // extract int64_t index, the reverse process of emplaceIndex() auto end = key.size() - idx - (absent ? 1 : 0); std::string str((char*)key.begin() + idx, end); From 2a621d1f3e7eaf95f2ed69d1af653a0d1ca27060 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 11 Mar 2022 16:49:52 -0800 Subject: [PATCH 023/299] fix shardmap read --- fdbserver/workloads/BulkSetup.actor.h | 3 +- fdbserver/workloads/ReadWrite.actor.cpp | 75 +++++++++++++++++-------- 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/fdbserver/workloads/BulkSetup.actor.h b/fdbserver/workloads/BulkSetup.actor.h index 722f5404dd..039e99a9f2 100644 --- a/fdbserver/workloads/BulkSetup.actor.h +++ b/fdbserver/workloads/BulkSetup.actor.h @@ -294,8 +294,7 @@ Future bulkSetup(Database cx, // Here we wait for data in flight to go to 0 (this will not work on a database with other users) if (postSetupWarming != 0) { try { - wait(delay(5.0)); // Wait for the data distribution in a small test to start - wait(timeoutError(waitForLowInFlight(cx, workload), postSetupWarming)); + wait(delay(5.0) >> waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 4a14c97e8c..bcd84b8522 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -365,15 +365,40 @@ struct ReadWriteWorkload : KVWorkload { } void debugPrintServerShards() const { + std::cout << std::hex; for (auto it : this->serverShards) { std::cout << serverInterfaces.at(it.first).address().toString() << ": ["; for (auto p : it.second) { - std::cout << "[" << p.first << "," << p.second << "), "; + std::cout << "[" << p.first << "," << p.second << "], "; } std::cout << "] \n"; } } + // for each boundary except the last one in boundaries, found the first existed key generated from keyForIndex as + // beginIdx, found the last existed key generated from keyForIndex the endIdx. + ACTOR static Future convertKeyBoundaryToIndexShard(Database cx, + ReadWriteWorkload* self, + Standalone> boundaries) { + state IndexRangeVec res; + state int i = 0; + for (; i < boundaries.size() - 1; ++i) { + KeyRangeRef currentShard = KeyRangeRef(boundaries[i], boundaries[i+1]); + // std::cout << currentShard.toString() << "\n"; + std::vector ranges = wait(runRYWTransaction(cx, [currentShard](Reference tr) -> Future> { + std::vector> f; + f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::False)); + f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::True)); + return getAll(f); + })); + ASSERT(ranges[0].size() == 1 && ranges[1].size() == 1); + res.emplace_back(self->indexForKey(ranges[0][0].key), self->indexForKey(ranges[1][0].key)); + } + + ASSERT(res.size() == boundaries.size() - 1); + return res; + } + ACTOR static Future updateServerShards(Database cx, ReadWriteWorkload* self) { state Future serverList = runRYWTransaction(cx, [](Reference tr) -> Future { @@ -383,7 +408,7 @@ struct ReadWriteWorkload : KVWorkload { state RangeResult range = wait(runRYWTransaction(cx, [](Reference tr) -> Future { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - return tr->getRange(serverKeysRange, CLIENT_KNOBS->TOO_MANY, Snapshot::True); + return tr->getRange(serverKeysRange, CLIENT_KNOBS->TOO_MANY); })); wait(success(serverList)); // decode server interfaces @@ -400,15 +425,14 @@ struct ReadWriteWorkload : KVWorkload { Key leftEdge(allKeys.begin); std::vector leftServer; // left server owns the range [leftEdge, workloadBegin) KeyRangeRef workloadRange(workloadBegin, workloadEnd); - std::map> beginServers; // begin index to server ID + state std::map> beginServers; // begin index to server ID for (auto kv = range.begin(); kv != range.end(); kv++) { if (serverHasKey(kv->value)) { auto [id, key] = serverKeysDecodeServerBegin(kv->key); if (workloadRange.contains(key)) { - auto idx = self->indexForKey(key); - beginServers[idx].push_back(id); + beginServers[key].push_back(id); } else if (workloadBegin > key && key > leftEdge) { // update left boundary leftEdge = key; leftServer.clear(); @@ -419,33 +443,37 @@ struct ReadWriteWorkload : KVWorkload { } } } - ASSERT(beginServers.size() == 0 || beginServers.begin()->first >= 0); + ASSERT(beginServers.size() == 0 || beginServers.begin()->first >= workloadBegin); // handle the left boundary - if (beginServers.size() == 0 || beginServers.begin()->first > 0) { - beginServers[0] = leftServer; + if (beginServers.size() == 0 || beginServers.begin()->first > workloadBegin) { + beginServers[workloadBegin] = leftServer; } + Standalone> keyBegins; + for(auto p = beginServers.begin(); p != beginServers.end(); ++ p) { + keyBegins.push_back(keyBegins.arena(), p->first); + } + // deep count because wait below will destruct workloadEnd + keyBegins.push_back_deep(keyBegins.arena(), workloadEnd); + IndexRangeVec indexShards = wait(convertKeyBoundaryToIndexShard(cx, self, keyBegins)); + ASSERT(beginServers.size() == indexShards.size()); // sort shard begin idx // build self->serverShards, starting from the left shard std::map serverShards; - auto nextIt = std::next(beginServers.begin()); - for (auto it : beginServers) { - auto shardEnd = self->nodeCount; - if (nextIt != beginServers.end()) { - shardEnd = nextIt->first; - ++nextIt; - } - for (auto id : it.second) { - serverShards[id].emplace_back(it.first, shardEnd); + int i = 0; + for(auto p = beginServers.begin(); p != beginServers.end(); ++ p) { + for(int j = 0; j < p->second.size(); ++ j) { + serverShards[p->second[j]].emplace_back(indexShards[i]); } + ++ i; } // self->serverShards is ordered by UID for (auto it : serverShards) { self->serverShards.emplace_back(it); } - // if (self->clientId == 0) { - // self->debugPrintServerShards(); - // } + if (self->clientId == 0) { + self->debugPrintServerShards(); + } return Void(); } @@ -737,12 +765,13 @@ struct ReadWriteWorkload : KVWorkload { clients.push_back(tracePeriodically(self)); if (self->skewRound > 0) { + wait(updateServerShards(cx, self)); for (self->currentHotRound = 0; self->currentHotRound < self->skewRound; ++self->currentHotRound) { - wait(updateServerShards(cx, self)); self->setHotServers(); self->startReadWriteClients(cx, clients); wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void())); clients.clear(); + wait(delay(5.0) >> updateServerShards(cx, self)); } } else { self->startReadWriteClients(cx, clients); @@ -778,9 +807,9 @@ struct ReadWriteWorkload : KVWorkload { ASSERT(hotServerCount > 0); int begin = currentHotRound * hotServerCount; int idx = deterministicRandom()->randomInt(begin, begin + hotServerCount) % serverShards.size(); - int shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); + int shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, - serverShards[idx].second[shardIdx].second); + serverShards[idx].second[shardIdx].second + 1); } int64_t getRandomKey(uint64_t nodeCount, bool hotServerRead = true) { From 6ebd84ef3d7b33d489a4995afd7f85e70b165417 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sat, 12 Mar 2022 23:22:46 -0800 Subject: [PATCH 024/299] Revert "add hex to int" This reverts commit d76c179bd07092ed930f4242e9841cf256c5428a. --- fdbserver/tester.actor.cpp | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 789737342b..5907f814a8 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -52,21 +52,6 @@ WorkloadContext::WorkloadContext(const WorkloadContext& r) WorkloadContext::~WorkloadContext() {} const char HEX_CHAR_LOOKUP[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; -// clang-format off -struct HexTable { - long long tab[128]; - constexpr HexTable() : tab {} { - tab['1'] = 1; tab['2'] = 2; tab['3'] = 3; tab['4'] = 4; tab['5'] = 5; tab['6'] = 6; tab['7'] = 7; tab['8'] = 8; - tab['9'] = 9; tab['a'] = 10; tab['A'] = 10; tab['b'] = 11; tab['B'] = 11; tab['c'] = 12; tab['C'] = 12; - tab['d'] = 13; tab['D'] = 13; tab['e'] = 14; tab['E'] = 14; tab['f'] = 15; tab['F'] = 15; - } - constexpr long long operator[](char const idx) const { return tab[(std::size_t) idx]; } -} constexpr hexTable; -// clang-format on - -constexpr int64_t hexToInt(char number) { - return hexTable[(std::size_t)number]; -} void emplaceIndex(uint8_t* data, int offset, int64_t index) { for (int i = 0; i < 16; i++) { @@ -117,10 +102,10 @@ Key KVWorkload::keyForIndex(uint64_t index) const { int64_t KVWorkload::indexForKey(const KeyRef& key, bool absent) const { int idx = 0; if (nodePrefix > 0) { - ASSERT(key.size() >= 32); + ASSERT(keyBytes >= 32); idx += 16; } - ASSERT(key.size() >= 16); + ASSERT(keyBytes >= 16); // extract int64_t index, the reverse process of emplaceIndex() auto end = key.size() - idx - (absent ? 1 : 0); std::string str((char*)key.begin() + idx, end); From d38b8279066b54ec491a36d031e059235d93be7a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sun, 13 Mar 2022 23:47:07 -0700 Subject: [PATCH 025/299] fix DD ignore bug --- fdbcli/StatusCommand.actor.cpp | 4 +++ fdbserver/DataDistributionQueue.actor.cpp | 1 + fdbserver/workloads/ReadWrite.actor.cpp | 31 ++++++++++++----------- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp index ea6bcb5293..9506867cbc 100644 --- a/fdbcli/StatusCommand.actor.cpp +++ b/fdbcli/StatusCommand.actor.cpp @@ -1108,6 +1108,10 @@ void printStatus(StatusObjectReader statusObj, if (statusObjCluster.has("data_distribution_disabled_for_rebalance")) { outputString += "\n\nWARNING: Data distribution is currently turned on but shard size balancing is " "currently disabled."; + // data_distribution_disabled_hex + if (statusObjCluster.has("data_distribution_disabled_hex")) { + outputString += " Ignore code: " + statusObjCluster["data_distribution_disabled_hex"].get_str(); + } } } diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 5074f9b14e..ced7696618 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1474,6 +1474,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde if (skipCurrentLoop && !val.present()) { // reset loop interval rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + skipCurrentLoop = false; } else if (val.present()) { if (val.get().size() > 0) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index bcd84b8522..d9c2b58ba5 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -383,14 +383,15 @@ struct ReadWriteWorkload : KVWorkload { state IndexRangeVec res; state int i = 0; for (; i < boundaries.size() - 1; ++i) { - KeyRangeRef currentShard = KeyRangeRef(boundaries[i], boundaries[i+1]); + KeyRangeRef currentShard = KeyRangeRef(boundaries[i], boundaries[i + 1]); // std::cout << currentShard.toString() << "\n"; - std::vector ranges = wait(runRYWTransaction(cx, [currentShard](Reference tr) -> Future> { - std::vector> f; - f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::False)); - f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::True)); - return getAll(f); - })); + std::vector ranges = wait(runRYWTransaction( + cx, [currentShard](Reference tr) -> Future> { + std::vector> f; + f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::False)); + f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::True)); + return getAll(f); + })); ASSERT(ranges[0].size() == 1 && ranges[1].size() == 1); res.emplace_back(self->indexForKey(ranges[0][0].key), self->indexForKey(ranges[1][0].key)); } @@ -449,7 +450,7 @@ struct ReadWriteWorkload : KVWorkload { beginServers[workloadBegin] = leftServer; } Standalone> keyBegins; - for(auto p = beginServers.begin(); p != beginServers.end(); ++ p) { + for (auto p = beginServers.begin(); p != beginServers.end(); ++p) { keyBegins.push_back(keyBegins.arena(), p->first); } // deep count because wait below will destruct workloadEnd @@ -461,19 +462,19 @@ struct ReadWriteWorkload : KVWorkload { // build self->serverShards, starting from the left shard std::map serverShards; int i = 0; - for(auto p = beginServers.begin(); p != beginServers.end(); ++ p) { - for(int j = 0; j < p->second.size(); ++ j) { + for (auto p = beginServers.begin(); p != beginServers.end(); ++p) { + for (int j = 0; j < p->second.size(); ++j) { serverShards[p->second[j]].emplace_back(indexShards[i]); } - ++ i; + ++i; } // self->serverShards is ordered by UID for (auto it : serverShards) { self->serverShards.emplace_back(it); } - if (self->clientId == 0) { - self->debugPrintServerShards(); - } + // if (self->clientId == 0) { + // self->debugPrintServerShards(); + // } return Void(); } @@ -807,7 +808,7 @@ struct ReadWriteWorkload : KVWorkload { ASSERT(hotServerCount > 0); int begin = currentHotRound * hotServerCount; int idx = deterministicRandom()->randomInt(begin, begin + hotServerCount) % serverShards.size(); - int shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); + int shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, serverShards[idx].second[shardIdx].second + 1); } From 10eb082888a6aefbfcddbaae3c902cb79979a621 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 14 Mar 2022 10:49:19 -0700 Subject: [PATCH 026/299] format code; fix DD ignore option bug" --- fdbserver/DataDistributionQueue.actor.cpp | 47 ++++++++++++----------- fdbserver/workloads/BulkSetup.actor.h | 3 +- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index ced7696618..a7f0ad5287 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1449,10 +1449,11 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; + state bool disableReadBalance = false; + state bool disableDiskBalance = false; + loop { state bool moved = false; - state bool disableReadBalance = false; - state bool disableDiskBalance = false; state Reference sourceTeam; state Reference destTeam; state GetTeamRequest srcReq; @@ -1471,19 +1472,19 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde tr.setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); lastRead = now(); - if (skipCurrentLoop && !val.present()) { + if (!val.present()) { // reset loop interval - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + if (skipCurrentLoop) { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } skipCurrentLoop = false; - } else if (val.present()) { + disableReadBalance = false; + disableDiskBalance = false; + } else { if (val.get().size() > 0) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - if (ddIgnore & DDIgnore::REBALANCE_DISK) { - disableDiskBalance = true; - } - if (ddIgnore & DDIgnore::REBALANCE_READ) { - disableReadBalance = true; - } + disableDiskBalance = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; + disableReadBalance = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; skipCurrentLoop = disableReadBalance && disableDiskBalance; } else { skipCurrentLoop = true; @@ -1564,11 +1565,11 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; + state bool disableReadBalance = false; + state bool disableDiskBalance = false; loop { state bool moved = false; - state bool disableReadBalance = false; - state bool disableDiskBalance = false; state Reference sourceTeam; state Reference destTeam; state GetTeamRequest srcReq; @@ -1587,19 +1588,19 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) tr.setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); lastRead = now(); - if (skipCurrentLoop && !val.present()) { + if (!val.present()) { // reset loop interval - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + if (skipCurrentLoop) { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } + skipCurrentLoop = false; + disableReadBalance = false; + disableDiskBalance = false; } else if (val.present()) { - // FIXME: better way for upgrade? for example, using a new key to record mode - if (val.get().size() > sizeof(int)) { + if (val.get().size() > 0) { int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - if (ddIgnore & DDIgnore::REBALANCE_DISK) { - disableDiskBalance = true; - } - if (ddIgnore & DDIgnore::REBALANCE_READ) { - disableReadBalance = true; - } + disableDiskBalance = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; + disableReadBalance = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; skipCurrentLoop = disableReadBalance && disableDiskBalance; } else { skipCurrentLoop = true; diff --git a/fdbserver/workloads/BulkSetup.actor.h b/fdbserver/workloads/BulkSetup.actor.h index 039e99a9f2..fc7db9cf05 100644 --- a/fdbserver/workloads/BulkSetup.actor.h +++ b/fdbserver/workloads/BulkSetup.actor.h @@ -294,7 +294,8 @@ Future bulkSetup(Database cx, // Here we wait for data in flight to go to 0 (this will not work on a database with other users) if (postSetupWarming != 0) { try { - wait(delay(5.0) >> waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start + wait(delay(5.0) >> + waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; From f14884de86c080750c4e6e753398f23741a19727 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 18 Mar 2022 14:21:07 -0700 Subject: [PATCH 027/299] update ReadWrite to control server overlap --- fdbserver/workloads/ReadWrite.actor.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index d9c2b58ba5..74c33009a8 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -112,8 +112,9 @@ struct ReadWriteWorkload : KVWorkload { double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting // server based hot traffic setting int skewRound = 0; // skewDuration = ceil(testDuration / skewRound) - double hotServerFraction = 0; // set > 0 to issue hot key based on shard map + double hotServerFraction = 0, hotServerShardFraction = 1.0; // set > 0 to issue hot key based on shard map double hotServerReadFrac, hotServerWriteFrac; // hot many traffic goes to hot servers + double hotReadWriteServerOverlap; // the portion of intersection of write and hot server // hot server state typedef std::vector> IndexRangeVec; @@ -231,6 +232,8 @@ struct ReadWriteWorkload : KVWorkload { // of hot keys, else it is directed to a disjoint set of cold keys hotKeyFraction = getOption(options, "hotKeyFraction"_sr, 0.0); hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.0); + hotServerShardFraction = getOption(options, "hotServerShardFraction"_sr, 1.0); + hotReadWriteServerOverlap = getOption(options, "hotReadWriteServerOverlap"_sr, 0.0); skewRound = getOption(options, "skewRound"_sr, 0); hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.8); hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); @@ -804,11 +807,16 @@ struct ReadWriteWorkload : KVWorkload { std::cout << "]\n"; } - int64_t getRandomKeyFromHotServer() { + int64_t getRandomKeyFromHotServer(bool hotServerRead = true) { ASSERT(hotServerCount > 0); int begin = currentHotRound * hotServerCount; + if (!hotServerRead) { + begin += hotServerCount * (1.0 - hotReadWriteServerOverlap); // calculate non-overlap part offset + } int idx = deterministicRandom()->randomInt(begin, begin + hotServerCount) % serverShards.size(); - int shardIdx = deterministicRandom()->randomInt(0, serverShards[idx].second.size()); + int shardMax = std::min(serverShards[idx].second.size(), + (size_t)ceil(serverShards[idx].second.size() * hotServerShardFraction)); + int shardIdx = deterministicRandom()->randomInt(0, shardMax); return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, serverShards[idx].second[shardIdx].second + 1); } @@ -820,7 +828,7 @@ struct ReadWriteWorkload : KVWorkload { hotKeyFraction; // spread hot keys over keyspace } else if (hotServerFraction > 0) { if ((hotServerRead && random < hotServerReadFrac) || (!hotServerRead && random < hotServerWriteFrac)) { - return getRandomKeyFromHotServer(); + return getRandomKeyFromHotServer(hotServerRead); } } return deterministicRandom()->randomInt64(0, nodeCount); From 4b92f8f546974e2c490eddc687f1ed71bf1ed806 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 18 Mar 2022 16:39:31 -0700 Subject: [PATCH 028/299] add relocate reason and set teamSorter in relocator --- fdbserver/DataDistribution.actor.cpp | 6 ++-- fdbserver/DataDistribution.actor.h | 9 ++++-- fdbserver/DataDistributionQueue.actor.cpp | 34 ++++++++++++--------- fdbserver/DataDistributionTracker.actor.cpp | 6 ++-- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index a012445d52..8dab99d618 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -699,8 +699,10 @@ ACTOR Future dataDistribution(Reference self, if (!unhealthy && configuration.usableRegions > 1) { unhealthy = initData->shards[shard].remoteSrc.size() != configuration.storageTeamSize; } - output.send(RelocateShard( - keys, unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY : SERVER_KNOBS->PRIORITY_RECOVER_MOVE)); + output.send(RelocateShard(keys, + unhealthy ? SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY + : SERVER_KNOBS->PRIORITY_RECOVER_MOVE, + RelocateReason::OTHER)); } wait(yield(TaskPriority::DataDistribution)); } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index bfdbfafebf..dbd681028e 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -32,12 +32,15 @@ #include "fdbclient/RunTransaction.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. +enum class RelocateReason { INVALID = -1, OTHER, REBALANCE_DISK, REBALANCE_READ }; + struct RelocateShard { KeyRange keys; int priority; - - RelocateShard() : priority(0) {} - RelocateShard(KeyRange const& keys, int priority) : keys(keys), priority(priority) {} + RelocateReason reason; + RelocateShard() : priority(0), reason(RelocateReason::INVALID) {} + RelocateShard(KeyRange const& keys, int priority, RelocateReason reason) + : keys(keys), priority(priority), reason(reason) {} }; struct IDataDistributionTeam { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index a7f0ad5287..01806c144d 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -42,6 +42,7 @@ struct RelocateData { int priority; int boundaryPriority; int healthPriority; + RelocateReason reason; double startTime; UID randomId; @@ -52,11 +53,11 @@ struct RelocateData { TraceInterval interval; RelocateData() - : priority(-1), boundaryPriority(-1), healthPriority(-1), startTime(-1), workFactor(0), wantsNewServers(false), - interval("QueuedRelocation") {} + : priority(-1), boundaryPriority(-1), healthPriority(-1), reason(RelocateReason::INVALID), startTime(-1), + workFactor(0), wantsNewServers(false), interval("QueuedRelocation") {} explicit RelocateData(RelocateShard const& rs) : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1), - healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), startTime(now()), + healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()), randomId(deterministicRandom()->randomUniqueID()), workFactor(0), wantsNewServers(rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || @@ -960,6 +961,16 @@ struct DDQueueData { } }; +// return true if a.readload > b.readload +bool greaterReadLoad(Reference a, Reference b) { + return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); +} + +// return true if a.readload < b.readload +bool lessReadLoad(Reference a, Reference b) { + return a->getLoadReadBandwidth() < b->getLoadReadBandwidth(); +} + // This actor relocates the specified keys to a good place. // The inFlightActor key range map stores the actor for each RelocateData ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, const DDEnabledState* ddEnabledState) { @@ -1028,6 +1039,10 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; + + if(rd.reason == RelocateReason::REBALANCE_READ) { + req.teamSorter = greaterReadLoad; + } // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any // server that hosts the relocateData. This is possible, for example, in a fearless configuration // when the remote DC is just brought up. @@ -1322,7 +1337,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); for (int i = 0; i < shards.size(); i++) { if (metrics.keys == shards[i]) { - self->output.send(RelocateShard(metrics.keys.get(), priority)); + self->output.send(RelocateShard(metrics.keys.get(), priority, RelocateReason::REBALANCE_READ)); return true; } } @@ -1395,7 +1410,7 @@ ACTOR Future rebalanceTeams(DDQueueData* self, ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); for (int i = 0; i < shards.size(); i++) { if (moveShard == shards[i]) { - self->output.send(RelocateShard(moveShard, priority)); + self->output.send(RelocateShard(moveShard, priority, RelocateReason::REBALANCE_DISK)); return true; } } @@ -1434,15 +1449,6 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, return Void(); } -// return true if a.readload > b.readload -bool greaterReadLoad(Reference a, Reference b) { - return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); -} -// return true if a.readload < b.readload -bool lessReadLoad(Reference a, Reference b) { - return a->getLoadReadBandwidth() < b->getLoadReadBandwidth(); -} - ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionIndex) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 43c154ceb2..e03f043bf5 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -525,12 +525,12 @@ ACTOR Future shardSplitter(DataDistributionTracker* self, for (int i = 0; i < skipRange; i++) { KeyRangeRef r(splitKeys[i], splitKeys[i + 1]); self->shardsAffectedByTeamFailure->defineShard(r); - self->output.send(RelocateShard(r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD)); + self->output.send(RelocateShard(r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD, RelocateReason::OTHER)); } for (int i = numShards - 1; i > skipRange; i--) { KeyRangeRef r(splitKeys[i], splitKeys[i + 1]); self->shardsAffectedByTeamFailure->defineShard(r); - self->output.send(RelocateShard(r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD)); + self->output.send(RelocateShard(r, SERVER_KNOBS->PRIORITY_SPLIT_SHARD, RelocateReason::OTHER)); } self->sizeChanges.add(changeSizes(self, keys, shardSize->get().get().metrics.bytes)); @@ -676,7 +676,7 @@ Future shardMerger(DataDistributionTracker* self, } restartShardTrackers(self, mergeRange, ShardMetrics(endingStats, lastLowBandwidthStartTime, shardCount)); self->shardsAffectedByTeamFailure->defineShard(mergeRange); - self->output.send(RelocateShard(mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD)); + self->output.send(RelocateShard(mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD, RelocateReason::OTHER)); // We are about to be cancelled by the call to restartShardTrackers return Void(); From 754f5f2c3a234a92f4d15c308c155b34d176c720 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 21 Mar 2022 15:46:06 -0700 Subject: [PATCH 029/299] set team too similar criteria for read rebalance --- fdbserver/DataDistributionQueue.actor.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 01806c144d..e1cbeca27e 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1040,7 +1040,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, req.src = rd.src; req.completeSources = rd.completeSources; - if(rd.reason == RelocateReason::REBALANCE_READ) { + if (rd.reason == RelocateReason::REBALANCE_READ) { req.teamSorter = greaterReadLoad; } // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any @@ -1329,9 +1329,14 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; - state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); if (metrics.keys.present() && metrics.bytes > 0) { + auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); + if (abs(srcLoad - destLoad) <= + 5 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + traceEvent->detail("SkipReason", "TeamTooSimilar"); + return false; + } // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); From 0186e00256992bc726bc211d4adc927b1d5c0263 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 21 Mar 2022 15:46:06 -0700 Subject: [PATCH 030/299] set team too similar criteria for read rebalance --- fdbserver/DataDistributionQueue.actor.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 01806c144d..262a9149bf 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1040,7 +1040,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, req.src = rd.src; req.completeSources = rd.completeSources; - if(rd.reason == RelocateReason::REBALANCE_READ) { + if (rd.reason == RelocateReason::REBALANCE_READ) { req.teamSorter = greaterReadLoad; } // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any @@ -1323,15 +1323,20 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, traceEvent->detail("SkipReason", "NoShardOnSource"); return false; } - + // state Future healthMetrics = self->cx->getHealthMetrics(true); state GetMetricsRequest req(shards); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; - state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); if (metrics.keys.present() && metrics.bytes > 0) { + auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); + if (abs(srcLoad - destLoad) <= + 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + traceEvent->detail("SkipReason", "TeamTooSimilar"); + return false; + } // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); From 746778fa0ac9ec3e3979898b2f2aa5ab115b1544 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 23 Mar 2022 11:18:58 -0700 Subject: [PATCH 031/299] CPU reading --- fdbserver/DataDistributionQueue.actor.cpp | 32 ++++++++++++++++------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 262a9149bf..de13ca653f 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1040,9 +1040,9 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, req.src = rd.src; req.completeSources = rd.completeSources; - if (rd.reason == RelocateReason::REBALANCE_READ) { - req.teamSorter = greaterReadLoad; - } + // if (rd.reason == RelocateReason::REBALANCE_READ) { + // req.teamSorter = greaterReadLoad; + // } // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any // server that hosts the relocateData. This is possible, for example, in a fearless configuration // when the remote DC is just brought up. @@ -1304,6 +1304,13 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } +inline double getWorstCpu(const HealthMetrics& metrics) { + double cpu = 0; + for (auto p : metrics.storageStats) { + cpu = std::max(cpu, p.second.cpuUsage); + } + return cpu; +} // Move the shard with highest read density of sourceTeam's to destTeam if sourceTeam has much more read load than // destTeam ACTOR Future rebalanceReadLoad(DDQueueData* self, @@ -1323,20 +1330,25 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, traceEvent->detail("SkipReason", "NoShardOnSource"); return false; } - // state Future healthMetrics = self->cx->getHealthMetrics(true); + state Future healthMetrics = self->cx->getHealthMetrics(true); state GetMetricsRequest req(shards); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); + wait(ready(healthMetrics)); + if (getWorstCpu(healthMetrics.get()) < 25.0) { // 25% + traceEvent->detail("SkipReason", "LowReadLoad"); + return false; + } if (metrics.keys.present() && metrics.bytes > 0) { - auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); - if (abs(srcLoad - destLoad) <= - 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { - traceEvent->detail("SkipReason", "TeamTooSimilar"); - return false; - } +// auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); +// if (abs(srcLoad - destLoad) <= +// 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { +// traceEvent->detail("SkipReason", "TeamTooSimilar"); +// return false; +// } // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); From e6893ba0b613c4bb15f9e8afb2c96b5fa83b5039 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 24 Mar 2022 13:09:39 -0700 Subject: [PATCH 032/299] change unit test to cover edge case --- fdbserver/DDTeamCollection.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 627b7549db..6cceef448f 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5659,8 +5659,8 @@ public: GetStorageMetricsReply mid_read; mid_read.capacity.bytes = 1000 * 1024 * 1024; mid_read.available.bytes = 400 * 1024 * 1024; - mid_read.load.bytes = 200 * 1024 * 1024; - mid_read.load.bytesReadPerKSecond = 200 * 1024 * 1024; + mid_read.load.bytes = 800 * 1024 * 1024; // high load bytes + mid_read.load.bytesReadPerKSecond = 200 * 1024 * 1024; // low read load GetStorageMetricsReply high_read; high_read.capacity.bytes = 1000 * 1024 * 1024; From b811a62b659de01c029695d71c331f28777cee5f Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 24 Mar 2022 13:16:10 -0700 Subject: [PATCH 033/299] fix teamSorter usage bug --- fdbclient/StorageServerInterface.h | 6 ++--- fdbserver/DDTeamCollection.actor.cpp | 8 +++---- fdbserver/DataDistribution.actor.h | 11 ++++----- fdbserver/DataDistributionQueue.actor.cpp | 27 ++++++++++++----------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 32a992fa3d..623272a38d 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -821,9 +821,9 @@ struct ChangeFeedVersionUpdateRequest { struct GetStorageMetricsReply { constexpr static FileIdentifier file_identifier = 15491478; - StorageMetrics load; - StorageMetrics available; - StorageMetrics capacity; + StorageMetrics load; // sum of key-value metrics (logical bytes) + StorageMetrics available; // physical bytes + StorageMetrics capacity; // physical bytes double bytesInputRate; int64_t versionLag; double lastUpdate; diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 6cceef448f..d892d373d9 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -250,8 +250,8 @@ public: self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team( self->teams[currentIndex]->getServerIDs(), self->primary))) && // sort conditions - (!bestOption.present() || req.lessCompare(bestOption.get(), self->teams[currentIndex]) || - !req.lessCompareByLoad(loadBytes, bestLoadBytes))) { + (!bestOption.present() || + req.lessCompare(bestOption.get(), self->teams[currentIndex], bestLoadBytes, loadBytes))) { bestLoadBytes = loadBytes; bestOption = self->teams[currentIndex]; bestIndex = currentIndex; @@ -299,8 +299,8 @@ public: for (int i = 0; i < randomTeams.size(); i++) { int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty); - if (!bestOption.present() || req.lessCompare(bestOption.get(), randomTeams[i]) || - !req.lessCompareByLoad(loadBytes, bestLoadBytes)) { + if (!bestOption.present() || + req.lessCompare(bestOption.get(), randomTeams[i], bestLoadBytes, loadBytes)) { bestLoadBytes = loadBytes; bestOption = randomTeams[i]; } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index dbd681028e..a5c3d61b85 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -106,16 +106,17 @@ struct GetTeamRequest { teamMustHaveShards(teamMustHaveShards), inflightPenalty(inflightPenalty) {} // return true if a.score < b.score - [[nodiscard]] bool lessCompare(TeamRef a, TeamRef b) const { + [[nodiscard]] bool lessCompare(TeamRef a, TeamRef b, int64_t aLoadBytes, int64_t bLoadBytes) const { if (teamSorter) { return teamSorter(a, b); } - return false; + return lessCompareByLoad(aLoadBytes, bLoadBytes); } - // return true if scoreWithLoadBytes < bestScoreWithBestLoadBytes - bool lessCompareByLoad(int64_t loadBytes, int64_t bestLoadBytes) const { - bool lessLoad = loadBytes < bestLoadBytes; + // return true if preferHigherUtil && aLoadBytes <= bLoadBytes (higher load bytes has larger score) + // or preferLowerUtil && aLoadBytes > bLoadBytes + bool lessCompareByLoad(int64_t aLoadBytes, int64_t bLoadBytes) const { + bool lessLoad = aLoadBytes <= bLoadBytes; return preferLowerUtilization ? !lessLoad : lessLoad; } diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index de13ca653f..136fa3c093 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -86,9 +86,10 @@ struct RelocateData { bool operator==(const RelocateData& rhs) const { return priority == rhs.priority && boundaryPriority == rhs.boundaryPriority && - healthPriority == rhs.healthPriority && keys == rhs.keys && startTime == rhs.startTime && - workFactor == rhs.workFactor && src == rhs.src && completeSources == rhs.completeSources && - wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId; + healthPriority == rhs.healthPriority && reason == rhs.reason && keys == rhs.keys && + startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src && + completeSources == rhs.completeSources && wantsNewServers == rhs.wantsNewServers && + randomId == rhs.randomId; } bool operator!=(const RelocateData& rhs) const { return !(*this == rhs); } }; @@ -1040,9 +1041,9 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, req.src = rd.src; req.completeSources = rd.completeSources; - // if (rd.reason == RelocateReason::REBALANCE_READ) { - // req.teamSorter = greaterReadLoad; - // } + if (rd.reason == RelocateReason::REBALANCE_READ) { + req.teamSorter = greaterReadLoad; + } // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any // server that hosts the relocateData. This is possible, for example, in a fearless configuration // when the remote DC is just brought up. @@ -1343,13 +1344,13 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } if (metrics.keys.present() && metrics.bytes > 0) { -// auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); -// if (abs(srcLoad - destLoad) <= -// 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { -// traceEvent->detail("SkipReason", "TeamTooSimilar"); -// return false; -// } - // Verify the shard is still in ShardsAffectedByTeamFailure + // auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); + // if (abs(srcLoad - destLoad) <= + // 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + // traceEvent->detail("SkipReason", "TeamTooSimilar"); + // return false; + // } + // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); for (int i = 0; i < shards.size(); i++) { From cd5ad654ae8a435847f791aadca6a6158df8c5d1 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 24 Mar 2022 16:18:18 -0700 Subject: [PATCH 034/299] unittest --- fdbclient/FDBTypes.h | 8 +++--- fdbserver/DDTeamCollection.actor.cpp | 42 +++++++++++++--------------- flow/genericactors.actor.h | 8 ++++++ 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 8a9bfe6ac0..2004f3b998 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1014,10 +1014,10 @@ class Database; struct HealthMetrics { struct StorageStats { - int64_t storageQueue; - int64_t storageDurabilityLag; - double diskUsage; - double cpuUsage; + int64_t storageQueue = 0; + int64_t storageDurabilityLag = 0; + double diskUsage = 0.0; + double cpuUsage = 0.0; bool operator==(StorageStats const& r) const { return ((storageQueue == r.storageQueue) && (storageDurabilityLag == r.storageDurabilityLag) && diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index d892d373d9..1ed68c131e 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5654,35 +5654,33 @@ public: Reference policy = Reference( new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); state int processSize = 5; - state int teamSize = 3; + state int teamSize = 1; state std::unique_ptr collection = testTeamCollection(teamSize, policy, processSize); - GetStorageMetricsReply mid_read; - mid_read.capacity.bytes = 1000 * 1024 * 1024; - mid_read.available.bytes = 400 * 1024 * 1024; - mid_read.load.bytes = 800 * 1024 * 1024; // high load bytes - mid_read.load.bytesReadPerKSecond = 200 * 1024 * 1024; // low read load - - GetStorageMetricsReply high_read; - high_read.capacity.bytes = 1000 * 1024 * 1024; - high_read.available.bytes = 800 * 1024 * 1024; - high_read.load.bytesReadPerKSecond = 800 * 1024 * 1024; - high_read.load.bytes = 400 * 1024 * 1024; - - collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); - collection->addTeam(std::set({ UID(2, 0), UID(3, 0), UID(4, 0) }), true); collection->disableBuildingTeams(); collection->setCheckTeamDelay(); + int64_t capacity = 1000 * 1024 * 1024; + std::vector read_bandwidths{ + 100 * 1024 * 1024, 300 * 1024 * 1024, 500 * 1024 * 1024, 700 * 1024 * 1024, 900 * 1024 * 1024 + }; + std::vector load_bytes{ + 50 * 1024 * 1024, 200 * 1024 * 1024, 400 * 1024 * 1024, 600 * 1024 * 1024, 800 * 1024 * 1024 + }; + GetStorageMetricsReply metrics[5]; + for (int i = 0; i < 5; ++i) { + metrics[i].capacity.bytes = capacity; + metrics[i].available.bytes = deterministicRandom()->randomChoice(load_bytes); + metrics[i].load.bytesReadPerKSecond = read_bandwidths[i]; + metrics[i].load.bytes = deterministicRandom()->randomChoice(load_bytes); + collection->addTeam(std::set({ UID(i + 1, 0) }), true); + collection->server_info[UID(i + 1, 0)]->setServerMetrics(metrics[i]); + } + /* * Among server teams that have healthy space available, pick the team that is * least utilized, if the caller says they preferLowerUtilization. */ - collection->server_info[UID(1, 0)]->setServerMetrics(mid_read); - collection->server_info[UID(2, 0)]->setServerMetrics(mid_read); - collection->server_info[UID(3, 0)]->setServerMetrics(mid_read); - collection->server_info[UID(4, 0)]->setServerMetrics(high_read); - bool wantsNewServers = true; bool wantsTrueBest = true; bool preferLowerUtilization = true; @@ -5699,7 +5697,7 @@ public: std::pair>, bool> resTeam = req.reply.getFuture().get(); - std::set expectedServers{ UID(1, 0), UID(2, 0), UID(3, 0) }; + std::set expectedServers{ UID(1, 0) }; ASSERT(resTeam.first.present()); auto servers = resTeam.first.get()->getServerIDs(); const std::set selectedServers(servers.begin(), servers.end()); @@ -5769,6 +5767,6 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") { return Void(); } TEST_CASE("/DataDistribution/GetTeam/TrueBestLeastReadBandwidth") { - wait(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth()); + wait(timeout(recurring(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth(), 0.1), 10)); return Void(); } diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 8d1465b756..84abded131 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -228,6 +228,14 @@ Future recurring(Func what, double interval, TaskPriority taskID = TaskPri } } +// run what every interval sec +ACTOR Future recurring(Future what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay) { + loop { + wait(what); + wait(delay(interval)); + } +} + ACTOR template Future trigger(Func what, Future signal) { wait(signal); From 510a1875d15aef989bb7063d3b118b108eb6090e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 24 Mar 2022 18:38:50 -0700 Subject: [PATCH 035/299] random unit test --- fdbserver/DDTeamCollection.actor.cpp | 15 +++++++++------ fdbserver/TCInfo.actor.cpp | 4 ++-- fdbserver/TCInfo.h | 4 ++-- flow/genericactors.actor.cpp | 7 +++++++ flow/genericactors.actor.h | 7 +------ 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 26f92e1d4d..38f2f29376 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5656,14 +5656,13 @@ public: ACTOR static Future GetTeam_TrueBestLeastReadBandwidth() { Reference policy = Reference( - new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); + new PolicyAcross(1, "zoneid", Reference(new PolicyOne()))); state int processSize = 5; state int teamSize = 1; state std::unique_ptr collection = testTeamCollection(teamSize, policy, processSize); - collection->disableBuildingTeams(); - collection->setCheckTeamDelay(); - int64_t capacity = 1000 * 1024 * 1024; + + int64_t capacity = 1000 * 1024 * 1024, available = 800*1024*1024; std::vector read_bandwidths{ 100 * 1024 * 1024, 300 * 1024 * 1024, 500 * 1024 * 1024, 700 * 1024 * 1024, 900 * 1024 * 1024 }; @@ -5673,12 +5672,16 @@ public: GetStorageMetricsReply metrics[5]; for (int i = 0; i < 5; ++i) { metrics[i].capacity.bytes = capacity; - metrics[i].available.bytes = deterministicRandom()->randomChoice(load_bytes); + metrics[i].available.bytes = available; metrics[i].load.bytesReadPerKSecond = read_bandwidths[i]; metrics[i].load.bytes = deterministicRandom()->randomChoice(load_bytes); collection->addTeam(std::set({ UID(i + 1, 0) }), true); collection->server_info[UID(i + 1, 0)]->setMetrics(metrics[i]); } + + collection->disableBuildingTeams(); + collection->setCheckTeamDelay(); + bool wantsNewServers = true; bool wantsTrueBest = true; bool preferLowerUtilization = true; @@ -5815,7 +5818,7 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") { return Void(); } TEST_CASE("/DataDistribution/GetTeam/TrueBestLeastReadBandwidth") { - wait(timeout(recurring(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth(), 0.1), 10)); + Optional res = wait(timeout(recurringFuture(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth(), 0.1), 10)); return Void(); } diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index dce36189b6..78db521bb7 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -378,8 +378,8 @@ double TCTeamInfo::getLoadReadBandwidth() const { double sum = 0; int size = 0; for (const auto& server : servers) { - if (server->serverMetricsPresent()) { - auto& replyValue = server->getServerMetrics(); + if (server->metricsPresent()) { + auto& replyValue = server->getMetrics(); ASSERT(replyValue.load.bytesReadPerKSecond >= 0); sum += replyValue.load.bytesReadPerKSecond; size += 1; diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index 961d563d12..111c7a427e 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -48,8 +48,6 @@ class TCServerInfo : public ReferenceCounted { std::vector> teams; ErrorOr metrics; - GetStorageMetricsReply const& getMetrics() const { return metrics.get(); } - void setMetrics(GetStorageMetricsReply serverMetrics) { this->metrics = serverMetrics; } void markTeamUnhealthy(int teamIndex); @@ -74,6 +72,8 @@ public: Reference storageServerSet, Version addedVersion = 0); + GetStorageMetricsReply const& getMetrics() const { return metrics.get(); } + UID const& getId() const { return id; } bool isInDesiredDC() const { return inDesiredDC; } void updateInDesiredDC(std::vector> const& includedDCs); diff --git a/flow/genericactors.actor.cpp b/flow/genericactors.actor.cpp index 9b7f906713..ddbe665ea5 100644 --- a/flow/genericactors.actor.cpp +++ b/flow/genericactors.actor.cpp @@ -22,6 +22,13 @@ #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // This must be the last #include. +ACTOR Future recurringFuture(Future what, double interval, TaskPriority taskID) { + loop { + wait(what); + wait(delay(interval)); + } +} + ACTOR Future allTrue(std::vector> all) { state int i = 0; while (i != all.size()) { diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 6bfe774591..a100532c3e 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -229,12 +229,7 @@ Future recurring(Func what, double interval, TaskPriority taskID = TaskPri } // run what every interval sec -ACTOR Future recurring(Future what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay) { - loop { - wait(what); - wait(delay(interval)); - } -} +ACTOR Future recurringFuture(Future what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay); ACTOR template Future trigger(Func what, Future signal) { From 9799b3dc9983b7eb34fa9ff4dd07e2cc0fd4e1cd Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 24 Mar 2022 22:20:58 -0700 Subject: [PATCH 036/299] change teamSorter details --- fdbserver/DDTeamCollection.actor.cpp | 20 +++++++++++--------- fdbserver/DataDistribution.actor.h | 9 +++++---- fdbserver/DataDistributionQueue.actor.cpp | 14 ++++++++------ flow/genericactors.actor.h | 4 +++- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 46eea173b5..7ad73f17ff 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5667,21 +5667,20 @@ public: state int teamSize = 1; state std::unique_ptr collection = testTeamCollection(teamSize, policy, processSize); - - int64_t capacity = 1000 * 1024 * 1024, available = 800*1024*1024; + int64_t capacity = 1000 * 1024 * 1024, available = 800 * 1024 * 1024; std::vector read_bandwidths{ - 100 * 1024 * 1024, 300 * 1024 * 1024, 500 * 1024 * 1024, 700 * 1024 * 1024, 900 * 1024 * 1024 + 300 * 1024 * 1024, 100 * 1024 * 1024, 500 * 1024 * 1024, 100 * 1024 * 1024, 900 * 1024 * 1024 }; std::vector load_bytes{ - 50 * 1024 * 1024, 200 * 1024 * 1024, 400 * 1024 * 1024, 600 * 1024 * 1024, 800 * 1024 * 1024 + 50 * 1024 * 1024, 600 * 1024 * 1024, 800 * 1024 * 1024, 200 * 1024 * 1024, 100 * 1024 * 1024 }; GetStorageMetricsReply metrics[5]; for (int i = 0; i < 5; ++i) { metrics[i].capacity.bytes = capacity; metrics[i].available.bytes = available; metrics[i].load.bytesReadPerKSecond = read_bandwidths[i]; - metrics[i].load.bytes = deterministicRandom()->randomChoice(load_bytes); - collection->addTeam(std::set({ UID(i + 1, 0) }), true); + metrics[i].load.bytes = load_bytes[i]; + collection->addTeam(std::set({ UID(i + 1, 0) }), IsInitialTeam::True); collection->server_info[UID(i + 1, 0)]->setMetrics(metrics[i]); } @@ -5697,16 +5696,19 @@ public: state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards); req.completeSources = completeSources; req.teamSorter = [](Reference a, Reference b) { - return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); + auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); + return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); }; wait(collection->getTeam(req)); std::pair>, bool> resTeam = req.reply.getFuture().get(); - std::set expectedServers{ UID(1, 0) }; + std::set expectedServers{ UID(4, 0) }; ASSERT(resTeam.first.present()); auto servers = resTeam.first.get()->getServerIDs(); const std::set selectedServers(servers.begin(), servers.end()); + // for (auto id : selectedServers) + // std::cout << id.toString() << std::endl; ASSERT(expectedServers == selectedServers); return Void(); @@ -5824,7 +5826,7 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") { return Void(); } TEST_CASE("/DataDistribution/GetTeam/TrueBestLeastReadBandwidth") { - Optional res = wait(timeout(recurringFuture(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth(), 0.1), 10)); + wait(DDTeamCollectionUnitTest::GetTeam_TrueBestLeastReadBandwidth()); return Void(); } diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 0c7ddc972f..fed968c9b1 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -93,8 +93,8 @@ struct GetTeamRequest { // optional typedef Reference TeamRef; std::function hardConstraint; - std::function - teamSorter; // => true if a.score < b.score, the reply will choose the largest one + std::function + teamSorter; // => -1 if a.score < b.score, 0 if equal, 1 if larger, the reply will choose the largest one GetTeamRequest() {} GetTeamRequest(bool wantsNewServers, @@ -107,10 +107,11 @@ struct GetTeamRequest { // return true if a.score < b.score [[nodiscard]] bool lessCompare(TeamRef a, TeamRef b, int64_t aLoadBytes, int64_t bLoadBytes) const { + int res = 0; if (teamSorter) { - return teamSorter(a, b); + res = teamSorter(a, b); } - return lessCompareByLoad(aLoadBytes, bLoadBytes); + return res == 0 ? lessCompareByLoad(aLoadBytes, bLoadBytes) : res < 0; } // return true if preferHigherUtil && aLoadBytes <= bLoadBytes (higher load bytes has larger score) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 7f19a12786..3105e6d303 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1035,14 +1035,16 @@ struct DDQueueData { } }; -// return true if a.readload > b.readload -bool greaterReadLoad(Reference a, Reference b) { - return a->getLoadReadBandwidth() > b->getLoadReadBandwidth(); +// return -1 if a.readload > b.readload +int greaterReadLoad(Reference a, Reference b) { + auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); + return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); } -// return true if a.readload < b.readload -bool lessReadLoad(Reference a, Reference b) { - return a->getLoadReadBandwidth() < b->getLoadReadBandwidth(); +// return -1 if a.readload < b.readload +int lessReadLoad(Reference a, Reference b) { + auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); + return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); } static std::string destServersString(std::vector, bool>> const& bestTeams) { diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 33199f1753..f4772eec7e 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -229,7 +229,9 @@ Future recurring(Func what, double interval, TaskPriority taskID = TaskPri } // run what every interval sec -ACTOR Future recurringFuture(Future what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay); +ACTOR Future recurringFuture(Future what, + double interval, + TaskPriority taskID = TaskPriority::DefaultDelay); ACTOR template Future trigger(Func what, Future signal) { From 25f757ba28e12f61a85878e81fb1bfa77b5f56bc Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 28 Mar 2022 14:13:44 -0700 Subject: [PATCH 037/299] fix merge conflict bug caused by removal of cancellable(true) --- fdbserver/DataDistributionQueue.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 3105e6d303..9dc9f9448b 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -56,7 +56,7 @@ struct RelocateData { RelocateData() : priority(-1), boundaryPriority(-1), healthPriority(-1), reason(RelocateReason::INVALID), startTime(-1), - workFactor(0), wantsNewServers(false), interval("QueuedRelocation") {} + workFactor(0), wantsNewServers(false), cancellable(false), interval("QueuedRelocation") {} explicit RelocateData(RelocateShard const& rs) : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1), healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()), From b15f0eb2b0370cef25d7fccdd911f9e8bf036a6d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 28 Mar 2022 14:20:07 -0700 Subject: [PATCH 038/299] add ReadInFlight --- fdbserver/DDTeamCollection.actor.cpp | 2 +- fdbserver/DataDistribution.actor.h | 5 +++-- fdbserver/DataDistributionQueue.actor.cpp | 21 ++++++++++++++------- fdbserver/TCInfo.actor.cpp | 17 +++++++++++++---- fdbserver/TCInfo.h | 15 +++++++++++---- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 7ad73f17ff..0830a697d9 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -164,7 +164,7 @@ public: } size_t pivot = teamAvailableSpace.size() / 2; - if (teamAvailableSpace.size() >= 1) { + if (teamAvailableSpace.size() > 1) { std::nth_element( teamAvailableSpace.begin(), teamAvailableSpace.begin() + pivot, teamAvailableSpace.end()); self->medianAvailableSpace = diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index fed968c9b1..826acc812c 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -47,10 +47,11 @@ struct IDataDistributionTeam { virtual std::vector getLastKnownServerInterfaces() const = 0; virtual int size() const = 0; virtual std::vector const& getServerIDs() const = 0; - virtual void addDataInFlightToTeam(int64_t delta) = 0; + virtual void addDataInFlightToTeam(int64_t delta, int64_t readDelta = 0) = 0; virtual int64_t getDataInFlightToTeam() const = 0; virtual int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const = 0; - virtual double getLoadReadBandwidth() const = 0; + virtual int64_t getReadInFlightToTeam() const = 0; + virtual double getLoadReadBandwidth(bool includeInFlight = true) const = 0; virtual int64_t getMinAvailableSpace(bool includeInFlight = true) const = 0; virtual double getMinAvailableSpaceRatio(bool includeInFlight = true) const = 0; virtual bool hasHealthyAvailableSpace(double minRatio) const = 0; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 9dc9f9448b..fb03872052 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -162,9 +162,9 @@ public: return tempServerIDs; } - void addDataInFlightToTeam(int64_t delta) override { + void addDataInFlightToTeam(int64_t delta, int64_t readDelta = 0) override { for (auto& team : teams) { - team->addDataInFlightToTeam(delta); + team->addDataInFlightToTeam(delta, readDelta); } } @@ -178,8 +178,14 @@ public: }); } - double getLoadReadBandwidth() const override { - return sum([](IDataDistributionTeam const& team) { return team.getLoadReadBandwidth(); }); + int64_t getReadInFlightToTeam() const override { + return sum([](IDataDistributionTeam const& team) { return team.getReadInFlightToTeam(); }); + } + + double getLoadReadBandwidth(bool includeInFlight = true) const override { + return sum([includeInFlight](IDataDistributionTeam const& team) { + return team.getLoadReadBandwidth(includeInFlight); + }); } int64_t getMinAvailableSpace(bool includeInFlight = true) const override { @@ -1262,7 +1268,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, self->shardsAffectedByTeamFailure->moveShard(rd.keys, destinationTeams); // FIXME: do not add data in flight to servers that were already in the src. - healthyDestinations.addDataInFlightToTeam(+metrics.bytes); + healthyDestinations.addDataInFlightToTeam(+metrics.bytes, +metrics.bytesReadPerKSecond); launchDest(rd, bestTeams, self->destBusymap); @@ -1280,6 +1286,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } else { TraceEvent(relocateShardInterval.severity, "RelocateShardHasDestination", distributorId) .detail("PairId", relocateShardInterval.pairID) + .detail("Priority", rd.priority) .detail("KeyBegin", rd.keys.begin) .detail("KeyEnd", rd.keys.end) .detail("SourceServers", describe(rd.src)) @@ -1366,7 +1373,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } - healthyDestinations.addDataInFlightToTeam(-metrics.bytes); + healthyDestinations.addDataInFlightToTeam(-metrics.bytes, -metrics.bytesReadPerKSecond); // onFinished.send( rs ); if (!error.code()) { @@ -1399,7 +1406,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } else { TEST(true); // move to removed server - healthyDestinations.addDataInFlightToTeam(-metrics.bytes); + healthyDestinations.addDataInFlightToTeam(-metrics.bytes, -metrics.bytesReadPerKSecond); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } } diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 78db521bb7..e124acf4d2 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -341,9 +341,9 @@ std::string TCTeamInfo::getServerIDsStr() const { return std::move(ss).str(); } -void TCTeamInfo::addDataInFlightToTeam(int64_t delta) { +void TCTeamInfo::addDataInFlightToTeam(int64_t delta, int64_t readDelta) { for (int i = 0; i < servers.size(); i++) - servers[i]->incrementDataInFlightToServer(delta); + servers[i]->incrementDataInFlightToServer(delta, readDelta); } int64_t TCTeamInfo::getDataInFlightToTeam() const { @@ -354,6 +354,14 @@ int64_t TCTeamInfo::getDataInFlightToTeam() const { return dataInFlight; } +int64_t TCTeamInfo::getReadInFlightToTeam() const { + int64_t inFlight = 0.0; + for (auto const& server : servers) { + inFlight += server->getReadInFlightToServer(); + } + return inFlight; +} + int64_t TCTeamInfo::getLoadBytes(bool includeInFlight, double inflightPenalty) const { int64_t physicalBytes = getLoadAverage(); double minAvailableSpaceRatio = getMinAvailableSpaceRatio(includeInFlight); @@ -374,7 +382,7 @@ int64_t TCTeamInfo::getLoadBytes(bool includeInFlight, double inflightPenalty) c return (physicalBytes + (inflightPenalty * inFlightBytes)) * availableSpaceMultiplier; } -double TCTeamInfo::getLoadReadBandwidth() const { +double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight) const { double sum = 0; int size = 0; for (const auto& server : servers) { @@ -385,7 +393,8 @@ double TCTeamInfo::getLoadReadBandwidth() const { size += 1; } } - return size == 0 ? 0 : sum / size; + return (size == 0 ? 0 : sum / size) + + (includeInFlight && !servers.empty() ? getReadInFlightToTeam() / servers.size() : 0); } int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index d8728ad192..643e1a7960 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -44,7 +44,7 @@ class TCServerInfo : public ReferenceCounted { // To change storeType for an ip:port, we destroy the old one and create a new one. KeyValueStoreType storeType; // Storage engine type - int64_t dataInFlightToServer; + int64_t dataInFlightToServer, readInFlightToServer = 0; std::vector> teams; ErrorOr metrics; @@ -84,7 +84,12 @@ public: Future updateStoreType(); KeyValueStoreType getStoreType() const { return storeType; } int64_t getDataInFlightToServer() const { return dataInFlightToServer; } - void incrementDataInFlightToServer(int64_t bytes) { dataInFlightToServer += bytes; } + // expect read traffic to server after data movement + int64_t getReadInFlightToServer() const { return readInFlightToServer; } + void incrementDataInFlightToServer(int64_t bytes, int64_t readBytes = 0) { + dataInFlightToServer += bytes; + readInFlightToServer += readBytes; + } void cancel(); std::vector> const& getTeams() const { return teams; } void addTeam(Reference team) { teams.push_back(team); } @@ -189,13 +194,15 @@ public: std::string getServerIDsStr() const; - void addDataInFlightToTeam(int64_t delta) override; + void addDataInFlightToTeam(int64_t delta, int64_t readDelta = 0) override; int64_t getDataInFlightToTeam() const override; int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const override; - double getLoadReadBandwidth() const override; + double getLoadReadBandwidth(bool includeInFlight = true) const override; + + int64_t getReadInFlightToTeam() const override; int64_t getMinAvailableSpace(bool includeInFlight = true) const override; From 2829415f1f51b9d475a1e360f114ab8d5a83122c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 28 Mar 2022 23:52:40 -0700 Subject: [PATCH 039/299] update unit test --- fdbserver/DDTeamCollection.actor.cpp | 33 ++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 0830a697d9..8d06c5e2b4 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5700,16 +5700,35 @@ public: return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); }; - wait(collection->getTeam(req)); - std::pair>, bool> resTeam = req.reply.getFuture().get(); - std::set expectedServers{ UID(4, 0) }; + state GetTeamRequest reqHigh(wantsNewServers, wantsTrueBest, false, teamMustHaveShards); + reqHigh.teamSorter = [](Reference a, Reference b) { + auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); + return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); + }; - ASSERT(resTeam.first.present()); - auto servers = resTeam.first.get()->getServerIDs(); - const std::set selectedServers(servers.begin(), servers.end()); + wait(collection->getTeam(req) && collection->getTeam(reqHigh)); + std::pair>, bool> resTeam = req.reply.getFuture().get(), + resTeamHigh = reqHigh.reply.getFuture().get(); + + std::set expectedServers{ UID(4, 0) }; + std::set expectedServersHigh{ UID(5, 0) }; + + ASSERT(resTeam.first.present() && resTeamHigh.first.present()); + auto servers = resTeam.first.get()->getServerIDs(), serversHigh = resTeamHigh.first.get()->getServerIDs(); + const std::set selectedServers(servers.begin(), servers.end()), + selectedServersHigh(serversHigh.begin(), serversHigh.end()); // for (auto id : selectedServers) // std::cout << id.toString() << std::endl; - ASSERT(expectedServers == selectedServers); + ASSERT(expectedServers == selectedServers && expectedServersHigh == selectedServersHigh); + + resTeam.first.get()->addDataInFlightToTeam(50, 50); + req.reply.reset(); + wait(collection->getTeam(req)); + std::pair>, bool> resTeam1 = req.reply.getFuture().get(); + std::set expectedServers1{ UID(2, 0) }; + auto servers1 = resTeam1.first.get()->getServerIDs(); + const std::set selectedServers1(servers1.begin(), servers1.end()); + ASSERT(expectedServers1 == selectedServers1); return Void(); } From 20a0b8d761ce7c7e76c4ba2134243656ef1683d6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 31 Mar 2022 09:57:00 -0700 Subject: [PATCH 040/299] enable sim skip; add readInFlight methods and inflight Penalty; add delayed inflight substraction: --- fdbserver/DDTeamCollection.actor.cpp | 2 +- fdbserver/DataDistribution.actor.h | 5 ++- fdbserver/DataDistributionQueue.actor.cpp | 55 ++++++++++++++++------- fdbserver/TCInfo.actor.cpp | 14 ++++-- fdbserver/TCInfo.h | 12 ++--- 5 files changed, 58 insertions(+), 30 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 8d06c5e2b4..ba4db0005f 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5721,7 +5721,7 @@ public: // std::cout << id.toString() << std::endl; ASSERT(expectedServers == selectedServers && expectedServersHigh == selectedServersHigh); - resTeam.first.get()->addDataInFlightToTeam(50, 50); + resTeam.first.get()->addReadInFlightToTeam(50); req.reply.reset(); wait(collection->getTeam(req)); std::pair>, bool> resTeam1 = req.reply.getFuture().get(); diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 826acc812c..42f64c5227 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -47,11 +47,12 @@ struct IDataDistributionTeam { virtual std::vector getLastKnownServerInterfaces() const = 0; virtual int size() const = 0; virtual std::vector const& getServerIDs() const = 0; - virtual void addDataInFlightToTeam(int64_t delta, int64_t readDelta = 0) = 0; + virtual void addDataInFlightToTeam(int64_t delta) = 0; + virtual void addReadInFlightToTeam(int64_t delta) = 0; virtual int64_t getDataInFlightToTeam() const = 0; virtual int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const = 0; virtual int64_t getReadInFlightToTeam() const = 0; - virtual double getLoadReadBandwidth(bool includeInFlight = true) const = 0; + virtual double getLoadReadBandwidth(bool includeInFlight = true, double inflightPenalty = 1.0) const = 0; virtual int64_t getMinAvailableSpace(bool includeInFlight = true) const = 0; virtual double getMinAvailableSpaceRatio(bool includeInFlight = true) const = 0; virtual bool hasHealthyAvailableSpace(double minRatio) const = 0; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index fb03872052..78474db1b0 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -130,6 +130,7 @@ class ParallelTCInfo final : public ReferenceCounted, public IDa public: ParallelTCInfo() = default; + explicit ParallelTCInfo(ParallelTCInfo const& info) : teams(info.teams), tempServerIDs(info.tempServerIDs){}; void addTeam(Reference team) { teams.push_back(team); } @@ -162,9 +163,15 @@ public: return tempServerIDs; } - void addDataInFlightToTeam(int64_t delta, int64_t readDelta = 0) override { + void addDataInFlightToTeam(int64_t delta) override { for (auto& team : teams) { - team->addDataInFlightToTeam(delta, readDelta); + team->addDataInFlightToTeam(delta); + } + } + + void addReadInFlightToTeam(int64_t delta) override { + for (auto& team : teams) { + team->addReadInFlightToTeam(delta); } } @@ -182,9 +189,9 @@ public: return sum([](IDataDistributionTeam const& team) { return team.getReadInFlightToTeam(); }); } - double getLoadReadBandwidth(bool includeInFlight = true) const override { - return sum([includeInFlight](IDataDistributionTeam const& team) { - return team.getLoadReadBandwidth(includeInFlight); + double getLoadReadBandwidth(bool includeInFlight = true, double inflightPenalty = 1.0) const override { + return sum([includeInFlight, inflightPenalty](IDataDistributionTeam const& team) { + return team.getLoadReadBandwidth(includeInFlight, inflightPenalty); }); } @@ -450,6 +457,7 @@ struct DDQueueData { PromiseStream dataTransferComplete; PromiseStream relocationComplete; PromiseStream fetchSourceServersComplete; // find source SSs for a relocate range + ActorCollectionNoErrors noErrorActors; PromiseStream output; FutureStream input; @@ -1043,7 +1051,7 @@ struct DDQueueData { // return -1 if a.readload > b.readload int greaterReadLoad(Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); + auto r1 = a->getLoadReadBandwidth(true, 2), r2 = b->getLoadReadBandwidth(true, 2); return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); } @@ -1268,7 +1276,8 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, self->shardsAffectedByTeamFailure->moveShard(rd.keys, destinationTeams); // FIXME: do not add data in flight to servers that were already in the src. - healthyDestinations.addDataInFlightToTeam(+metrics.bytes, +metrics.bytesReadPerKSecond); + healthyDestinations.addDataInFlightToTeam(+metrics.bytes); + healthyDestinations.addReadInFlightToTeam(+metrics.bytesReadPerKSecond); launchDest(rd, bestTeams, self->destBusymap); @@ -1373,7 +1382,12 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } - healthyDestinations.addDataInFlightToTeam(-metrics.bytes, -metrics.bytesReadPerKSecond); + healthyDestinations.addDataInFlightToTeam(-metrics.bytes); + auto readLoad = metrics.bytesReadPerKSecond; + auto& destinationRef = healthyDestinations; + self->noErrorActors.add( + trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, + delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); // onFinished.send( rs ); if (!error.code()) { @@ -1406,7 +1420,12 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } else { TEST(true); // move to removed server - healthyDestinations.addDataInFlightToTeam(-metrics.bytes, -metrics.bytesReadPerKSecond); + healthyDestinations.addDataInFlightToTeam(-metrics.bytes); + auto readLoad = metrics.bytesReadPerKSecond; + auto& destinationRef = healthyDestinations; + self->noErrorActors.add( + trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, + delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } } @@ -1474,12 +1493,14 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } if (metrics.keys.present() && metrics.bytes > 0) { - // auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); - // if (abs(srcLoad - destLoad) <= - // 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { - // traceEvent->detail("SkipReason", "TeamTooSimilar"); - // return false; - // } + auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); + if (abs(srcLoad - destLoad) <= + 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + traceEvent->detail("SkipReason", "TeamTooSimilar") + .detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) + .detail("SrcReadBandwidth", srcLoad); + return false; + } // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); @@ -1865,7 +1886,6 @@ ACTOR Future dataDistributionQueue(Database cx, state std::vector> balancingFutures; - state ActorCollectionNoErrors actors; state PromiseStream rangesComplete; state Future launchQueuedWorkTimeout = Never(); @@ -1921,7 +1941,8 @@ ACTOR Future dataDistributionQueue(Database cx, self.finishRelocation(done.priority, done.healthPriority); self.fetchKeysComplete.erase(done); // self.logRelocation( done, "ShardRelocatorDone" ); - actors.add(tag(delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete)); + self.noErrorActors.add( + tag(delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete)); if (g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60) { TraceEvent(SevWarnAlways, "RelocationDurationTooLong") .detail("Duration", now() - done.startTime); diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index e124acf4d2..10b3522540 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -341,9 +341,14 @@ std::string TCTeamInfo::getServerIDsStr() const { return std::move(ss).str(); } -void TCTeamInfo::addDataInFlightToTeam(int64_t delta, int64_t readDelta) { +void TCTeamInfo::addDataInFlightToTeam(int64_t delta) { for (int i = 0; i < servers.size(); i++) - servers[i]->incrementDataInFlightToServer(delta, readDelta); + servers[i]->incrementDataInFlightToServer(delta); +} + +void TCTeamInfo::addReadInFlightToTeam(int64_t delta) { + for (int i = 0; i < servers.size(); i++) + servers[i]->incrementReadInFlightToServer(delta); } int64_t TCTeamInfo::getDataInFlightToTeam() const { @@ -382,7 +387,8 @@ int64_t TCTeamInfo::getLoadBytes(bool includeInFlight, double inflightPenalty) c return (physicalBytes + (inflightPenalty * inFlightBytes)) * availableSpaceMultiplier; } -double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight) const { +double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight, double inflightPenalty) const { + // FIXME: consider team load variance double sum = 0; int size = 0; for (const auto& server : servers) { @@ -394,7 +400,7 @@ double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight) const { } } return (size == 0 ? 0 : sum / size) + - (includeInFlight && !servers.empty() ? getReadInFlightToTeam() / servers.size() : 0); + (includeInFlight && !servers.empty() ? inflightPenalty * getReadInFlightToTeam() / servers.size() : 0); } int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index 643e1a7960..2eb4b46492 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -86,10 +86,8 @@ public: int64_t getDataInFlightToServer() const { return dataInFlightToServer; } // expect read traffic to server after data movement int64_t getReadInFlightToServer() const { return readInFlightToServer; } - void incrementDataInFlightToServer(int64_t bytes, int64_t readBytes = 0) { - dataInFlightToServer += bytes; - readInFlightToServer += readBytes; - } + void incrementDataInFlightToServer(int64_t bytes) { dataInFlightToServer += bytes; } + void incrementReadInFlightToServer(int64_t readBytes) { readInFlightToServer += readBytes; } void cancel(); std::vector> const& getTeams() const { return teams; } void addTeam(Reference team) { teams.push_back(team); } @@ -194,13 +192,15 @@ public: std::string getServerIDsStr() const; - void addDataInFlightToTeam(int64_t delta, int64_t readDelta = 0) override; + void addDataInFlightToTeam(int64_t delta) override; + + void addReadInFlightToTeam(int64_t delta) override; int64_t getDataInFlightToTeam() const override; int64_t getLoadBytes(bool includeInFlight = true, double inflightPenalty = 1.0) const override; - double getLoadReadBandwidth(bool includeInFlight = true) const override; + double getLoadReadBandwidth(bool includeInFlight = true, double inflightPenalty = 1.0) const override; int64_t getReadInFlightToTeam() const override; From b6594aebbf408f52af1446884b5e856a0f97f3c8 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 4 Apr 2022 11:34:41 -0700 Subject: [PATCH 041/299] change inflight substraction priority (not non-negligible performance influence --- fdbserver/DataDistributionQueue.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 78474db1b0..4a2ceab228 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1051,7 +1051,7 @@ struct DDQueueData { // return -1 if a.readload > b.readload int greaterReadLoad(Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(true, 2), r2 = b->getLoadReadBandwidth(true, 2); + auto r1 = a->getLoadReadBandwidth(true, 10), r2 = b->getLoadReadBandwidth(true, 10); return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); } @@ -1387,7 +1387,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto& destinationRef = healthyDestinations; self->noErrorActors.add( trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, - delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); + delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistributionLow))); // onFinished.send( rs ); if (!error.code()) { @@ -1425,7 +1425,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto& destinationRef = healthyDestinations; self->noErrorActors.add( trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, - delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); + delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistributionLow))); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } } From 150b4318aafc10adaf329a11133fad014ff4fd96 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 6 Apr 2022 22:10:23 -0700 Subject: [PATCH 042/299] refactor datadistribution command; try dual-mode code --- fdbcli/DataDistributionCommand.actor.cpp | 43 ++++++++----- fdbclient/ServerKnobs.cpp | 2 + fdbclient/ServerKnobs.h | 2 + fdbclient/SystemData.h | 2 +- fdbserver/DataDistributionQueue.actor.cpp | 76 ++++++++++++++--------- 5 files changed, 79 insertions(+), 46 deletions(-) diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index d75ad693da..372adeca6e 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -61,20 +61,25 @@ ACTOR Future setDDMode(Reference db, int mode) { } } -ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, int DDIgnoreOption) { +ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, uint8_t DDIgnoreOptionMask, bool setMaskedBit) { state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { - if (DDIgnoreOption > 0) { - Optional v = wait(safeThreadFutureToFuture(tr->get(fdb_cli::ddIgnoreRebalanceSpecialKey))); - if (v.present() && v.get().size() > 0) { - int oldValue = BinaryReader::fromStringRef(v.get(), Unversioned()); - tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, - BinaryWriter::toValue(DDIgnoreOption & oldValue, Unversioned())); + Optional v = wait(safeThreadFutureToFuture(tr->get(fdb_cli::ddIgnoreRebalanceSpecialKey))); + uint8_t oldValue = 0; // nothing is disabled + if (v.present()) { + if (v.get().size() > 0) { + oldValue = BinaryReader::fromStringRef(v.get(), Unversioned()); } else { - tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, BinaryWriter::toValue(DDIgnoreOption, Unversioned())); + // In old version (<= 7.1), the value is an empty string, which means all DD rebalance functions are + // disabled + oldValue = DDIgnore::ALL; } + } + uint8_t newValue = setMaskedBit ? (oldValue | DDIgnoreOptionMask) : (oldValue & (~DDIgnoreOptionMask)); + if (newValue > 0) { + tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, BinaryWriter::toValue(newValue, Unversioned())); } else { tr->clear(fdb_cli::ddIgnoreRebalanceSpecialKey); } @@ -86,6 +91,16 @@ ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, int DDIgn } } +// set masked bit +Future setDDIgnoreRebalanceOn(Reference db, uint8_t DDIgnoreOptionMask) { + return setDDIgnoreRebalanceSwitch(db, DDIgnoreOptionMask, true); +} + +// reset masked bit +Future setDDIgnoreRebalanceOff(Reference db, uint8_t DDIgnoreOptionMask) { + return setDDIgnoreRebalanceSwitch(db, DDIgnoreOptionMask, false); +} + } // namespace namespace fdb_cli { @@ -112,13 +127,13 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve wait(success((setHealthyZone(db, LiteralStringRef("IgnoreSSFailures"), 0)))); printf("Data distribution is disabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { - wait(setDDIgnoreRebalanceSwitch(db, DDIgnore::REBALANCE_DISK | DDIgnore::REBALANCE_READ)); + wait(setDDIgnoreRebalanceOn(db, DDIgnore::REBALANCE_DISK | DDIgnore::REBALANCE_READ)); printf("Data distribution is disabled for rebalance.\n"); } else if (tokencmp(tokens[2], "rebalance_disk")) { - wait(setDDIgnoreRebalanceSwitch(db, DDIgnore::REBALANCE_DISK)); + wait(setDDIgnoreRebalanceOn(db, DDIgnore::REBALANCE_DISK)); printf("Data distribution is disabled for rebalance_disk.\n"); } else if (tokencmp(tokens[2], "rebalance_read")) { - wait(setDDIgnoreRebalanceSwitch(db, DDIgnore::REBALANCE_READ)); + wait(setDDIgnoreRebalanceOn(db, DDIgnore::REBALANCE_READ)); printf("Data distribution is disabled for rebalance_read.\n"); } else { printf(usage); @@ -129,13 +144,13 @@ ACTOR Future dataDistributionCommandActor(Reference db, std::ve wait(success((clearHealthyZone(db, false, true)))); printf("Data distribution is enabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { - wait(setDDIgnoreRebalanceSwitch(db, 0)); + wait(setDDIgnoreRebalanceOff(db, DDIgnore::REBALANCE_DISK | DDIgnore::REBALANCE_READ)); printf("Data distribution is enabled for rebalance.\n"); } else if (tokencmp(tokens[2], "rebalance_disk")) { - wait(setDDIgnoreRebalanceSwitch(db, ~DDIgnore::REBALANCE_DISK)); + wait(setDDIgnoreRebalanceOff(db, DDIgnore::REBALANCE_DISK)); printf("Data distribution is enabled for rebalance_disk.\n"); } else if (tokencmp(tokens[2], "rebalance_read")) { - wait(setDDIgnoreRebalanceSwitch(db, ~DDIgnore::REBALANCE_READ)); + wait(setDDIgnoreRebalanceOff(db, DDIgnore::REBALANCE_READ)); printf("Data distribution is enabled for rebalance_read.\n"); } else { printf(usage); diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index b1180f5162..ceba68835a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -135,6 +135,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( PRIORITY_RECOVER_MOVE, 110 ); init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 ); init( PRIORITY_REBALANCE_OVERUTILIZED_TEAM, 121 ); + init( PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM, 122 ); + init( PRIORITY_REBALANCE_READ_OVERUTIL_TEAM, 123 ); init( PRIORITY_PERPETUAL_STORAGE_WIGGLE, 139 ); init( PRIORITY_TEAM_HEALTHY, 140 ); init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index e15a3100b5..c64fb5f0ef 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -140,6 +140,8 @@ public: int PRIORITY_RECOVER_MOVE; int PRIORITY_REBALANCE_UNDERUTILIZED_TEAM; int PRIORITY_REBALANCE_OVERUTILIZED_TEAM; + int PRIORITY_REBALANCE_READ_OVERUTIL_TEAM; + int PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM; int PRIORITY_PERPETUAL_STORAGE_WIGGLE; int PRIORITY_TEAM_HEALTHY; int PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER; diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 5129d2c111..58973ade52 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -494,7 +494,7 @@ extern const KeyRangeRef monitorConfKeys; extern const KeyRef healthyZoneKey; extern const StringRef ignoreSSFailuresZoneString; extern const KeyRef rebalanceDDIgnoreKey; -enum DDIgnore { REBALANCE_DISK = 1, REBALANCE_READ = 2 }; +enum DDIgnore : uint8_t { REBALANCE_DISK = 1, REBALANCE_READ = 2, ALL = 3 }; const Value healthyZoneValue(StringRef const& zoneId, Version version); std::pair decodeHealthyZoneValue(ValueRef const&); diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 5c07112741..816c1ec0c3 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1462,6 +1462,14 @@ inline double getWorstCpu(const HealthMetrics& metrics) { } return cpu; } +inline bool isDiskRebalancePriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM; +} +inline bool isMountainChopperPriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM; +} // Move the shard with highest read density of sourceTeam's to destTeam if sourceTeam has much more read load than // destTeam ACTOR Future rebalanceReadLoad(DDQueueData* self, @@ -1496,7 +1504,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, if (metrics.keys.present() && metrics.bytes > 0) { auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); if (abs(srcLoad - destLoad) <= - 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + 10 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { traceEvent->detail("SkipReason", "TeamTooSimilar") .detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) .detail("SrcReadBandwidth", srcLoad); @@ -1619,14 +1627,13 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, return Void(); } -ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionIndex) { +ACTOR Future BgDDLoadRebalancer(DDQueueData* self, int teamCollectionIndex, int ddPriority) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; - state bool disableReadBalance = false; - state bool disableDiskBalance = false; + state const bool readRebalance = !isDiskRebalancePriority(ddPriority); loop { state bool moved = false; @@ -1634,7 +1641,8 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde state Reference destTeam; state GetTeamRequest srcReq; state GetTeamRequest destReq; - state TraceEvent traceEvent("BgDDMountainChopper", self->distributorId); + state TraceEvent traceEvent(isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper" : "BgDDValleyFiller", + self->distributorId); traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval); if (*self->lastLimited > 0) { @@ -1654,14 +1662,14 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; } skipCurrentLoop = false; - disableReadBalance = false; - disableDiskBalance = false; } else { if (val.get().size() > 0) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - disableDiskBalance = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; - disableReadBalance = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; - skipCurrentLoop = disableReadBalance && disableDiskBalance; + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + if (readRebalance) { + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; + } else { + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; + } } else { skipCurrentLoop = true; } @@ -1669,7 +1677,8 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde } traceEvent.detail("Enabled", - skipCurrentLoop ? "None" : (disableReadBalance ? "NoReadBalance" : "NoDiskBalance")); + readRebalance ? (skipCurrentLoop ? "NoReadRebalance" : "ReadRebalance") + : (skipCurrentLoop ? "NoDiskRebalance" : "DiskRebalance")); wait(delayF); if (skipCurrentLoop) { @@ -1679,29 +1688,27 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde continue; } - traceEvent.detail("QueuedRelocations", - self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM]); - if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < - SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { - // FIXME: read balance and disk balance shouldn't be mutual exclusive in the future - srcReq = GetTeamRequest(true, true, false, true); - destReq = GetTeamRequest(true, false, true, false); - if (!disableReadBalance) { + traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]); + // FIXME: find a proper number for SERVER_KNOBS->DD_REBALANCE_PARALLELISM + if (self->priority_relocations[ddPriority] < 25) { + if (isMountainChopperPriority(ddPriority)) { + srcReq = GetTeamRequest(true, true, false, true); + destReq = GetTeamRequest(true, false, true, false); + } else { + srcReq = GetTeamRequest(true, false, false, true); + destReq = GetTeamRequest(true, true, true, false); + } + if (readRebalance) { srcReq.teamSorter = lessReadLoad; destReq.teamSorter = greaterReadLoad; } // clang-format off - wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam, - SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM,&traceEvent)); + wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam,ddPriority,&traceEvent)); if (sourceTeam.isValid() && destTeam.isValid()) { - if (!disableReadBalance) { - wait(store(moved,rebalanceReadLoad(self,SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, - sourceTeam, destTeam,teamCollectionIndex == 0, - &traceEvent))); + if (readRebalance) { + wait(store(moved,rebalanceReadLoad(self,ddPriority, sourceTeam, destTeam,teamCollectionIndex == 0,&traceEvent))); } else { - wait(store(moved,rebalanceTeams(self,SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM, - sourceTeam, destTeam,teamCollectionIndex == 0, - &traceEvent))); + wait(store(moved,rebalanceTeams(self,ddPriority, sourceTeam, destTeam,teamCollectionIndex == 0,&traceEvent))); } } // clang-format on @@ -1891,8 +1898,11 @@ ACTOR Future dataDistributionQueue(Database cx, state Future launchQueuedWorkTimeout = Never(); for (int i = 0; i < teamCollections.size(); i++) { - balancingFutures.push_back(BgDDMountainChopper(&self, i)); - balancingFutures.push_back(BgDDValleyFiller(&self, i)); + balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); + balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); + balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); + balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM)); + // balancingFutures.push_back(BgDDValleyFiller(&self, i)); } balancingFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0)); balancingFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0)); @@ -1976,6 +1986,10 @@ ACTOR Future dataDistributionQueue(Database cx, self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM]) .detail("PriorityRebalanceOverutilizedTeam", self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM]) + .detail("PriorityRebalanceReadUnderutilTeam", + self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM]) + .detail("PriorityRebalanceReadOverutilTeam", + self.priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM]) .detail("PriorityStorageWiggle", self.priority_relocations[SERVER_KNOBS->PRIORITY_PERPETUAL_STORAGE_WIGGLE]) .detail("PriorityTeamHealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_HEALTHY]) From 6fa60a2cf9231583498d4b09bf1e680baedea5d6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 6 Apr 2022 23:03:25 -0700 Subject: [PATCH 043/299] fix substraction typo --- fdbserver/DataDistributionQueue.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 816c1ec0c3..def7273dd0 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1386,7 +1386,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto readLoad = metrics.bytesReadPerKSecond; auto& destinationRef = healthyDestinations; self->noErrorActors.add( - trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, + trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistributionLow))); // onFinished.send( rs ); @@ -1424,7 +1424,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto readLoad = metrics.bytesReadPerKSecond; auto& destinationRef = healthyDestinations; self->noErrorActors.add( - trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, + trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistributionLow))); rd.completeDests.clear(); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); From c858b0eaa90823d2ee9d32cbee6e9dad63b7ff36 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 7 Apr 2022 11:59:40 -0700 Subject: [PATCH 044/299] update penalty, reset sim skip to 3; add read_underutil to relocator priority --- fdbserver/DataDistributionQueue.actor.cpp | 32 +++++++++++++---------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index def7273dd0..efe5d59053 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1051,7 +1051,7 @@ struct DDQueueData { // return -1 if a.readload > b.readload int greaterReadLoad(Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(true, 10), r2 = b->getLoadReadBandwidth(true, 10); + auto r1 = a->getLoadReadBandwidth(true, 2.0), r2 = b->getLoadReadBandwidth(true, 2.0); return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); } @@ -1073,6 +1073,18 @@ static std::string destServersString(std::vectorPRIORITY_REBALANCE_UNDERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM; +} +inline bool isMountainChopperPriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM; +} +inline bool isValleyFillerPriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM; +} // This actor relocates the specified keys to a good place. // The inFlightActor key range map stores the actor for each RelocateData ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, const DDEnabledState* ddEnabledState) { @@ -1140,7 +1152,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT; auto req = GetTeamRequest(rd.wantsNewServers, - rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, + isValleyFillerPriority(rd.priority), true, false, inflightPenalty); @@ -1387,7 +1399,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto& destinationRef = healthyDestinations; self->noErrorActors.add( trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, - delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistributionLow))); + delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistribution))); // onFinished.send( rs ); if (!error.code()) { @@ -1425,7 +1437,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto& destinationRef = healthyDestinations; self->noErrorActors.add( trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, - delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistributionLow))); + delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL, TaskPriority::DataDistribution))); rd.completeDests.clear(); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } @@ -1462,14 +1474,6 @@ inline double getWorstCpu(const HealthMetrics& metrics) { } return cpu; } -inline bool isDiskRebalancePriority(int priority) { - return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || - priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM; -} -inline bool isMountainChopperPriority(int priority) { - return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || - priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM; -} // Move the shard with highest read density of sourceTeam's to destTeam if sourceTeam has much more read load than // destTeam ACTOR Future rebalanceReadLoad(DDQueueData* self, @@ -1504,7 +1508,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, if (metrics.keys.present() && metrics.bytes > 0) { auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); if (abs(srcLoad - destLoad) <= - 10 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { traceEvent->detail("SkipReason", "TeamTooSimilar") .detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) .detail("SrcReadBandwidth", srcLoad); @@ -1651,7 +1655,7 @@ ACTOR Future BgDDLoadRebalancer(DDQueueData* self, int teamCollectionIndex try { // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test - state Future delayF = delay(0.1, TaskPriority::DataDistributionLaunch); + state Future delayF = delayJittered(0.1, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); From 7c83f63e46edc6e552e4fcbbf757c73bfc01a9b0 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 8 Apr 2022 10:44:56 -0700 Subject: [PATCH 045/299] fix substraction bug; test1,2,3 generate good result --- fdbserver/DataDistributionQueue.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 78474db1b0..f388c5afef 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1386,7 +1386,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto readLoad = metrics.bytesReadPerKSecond; auto& destinationRef = healthyDestinations; self->noErrorActors.add( - trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, + trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); // onFinished.send( rs ); @@ -1424,7 +1424,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto readLoad = metrics.bytesReadPerKSecond; auto& destinationRef = healthyDestinations; self->noErrorActors.add( - trigger([destinationRef, readLoad]() mutable { destinationRef.addDataInFlightToTeam(-readLoad); }, + trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } From 82e5859e034bdca13b70ab68312a6b45a986211b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 11 Apr 2022 14:35:12 -0700 Subject: [PATCH 046/299] allow safeThreadFutureToFuture --- fdbcli/DataDistributionCommand.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index d75ad693da..34dc85c6aa 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -67,7 +67,8 @@ ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, int DDIgn tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); try { if (DDIgnoreOption > 0) { - Optional v = wait(safeThreadFutureToFuture(tr->get(fdb_cli::ddIgnoreRebalanceSpecialKey))); + state ThreadFuture> resultFuture = tr->get(fdb_cli::ddIgnoreRebalanceSpecialKey); + Optional v = wait(safeThreadFutureToFuture(resultFuture)); if (v.present() && v.get().size() > 0) { int oldValue = BinaryReader::fromStringRef(v.get(), Unversioned()); tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, From 61a1f7683b804757126910edf13a885e4262fb4a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 11 Apr 2022 22:49:21 -0700 Subject: [PATCH 047/299] fix dd command line read special key space error --- fdbcli/DataDistributionCommand.actor.cpp | 6 ++++-- fdbclient/ManagementAPI.actor.cpp | 19 ------------------- fdbclient/ManagementAPI.actor.h | 1 - fdbserver/DataDistributionQueue.actor.cpp | 10 +++++----- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index dade33e7a1..1cb667c812 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -65,8 +65,9 @@ ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, uint8_t D state Reference tr = db->createTransaction(); loop { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); try { - state ThreadFuture> resultFuture = tr->get(fdb_cli::ddIgnoreRebalanceSpecialKey); + state ThreadFuture> resultFuture = tr->get(rebalanceDDIgnoreKey); Optional v = wait(safeThreadFutureToFuture(resultFuture)); uint8_t oldValue = 0; // nothing is disabled if (v.present()) { @@ -77,8 +78,9 @@ ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, uint8_t D // disabled oldValue = DDIgnore::ALL; } + // printf("oldValue: %d Mask: %d V:%d\n", oldValue, DDIgnoreOptionMask, v.get().size()); } - uint8_t newValue = setMaskedBit ? (oldValue | DDIgnoreOptionMask) : (oldValue & (~DDIgnoreOptionMask)); + uint8_t newValue = setMaskedBit ? (oldValue | DDIgnoreOptionMask) : (oldValue & ~DDIgnoreOptionMask); if (newValue > 0) { tr->set(fdb_cli::ddIgnoreRebalanceSpecialKey, BinaryWriter::toValue(newValue, Unversioned())); } else { diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index d4633807b8..fe5ffc7a24 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1788,25 +1788,6 @@ ACTOR Future setHealthyZone(Database cx, StringRef zoneId, double seconds, } } -ACTOR Future setDDIgnoreRebalanceSwitch(Database cx, bool ignoreRebalance) { - state Transaction tr(cx); - loop { - try { - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - if (ignoreRebalance) { - tr.set(rebalanceDDIgnoreKey, LiteralStringRef("on")); - } else { - tr.clear(rebalanceDDIgnoreKey); - } - wait(tr.commit()); - return Void(); - } catch (Error& e) { - wait(tr.onError(e)); - } - } -} - ACTOR Future setDDMode(Database cx, int mode) { state Transaction tr(cx); state int oldMode = -1; diff --git a/fdbclient/ManagementAPI.actor.h b/fdbclient/ManagementAPI.actor.h index 64c54447a7..3fdc35ab2f 100644 --- a/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/ManagementAPI.actor.h @@ -139,7 +139,6 @@ ACTOR Future setDDMode(Database cx, int mode); ACTOR Future forceRecovery(Reference clusterFile, Standalone dcId); ACTOR Future printHealthyZone(Database cx); -ACTOR Future setDDIgnoreRebalanceSwitch(Database cx, bool ignoreRebalance); ACTOR Future clearHealthyZone(Database cx, bool printWarning = false, bool clearSSFailureZoneString = false); ACTOR Future setHealthyZone(Database cx, StringRef zoneId, double seconds, bool printWarning = false); diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 4f3a6a0faf..b5876ded68 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1631,7 +1631,7 @@ ACTOR Future getSrcDestTeams(DDQueueData* self, return Void(); } -ACTOR Future BgDDLoadRebalancer(DDQueueData* self, int teamCollectionIndex, int ddPriority) { +ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, int ddPriority) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; state Transaction tr(self->cx); @@ -2018,10 +2018,10 @@ ACTOR Future dataDistributionQueue(Database cx, state Future launchQueuedWorkTimeout = Never(); for (int i = 0; i < teamCollections.size(); i++) { - balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); - balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); - balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); - balancingFutures.push_back(BgDDLoadRebalancer(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM)); + balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); + balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); + balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); + balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM)); // balancingFutures.push_back(BgDDValleyFiller(&self, i)); } balancingFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0)); From 5e96bacb5bdde5acaeb29523c980a3edcaa85a77 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 12 Apr 2022 16:22:17 -0700 Subject: [PATCH 048/299] add new priority in RelocateData --- fdbserver/DataDistributionQueue.actor.cpp | 65 ++++++++++++----------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index b5876ded68..d00140f855 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -37,6 +37,19 @@ #define WORK_FULL_UTILIZATION 10000 // This is not a knob; it is a fixed point scaling factor! +inline bool isDiskRebalancePriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM; +} +inline bool isMountainChopperPriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM; +} +inline bool isValleyFillerPriority(int priority) { + return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || + priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM; +} + struct RelocateData { KeyRange keys; int priority; @@ -61,8 +74,7 @@ struct RelocateData { : keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1), healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), reason(rs.reason), startTime(now()), randomId(deterministicRandom()->randomUniqueID()), workFactor(0), - wantsNewServers(rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || - rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || + wantsNewServers(isMountainChopperPriority(rs.priority) || isValleyFillerPriority(rs.priority) || rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD || rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT), cancellable(true), interval("QueuedRelocation") {} @@ -1073,18 +1085,6 @@ static std::string destServersString(std::vectorPRIORITY_REBALANCE_UNDERUTILIZED_TEAM || - priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM; -} -inline bool isMountainChopperPriority(int priority) { - return priority == SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM || - priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM; -} -inline bool isValleyFillerPriority(int priority) { - return priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM || - priority == SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM; -} // This actor relocates the specified keys to a good place. // The inFlightActor key range map stores the actor for each RelocateData ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, const DDEnabledState* ddEnabledState) { @@ -1151,11 +1151,8 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT; - auto req = GetTeamRequest(rd.wantsNewServers, - isValleyFillerPriority(rd.priority), - true, - false, - inflightPenalty); + auto req = GetTeamRequest( + rd.wantsNewServers, isValleyFillerPriority(rd.priority), true, false, inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; @@ -1638,6 +1635,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state double lastRead = 0; state bool skipCurrentLoop = false; state const bool readRebalance = !isDiskRebalancePriority(ddPriority); + state const char* eventName = isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper" : "BgDDValleyFiller"; loop { state bool moved = false; @@ -1645,9 +1643,10 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state Reference destTeam; state GetTeamRequest srcReq; state GetTeamRequest destReq; - state TraceEvent traceEvent(isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper" : "BgDDValleyFiller", - self->distributorId); - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval); + state TraceEvent traceEvent(eventName, self->distributorId); + traceEvent.suppressFor(5.0) + .detail("PollingInterval", rebalancePollingInterval) + .detail("Rebalance", readRebalance ? "Read" : "Disk"); if (*self->lastLimited > 0) { traceEvent.detail("SecondsSinceLastLimited", now() - *self->lastLimited); @@ -1658,6 +1657,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state Future delayF = delay(0.1, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); lastRead = now(); if (!val.present()) { @@ -1680,9 +1680,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, } } - traceEvent.detail("Enabled", - readRebalance ? (skipCurrentLoop ? "NoReadRebalance" : "ReadRebalance") - : (skipCurrentLoop ? "NoDiskRebalance" : "DiskRebalance")); + traceEvent.detail("Enabled", !skipCurrentLoop); wait(delayF); if (skipCurrentLoop) { @@ -1693,8 +1691,8 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, } traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]); - // FIXME: find a proper number for SERVER_KNOBS->DD_REBALANCE_PARALLELISM - if (self->priority_relocations[ddPriority] < 25) { + + if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { if (isMountainChopperPriority(ddPriority)) { srcReq = GetTeamRequest(true, true, false, true); destReq = GetTeamRequest(true, false, true, false); @@ -1785,7 +1783,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde disableDiskBalance = false; } else { if (val.get().size() > 0) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); disableDiskBalance = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; disableReadBalance = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; skipCurrentLoop = disableReadBalance && disableDiskBalance; @@ -1901,7 +1899,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) disableDiskBalance = false; } else if (val.present()) { if (val.get().size() > 0) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); disableDiskBalance = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; disableReadBalance = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; skipCurrentLoop = disableReadBalance && disableDiskBalance; @@ -2020,8 +2018,13 @@ ACTOR Future dataDistributionQueue(Database cx, for (int i = 0; i < teamCollections.size(); i++) { balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); - balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); - balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM)); + if(SERVER_KNOBS->READ_SAMPLING_ENABLED == true) { + balancingFutures.push_back( + BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); + balancingFutures.push_back( + BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM)); + } + // balancingFutures.push_back(BgDDMountainChopper(&self, i)); // balancingFutures.push_back(BgDDValleyFiller(&self, i)); } balancingFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0)); From 7b9432bddaeb4df693082446e851a1d5f7ca4432 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 12 Apr 2022 18:01:28 -0700 Subject: [PATCH 049/299] revert to original BgDDValleyFiller and BgDDMountainChopper for disk rebalance --- fdbserver/DataDistributionQueue.actor.cpp | 45 +++++++++++++++++------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index d5395a1433..bbf985ce23 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1780,7 +1780,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde state std::pair>, bool> randomTeam; state bool moved = false; state TraceEvent traceEvent("BgDDMountainChopper", self->distributorId); - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval); + traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); if (*self->lastLimited > 0) { traceEvent.detail("SecondsSinceLastLimited", now() - *self->lastLimited); @@ -1790,13 +1790,23 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); lastRead = now(); - if (skipCurrentLoop && !val.present()) { + if (!val.present()) { // reset loop interval - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + if (skipCurrentLoop) { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } + skipCurrentLoop = false; + } else { + if (val.get().size() > 0) { + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; + } else { + skipCurrentLoop = true; + } } - skipCurrentLoop = val.present(); } traceEvent.detail("Enabled", !skipCurrentLoop); @@ -1892,7 +1902,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) state std::pair>, bool> randomTeam; state bool moved = false; state TraceEvent traceEvent("BgDDValleyFiller", self->distributorId); - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval); + traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); if (*self->lastLimited > 0) { traceEvent.detail("SecondsSinceLastLimited", now() - *self->lastLimited); @@ -1902,13 +1912,23 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional val = wait(tr.get(rebalanceDDIgnoreKey)); lastRead = now(); - if (skipCurrentLoop && !val.present()) { + if (!val.present()) { // reset loop interval - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + if (skipCurrentLoop) { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } + skipCurrentLoop = false; + } else { + if (val.get().size() > 0) { + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; + } else { + skipCurrentLoop = true; + } } - skipCurrentLoop = val.present(); } traceEvent.detail("Enabled", !skipCurrentLoop); @@ -2032,16 +2052,17 @@ ACTOR Future dataDistributionQueue(Database cx, state Future launchQueuedWorkTimeout = Never(); for (int i = 0; i < teamCollections.size(); i++) { - balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); - balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); + // FIXME: Use BgDDLoadBalance for disk rebalance too after DD simulation test proof. + // balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); + // balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); if (SERVER_KNOBS->READ_SAMPLING_ENABLED == true) { balancingFutures.push_back( BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); balancingFutures.push_back( BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM)); } - // balancingFutures.push_back(BgDDMountainChopper(&self, i)); - // balancingFutures.push_back(BgDDValleyFiller(&self, i)); + balancingFutures.push_back(BgDDMountainChopper(&self, i)); + balancingFutures.push_back(BgDDValleyFiller(&self, i)); } balancingFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0)); balancingFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0)); From 718119af8377bd0ced5f11be9c656ddd92c57b7b Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Tue, 29 Mar 2022 15:36:54 -0400 Subject: [PATCH 050/299] simplest initial implementation of Java listTenants --- .../apple/foundationdb/TenantManagement.java | 96 +++++++++++++++++++ fdbclient/MultiVersionTransaction.actor.cpp | 2 +- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java index 857aeb7f1f..3ae786f464 100644 --- a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java +++ b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java @@ -22,6 +22,7 @@ package com.apple.foundationdb; import java.nio.charset.Charset; import java.util.Arrays; +import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.Executor; @@ -210,5 +211,100 @@ public class TenantManagement { return deleteTenant(db, tenantName.pack()); } + + /** + * Lists all tenants in between the range specified. The number of tenants listed can be restricted. + * + * @param db The database used to create a transaction for listing the tenants. + * @param begin The beginning of the range of tenants to list. + * @param end The end of the range of the tenants to list. + * @param limit The maximum number of tenants to return from this request. + * @return an iterator where each item is a byte array with the tenant name and value. + */ + public static CloseableAsyncIterator listTenants(Database db, byte[] begin, byte[] end, int limit) { + return listTenants_internal(db.createTransaction(), begin, end, limit); + } + + public static CloseableAsyncIterator listTenants(Database db, Tuple begin, Tuple end, int limit) { + return listTenants_internal(db.createTransaction(), begin.pack(), end.pack(), limit); + } + + private static CloseableAsyncIterator listTenants_internal(Transaction tr, byte[] begin, byte[] end, + int limit) { + return new TenantAsyncIterator(tr, begin, end, limit); + } + + // Templates taken from BoundaryIterator LocalityUtil.java + static class TenantAsyncIterator implements CloseableAsyncIterator { + Transaction tr; + final byte[] begin; + final byte[] end; + + final AsyncIterable firstGet; + AsyncIterator iter; + private boolean closed; + + TenantAsyncIterator(Transaction tr, byte[] begin, byte[] end, int limit) { + this.tr = tr; + + this.begin = ByteArrayUtil.join(TENANT_MAP_PREFIX, begin); + this.end = ByteArrayUtil.join(TENANT_MAP_PREFIX, end); + + tr.options().setReadSystemKeys(); + tr.options().setLockAware(); + + firstGet = tr.getRange(this.begin, this.end, limit); + iter = firstGet.iterator(); + closed = false; + } + + @Override + public CompletableFuture onHasNext() { + return iter.onHasNext(); + } + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + @Override + public byte[] next() { + KeyValue kv = iter.next(); + byte[] tenant = ByteArrayUtil.replace(kv.getKey(), 0, kv.getKey().length, TENANT_MAP_PREFIX, null); + byte[] value = kv.getValue(); + + List parts = Arrays.asList(tenant, value); + byte[] separator = ": ".getBytes(); + + byte[] result = ByteArrayUtil.join(separator, parts); + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Tenant lists are read-only"); + } + + @Override + public void close() { + TenantAsyncIterator.this.tr.close(); + closed = true; + } + + @Override + protected void finalize() throws Throwable { + try { + if (FDB.instance().warnOnUnclosed && !closed) { + System.err.println("CloseableAsyncIterator not closed (listTenants)"); + } + if (!closed) { + close(); + } + } finally { + super.finalize(); + } + } + } + private TenantManagement() {} } diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 20517fc264..5db860b92e 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -980,7 +980,7 @@ ThreadFuture MultiVersionTransaction::getMappedRange(const Ke auto tr = getTransaction(); auto f = tr.transaction ? tr.transaction->getMappedRange(begin, end, mapper, limits, snapshot, reverse) : makeTimeout(); - return abortableFuture(f, tr.onChange); + return abortableFuture(f, tr.onChange, cluster_version_changed()); } ThreadFuture> MultiVersionTransaction::getVersionstamp() { From c683795f6b4e3c874d845aa83a1a49db21ebcf95 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Tue, 12 Apr 2022 16:34:54 -0400 Subject: [PATCH 051/299] add python bindings and revise test code --- TenantTest.java | 91 +++++++++++++++++++ .../apple/foundationdb/TenantManagement.java | 29 ++++-- bindings/python/fdb/impl.py | 5 + bindings/python/fdb/tenant_management.py | 41 +++++++++ fdbclient/MultiVersionTransaction.actor.cpp | 2 +- test_tenant.py | 33 +++++++ 6 files changed, 190 insertions(+), 11 deletions(-) create mode 100644 TenantTest.java create mode 100755 test_tenant.py diff --git a/TenantTest.java b/TenantTest.java new file mode 100644 index 0000000000..d9bb02acdd --- /dev/null +++ b/TenantTest.java @@ -0,0 +1,91 @@ +import java.io.UnsupportedEncodingException; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ThreadLocalRandom; + +import com.apple.foundationdb.Database; +import com.apple.foundationdb.FDB; +import com.apple.foundationdb.KeyValue; +import com.apple.foundationdb.Tenant; +import com.apple.foundationdb.Transaction; +import com.apple.foundationdb.tuple.Tuple; +import com.apple.foundationdb.KeyArrayResult; +import com.apple.foundationdb.TenantManagement; +import com.apple.foundationdb.async.AsyncUtil; +import static com.apple.foundationdb.async.AsyncUtil.collectRemaining; +import com.apple.foundationdb.async.CloseableAsyncIterator; + +public class TenantTest { + private FDB fdb; + private Database db; + CloseableAsyncIterator tenants; + + public TenantTest() { + try { + fdb = FDB.selectAPIVersion(710); + fdb.options().setTraceEnable(null); + db = fdb.open(); +///* + Tuple t1 = Tuple.from("tenant"); + Tuple t2 = Tuple.from("tenant2"); + Tuple t3 = Tuple.from("tenant3"); +//*/ +/* + byte[] t1 = Tuple.from("tenant").pack(); + byte[] t2 = Tuple.from("tenant2").pack(); + byte[] t3 = Tuple.from("tenant3").pack(); +*/ + System.out.println(t1); + System.out.println(t2); + System.out.println(t3); + + TenantManagement.createTenant(db, t1).join(); + TenantManagement.createTenant(db, t2).join(); + TenantManagement.createTenant(db, t3).join(); + + tenants = TenantManagement.listTenants(db, Tuple.from("a").pack(), Tuple.from("z").pack(), 100); + + try { +/* + List result = AsyncUtil.collectRemaining(tenants).join(); + System.out.println("Size: " + result.size()); + for(int i = 0; i < result.size(); i++) { + System.out.println(i); + KeyValue res = result.get(i); + System.out.println(new String(res.getKey())); + System.out.println(new String(res.getValue())); + } +*/ +// /* + while (tenants.hasNext()) { + KeyValue res = tenants.next(); + System.out.println(new String(res.getKey())); + System.out.println(new String(res.getValue())); + } +// */ + } + finally { + tenants.close(); + } + TenantManagement.deleteTenant(db, t1).join(); + TenantManagement.deleteTenant(db, t2).join(); + TenantManagement.deleteTenant(db, t3).join(); + } + catch(Exception e) { + e.printStackTrace(); + } + } + + public void close() { + db.close(); + } + + public static void main(String[] args) { + new TenantTest().close(); + } +} + diff --git a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java index 3ae786f464..d226d8c044 100644 --- a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java +++ b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java @@ -219,23 +219,35 @@ public class TenantManagement { * @param begin The beginning of the range of tenants to list. * @param end The end of the range of the tenants to list. * @param limit The maximum number of tenants to return from this request. - * @return an iterator where each item is a byte array with the tenant name and value. + * @return an iterator where each item is a KeyValue object where the key is the tenant name + * and the value is the unprocessed JSON string containing the tenant's metadata */ - public static CloseableAsyncIterator listTenants(Database db, byte[] begin, byte[] end, int limit) { + public static CloseableAsyncIterator listTenants(Database db, byte[] begin, byte[] end, int limit) { return listTenants_internal(db.createTransaction(), begin, end, limit); } - public static CloseableAsyncIterator listTenants(Database db, Tuple begin, Tuple end, int limit) { + /** + * Lists all tenants in between the range specified. The number of tenants listed can be restricted. + * This is a convenience method that generates the begin and end ranges by packing two {@code Tuple}s. + * + * @param db The database used to create a transaction for listing the tenants. + * @param begin The beginning of the range of tenants to list. + * @param end The end of the range of the tenants to list. + * @param limit The maximum number of tenants to return from this request. + * @return an iterator where each item is a KeyValue object where the key is the tenant name + * and the value is the unprocessed JSON string containing the tenant's metadata + */ + public static CloseableAsyncIterator listTenants(Database db, Tuple begin, Tuple end, int limit) { return listTenants_internal(db.createTransaction(), begin.pack(), end.pack(), limit); } - private static CloseableAsyncIterator listTenants_internal(Transaction tr, byte[] begin, byte[] end, + private static CloseableAsyncIterator listTenants_internal(Transaction tr, byte[] begin, byte[] end, int limit) { return new TenantAsyncIterator(tr, begin, end, limit); } // Templates taken from BoundaryIterator LocalityUtil.java - static class TenantAsyncIterator implements CloseableAsyncIterator { + static class TenantAsyncIterator implements CloseableAsyncIterator { Transaction tr; final byte[] begin; final byte[] end; @@ -268,15 +280,12 @@ public class TenantManagement { return iter.hasNext(); } @Override - public byte[] next() { + public KeyValue next() { KeyValue kv = iter.next(); byte[] tenant = ByteArrayUtil.replace(kv.getKey(), 0, kv.getKey().length, TENANT_MAP_PREFIX, null); byte[] value = kv.getValue(); - List parts = Arrays.asList(tenant, value); - byte[] separator = ": ".getBytes(); - - byte[] result = ByteArrayUtil.join(separator, parts); + KeyValue result = new KeyValue(tenant, value); return result; } diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 7c32b07e9b..51d67e5162 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -71,6 +71,11 @@ import types import struct +def remove_prefix(text, prefix): + if text.startswith(prefix): + return text[len(prefix):] + return text + def option_wrap(code): def setfunc(self): self._parent._set_option(code, None, 0) diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index b371a34226..3ee43326e4 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -78,6 +78,41 @@ def _delete_tenant_impl(tr, tenant_name, existence_check_marker, force_existence del tr[key] +class FDBTenantList(object): + """Iterates over the results of list_tenants query. Returns + KeyValue objects. + + """ + + def __init__(self, rangeresult): + self._range = rangeresult + self._iter = iter(self._range) + + def to_list(self): + return list(self.__iter__()) + + def __iter__(self, mode=None): + while True: + result = self._iter.__next__() + + tenant_name = _impl.remove_prefix(result.key, _tenant_map_prefix) + yield _impl.KeyValue(tenant_name, result.value) + +# Lists the tenants created in the cluster, specified by the begin and end range. +# Also limited in number of results by the limit parameter. +# Returns an iterable object that yields KeyValue objects +# where the keys are the tenant names and the values are the unprocessed +# JSON strings of the tenant metadata +@_impl.transactional +def _list_tenants_impl(tr, begin, end, limit): + tr.options.set_read_system_keys() + begin_key = b'%s%s' % (_tenant_map_prefix, begin) + end_key = b'%s%s' % (_tenant_map_prefix, end) + + rangeresult = tr.get_range(begin_key, end_key, limit) + + return FDBTenantList(rangeresult) + def create_tenant(db_or_tr, tenant_name): tenant_name = _impl.process_tenant_name(tenant_name) @@ -93,3 +128,9 @@ def delete_tenant(db_or_tr, tenant_name): # Callers using a transaction are expected to check existence themselves if required existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None] _delete_tenant_impl(db_or_tr, tenant_name, existence_check_marker) + +def list_tenants(db_or_tr, begin, end, limit): + begin = _impl.process_tenant_name(begin) + end = _impl.process_tenant_name(end) + + return _list_tenants_impl(db_or_tr, begin, end, limit) \ No newline at end of file diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 5db860b92e..20517fc264 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -980,7 +980,7 @@ ThreadFuture MultiVersionTransaction::getMappedRange(const Ke auto tr = getTransaction(); auto f = tr.transaction ? tr.transaction->getMappedRange(begin, end, mapper, limits, snapshot, reverse) : makeTimeout(); - return abortableFuture(f, tr.onChange, cluster_version_changed()); + return abortableFuture(f, tr.onChange); } ThreadFuture> MultiVersionTransaction::getVersionstamp() { diff --git a/test_tenant.py b/test_tenant.py new file mode 100755 index 0000000000..5f3dd9abd6 --- /dev/null +++ b/test_tenant.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import fdb +import sys + +fdb.api_version(710) +db=fdb.open() + +db.options.set_transaction_timeout(2000) + +#tenant = b'tenant' +#tenant2 = b'tenant2' +#tenant3 = b'tenant3' + +tenant = (u"tenant",) +tenant2 = (u"tenant2",) +tenant3 = (u"tenant3",) + +fdb.tenant_management.create_tenant(db, tenant) +fdb.tenant_management.create_tenant(db, tenant2) +fdb.tenant_management.create_tenant(db, tenant3) + +res = fdb.tenant_management.list_tenants(db, (u"a",), (u"z",), 10) +#res = fdb.tenant_management.list_tenants(db, b'a', b'z', 10) +for t in res: + print(t.key.decode()) + print(t.value.decode()) + +fdb.tenant_management.delete_tenant(db, tenant) +fdb.tenant_management.delete_tenant(db, tenant2) +fdb.tenant_management.delete_tenant(db, tenant3) + +sys.exit(0) From c0aa361885718fe026b15aed6b884c46a7567f1d Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 13 Apr 2022 16:25:01 -0400 Subject: [PATCH 052/299] add TENANT_LIST to existing tests --- .../com/apple/foundationdb/test/AsyncStackTester.java | 8 ++++++++ .../test/com/apple/foundationdb/test/StackOperation.java | 1 + .../src/test/com/apple/foundationdb/test/StackTester.java | 7 +++++++ bindings/python/tests/tenant_tests.py | 4 ++++ 4 files changed, 20 insertions(+) diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index 70263b510a..a66813524a 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -483,6 +483,14 @@ public class AsyncStackTester { inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName)); }, FDB.DEFAULT_EXECUTOR); } + else if (op == StackOperation.TENANT_LIST) { + return inst.popParams(3).thenAcceptAsync(params -> { + byte[] begin = (byte[])params.get(0); + byte[] end = (byte[])params.get(1); + int limit = StackUtils.getInt(params.get(2)); + inst.push(TenantManagement.listTenants(inst.context.db, begin, end, limit)); + }, FDB.DEFAULT_EXECUTOR); + } else if (op == StackOperation.TENANT_SET_ACTIVE) { return inst.popParam().thenAcceptAsync(param -> { byte[] tenantName = (byte[])param; diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java index 5cd013195d..e67d4cff81 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java @@ -76,6 +76,7 @@ enum StackOperation { // Tenants TENANT_CREATE, TENANT_DELETE, + TENANT_LIST, TENANT_SET_ACTIVE, TENANT_CLEAR_ACTIVE, diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index 0fc9141c96..002af5e97a 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -429,6 +429,13 @@ public class StackTester { byte[] tenantName = (byte[])inst.popParam().join(); inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName)); } + else if (op == StackOperation.TENANT_LIST) { + List params = inst.popParams(3).join(); + byte[] begin = (byte[])params.get(0); + byte[] end = (byte[])params.get(1); + int limit = StackUtils.getInt(params.get(2)); + inst.push(TenantManagement.listTenants(inst.context.db, begin, end, limit)); + } else if (op == StackOperation.TENANT_SET_ACTIVE) { byte[] tenantName = (byte[])inst.popParam().join(); inst.context.setTenant(Optional.of(tenantName)); diff --git a/bindings/python/tests/tenant_tests.py b/bindings/python/tests/tenant_tests.py index 2af7dd3307..b1d883a5ed 100755 --- a/bindings/python/tests/tenant_tests.py +++ b/bindings/python/tests/tenant_tests.py @@ -59,6 +59,10 @@ def test_tenant_operations(db): fdb.tenant_management.create_tenant(db, b'tenant1') fdb.tenant_management.create_tenant(db, b'tenant2') + tenant_list = fdb.tenant_management.list_tenants(db, b'a', b'z', 10).to_list() + assert tenant_list[0] == b'tenant1' + assert tenant_list[1] == b'tenant2' + tenant1 = db.open_tenant(b'tenant1') tenant2 = db.open_tenant(b'tenant2') From d58f918bba764eb2599f69568bab3523efea1ec6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 14 Apr 2022 12:02:27 -0700 Subject: [PATCH 053/299] update unittests; add more information in workload --- fdbclient/SystemData.cpp | 3 ++- fdbserver/DDTeamCollection.actor.cpp | 10 +++++----- fdbserver/workloads/ReadWrite.actor.cpp | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 9f70bf9e18..dfbe1db0ec 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -1456,11 +1456,12 @@ TEST_CASE("/SystemData/SerDes/SSI") { ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678); ssi.locality = localityData; ssi.initEndpoints(); + ssi.startAcceptingRequests(); testSSISerdes(ssi); ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238); - + ssi.stopAcceptingRequests(); testSSISerdes(ssi); printf("ssi serdes test complete\n"); diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index f241133909..01f6f79de5 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5676,10 +5676,10 @@ public: collection->disableBuildingTeams(); collection->setCheckTeamDelay(); - bool wantsNewServers = true; - bool wantsTrueBest = true; - bool preferLowerUtilization = true; - bool teamMustHaveShards = false; + auto wantsNewServers = WantNewServers::True; + auto wantsTrueBest = WantTrueBest::True; + auto preferLowerUtilization = PreferLowerUtilization::True; + auto teamMustHaveShards = TeamMustHaveShards::False; std::vector completeSources{ UID(1, 0), UID(2, 0), UID(3, 0) }; state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards); @@ -5689,7 +5689,7 @@ public: return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); }; - state GetTeamRequest reqHigh(wantsNewServers, wantsTrueBest, false, teamMustHaveShards); + state GetTeamRequest reqHigh(wantsNewServers, wantsTrueBest, PreferLowerUtilization::False, teamMustHaveShards); reqHigh.teamSorter = [](Reference a, Reference b) { auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index dd8d2822e6..f9c47bb556 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -798,7 +798,7 @@ struct ReadWriteWorkload : KVWorkload { // calculate hot server count void setHotServers() { hotServerCount = ceil(hotServerFraction * serverShards.size()); - std::cout << "Choose " << hotServerCount << " hot servers: ["; + std::cout << "Choose " << hotServerCount <<"/" << serverShards.size() <<" hot servers: ["; int begin = currentHotRound * hotServerCount; for (int i = 0; i < hotServerCount; ++i) { int idx = (begin + i) % serverShards.size(); From a7a9de781e62a7583c8a06c710fe358a1d7e0e93 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sat, 16 Apr 2022 22:51:55 -0700 Subject: [PATCH 054/299] update unittests --- fdbserver/DDTeamCollection.actor.cpp | 10 ++-------- fdbserver/DataDistribution.actor.h | 5 ++++- fdbserver/DataDistributionQueue.actor.cpp | 6 +++--- fdbserver/TCInfo.actor.cpp | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 01f6f79de5..27fe9379be 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5684,16 +5684,10 @@ public: state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards); req.completeSources = completeSources; - req.teamSorter = [](Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); - return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); - }; + req.teamSorter = greaterReadLoad; state GetTeamRequest reqHigh(wantsNewServers, wantsTrueBest, PreferLowerUtilization::False, teamMustHaveShards); - reqHigh.teamSorter = [](Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); - return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); - }; + reqHigh.teamSorter = lessReadLoad; wait(collection->getTeam(req) && collection->getTeam(reqHigh)); std::pair>, bool> resTeam = req.reply.getFuture().get(), diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 93a856307e..cc8c80c592 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -481,6 +481,9 @@ struct StorageWiggler : ReferenceCounted { ACTOR Future>> getServerListAndProcessClasses( Transaction* tr); - +// return -1 if a.readload > b.readload +int greaterReadLoad(Reference a, Reference b); +// return -1 if a.readload < b.readload +int lessReadLoad(Reference a, Reference b); #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index bbf985ce23..5f946f5639 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1587,7 +1587,7 @@ ACTOR static Future rebalanceTeams(DDQueueData* self, int64_t destBytes = destTeam->getLoadBytes(); bool sourceAndDestTooSimilar = - sourceBytes - destBytes <= 3 * std::max(SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes); + abs(sourceBytes - destBytes) <= 3 * std::max(SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes); traceEvent->detail("SourceBytes", sourceBytes) .detail("DestBytes", destBytes) .detail("ShardBytes", metrics.bytes) @@ -1668,7 +1668,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, try { // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test - state Future delayF = delay(0.1, TaskPriority::DataDistributionLaunch); + state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -2055,7 +2055,7 @@ ACTOR Future dataDistributionQueue(Database cx, // FIXME: Use BgDDLoadBalance for disk rebalance too after DD simulation test proof. // balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM)); // balancingFutures.push_back(BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM)); - if (SERVER_KNOBS->READ_SAMPLING_ENABLED == true) { + if (SERVER_KNOBS->READ_SAMPLING_ENABLED) { balancingFutures.push_back( BgDDLoadRebalance(&self, i, SERVER_KNOBS->PRIORITY_REBALANCE_READ_OVERUTIL_TEAM)); balancingFutures.push_back( diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 10b3522540..b7d819841f 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -400,7 +400,7 @@ double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight, double inflightPen } } return (size == 0 ? 0 : sum / size) + - (includeInFlight && !servers.empty() ? inflightPenalty * getReadInFlightToTeam() / servers.size() : 0); + (includeInFlight ? inflightPenalty * getReadInFlightToTeam() / servers.size() : 0); } int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { From e0ae0942c477ac00d7d64e2150a1287b4dcf439c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 20 Apr 2022 12:15:40 -0700 Subject: [PATCH 055/299] fix inflight read division; temp destComplete fix; 0.1 constant poll time --- fdbserver/DataDistributionQueue.actor.cpp | 50 +++++++++++++++-------- fdbserver/TCInfo.actor.cpp | 4 +- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 5f946f5639..de1b8edc03 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -416,16 +416,19 @@ void launchDest(RelocateData& relocation, } } } +void completeDest(RelocateData const& relocation, std::map& destBusymap) { + int destWorkFactor = getDestWorkFactor(); + for (UID id : relocation.completeDests) { + destBusymap[id].removeWork(relocation.priority, destWorkFactor); + } +} void complete(RelocateData const& relocation, std::map& busymap, std::map& destBusymap) { ASSERT(relocation.workFactor > 0); for (int i = 0; i < relocation.src.size(); i++) busymap[relocation.src[i]].removeWork(relocation.priority, relocation.workFactor); - int destWorkFactor = getDestWorkFactor(); - for (UID id : relocation.completeDests) { - destBusymap[id].removeWork(relocation.priority, destWorkFactor); - } + completeDest(relocation, destBusymap); } ACTOR Future dataDistributionRelocator(struct DDQueueData* self, @@ -1071,13 +1074,13 @@ struct DDQueueData { } }; -// return -1 if a.readload > b.readload +// return -1 if a.readload > b.readload, usually for choose dest team with low read load int greaterReadLoad(Reference a, Reference b) { auto r1 = a->getLoadReadBandwidth(true, 2), r2 = b->getLoadReadBandwidth(true, 2); return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); } -// return -1 if a.readload < b.readload +// return -1 if a.readload < b.readload, usually for choose source team with high read load int lessReadLoad(Reference a, Reference b) { auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); @@ -1449,7 +1452,10 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, self->noErrorActors.add( trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); - rd.completeDests.clear(); + + // completeDest(rd, self->destBusymap); + // rd.completeDests.clear(); + wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } } @@ -1517,12 +1523,14 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } if (metrics.keys.present() && metrics.bytes > 0) { - auto srcLoad = sourceTeam->getLoadReadBandwidth(), destLoad = destTeam->getLoadReadBandwidth(); - if (abs(srcLoad - destLoad) <= - 3 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { - traceEvent->detail("SkipReason", "TeamTooSimilar") - .detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) - .detail("SrcReadBandwidth", srcLoad); + auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth(); + traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) + .detail("SrcReadBandwidth", srcLoad) + .detail("DestReadBandwidth", destLoad); + + if (srcLoad - destLoad <= + 5 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + traceEvent->detail("SkipReason", "TeamTooSimilar"); return false; } // Verify the shard is still in ShardsAffectedByTeamFailure @@ -1587,7 +1595,7 @@ ACTOR static Future rebalanceTeams(DDQueueData* self, int64_t destBytes = destTeam->getLoadBytes(); bool sourceAndDestTooSimilar = - abs(sourceBytes - destBytes) <= 3 * std::max(SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes); + sourceBytes - destBytes <= 3 * std::max(SERVER_KNOBS->MIN_SHARD_BYTES, metrics.bytes); traceEvent->detail("SourceBytes", sourceBytes) .detail("DestBytes", destBytes) .detail("ShardBytes", metrics.bytes) @@ -1648,6 +1656,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; + state Future delayF = Never(); state const bool readRebalance = !isDiskRebalancePriority(ddPriority); state const char* eventName = isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper" : "BgDDValleyFiller"; @@ -1668,7 +1677,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, try { // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test - state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); + delayF = delay(0.1, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -1701,6 +1710,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, // set loop interval to avoid busy wait here. rebalancePollingInterval = std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); + tr.reset(); continue; } @@ -1816,6 +1826,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde // set loop interval to avoid busy wait here. rebalancePollingInterval = std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); + tr.reset(); continue; } @@ -1938,6 +1949,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) // set loop interval to avoid busy wait here. rebalancePollingInterval = std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); + tr.reset(); continue; } @@ -2120,7 +2132,9 @@ ACTOR Future dataDistributionQueue(Database cx, debug_setCheckRelocationDuration(false); } } - when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; } + when(KeyRange done = waitNext(rangesComplete.getFuture())) { + keysToLaunchFrom = done; + } when(wait(recordMetrics)) { Promise req; getAverageShardBytes.send(req); @@ -2167,7 +2181,9 @@ ACTOR Future dataDistributionQueue(Database cx, } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} - when(Promise r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); } + when(Promise r = waitNext(getUnhealthyRelocationCount)) { + r.send(self.unhealthyRelocations); + } } } } catch (Error& e) { diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index b7d819841f..a9cf76b95e 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -387,6 +387,7 @@ int64_t TCTeamInfo::getLoadBytes(bool includeInFlight, double inflightPenalty) c return (physicalBytes + (inflightPenalty * inFlightBytes)) * availableSpaceMultiplier; } +// average read bandwidth within a team double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight, double inflightPenalty) const { // FIXME: consider team load variance double sum = 0; @@ -400,7 +401,8 @@ double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight, double inflightPen } } return (size == 0 ? 0 : sum / size) + - (includeInFlight ? inflightPenalty * getReadInFlightToTeam() / servers.size() : 0); + // we don't need to divide the inflight bandwidth because when added it the bandwidth is from single server + (includeInFlight ? inflightPenalty * getReadInFlightToTeam() : 0); } int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { From 25e1e75d9edeb7c15b48c1828c7ed015380e51a3 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 20 Apr 2022 13:32:04 -0700 Subject: [PATCH 056/299] enable destComplete --- fdbserver/DataDistributionQueue.actor.cpp | 4 ++-- fdbserver/workloads/ReadWrite.actor.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index de1b8edc03..e2da233bd6 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1453,8 +1453,8 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, trigger([destinationRef, readLoad]() mutable { destinationRef.addReadInFlightToTeam(-readLoad); }, delay(SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL))); - // completeDest(rd, self->destBusymap); - // rd.completeDests.clear(); + completeDest(rd, self->destBusymap); + rd.completeDests.clear(); wait(delay(SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch)); } diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index f9c47bb556..e4c6818ca8 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -798,7 +798,8 @@ struct ReadWriteWorkload : KVWorkload { // calculate hot server count void setHotServers() { hotServerCount = ceil(hotServerFraction * serverShards.size()); - std::cout << "Choose " << hotServerCount <<"/" << serverShards.size() <<" hot servers: ["; + std::cout << "Choose " << hotServerCount << "/" << serverShards.size() << "/" << serverInterfaces.size() + << " hot servers: ["; int begin = currentHotRound * hotServerCount; for (int i = 0; i < hotServerCount; ++i) { int idx = (begin + i) % serverShards.size(); From 131adec811c9d1d7261017fc1f02925fe9d2c589 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 20 Apr 2022 15:28:03 -0700 Subject: [PATCH 057/299] add canQueue --- fdbserver/DataDistributionQueue.actor.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index e2da233bd6..6a95475685 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1072,6 +1072,12 @@ struct DDQueueData { } return highestPriority; } + + bool canQueue(const std::vector& ids) const { + return std::all_of(ids.begin(), ids.end(), [this](const UID& id) { + return this->queue.count(id) == 0 || this->queue.at(id).size() < 3 ; // == RELOCATION_PARALLELISM_PER_SOURCE_SERVER + 1 + }); + } }; // return -1 if a.readload > b.readload, usually for choose dest team with low read load @@ -1742,14 +1748,22 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, } // clang-format off wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam,ddPriority,&traceEvent)); + // clang-format on if (sourceTeam.isValid() && destTeam.isValid()) { if (readRebalance) { - wait(store(moved,rebalanceReadLoad(self,ddPriority, sourceTeam, destTeam,teamCollectionIndex == 0,&traceEvent))); + // check can queue for src server + if (self->canQueue(sourceTeam->getServerIDs())) { + wait(store( + moved, + rebalanceReadLoad( + self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); + } } else { - wait(store(moved,rebalanceTeams(self,ddPriority, sourceTeam, destTeam,teamCollectionIndex == 0,&traceEvent))); + wait(store(moved, + rebalanceTeams( + self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } } - // clang-format on moved ? resetCount = 0 : resetCount++; } From af9e5ba8852ae31c94ea8d29e369b2f1cbd296e1 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 20 Apr 2022 22:19:56 -0700 Subject: [PATCH 058/299] move canQueue, 60s each source server, random select portion of shards --- fdbserver/DataDistributionQueue.actor.cpp | 34 +++++++++++++---------- flow/IRandom.h | 5 ++-- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 6a95475685..0f75f5fbd8 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -903,6 +903,12 @@ struct DDQueueData { // logRelocation( results, "GotSourceServers" ); fetchingSourcesQueue.erase(results); + + // when doing read rebalance, to avoid the hottest team is chosen many times within 1 traffic sample period, if there are too many shard in the queue or the last shard appending time is less than 1 min, just discard this relocation request + if(results.reason == RelocateReason::REBALANCE_READ && !canQueue(results.src)) { + return; + } + queueMap.insert(results.keys, results); for (int i = 0; i < results.src.size(); i++) { queue[results.src[i]].insert(results); @@ -1075,7 +1081,10 @@ struct DDQueueData { bool canQueue(const std::vector& ids) const { return std::all_of(ids.begin(), ids.end(), [this](const UID& id) { - return this->queue.count(id) == 0 || this->queue.at(id).size() < 3 ; // == RELOCATION_PARALLELISM_PER_SOURCE_SERVER + 1 + if(this->queue.count(id) && this->queue.at(id).size()) { + return now() - this->queue.at(id).rbegin()->startTime >= 60.0; + } + return true; }); } }; @@ -1512,12 +1521,17 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); - if (!shards.size()) { + if (shards.size() <= 1) { traceEvent->detail("SkipReason", "NoShardOnSource"); return false; } + + // TODO: set 1000 as a knob + // randomly compare a portion of all shards + int shuffleLen = std::min((int)(shards.size() * 0.67), 1000); + deterministicRandom()->randomShuffle(shards, shuffleLen); state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetMetricsRequest req(shards); + state GetMetricsRequest req(std::vector(shards.begin(), shards.begin() + shuffleLen)); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); @@ -1748,22 +1762,14 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, } // clang-format off wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam,ddPriority,&traceEvent)); - // clang-format on if (sourceTeam.isValid() && destTeam.isValid()) { if (readRebalance) { - // check can queue for src server - if (self->canQueue(sourceTeam->getServerIDs())) { - wait(store( - moved, - rebalanceReadLoad( - self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); - } + wait(store(moved,rebalanceReadLoad(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } else { - wait(store(moved, - rebalanceTeams( - self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); + wait(store(moved,rebalanceTeams(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } } + // clang-format on moved ? resetCount = 0 : resetCount++; } diff --git a/flow/IRandom.h b/flow/IRandom.h index 87f7f42424..917e556ce0 100644 --- a/flow/IRandom.h +++ b/flow/IRandom.h @@ -156,8 +156,9 @@ public: } template - void randomShuffle(C& container) { - int s = (int)container.size(); + void randomShuffle(C& container, int shuffleLen = -1) { + int s = shuffleLen < 0 ? std::min(shuffleLen, (int)container.size()) : (int)container.size(); + for (int i = 0; i < s; i++) { int j = randomInt(i, s); if (i != j) { From a8bfd0f481180a08dbcc37550a6fe3bb6e19f2d1 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 20 Apr 2022 23:12:34 -0700 Subject: [PATCH 059/299] move canQueue, 60s each source server, random select portion of shards/destServer --- fdbserver/DataDistributionQueue.actor.cpp | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 0f75f5fbd8..0a3047dac1 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -903,12 +903,6 @@ struct DDQueueData { // logRelocation( results, "GotSourceServers" ); fetchingSourcesQueue.erase(results); - - // when doing read rebalance, to avoid the hottest team is chosen many times within 1 traffic sample period, if there are too many shard in the queue or the last shard appending time is less than 1 min, just discard this relocation request - if(results.reason == RelocateReason::REBALANCE_READ && !canQueue(results.src)) { - return; - } - queueMap.insert(results.keys, results); for (int i = 0; i < results.src.size(); i++) { queue[results.src[i]].insert(results); @@ -1081,7 +1075,7 @@ struct DDQueueData { bool canQueue(const std::vector& ids) const { return std::all_of(ids.begin(), ids.end(), [this](const UID& id) { - if(this->queue.count(id) && this->queue.at(id).size()) { + if (this->queue.count(id) && this->queue.at(id).size()) { return now() - this->queue.at(id).rbegin()->startTime >= 60.0; } return true; @@ -1179,11 +1173,12 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT; - auto req = GetTeamRequest(WantNewServers(rd.wantsNewServers), - WantTrueBest(isValleyFillerPriority(rd.priority)), - PreferLowerUtilization::True, - TeamMustHaveShards::False, - inflightPenalty); + auto req = + GetTeamRequest(WantNewServers(rd.wantsNewServers), + WantTrueBest(rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM), + PreferLowerUtilization::True, + TeamMustHaveShards::False, + inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; @@ -1764,7 +1759,9 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam,ddPriority,&traceEvent)); if (sourceTeam.isValid() && destTeam.isValid()) { if (readRebalance) { - wait(store(moved,rebalanceReadLoad(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); + if(self->canQueue(sourceTeam->getServerIDs())) { + wait(store(moved,rebalanceReadLoad(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); + } } else { wait(store(moved,rebalanceTeams(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } From 9e79ff49a6bbfe549e50d8e0a7148bd019bbfaf3 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 21 Apr 2022 16:58:32 -0400 Subject: [PATCH 060/299] address code review comments and add more places for test code --- bindings/bindingtester/spec/tenantTester.md | 6 ++++++ bindings/bindingtester/tests/api.py | 4 ++++ .../com/apple/foundationdb/TenantManagement.java | 2 +- bindings/python/fdb/tenant_management.py | 2 +- bindings/python/tests/tenant_tests.py | 14 ++++++++++++-- bindings/python/tests/tester.py | 6 ++++++ 6 files changed, 30 insertions(+), 4 deletions(-) diff --git a/bindings/bindingtester/spec/tenantTester.md b/bindings/bindingtester/spec/tenantTester.md index 2ba54a74c4..df33ef4b0b 100644 --- a/bindings/bindingtester/spec/tenantTester.md +++ b/bindings/bindingtester/spec/tenantTester.md @@ -38,6 +38,12 @@ The tenant API introduces some new operations: Unsets the active tenant. +#### TENANT_LIST + + Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. Returns list + of tenants contained in the range BEGIN to END, numbering LIMIT at most. + May optionally push a future onto the stack. + Updates to Existing Instructions -------------------------------- diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index fd495fac76..31ebe473a7 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -600,6 +600,10 @@ class ApiTest(Test): instructions.append(op) elif op == 'TENANT_CLEAR_ACTIVE': instructions.append(op) + elif op == 'TENANT_LIST': + instructions.push_args(b'', b'\xff', 10) + instructions.append(op) + self.add_strings(1) else: assert False, 'Unknown operation: ' + op diff --git a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java index d226d8c044..262ebfef7c 100644 --- a/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java +++ b/bindings/java/src/main/com/apple/foundationdb/TenantManagement.java @@ -282,7 +282,7 @@ public class TenantManagement { @Override public KeyValue next() { KeyValue kv = iter.next(); - byte[] tenant = ByteArrayUtil.replace(kv.getKey(), 0, kv.getKey().length, TENANT_MAP_PREFIX, null); + byte[] tenant = Arrays.copyOfRange(kv.getKey(), TENANT_MAP_PREFIX.length, kv.getKey().length); byte[] value = kv.getValue(); KeyValue result = new KeyValue(tenant, value); diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index 3ee43326e4..c8b061611f 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -91,7 +91,7 @@ class FDBTenantList(object): def to_list(self): return list(self.__iter__()) - def __iter__(self, mode=None): + def __iter__(self): while True: result = self._iter.__next__() diff --git a/bindings/python/tests/tenant_tests.py b/bindings/python/tests/tenant_tests.py index b1d883a5ed..0570c42537 100755 --- a/bindings/python/tests/tenant_tests.py +++ b/bindings/python/tests/tenant_tests.py @@ -60,8 +60,16 @@ def test_tenant_operations(db): fdb.tenant_management.create_tenant(db, b'tenant2') tenant_list = fdb.tenant_management.list_tenants(db, b'a', b'z', 10).to_list() - assert tenant_list[0] == b'tenant1' - assert tenant_list[1] == b'tenant2' + assert tenant_list[0].key == b'tenant1' + assert tenant_list[1].key == b'tenant2' + + t1_entry = tenant_list[0].value + t1_json = json.loads(t1_entry) + p1 = t1_json['prefix'].encode('utf8') + + t2_entry = tenant_list[1].value + t2_json = json.loads(t2_entry) + p2 = t2_json['prefix'].encode('utf8') tenant1 = db.open_tenant(b'tenant1') tenant2 = db.open_tenant(b'tenant2') @@ -73,10 +81,12 @@ def test_tenant_operations(db): tenant1_entry = db[b'\xff\xff/management/tenant_map/tenant1'] tenant1_json = json.loads(tenant1_entry) prefix1 = tenant1_json['prefix'].encode('utf8') + assert prefix1 == p1 tenant2_entry = db[b'\xff\xff/management/tenant_map/tenant2'] tenant2_json = json.loads(tenant2_entry) prefix2 = tenant2_json['prefix'].encode('utf8') + assert prefix2 == p2 assert tenant1[b'tenant_test_key'] == b'tenant1' assert db[prefix1 + b'tenant_test_key'] == b'tenant1' diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 936f7015c0..f392772a31 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -604,6 +604,12 @@ class Tester: self.tenant = self.db.open_tenant(name) elif inst.op == six.u("TENANT_CLEAR_ACTIVE"): self.tenant = None + elif inst.op == six.u("TENANT_LIST"): + begin = inst.pop() + end = inst.pop() + limit = inst.pop() + tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) + inst.push(tenant_list) elif inst.op == six.u("UNIT_TESTS"): try: test_db_options(db) From 04311d001e0cacfb055c3dd6e92e9fe478289bca Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 21 Apr 2022 22:37:16 -0700 Subject: [PATCH 061/299] topK shard random selection --- fdbserver/DDTeamCollection.actor.cpp | 4 +- fdbserver/DataDistribution.actor.h | 9 +-- fdbserver/DataDistributionQueue.actor.cpp | 71 ++++++++++++--------- fdbserver/DataDistributionTracker.actor.cpp | 29 ++++++--- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 27fe9379be..d4b437bad1 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -1473,7 +1473,7 @@ public: wait(delay(SERVER_KNOBS->DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY)); state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(team->getServerIDs(), self->primary)); - state std::vector> sizes; + state std::vector>> sizes; sizes.reserve(shards.size()); for (auto const& shard : shards) { @@ -1488,7 +1488,7 @@ public: int64_t bytesLost = 0; for (auto const& size : sizes) { - bytesLost += size.get().bytes; + bytesLost += size.get()[0].bytes; } TraceEvent(SevWarnAlways, "DDZeroServerLeftInTeam", self->distributorId) diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index cc8c80c592..16c42f23c2 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -155,13 +155,14 @@ struct GetMetricsRequest { // whether a < b typedef std::function MetricsComparator; std::vector keys; - Promise reply; + int topK = 1; // default only return the top 1 shard based on the comparator + Promise> reply; // topK storage metrics Optional - comparator; // if comparator is assigned, return the largest one in keys, otherwise return the sum of metrics + comparator; // if comparator is assigned, return the largest topK in keys, otherwise return the sum of metrics GetMetricsRequest() {} - GetMetricsRequest(KeyRange const& keys) : keys({ keys }) {} - GetMetricsRequest(std::vector const& keys) : keys(keys) {} + GetMetricsRequest(KeyRange const& keys, int topK = 1) : keys({ keys }), topK(topK) {} + GetMetricsRequest(std::vector const& keys, int topK = 1) : keys(keys), topK(topK) {} }; struct GetMetricsListRequest { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 0a3047dac1..82a5082020 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1146,8 +1146,9 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, self->suppressIntervals = 0; } - state StorageMetrics metrics = + std::vector metricsList = wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(rd.keys)))); + state StorageMetrics metrics = metricsList[0]; ASSERT(rd.src.size()); loop { @@ -1521,45 +1522,55 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } - // TODO: set 1000 as a knob - // randomly compare a portion of all shards - int shuffleLen = std::min((int)(shards.size() * 0.67), 1000); - deterministicRandom()->randomShuffle(shards, shuffleLen); + // TODO: set 100 as a knob + // randomly choose topK shards state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetMetricsRequest req(std::vector(shards.begin(), shards.begin() + shuffleLen)); + state GetMetricsRequest req(shards, 100); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; - state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); + state std::vector metricsList = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); wait(ready(healthMetrics)); if (getWorstCpu(healthMetrics.get()) < 25.0) { // 25% traceEvent->detail("SkipReason", "LowReadLoad"); return false; } - if (metrics.keys.present() && metrics.bytes > 0) { - auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth(); - traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) - .detail("SrcReadBandwidth", srcLoad) - .detail("DestReadBandwidth", destLoad); - if (srcLoad - destLoad <= - 5 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { - traceEvent->detail("SkipReason", "TeamTooSimilar"); - return false; + int chosenIdx = -1; + for (int i = 0; i < SERVER_KNOBS->REBALANCE_MAX_RETRIES; ++i) { + int idx = deterministicRandom()->randomInt(0, metricsList.size()); + if (metricsList[idx].keys.present() && metricsList[i].bytes > 0) { + chosenIdx = idx; + break; } - // Verify the shard is still in ShardsAffectedByTeamFailure - shards = self->shardsAffectedByTeamFailure->getShardsFor( - ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); - for (int i = 0; i < shards.size(); i++) { - if (metrics.keys == shards[i]) { - self->output.send(RelocateShard(metrics.keys.get(), priority, RelocateReason::REBALANCE_READ)); - return true; - } + } + if (chosenIdx == -1) { + traceEvent->detail("SkipReason", "NoEligibleShards"); + return false; + } + + auto& metrics = metricsList[chosenIdx]; + auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth(); + traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) + .detail("SrcReadBandwidth", srcLoad) + .detail("DestReadBandwidth", destLoad); + + if (srcLoad - destLoad <= + 5 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { + traceEvent->detail("SkipReason", "TeamTooSimilar"); + return false; + } + // Verify the shard is still in ShardsAffectedByTeamFailure + shards = self->shardsAffectedByTeamFailure->getShardsFor( + ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); + for (int i = 0; i < shards.size(); i++) { + if (metrics.keys == shards[i]) { + self->output.send(RelocateShard(metrics.keys.get(), priority, RelocateReason::REBALANCE_READ)); + return true; } - traceEvent->detail("SkipReason", "ShardNotPresent"); - } else - traceEvent->detail("SkipReason", metrics.keys.present() ? "ShardZeroSize" : "ShardNoKeys"); + } + traceEvent->detail("SkipReason", "ShardNotPresent"); return false; } @@ -1594,11 +1605,11 @@ ACTOR static Future rebalanceTeams(DDQueueData* self, state int retries = 0; while (retries < SERVER_KNOBS->REBALANCE_MAX_RETRIES) { state KeyRange testShard = deterministicRandom()->randomChoice(shards); - StorageMetrics testMetrics = + std::vector testMetrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(testShard)))); - if (testMetrics.bytes > metrics.bytes) { + if (testMetrics[0].bytes > metrics.bytes) { moveShard = testShard; - metrics = testMetrics; + metrics = testMetrics[0]; if (metrics.bytes > averageShardBytes) { break; } diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 7a9be7ab75..0f8883cccc 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -836,7 +836,9 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr try { loop { Future onChange; - StorageMetrics returnMetrics; + std::vector returnMetrics; + if (!req.comparator.present()) + returnMetrics.push_back(StorageMetrics()); for (auto range : req.keys) { StorageMetrics metrics; for (auto t : self->shards.intersectingRanges(range)) { @@ -854,20 +856,25 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr } if (req.comparator.present()) { - if (req.comparator.get()(returnMetrics, metrics)) { - returnMetrics = metrics; - returnMetrics.keys = range; - } + returnMetrics.push_back(metrics); } else { - returnMetrics += metrics; + returnMetrics[0] += metrics; } } if (!onChange.isValid()) { - req.reply.send(returnMetrics); + if (req.topK >= returnMetrics.size()) + req.reply.send(returnMetrics); + else if (req.comparator.present()) { + std::nth_element(returnMetrics.begin(), + returnMetrics.end() - req.topK, + returnMetrics.end(), + req.comparator.get()); + req.reply.send( + std::vector(returnMetrics.rbegin(), returnMetrics.rbegin() + req.topK)); + } return Void(); } - wait(onChange); } } catch (Error& e) { @@ -884,7 +891,7 @@ ACTOR Future fetchShardMetrics(DataDistributionTracker* self, GetMetricsRe TEST(true); // DD_SHARD_METRICS_TIMEOUT StorageMetrics largeMetrics; largeMetrics.bytes = getMaxShardSize(self->dbSizeEstimate->get()); - req.reply.send(largeMetrics); + req.reply.send(std::vector(1, largeMetrics)); } } return Void(); @@ -931,7 +938,9 @@ ACTOR Future fetchShardMetricsList_impl(DataDistributionTracker* self, Get ACTOR Future fetchShardMetricsList(DataDistributionTracker* self, GetMetricsListRequest req) { choose { when(wait(fetchShardMetricsList_impl(self, req))) {} - when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { req.reply.sendError(timed_out()); } + when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { + req.reply.sendError(timed_out()); + } } return Void(); } From eb949ee3dc343c0da0613af2490e1fd759053f87 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 22 Apr 2022 10:07:12 -0700 Subject: [PATCH 062/299] change default shard size; enable valley filler best; top 10 random choice --- fdbclient/ServerKnobs.cpp | 2 +- fdbserver/DataDistributionQueue.actor.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 2531805cfc..a1d3c74407 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -163,7 +163,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0; bool buggifySmallShards = randomize && BUGGIFY; bool simulationMediumShards = !buggifySmallShards && isSimulated && randomize && !BUGGIFY; // prefer smaller shards in simulation - init( MIN_SHARD_BYTES, 50000000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair + init( MIN_SHARD_BYTES, 500000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair init( SHARD_BYTES_RATIO, 4 ); init( SHARD_BYTES_PER_SQRT_BYTES, 45 ); if( buggifySmallShards ) SHARD_BYTES_PER_SQRT_BYTES = 0;//Approximately 10000 bytes per shard init( MAX_SHARD_BYTES, 500000000 ); diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 82a5082020..0b469200e0 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1176,7 +1176,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, auto req = GetTeamRequest(WantNewServers(rd.wantsNewServers), - WantTrueBest(rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM), + WantTrueBest(isValleyFillerPriority(rd.priority)), PreferLowerUtilization::True, TeamMustHaveShards::False, inflightPenalty); @@ -1522,10 +1522,10 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } - // TODO: set 100 as a knob + // TODO: set 10 as a knob // randomly choose topK shards state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetMetricsRequest req(shards, 100); + state GetMetricsRequest req(shards, 10); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); @@ -1538,7 +1538,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, } int chosenIdx = -1; - for (int i = 0; i < SERVER_KNOBS->REBALANCE_MAX_RETRIES; ++i) { + for (int i = 0; i < 10; ++i) { int idx = deterministicRandom()->randomInt(0, metricsList.size()); if (metricsList[idx].keys.present() && metricsList[i].bytes > 0) { chosenIdx = idx; From 4818f01de90d7cc1fdbc5d525e11c3f83f66a55b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 22 Apr 2022 14:14:58 -0700 Subject: [PATCH 063/299] fix top10 shard index bug; add event detail; fix merge conflict --- fdbserver/DataDistributionQueue.actor.cpp | 33 +++++++++++------------ 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 89a33b6b9c..fa73ca45a9 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -423,13 +423,6 @@ void completeDest(RelocateData const& relocation, std::map& destB } } -void completeDest(RelocateData const& relocation, std::map& destBusymap) { - int destWorkFactor = getDestWorkFactor(); - for (UID id : relocation.completeDests) { - destBusymap[id].removeWork(relocation.priority, destWorkFactor); - } -} - void complete(RelocateData const& relocation, std::map& busymap, std::map& destBusymap) { ASSERT(relocation.workFactor > 0); for (int i = 0; i < relocation.src.size(); i++) @@ -470,6 +463,9 @@ struct DDQueueData { KeyRangeActorMap getSourceActors; std::map>> queue; // Key UID is serverID, value is the serverID's set of RelocateData to relocate + // The last time one server was selected as source team for read rebalance reason. We want to throttle read + // rebalance on time bases because the read workload sample update has delay after the previous moving + std::map lastAsSource; KeyRangeMap inFlight; // Track all actors that relocates specified keys to a good place; Key: keyRange; Value: actor @@ -1181,12 +1177,11 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT; - auto req = - GetTeamRequest(WantNewServers(rd.wantsNewServers), - WantTrueBest(isValleyFillerPriority(rd.priority)), - PreferLowerUtilization::True, - TeamMustHaveShards::False, - inflightPenalty); + auto req = GetTeamRequest(WantNewServers(rd.wantsNewServers), + WantTrueBest(isValleyFillerPriority(rd.priority)), + PreferLowerUtilization::True, + TeamMustHaveShards::False, + inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; @@ -1524,6 +1519,10 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); + traceEvent->detail("ShardsInSource", shards.size()); + // For read rebalance if there is just 1 hot shard remained, move this shard to another server won't solve the + // problem. + // TODO: This situation should be solved by split and merge if (shards.size() <= 1) { traceEvent->detail("SkipReason", "NoShardOnSource"); return false; @@ -1544,11 +1543,11 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } + deterministicRandom()->randomShuffle(metricsList); int chosenIdx = -1; - for (int i = 0; i < 10; ++i) { - int idx = deterministicRandom()->randomInt(0, metricsList.size()); - if (metricsList[idx].keys.present() && metricsList[i].bytes > 0) { - chosenIdx = idx; + for (int i = 0; i < metricsList.size(); ++i) { + if (metricsList[i].keys.present() && metricsList[i].bytes > 0) { + chosenIdx = i; break; } } From eefd0778c5c04367eb4ff3e0f4ebe93de4b1f4eb Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 22 Apr 2022 15:26:44 -0700 Subject: [PATCH 064/299] change canQueue to timeThrottle() --- fdbserver/DataDistributionQueue.actor.cpp | 31 +++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index fa73ca45a9..e1a5b9848f 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -910,6 +910,7 @@ struct DDQueueData { for (int i = 0; i < results.src.size(); i++) { queue[results.src[i]].insert(results); } + updateLastAsSource(results.src); } void logRelocation(const RelocateData& rd, const char* title) { @@ -1076,14 +1077,20 @@ struct DDQueueData { return highestPriority; } - bool canQueue(const std::vector& ids) const { - return std::all_of(ids.begin(), ids.end(), [this](const UID& id) { - if (this->queue.count(id) && this->queue.at(id).size()) { - return now() - this->queue.at(id).rbegin()->startTime >= 60.0; + // return true if the servers are throttled as source for read rebalance + bool timeThrottle(const std::vector& ids) const { + return std::any_of(ids.begin(), ids.end(), [this](const UID& id) { + if (this->lastAsSource.count(id)) { + return (now() - this->lastAsSource.at(id)) * 3.0 < SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; } - return true; + return false; }); } + + void updateLastAsSource(const std::vector& ids, double t = now()) { + for (auto& id : ids) + lastAsSource[id] = t; + } }; // return -1 if a.readload > b.readload, usually for choose dest team with low read load @@ -1504,8 +1511,9 @@ inline double getWorstCpu(const HealthMetrics& metrics) { } return cpu; } -// Move the shard with highest read density of sourceTeam's to destTeam if sourceTeam has much more read load than -// destTeam + +// Move the shard with the top K highest read density of sourceTeam's to destTeam if sourceTeam has much more read load +// than destTeam ACTOR Future rebalanceReadLoad(DDQueueData* self, int priority, Reference sourceTeam, @@ -1516,6 +1524,10 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, traceEvent->detail("CancelingDueToSimulationSpeedup", true); return false; } + // check lastAsSource + if (self->timeThrottle(sourceTeam->getServerIDs())) { + traceEvent->detail("SkipReason", "SourceTeamThrottle"); + } state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); @@ -1573,6 +1585,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, for (int i = 0; i < shards.size(); i++) { if (metrics.keys == shards[i]) { self->output.send(RelocateShard(metrics.keys.get(), priority, RelocateReason::REBALANCE_READ)); + self->updateLastAsSource(sourceTeam->getServerIDs()); return true; } } @@ -1776,9 +1789,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam,ddPriority,&traceEvent)); if (sourceTeam.isValid() && destTeam.isValid()) { if (readRebalance) { - if(self->canQueue(sourceTeam->getServerIDs())) { - wait(store(moved,rebalanceReadLoad(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); - } + wait(store(moved,rebalanceReadLoad(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } else { wait(store(moved,rebalanceTeams(self, ddPriority, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } From 99d2335220354e77af3bd744b575c09fd605d709 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sat, 23 Apr 2022 00:03:57 -0700 Subject: [PATCH 065/299] add storeType to metadata; updateStorageMetadata; combine storeTypeTracker --- fdbclient/FDBTypes.h | 22 +++++++++++-- fdbserver/DDTeamCollection.actor.cpp | 48 +++++++++++++++++----------- fdbserver/DDTeamCollection.h | 2 +- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/DataDistribution.actor.h | 8 ++--- 5 files changed, 56 insertions(+), 26 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 9f89237b51..97f8e2bb1c 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1421,16 +1421,34 @@ struct StorageMetadataType { constexpr static FileIdentifier file_identifier = 732123; // when the SS is initialized, in epoch seconds, comes from currentTime() double createdTime; + KeyValueStoreType storeType; + + // no need to serialize part (should be assigned after initialization) + bool wrongConfigured = false; + StorageMetadataType() : createdTime(0) {} - StorageMetadataType(uint64_t t) : createdTime(t) {} + StorageMetadataType(uint64_t t, KeyValueStoreType storeType = KeyValueStoreType::END, bool wrongConfigured = false) + : createdTime(t), storeType(storeType), wrongConfigured(wrongConfigured) {} static double currentTime() { return g_network->timer(); } + bool operator==(const StorageMetadataType& b) const { + return createdTime == b.createdTime && storeType == b.storeType; + } + + bool operator<(const StorageMetadataType& b) const { + if (wrongConfigured == b.wrongConfigured) { + // the younger, the less + return createdTime > b.createdTime; + } + return wrongConfigured < b.wrongConfigured; + } + // To change this serialization, ProtocolVersion::StorageMetadata must be updated, and downgrades need // to be considered template void serialize(Ar& ar) { - serializer(ar, createdTime); + serializer(ar, createdTime, storeType); } }; diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 95916ee7cd..a3d7a057c4 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -995,14 +995,12 @@ public: state Future metricsTracker = server->serverMetricsPolling(); state Future> interfaceChanged = server->onInterfaceChanged; - - state Future storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server); state bool hasWrongDC = !self->isCorrectDC(*server); state bool hasInvalidLocality = !self->isValidLocality(self->configuration.storagePolicy, server->getLastKnownInterface().locality); state int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2; - state Future storageMetadataTracker = (isTss) ? Never() : self->readOrCreateStorageMetadata(server); + state Future storageMetadataTracker = self->updateStorageMetadata(server, isTss); try { loop { status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get(); @@ -1325,8 +1323,8 @@ public: recordTeamCollectionInfo = true; // Restart the storeTracker for the new interface. This will cancel the previous // keyValueStoreTypeTracker - storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server); - storageMetadataTracker = (isTss) ? Never() : readOrCreateStorageMetadata(self, server); + // storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server); + storageMetadataTracker = updateStorageMetadata(self, server, isTss); hasWrongDC = !self->isCorrectDC(*server); hasInvalidLocality = !self->isValidLocality(self->configuration.storagePolicy, server->getLastKnownInterface().locality); @@ -1346,7 +1344,7 @@ public: .detail("WrongStoreTypeRemoved", server->wrongStoreTypeToRemove.get()); } when(wait(server->wakeUpTracker.getFuture())) { server->wakeUpTracker = Promise(); } - when(wait(storageMetadataTracker || storeTypeTracker)) {} + when(wait(storageMetadataTracker)) {} when(wait(server->ssVersionTooFarBehind.onChange())) {} when(wait(self->disableFailingLaggingServers.onChange())) {} } @@ -2876,11 +2874,18 @@ public: return Void(); } - ACTOR static Future readOrCreateStorageMetadata(DDTeamCollection* self, TCServerInfo* server) { + ACTOR static Future updateStorageMetadata(DDTeamCollection* self, TCServerInfo* server, bool isTss) { state KeyBackedObjectMap metadataMap( serverMetadataKeys.begin, IncludeVersion()); state Reference tr = makeReference(self->cx); - state StorageMetadataType data(StorageMetadataType::currentTime()); + + // Update server's storeType, especially when it was created + wait(server->updateStoreType()); + state StorageMetadataType data( + StorageMetadataType::currentTime(), + server->getStoreType(), + isTss ? !server->isCorrectStoreType(self->configuration.testingStorageServerStoreType) + : !server->isCorrectStoreType(self->configuration.storageServerStoreType)); // printf("------ read metadata %s\n", server->getId().toString().c_str()); // read storage metadata loop { @@ -2890,10 +2895,9 @@ public: Optional metadata = wait(property.get(tr)); // NOTE: in upgrade testing, there may not be any metadata if (metadata.present()) { - data = metadata.get(); - } else { - metadataMap.set(tr, server->getId(), data); + data.createdTime = metadata.get().createdTime; } + metadataMap.set(tr, server->getId(), data); wait(tr->commit()); break; } catch (Error& e) { @@ -2901,11 +2905,19 @@ public: } } - // add server to wiggler - if (self->storageWiggler->contains(server->getId())) { - self->storageWiggler->updateMetadata(server->getId(), data); - } else { - self->storageWiggler->addServer(server->getId(), data); + if (!isTss) { + // wrong store type handler + if (!server->isCorrectStoreType(self->configuration.storageServerStoreType) && + self->wrongStoreTypeRemover.isReady()) { + self->wrongStoreTypeRemover = removeWrongStoreType(self); + self->addActor.send(self->wrongStoreTypeRemover); + } + // add server to wiggler + if (self->storageWiggler->contains(server->getId())) { + self->storageWiggler->updateMetadata(server->getId(), data); + } else { + self->storageWiggler->addServer(server->getId(), data); + } } return Never(); @@ -3512,8 +3524,8 @@ Future DDTeamCollection::readStorageWiggleMap() { return DDTeamCollectionImpl::readStorageWiggleMap(this); } -Future DDTeamCollection::readOrCreateStorageMetadata(TCServerInfo* server) { - return DDTeamCollectionImpl::readOrCreateStorageMetadata(this, server); +Future DDTeamCollection::updateStorageMetadata(TCServerInfo* server, bool isTss) { + return DDTeamCollectionImpl::updateStorageMetadata(this, server, isTss); } void DDTeamCollection::resetLocalitySet() { diff --git a/fdbserver/DDTeamCollection.h b/fdbserver/DDTeamCollection.h index 529d2ead93..99c04b0c2a 100644 --- a/fdbserver/DDTeamCollection.h +++ b/fdbserver/DDTeamCollection.h @@ -487,7 +487,7 @@ class DDTeamCollection : public ReferenceCounted { } // Read storage metadata from database, and do necessary updates - Future readOrCreateStorageMetadata(TCServerInfo* server); + Future updateStorageMetadata(TCServerInfo* server, bool isTss); Future serverGetTeamRequests(TeamCollectionInterface tci); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4d7cd8a507..893a46efbf 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -262,7 +262,7 @@ void StorageWiggler::updateMetadata(const UID& serverId, const StorageMetadataTy // std::cout << "size: " << pq_handles.size() << " update " << serverId.toString() // << " DC: " << teamCollection->isPrimary() << std::endl; auto handle = pq_handles.at(serverId); - if ((*handle).first.createdTime == metadata.createdTime) { + if ((*handle).first == metadata) { return; } wiggle_pq.update(handle, std::make_pair(metadata, serverId)); diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index fe2daf8c00..2f0231526f 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -395,16 +395,16 @@ struct StorageWiggler : ReferenceCounted { // data structures typedef std::pair MetadataUIDP; - // sorted by (createdTime, UID), the least comes first + // less comparator by (metadata, UID), the largest comes first struct CompPair { bool operator()(MetadataUIDP const& a, MetadataUIDP const& b) const { - if (a.first.createdTime == b.first.createdTime) { + if (a.first == b.first) { return a.second > b.second; } - // larger createdTime means the age is younger - return a.first.createdTime > b.first.createdTime; + return a.first < b.first; } }; + // max-heap boost::heap::skew_heap, boost::heap::compare> wiggle_pq; std::unordered_map pq_handles; From a0b45c29625cdf9e9364489acf7f7d1c1905f76d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sun, 24 Apr 2022 15:51:18 -0700 Subject: [PATCH 066/299] fix throttle bug --- fdbserver/DataDistributionQueue.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index e1a5b9848f..9a2fe3e1a5 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1527,6 +1527,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, // check lastAsSource if (self->timeThrottle(sourceTeam->getServerIDs())) { traceEvent->detail("SkipReason", "SourceTeamThrottle"); + return false; } state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( From cc05f5e9dbe6bf55c516bfbd2ddbbefe72437e2c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sun, 24 Apr 2022 17:10:58 -0700 Subject: [PATCH 067/299] fix getMetrics keys bug --- fdbserver/DataDistribution.actor.h | 4 ++-- fdbserver/DataDistributionQueue.actor.cpp | 19 ++++++++++++------- fdbserver/DataDistributionTracker.actor.cpp | 5 +++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 16c42f23c2..66cfef3c48 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -152,13 +152,13 @@ struct GetTeamRequest { }; struct GetMetricsRequest { - // whether a < b + // whether a > b typedef std::function MetricsComparator; std::vector keys; int topK = 1; // default only return the top 1 shard based on the comparator Promise> reply; // topK storage metrics Optional - comparator; // if comparator is assigned, return the largest topK in keys, otherwise return the sum of metrics + comparator; // Return true if a.score > b.score.if comparator is assigned, return the largest topK in keys, otherwise return the sum of metrics GetMetricsRequest() {} GetMetricsRequest(KeyRange const& keys, int topK = 1) : keys({ keys }), topK(topK) {} diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 9a2fe3e1a5..1d61812a37 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1081,7 +1081,7 @@ struct DDQueueData { bool timeThrottle(const std::vector& ids) const { return std::any_of(ids.begin(), ids.end(), [this](const UID& id) { if (this->lastAsSource.count(id)) { - return (now() - this->lastAsSource.at(id)) * 3.0 < SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; + return (now() - this->lastAsSource.at(id)) * 5.0 < SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; } return false; }); @@ -1504,10 +1504,15 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, } } -inline double getWorstCpu(const HealthMetrics& metrics) { +inline double getWorstCpu(const HealthMetrics& metrics, const std::vector& ids) { double cpu = 0; - for (auto p : metrics.storageStats) { - cpu = std::max(cpu, p.second.cpuUsage); + for (auto& id : ids) { + if (metrics.storageStats.count(id)) { + cpu = std::max(cpu, metrics.storageStats.at(id).cpuUsage); + } else { + // assume the server is too busy to report its stats + cpu = std::max(cpu, 100.0); + } } return cpu; } @@ -1546,12 +1551,12 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, state Future healthMetrics = self->cx->getHealthMetrics(true); state GetMetricsRequest req(shards, 10); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { - return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) < + return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) > b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; state std::vector metricsList = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); wait(ready(healthMetrics)); - if (getWorstCpu(healthMetrics.get()) < 25.0) { // 25% + if (getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs()) < 25.0) { // 25% traceEvent->detail("SkipReason", "LowReadLoad"); return false; } @@ -1559,7 +1564,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, deterministicRandom()->randomShuffle(metricsList); int chosenIdx = -1; for (int i = 0; i < metricsList.size(); ++i) { - if (metricsList[i].keys.present() && metricsList[i].bytes > 0) { + if (metricsList[i].keys.present() && metricsList[i].bytesReadPerKSecond > 0) { chosenIdx = i; break; } diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 0f8883cccc..3c578b3f84 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -856,6 +856,7 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr } if (req.comparator.present()) { + metrics.keys = range; returnMetrics.push_back(metrics); } else { returnMetrics[0] += metrics; @@ -867,11 +868,11 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr req.reply.send(returnMetrics); else if (req.comparator.present()) { std::nth_element(returnMetrics.begin(), - returnMetrics.end() - req.topK, + returnMetrics.begin() + req.topK - 1, returnMetrics.end(), req.comparator.get()); req.reply.send( - std::vector(returnMetrics.rbegin(), returnMetrics.rbegin() + req.topK)); + std::vector(returnMetrics.begin(), returnMetrics.begin() + req.topK)); } return Void(); } From 269e94cb71e5e137b532dcc36c57f7a665b13423 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Sun, 24 Apr 2022 22:53:06 -0700 Subject: [PATCH 068/299] add store type to status json --- fdbclient/FDBTypes.h | 2 +- fdbclient/Schemas.cpp | 15 ++++++++++++++- fdbserver/Status.actor.cpp | 15 ++++++++------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 97f8e2bb1c..13fd1d4a5b 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1433,7 +1433,7 @@ struct StorageMetadataType { static double currentTime() { return g_network->timer(); } bool operator==(const StorageMetadataType& b) const { - return createdTime == b.createdTime && storeType == b.storeType; + return createdTime == b.createdTime && storeType == b.storeType && wrongConfigured && b.wrongConfigured; } bool operator<(const StorageMetadataType& b) const { diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 6415604d86..0c37fd37ba 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -137,7 +137,20 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( }, "storage_metadata":{ "created_time_datetime":"1970-01-01 00:00:00.000 +0000", - "created_time_timestamp": 0 + "created_time_timestamp": 0, + "storage_engine":{ + "$enum":[ + "ssd", + "ssd-1", + "ssd-2", + "ssd-redwood-1-experimental", + "ssd-rocksdb-v1", + "ssd-sharded-rocksdb", + "memory", + "memory-1", + "memory-2", + "memory-radixtree-beta" + ]} }, "data_version":12341234, "durable_version":12341234, diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index f30f1638b2..3f7b10a130 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -583,13 +583,12 @@ struct RolesInfo { } } - if (!iface.isTss()) { // only storage server has Metadata field - TraceEventFields const& metadata = metrics.at("Metadata"); - JsonBuilderObject metadataObj; - metadataObj["created_time_datetime"] = metadata.getValue("CreatedTimeDatetime"); - metadataObj["created_time_timestamp"] = metadata.getUint64("CreatedTimeTimestamp"); - obj["storage_metadata"] = metadataObj; - } + TraceEventFields const& metadata = metrics.at("Metadata"); + JsonBuilderObject metadataObj; + metadataObj["created_time_datetime"] = metadata.getValue("CreatedTimeDatetime"); + metadataObj["created_time_timestamp"] = metadata.getUint64("CreatedTimeTimestamp"); + metadataObj["storage_engine"] = metadata.getValue("StoreType"); + obj["storage_metadata"] = metadataObj; } catch (Error& e) { if (e.code() != error_code_attribute_not_found) @@ -1937,11 +1936,13 @@ ACTOR static Future>> ge TraceEventFields metadataField; metadataField.addField("CreatedTimeTimestamp", std::to_string(metadata[i].get().createdTime)); metadataField.addField("CreatedTimeDatetime", epochsToGMTString(metadata[i].get().createdTime)); + metadataField.addField("StoreType", metadata[i].get().storeType.toString()); results[i].second.emplace("Metadata", metadataField); } else if (!servers[i].isTss()) { TraceEventFields metadataField; metadataField.addField("CreatedTimeTimestamp", "0"); metadataField.addField("CreatedTimeDatetime", "[removed]"); + metadataField.addField("StoreType", "[unknown]"); results[i].second.emplace("Metadata", metadataField); } } From 182f244e2e127e7913e8d804015b9ff1d12ba440 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 25 Apr 2022 10:28:45 -0700 Subject: [PATCH 069/299] fix schema format --- fdbclient/Schemas.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index b4f0b39632..a5a451a46e 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -138,19 +138,19 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "storage_metadata":{ "created_time_datetime":"1970-01-01 00:00:00.000 +0000", "created_time_timestamp": 0, - "storage_engine":{ - "$enum":[ - "ssd", - "ssd-1", - "ssd-2", - "ssd-redwood-1-experimental", - "ssd-rocksdb-v1", - "ssd-sharded-rocksdb", - "memory", - "memory-1", - "memory-2", - "memory-radixtree-beta" - ]} + "storage_engine":{ + "$enum":[ + "ssd", + "ssd-1", + "ssd-2", + "ssd-redwood-1-experimental", + "ssd-rocksdb-v1", + "ssd-sharded-rocksdb", + "memory", + "memory-1", + "memory-2", + "memory-radixtree-beta" + ]} }, "data_version":12341234, "durable_version":12341234, From 2ede89b6238b8aa3c3e595bce8cf4087ee635f93 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 25 Apr 2022 14:46:50 -0700 Subject: [PATCH 070/299] fix getUint64 bug --- fdbserver/DDTeamCollection.actor.cpp | 4 ++-- fdbserver/Status.actor.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 822f277ad7..3ed99066bd 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -2888,7 +2888,7 @@ public: server->getStoreType(), isTss ? !server->isCorrectStoreType(self->configuration.testingStorageServerStoreType) : !server->isCorrectStoreType(self->configuration.storageServerStoreType)); - // printf("------ read metadata %s\n", server->getId().toString().c_str()); + // read storage metadata loop { try { @@ -2906,7 +2906,7 @@ public: wait(tr->onError(e)); } } - + // printf("------ updated metadata %s\n", server->getId().toString().c_str()); if (!isTss) { // wrong store type handler if (!server->isCorrectStoreType(self->configuration.storageServerStoreType) && diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 1e88deab71..29825b34cc 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -587,10 +587,10 @@ struct RolesInfo { TraceEventFields const& metadata = metrics.at("Metadata"); JsonBuilderObject metadataObj; metadataObj["created_time_datetime"] = metadata.getValue("CreatedTimeDatetime"); - metadataObj["created_time_timestamp"] = metadata.getUint64("CreatedTimeTimestamp"); + metadataObj["created_time_timestamp"] = metadata.getDouble("CreatedTimeTimestamp"); metadataObj["storage_engine"] = metadata.getValue("StoreType"); obj["storage_metadata"] = metadataObj; - + // printf("%s\n", metadataObj.getJson().c_str()); } catch (Error& e) { if (e.code() != error_code_attribute_not_found) throw e; @@ -1974,7 +1974,7 @@ ACTOR static Future>> ge metadataField.addField("CreatedTimeDatetime", epochsToGMTString(metadata[i].get().createdTime)); metadataField.addField("StoreType", metadata[i].get().storeType.toString()); results[i].second.emplace("Metadata", metadataField); - } else if (!servers[i].isTss()) { + } else { TraceEventFields metadataField; metadataField.addField("CreatedTimeTimestamp", "0"); metadataField.addField("CreatedTimeDatetime", "[removed]"); From 8cc6d9aaacc172361dcfe7cc1bc017e988c42221 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 25 Apr 2022 16:59:20 -0700 Subject: [PATCH 071/299] determine timeThrottle and topK dynamically on the shard number --- fdbserver/DataDistributionQueue.actor.cpp | 29 +++++++++++------------ 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 1d61812a37..a3bb6e7bdc 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1078,10 +1078,11 @@ struct DDQueueData { } // return true if the servers are throttled as source for read rebalance - bool timeThrottle(const std::vector& ids) const { - return std::any_of(ids.begin(), ids.end(), [this](const UID& id) { + bool timeThrottle(const std::vector& ids, int shardCount) const { + return std::any_of(ids.begin(), ids.end(), [this, shardCount](const UID& id) { if (this->lastAsSource.count(id)) { - return (now() - this->lastAsSource.at(id)) * 5.0 < SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; + return (now() - this->lastAsSource.at(id)) * shardCount < + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; } return false; }); @@ -1529,11 +1530,6 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, traceEvent->detail("CancelingDueToSimulationSpeedup", true); return false; } - // check lastAsSource - if (self->timeThrottle(sourceTeam->getServerIDs())) { - traceEvent->detail("SkipReason", "SourceTeamThrottle"); - return false; - } state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); @@ -1546,10 +1542,17 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, return false; } + // check lastAsSource, at most 10% of shards can be moved within a sample period + if (self->timeThrottle(sourceTeam->getServerIDs(), 0.1 * shards.size())) { + traceEvent->detail("SkipReason", "SourceTeamThrottle"); + return false; + } + // TODO: set 10 as a knob // randomly choose topK shards + int topK = std::min(int(0.1 * shards.size()), 10); state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetMetricsRequest req(shards, 10); + state GetMetricsRequest req(shards, topK); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) > b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); @@ -2183,9 +2186,7 @@ ACTOR Future dataDistributionQueue(Database cx, debug_setCheckRelocationDuration(false); } } - when(KeyRange done = waitNext(rangesComplete.getFuture())) { - keysToLaunchFrom = done; - } + when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; } when(wait(recordMetrics)) { Promise req; getAverageShardBytes.send(req); @@ -2232,9 +2233,7 @@ ACTOR Future dataDistributionQueue(Database cx, } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} - when(Promise r = waitNext(getUnhealthyRelocationCount)) { - r.send(self.unhealthyRelocations); - } + when(Promise r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); } } } } catch (Error& e) { From e9bf7c0851d549d6b7620d9b8d7d701faec7245f Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 25 Apr 2022 16:49:23 -0700 Subject: [PATCH 072/299] fix unknown schema check --- fdbclient/Schemas.cpp | 3 ++- fdbserver/Status.actor.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index a5a451a46e..8efaf244c3 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -149,7 +149,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "memory", "memory-1", "memory-2", - "memory-radixtree-beta" + "memory-radixtree-beta", + "unknown" ]} }, "data_version":12341234, diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 29825b34cc..e988256f50 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1978,7 +1978,7 @@ ACTOR static Future>> ge TraceEventFields metadataField; metadataField.addField("CreatedTimeTimestamp", "0"); metadataField.addField("CreatedTimeDatetime", "[removed]"); - metadataField.addField("StoreType", "[unknown]"); + metadataField.addField("StoreType", getTypeString(KeyValueStoreType::END)); results[i].second.emplace("Metadata", metadataField); } } From a8bc81a09ddc280ad2e539a9cc668a5780093074 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 26 Apr 2022 13:35:15 -0700 Subject: [PATCH 073/299] add getStoreTypeStr method --- fdbclient/FDBTypes.h | 5 +++-- fdbserver/Status.actor.cpp | 2 +- fdbserver/workloads/StatusWorkload.actor.cpp | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 6b42b5c480..2217717df8 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -869,8 +869,8 @@ struct KeyValueStoreType { serializer(ar, type); } - std::string toString() const { - switch (type) { + static std::string getStoreTypeStr(const StoreType& storeType) { + switch (storeType) { case SSD_BTREE_V1: return "ssd-1"; case SSD_BTREE_V2: @@ -889,6 +889,7 @@ struct KeyValueStoreType { return "unknown"; } } + std::string toString() const { return getStoreTypeStr((StoreType)type); } private: uint32_t type; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index e988256f50..8e0f825376 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1978,7 +1978,7 @@ ACTOR static Future>> ge TraceEventFields metadataField; metadataField.addField("CreatedTimeTimestamp", "0"); metadataField.addField("CreatedTimeDatetime", "[removed]"); - metadataField.addField("StoreType", getTypeString(KeyValueStoreType::END)); + metadataField.addField("StoreType", KeyValueStoreType::getStoreTypeStr(KeyValueStoreType::END)); results[i].second.emplace("Metadata", metadataField); } } diff --git a/fdbserver/workloads/StatusWorkload.actor.cpp b/fdbserver/workloads/StatusWorkload.actor.cpp index 76964d213d..29279fc036 100644 --- a/fdbserver/workloads/StatusWorkload.actor.cpp +++ b/fdbserver/workloads/StatusWorkload.actor.cpp @@ -188,9 +188,11 @@ struct StatusWorkload : TestWorkload { now() - issued); //.detail("Reply", json_spirit::write_string(json_spirit::mValue(result))); std::string errorStr; if (self->parsedSchema.present() && - !schemaMatch(self->parsedSchema.get(), result, errorStr, SevError, true)) + !schemaMatch(self->parsedSchema.get(), result, errorStr, SevError, true)) { + std::cout << errorStr << std::endl; TraceEvent(SevError, "StatusWorkloadValidationFailed") .detail("JSON", json_spirit::write_string(json_spirit::mValue(result))); + } } catch (Error& e) { if (e.code() != error_code_actor_cancelled) { TraceEvent(SevError, "StatusWorkloadError").error(e); From 87d358ce6c444a1dac4ac45b342b2830b560c7b4 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 26 Apr 2022 17:11:38 -0700 Subject: [PATCH 074/299] reset all knobs --- fdbclient/ServerKnobs.cpp | 6 +++--- tests/noSim/ReadSkewReadWrite.toml | 17 +++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index e1b5a2e9fd..faf05a801b 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -144,7 +144,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 ); init( PRIORITY_REBALANCE_OVERUTILIZED_TEAM, 121 ); init( PRIORITY_REBALANCE_READ_UNDERUTIL_TEAM, 122 ); - init( PRIORITY_REBALANCE_READ_OVERUTIL_TEAM, 123 ); + init( PRIORITY_REBALANCE_READ_OVERUTIL_TEAM, 123 ); init( PRIORITY_PERPETUAL_STORAGE_WIGGLE, 139 ); init( PRIORITY_TEAM_HEALTHY, 140 ); init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 ); @@ -163,7 +163,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0; bool buggifySmallShards = randomize && BUGGIFY; bool simulationMediumShards = !buggifySmallShards && isSimulated && randomize && !BUGGIFY; // prefer smaller shards in simulation - init( MIN_SHARD_BYTES, 500000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair + init( MIN_SHARD_BYTES, 5000000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair init( SHARD_BYTES_RATIO, 4 ); init( SHARD_BYTES_PER_SQRT_BYTES, 45 ); if( buggifySmallShards ) SHARD_BYTES_PER_SQRT_BYTES = 0;//Approximately 10000 bytes per shard init( MAX_SHARD_BYTES, 500000000 ); @@ -658,7 +658,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); // 100K bytes init( READ_HOT_SUB_RANGE_CHUNK_SIZE, 10000000); // 10MB init( EMPTY_READ_PENALTY, 20 ); // 20 bytes - init( READ_SAMPLING_ENABLED, true ); if ( randomize && BUGGIFY ) READ_SAMPLING_ENABLED = true;// enable/disable read sampling + init( READ_SAMPLING_ENABLED, false ); if ( randomize && BUGGIFY ) READ_SAMPLING_ENABLED = true;// enable/disable read sampling //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); diff --git a/tests/noSim/ReadSkewReadWrite.toml b/tests/noSim/ReadSkewReadWrite.toml index 1034f01cd7..4542ef46d3 100644 --- a/tests/noSim/ReadSkewReadWrite.toml +++ b/tests/noSim/ReadSkewReadWrite.toml @@ -1,23 +1,24 @@ [[test]] testTitle = 'RandomReadWriteTest' connectionFailuresDisableDuration = 100000 -waitForQuiescenceBegin=false -waitForQuiescenceEnd=false -clearAfterTest = true -runSetup = true +# waitForQuiescenceBegin= false +# waitForQuiescenceEnd=false +clearAfterTest = false #true +runSetup = true # false timeout = 3600.0 [[test.workload]] testName = 'ReadWrite' transactionsPerSecond = 100000 -testDuration = 600.0 +testDuration = 900.0 skewRound = 1 -nodeCount = 15000000 +nodeCount = 30000000 valueBytes = 1000 -readsPerTransactionA = 4 +readsPerTransactionA = 8 writesPerTransactionA = 0 alpha = 0 discardEdgeMeasurements = false -hotServerFraction = 0.02 +hotServerFraction = 0.2 hotServerReadFrac = 0.8 +# hotServerShardFraction = 0.3 warmingDelay = 180.0 \ No newline at end of file From 9b211c48ee43f84b18374dbd4b74aba3b0bee271 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 26 Apr 2022 17:13:37 -0700 Subject: [PATCH 075/299] format code --- fdbserver/DataDistribution.actor.h | 4 ++-- fdbserver/DataDistributionQueue.actor.cpp | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 66cfef3c48..4a1a4751df 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -157,8 +157,8 @@ struct GetMetricsRequest { std::vector keys; int topK = 1; // default only return the top 1 shard based on the comparator Promise> reply; // topK storage metrics - Optional - comparator; // Return true if a.score > b.score.if comparator is assigned, return the largest topK in keys, otherwise return the sum of metrics + Optional comparator; // Return true if a.score > b.score.if comparator is assigned, return the + // largest topK in keys, otherwise return the sum of metrics GetMetricsRequest() {} GetMetricsRequest(KeyRange const& keys, int topK = 1) : keys({ keys }), topK(topK) {} diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index a3bb6e7bdc..7e7024208b 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -2186,7 +2186,9 @@ ACTOR Future dataDistributionQueue(Database cx, debug_setCheckRelocationDuration(false); } } - when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; } + when(KeyRange done = waitNext(rangesComplete.getFuture())) { + keysToLaunchFrom = done; + } when(wait(recordMetrics)) { Promise req; getAverageShardBytes.send(req); @@ -2233,7 +2235,9 @@ ACTOR Future dataDistributionQueue(Database cx, } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} - when(Promise r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); } + when(Promise r = waitNext(getUnhealthyRelocationCount)) { + r.send(self.unhealthyRelocations); + } } } } catch (Error& e) { From 898a5b86b2601909850e1ef04b5f2eb9ba408baa Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 26 Apr 2022 17:16:55 -0700 Subject: [PATCH 076/299] reset knobs --- fdbclient/ServerKnobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index faf05a801b..1afa0665f7 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -163,7 +163,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0; bool buggifySmallShards = randomize && BUGGIFY; bool simulationMediumShards = !buggifySmallShards && isSimulated && randomize && !BUGGIFY; // prefer smaller shards in simulation - init( MIN_SHARD_BYTES, 5000000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair + init( MIN_SHARD_BYTES, 50000000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair init( SHARD_BYTES_RATIO, 4 ); init( SHARD_BYTES_PER_SQRT_BYTES, 45 ); if( buggifySmallShards ) SHARD_BYTES_PER_SQRT_BYTES = 0;//Approximately 10000 bytes per shard init( MAX_SHARD_BYTES, 500000000 ); From 73c7240bfd76df19e9bf513db084c5cffdf98fe2 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 26 Apr 2022 17:34:34 -0700 Subject: [PATCH 077/299] code format --- fdbserver/DataDistributionQueue.actor.cpp | 8 ++------ fdbserver/DataDistributionTracker.actor.cpp | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 7e7024208b..a3bb6e7bdc 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -2186,9 +2186,7 @@ ACTOR Future dataDistributionQueue(Database cx, debug_setCheckRelocationDuration(false); } } - when(KeyRange done = waitNext(rangesComplete.getFuture())) { - keysToLaunchFrom = done; - } + when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; } when(wait(recordMetrics)) { Promise req; getAverageShardBytes.send(req); @@ -2235,9 +2233,7 @@ ACTOR Future dataDistributionQueue(Database cx, } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} - when(Promise r = waitNext(getUnhealthyRelocationCount)) { - r.send(self.unhealthyRelocations); - } + when(Promise r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); } } } } catch (Error& e) { diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 3c578b3f84..af0b6e7657 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -939,9 +939,7 @@ ACTOR Future fetchShardMetricsList_impl(DataDistributionTracker* self, Get ACTOR Future fetchShardMetricsList(DataDistributionTracker* self, GetMetricsListRequest req) { choose { when(wait(fetchShardMetricsList_impl(self, req))) {} - when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { - req.reply.sendError(timed_out()); - } + when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { req.reply.sendError(timed_out()); } } return Void(); } From 101b6717e354e6b2289ac8e3e0cfd56e85def7c8 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 27 Apr 2022 15:54:51 -0700 Subject: [PATCH 078/299] remove TSS storage metadata part --- fdbserver/DDTeamCollection.actor.cpp | 77 ++++++++++++---------------- fdbserver/DDTeamCollection.h | 6 +-- fdbserver/Status.actor.cpp | 19 ++++--- 3 files changed, 46 insertions(+), 56 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 3ed99066bd..d50f6f5a26 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -132,7 +132,9 @@ public: loop { choose { - when(wait(self->buildTeams())) { return Void(); } + when(wait(self->buildTeams())) { + return Void(); + } when(wait(self->restartTeamBuilder.onTrigger())) {} } } @@ -510,7 +512,9 @@ public: while (self->pauseWiggle && !self->pauseWiggle->get() && self->waitUntilRecruited.get()) { choose { when(wait(self->waitUntilRecruited.onChange() || self->pauseWiggle->onChange())) {} - when(wait(delay(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY, g_network->getCurrentTask()))) { break; } + when(wait(delay(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY, g_network->getCurrentTask()))) { + break; + } } } @@ -1324,7 +1328,7 @@ public: // Restart the storeTracker for the new interface. This will cancel the previous // keyValueStoreTypeTracker // storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server); - storageMetadataTracker = updateStorageMetadata(self, server, isTss); + storageMetadataTracker = self->updateStorageMetadata(server, isTss); hasWrongDC = !self->isCorrectDC(*server); hasInvalidLocality = !self->isValidLocality(self->configuration.storagePolicy, server->getLastKnownInterface().locality); @@ -1343,7 +1347,9 @@ public: .detail("ConfigStoreType", self->configuration.storageServerStoreType) .detail("WrongStoreTypeRemoved", server->wrongStoreTypeToRemove.get()); } - when(wait(server->wakeUpTracker.getFuture())) { server->wakeUpTracker = Promise(); } + when(wait(server->wakeUpTracker.getFuture())) { + server->wakeUpTracker = Promise(); + } when(wait(storageMetadataTracker)) {} when(wait(server->ssVersionTooFarBehind.onChange())) {} when(wait(self->disableFailingLaggingServers.onChange())) {} @@ -1494,20 +1500,6 @@ public: return Void(); } - ACTOR static Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo* server) { - // Update server's storeType, especially when it was created - wait(server->updateStoreType()); - - if (server->getStoreType() != self->configuration.storageServerStoreType) { - if (self->wrongStoreTypeRemover.isReady()) { - self->wrongStoreTypeRemover = removeWrongStoreType(self); - self->addActor.send(self->wrongStoreTypeRemover); - } - } - - return Never(); - } - ACTOR static Future storageServerFailureTracker(DDTeamCollection* self, TCServerInfo* server, Database cx, @@ -2067,7 +2059,9 @@ public: .detail("ExtraHealthyTeamCount", extraTeamCount) .detail("HealthyTeamCount", self->healthyTeamCount); } - when(wait(pauseChanged)) { continue; } + when(wait(pauseChanged)) { + continue; + } } } } @@ -2591,7 +2585,9 @@ public: } } } - when(wait(recruitStorage->onChange())) { fCandidateWorker = Future(); } + when(wait(recruitStorage->onChange())) { + fCandidateWorker = Future(); + } when(wait(self->zeroHealthyTeams->onChange())) { if (!pendingTSSCheck && self->zeroHealthyTeams->get() && (self->isTssRecruiting || self->tss_info_by_pair.size() > 0)) { @@ -2876,18 +2872,16 @@ public: return Void(); } - ACTOR static Future updateStorageMetadata(DDTeamCollection* self, TCServerInfo* server, bool isTss) { + ACTOR static Future updateStorageMetadata(DDTeamCollection* self, TCServerInfo* server) { state KeyBackedObjectMap metadataMap( serverMetadataKeys.begin, IncludeVersion()); state Reference tr = makeReference(self->cx); // Update server's storeType, especially when it was created wait(server->updateStoreType()); - state StorageMetadataType data( - StorageMetadataType::currentTime(), - server->getStoreType(), - isTss ? !server->isCorrectStoreType(self->configuration.testingStorageServerStoreType) - : !server->isCorrectStoreType(self->configuration.storageServerStoreType)); + state StorageMetadataType data(StorageMetadataType::currentTime(), + server->getStoreType(), + !server->isCorrectStoreType(self->configuration.storageServerStoreType)); // read storage metadata loop { @@ -2907,19 +2901,18 @@ public: } } // printf("------ updated metadata %s\n", server->getId().toString().c_str()); - if (!isTss) { - // wrong store type handler - if (!server->isCorrectStoreType(self->configuration.storageServerStoreType) && - self->wrongStoreTypeRemover.isReady()) { - self->wrongStoreTypeRemover = removeWrongStoreType(self); - self->addActor.send(self->wrongStoreTypeRemover); - } - // add server to wiggler - if (self->storageWiggler->contains(server->getId())) { - self->storageWiggler->updateMetadata(server->getId(), data); - } else { - self->storageWiggler->addServer(server->getId(), data); - } + + // wrong store type handler + if (!server->isCorrectStoreType(self->configuration.storageServerStoreType) && + self->wrongStoreTypeRemover.isReady()) { + self->wrongStoreTypeRemover = removeWrongStoreType(self); + self->addActor.send(self->wrongStoreTypeRemover); + } + // add server to wiggler + if (self->storageWiggler->contains(server->getId())) { + self->storageWiggler->updateMetadata(server->getId(), data); + } else { + self->storageWiggler->addServer(server->getId(), data); } return Never(); @@ -3427,10 +3420,6 @@ Future DDTeamCollection::removeBadTeams() { return DDTeamCollectionImpl::removeBadTeams(this); } -Future DDTeamCollection::keyValueStoreTypeTracker(TCServerInfo* server) { - return DDTeamCollectionImpl::keyValueStoreTypeTracker(this, server); -} - Future DDTeamCollection::storageServerFailureTracker(TCServerInfo* server, Database cx, ServerStatus* status, @@ -3527,7 +3516,7 @@ Future DDTeamCollection::readStorageWiggleMap() { } Future DDTeamCollection::updateStorageMetadata(TCServerInfo* server, bool isTss) { - return DDTeamCollectionImpl::updateStorageMetadata(this, server, isTss); + return isTss ? Never() : DDTeamCollectionImpl::updateStorageMetadata(this, server); } void DDTeamCollection::resetLocalitySet() { diff --git a/fdbserver/DDTeamCollection.h b/fdbserver/DDTeamCollection.h index 99c04b0c2a..4d03ff0f94 100644 --- a/fdbserver/DDTeamCollection.h +++ b/fdbserver/DDTeamCollection.h @@ -432,9 +432,6 @@ class DDTeamCollection : public ReferenceCounted { bool isCorrectDC(TCServerInfo const& server) const; - // Set the server's storeType; Error is caught by the caller - Future keyValueStoreTypeTracker(TCServerInfo* server); - Future storageServerFailureTracker(TCServerInfo* server, Database cx, ServerStatus* status, @@ -486,7 +483,8 @@ class DDTeamCollection : public ReferenceCounted { }); } - // Read storage metadata from database, and do necessary updates + // Read storage metadata from database, get the server's storeType, and do necessary updates. Error is caught by the + // caller Future updateStorageMetadata(TCServerInfo* server, bool isTss); Future serverGetTeamRequests(TeamCollectionInterface tci); diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 8e0f825376..32ff6b798a 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -584,13 +584,16 @@ struct RolesInfo { } } - TraceEventFields const& metadata = metrics.at("Metadata"); - JsonBuilderObject metadataObj; - metadataObj["created_time_datetime"] = metadata.getValue("CreatedTimeDatetime"); - metadataObj["created_time_timestamp"] = metadata.getDouble("CreatedTimeTimestamp"); - metadataObj["storage_engine"] = metadata.getValue("StoreType"); - obj["storage_metadata"] = metadataObj; - // printf("%s\n", metadataObj.getJson().c_str()); + if (!iface.isTss()) { // only storage server has Metadata field + TraceEventFields const& metadata = metrics.at("Metadata"); + JsonBuilderObject metadataObj; + metadataObj["created_time_datetime"] = metadata.getValue("CreatedTimeDatetime"); + metadataObj["created_time_timestamp"] = metadata.getDouble("CreatedTimeTimestamp"); + metadataObj["storage_engine"] = metadata.getValue("StoreType"); + obj["storage_metadata"] = metadataObj; + // printf("%s\n", metadataObj.getJson().c_str()); + } + } catch (Error& e) { if (e.code() != error_code_attribute_not_found) throw e; @@ -1974,7 +1977,7 @@ ACTOR static Future>> ge metadataField.addField("CreatedTimeDatetime", epochsToGMTString(metadata[i].get().createdTime)); metadataField.addField("StoreType", metadata[i].get().storeType.toString()); results[i].second.emplace("Metadata", metadataField); - } else { + } else if (!servers[i].isTss()) { TraceEventFields metadataField; metadataField.addField("CreatedTimeTimestamp", "0"); metadataField.addField("CreatedTimeDatetime", "[removed]"); From 4724a099e23e876f869436ca2d934b1c5c7ddf2b Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 27 Apr 2022 20:42:31 -0700 Subject: [PATCH 079/299] format code --- fdbserver/DDTeamCollection.actor.cpp | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index d50f6f5a26..ad5347a812 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -132,9 +132,7 @@ public: loop { choose { - when(wait(self->buildTeams())) { - return Void(); - } + when(wait(self->buildTeams())) { return Void(); } when(wait(self->restartTeamBuilder.onTrigger())) {} } } @@ -512,9 +510,7 @@ public: while (self->pauseWiggle && !self->pauseWiggle->get() && self->waitUntilRecruited.get()) { choose { when(wait(self->waitUntilRecruited.onChange() || self->pauseWiggle->onChange())) {} - when(wait(delay(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY, g_network->getCurrentTask()))) { - break; - } + when(wait(delay(SERVER_KNOBS->PERPETUAL_WIGGLE_DELAY, g_network->getCurrentTask()))) { break; } } } @@ -1347,9 +1343,7 @@ public: .detail("ConfigStoreType", self->configuration.storageServerStoreType) .detail("WrongStoreTypeRemoved", server->wrongStoreTypeToRemove.get()); } - when(wait(server->wakeUpTracker.getFuture())) { - server->wakeUpTracker = Promise(); - } + when(wait(server->wakeUpTracker.getFuture())) { server->wakeUpTracker = Promise(); } when(wait(storageMetadataTracker)) {} when(wait(server->ssVersionTooFarBehind.onChange())) {} when(wait(self->disableFailingLaggingServers.onChange())) {} @@ -2059,9 +2053,7 @@ public: .detail("ExtraHealthyTeamCount", extraTeamCount) .detail("HealthyTeamCount", self->healthyTeamCount); } - when(wait(pauseChanged)) { - continue; - } + when(wait(pauseChanged)) { continue; } } } } @@ -2585,9 +2577,7 @@ public: } } } - when(wait(recruitStorage->onChange())) { - fCandidateWorker = Future(); - } + when(wait(recruitStorage->onChange())) { fCandidateWorker = Future(); } when(wait(self->zeroHealthyTeams->onChange())) { if (!pendingTSSCheck && self->zeroHealthyTeams->get() && (self->isTssRecruiting || self->tss_info_by_pair.size() > 0)) { From 15001f614e11db85e457e059e8b3dbaa42fcf577 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 27 Apr 2022 22:36:16 -0700 Subject: [PATCH 080/299] remove unused code --- flow/IRandom.h | 5 ++--- flow/genericactors.actor.cpp | 7 ------- flow/genericactors.actor.h | 5 ----- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/flow/IRandom.h b/flow/IRandom.h index 917e556ce0..87f7f42424 100644 --- a/flow/IRandom.h +++ b/flow/IRandom.h @@ -156,9 +156,8 @@ public: } template - void randomShuffle(C& container, int shuffleLen = -1) { - int s = shuffleLen < 0 ? std::min(shuffleLen, (int)container.size()) : (int)container.size(); - + void randomShuffle(C& container) { + int s = (int)container.size(); for (int i = 0; i < s; i++) { int j = randomInt(i, s); if (i != j) { diff --git a/flow/genericactors.actor.cpp b/flow/genericactors.actor.cpp index e587a19f09..6bb1e3fd8d 100644 --- a/flow/genericactors.actor.cpp +++ b/flow/genericactors.actor.cpp @@ -22,13 +22,6 @@ #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // This must be the last #include. -ACTOR Future recurringFuture(Future what, double interval, TaskPriority taskID) { - loop { - wait(what); - wait(delay(interval)); - } -} - ACTOR Future allTrue(std::vector> all) { state int i = 0; while (i != all.size()) { diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 8731d16f63..d0abeedd6d 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -232,11 +232,6 @@ Future recurring(Func what, double interval, TaskPriority taskID = TaskPri } } -// run what every interval sec -ACTOR Future recurringFuture(Future what, - double interval, - TaskPriority taskID = TaskPriority::DefaultDelay); - ACTOR template Future trigger(Func what, Future signal) { wait(signal); From 00b97ec82946435e85be7352b37df32e6898255d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 27 Apr 2022 23:37:35 -0700 Subject: [PATCH 081/299] add storage metric compare knob; timeThrottle with constant --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 1 + fdbserver/DataDistributionQueue.actor.cpp | 10 +++++----- fdbserver/DataDistributionTracker.actor.cpp | 7 +++++-- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 1afa0665f7..c57eb619ac 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -658,6 +658,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); // 100K bytes init( READ_HOT_SUB_RANGE_CHUNK_SIZE, 10000000); // 10MB init( EMPTY_READ_PENALTY, 20 ); // 20 bytes + init( DD_SHARD_COMPARE_LIMIT, 1000 ); init( READ_SAMPLING_ENABLED, false ); if ( randomize && BUGGIFY ) READ_SAMPLING_ENABLED = true;// enable/disable read sampling //Storage Server diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index d193f73fc6..b2f9fa8891 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -607,6 +607,7 @@ public: int64_t BYTES_READ_UNITS_PER_SAMPLE; int64_t READ_HOT_SUB_RANGE_CHUNK_SIZE; int64_t EMPTY_READ_PENALTY; + int DD_SHARD_COMPARE_LIMIT; bool READ_SAMPLING_ENABLED; // Storage Server diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index a3bb6e7bdc..376d61d90e 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1078,11 +1078,11 @@ struct DDQueueData { } // return true if the servers are throttled as source for read rebalance - bool timeThrottle(const std::vector& ids, int shardCount) const { - return std::any_of(ids.begin(), ids.end(), [this, shardCount](const UID& id) { + bool timeThrottle(const std::vector& ids) const { + return std::any_of(ids.begin(), ids.end(), [this](const UID& id) { if (this->lastAsSource.count(id)) { - return (now() - this->lastAsSource.at(id)) * shardCount < - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; + // TODO: set 5.0 as a knob + return (now() - this->lastAsSource.at(id)) * 5.0 < SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; } return false; }); @@ -1543,7 +1543,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, } // check lastAsSource, at most 10% of shards can be moved within a sample period - if (self->timeThrottle(sourceTeam->getServerIDs(), 0.1 * shards.size())) { + if (self->timeThrottle(sourceTeam->getServerIDs())) { traceEvent->detail("SkipReason", "SourceTeamThrottle"); return false; } diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index af0b6e7657..6b1361101a 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -839,7 +839,10 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr std::vector returnMetrics; if (!req.comparator.present()) returnMetrics.push_back(StorageMetrics()); - for (auto range : req.keys) { + + // TODO: shall we do random shuffle to make the selection uniform distributed over the shard space? + for (int i = 0; i < SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT && i < req.keys.size(); ++i) { + auto range = req.keys[i]; StorageMetrics metrics; for (auto t : self->shards.intersectingRanges(range)) { auto& stats = t.value().stats; @@ -864,7 +867,7 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr } if (!onChange.isValid()) { - if (req.topK >= returnMetrics.size()) + if (!req.comparator.present() || req.topK >= returnMetrics.size()) req.reply.send(returnMetrics); else if (req.comparator.present()) { std::nth_element(returnMetrics.begin(), From 74abca44d88c5a5c7299ce2f00791486380bf73e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 28 Apr 2022 09:15:20 -0600 Subject: [PATCH 082/299] Make QuietDatabase more human friendly QuietDatabase will now fail by itself after 1000 seconds instead of relying on the general simulation timeout. Additionally it will print a more human friendly error. --- fdbserver/QuietDatabase.actor.cpp | 93 ++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index b69d7930ee..aa2abc6abf 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -20,6 +20,7 @@ #include #include +#include #include "fdbclient/FDBOptions.g.h" #include "fdbclient/SystemData.h" @@ -670,6 +671,49 @@ ACTOR Future reconfigureAfter(Database cx, return Void(); } +struct QuietDatabaseChecker { + double start = now(); + constexpr static double maxDDRunTime = 1000.0; + + struct Impl { + double start; + std::string const& phase; + std::vector failReasons; + + Impl(double start, const std::string& phase) : start(start), phase(phase) {} + + template > + Impl& add(BaseTraceEvent& evt, const char* name, T value, T expected, Comparison const& cmp = std::less<>()) { + std::string k = fmt::format("{}Gate", name); + evt.detail(name, value).detail(k.c_str(), expected); + if (!cmp(value, expected)) { + failReasons.push_back(name); + } + return *this; + } + + bool success() { + bool timedOut = now() - start > maxDDRunTime; + if (!failReasons.empty()) { + std::string traceMessage = fmt::format("QuietDatabase{}Fail", phase); + std::string reasons = fmt::format("{}", fmt::join(failReasons, ", ")); + TraceEvent(timedOut ? SevError : SevWarnAlways, traceMessage.c_str()) + .detail(failReasons.size() == 1 ? "Reason" : "Reasons", reasons) + .detail("FailedAfter", now() - start) + .detail("Timeout", maxDDRunTime); + ASSERT(!timedOut); + return false; + } + return true; + } + }; + + Impl startIteration(std::string const& phase) const { + Impl res(start, phase); + return res; + } +}; + // Waits until a database quiets down (no data in flight, small tlog queue, low SQ, no active data distribution). This // requires the database to be available and healthy in order to succeed. ACTOR Future waitForQuietDatabase(Database cx, @@ -681,6 +725,7 @@ ACTOR Future waitForQuietDatabase(Database cx, int64_t maxDataDistributionQueueSize = 0, int64_t maxPoppedVersionLag = 30e6, int64_t maxVersionOffset = 1e6) { + state QuietDatabaseChecker checker; state Future reconfig = reconfigureAfter(cx, 100 + (deterministicRandom()->random01() * 100), dbInfo, "QuietDatabase"); state Future dataInFlight; @@ -733,35 +778,26 @@ ACTOR Future waitForQuietDatabase(Database cx, success(teamCollectionValid) && success(storageQueueSize) && success(dataDistributionActive) && success(storageServersRecruiting) && success(versionOffset)); - TraceEvent(("QuietDatabase" + phase).c_str()) - .detail("DataInFlight", dataInFlight.get()) - .detail("DataInFlightGate", dataInFlightGate) - .detail("MaxTLogQueueSize", tLogQueueInfo.get().first) - .detail("MaxTLogQueueGate", maxTLogQueueGate) - .detail("MaxTLogPoppedVersionLag", tLogQueueInfo.get().second) - .detail("MaxTLogPoppedVersionLagGate", maxPoppedVersionLag) - .detail("DataDistributionQueueSize", dataDistributionQueueSize.get()) - .detail("DataDistributionQueueSizeGate", maxDataDistributionQueueSize) - .detail("TeamCollectionValid", teamCollectionValid.get()) - .detail("MaxStorageQueueSize", storageQueueSize.get()) - .detail("MaxStorageServerQueueGate", maxStorageServerQueueGate) - .detail("DataDistributionActive", dataDistributionActive.get()) - .detail("StorageServersRecruiting", storageServersRecruiting.get()) - .detail("RecoveryCount", dbInfo->get().recoveryCount) - .detail("VersionOffset", versionOffset.get()) - .detail("NumSuccesses", numSuccesses); - maxVersionOffset += dbInfo->get().recoveryCount * SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT; - if (dataInFlight.get() > dataInFlightGate || tLogQueueInfo.get().first > maxTLogQueueGate || - tLogQueueInfo.get().second > maxPoppedVersionLag || - dataDistributionQueueSize.get() > maxDataDistributionQueueSize || - storageQueueSize.get() > maxStorageServerQueueGate || !dataDistributionActive.get() || - storageServersRecruiting.get() || versionOffset.get() > maxVersionOffset || - !teamCollectionValid.get()) { - wait(delay(1.0)); - numSuccesses = 0; - } else { + auto check = checker.startIteration(phase); + + std::string evtType = "QuietDatabase" + phase; + TraceEvent evt(evtType.c_str()); + check.add(evt, "DataInFlight", dataInFlight.get(), dataInFlightGate) + .add(evt, "MaxTLogQueueSize", tLogQueueInfo.get().first, maxTLogQueueGate) + .add(evt, "MaxTLogPoppedVersionLag", tLogQueueInfo.get().second, maxPoppedVersionLag) + .add(evt, "DataDistributionQueueSize", dataDistributionQueueSize.get(), maxDataDistributionQueueSize) + .add(evt, "TeamCollectionValid", teamCollectionValid.get(), true, std::equal_to<>()) + .add(evt, "MaxStorageQueueSize", storageQueueSize.get(), maxStorageServerQueueGate) + .add(evt, "DataDistributionActive", dataDistributionActive.get(), true, std::equal_to<>()) + .add(evt, "StorageServersRecruiting", storageServersRecruiting.get(), false, std::equal_to<>()) + .add(evt, "VersionOffset", versionOffset.get(), maxVersionOffset); + + evt.detail("RecoveryCount", dbInfo->get().recoveryCount).detail("NumSuccesses", numSuccesses); + evt.log(); + + if (check.success()) { if (++numSuccesses == 3) { auto msg = "QuietDatabase" + phase + "Done"; TraceEvent(msg.c_str()).log(); @@ -769,6 +805,9 @@ ACTOR Future waitForQuietDatabase(Database cx, } else { wait(delay(g_network->isSimulated() ? 2.0 : 30.0)); } + } else { + wait(delay(1.0)); + numSuccesses = 0; } } catch (Error& e) { TraceEvent(("QuietDatabase" + phase + "Error").c_str()).errorUnsuppressed(e); From f959e84b852e0dd688cca503d431ccab210be581 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 28 Apr 2022 09:32:30 -0600 Subject: [PATCH 083/299] fix comparison --- fdbserver/QuietDatabase.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index aa2abc6abf..db49bf7ed2 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -682,8 +682,8 @@ struct QuietDatabaseChecker { Impl(double start, const std::string& phase) : start(start), phase(phase) {} - template > - Impl& add(BaseTraceEvent& evt, const char* name, T value, T expected, Comparison const& cmp = std::less<>()) { + template > + Impl& add(BaseTraceEvent& evt, const char* name, T value, T expected, Comparison const& cmp = std::less_equal<>()) { std::string k = fmt::format("{}Gate", name); evt.detail(name, value).detail(k.c_str(), expected); if (!cmp(value, expected)) { From eb22ac1c1f70ef6f5d8c8f2fa0a9b5d6ef29b9fd Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 28 Apr 2022 10:09:06 -0600 Subject: [PATCH 084/299] Address review comments --- fdbserver/QuietDatabase.actor.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index db49bf7ed2..cb6ed40ba1 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -683,7 +683,11 @@ struct QuietDatabaseChecker { Impl(double start, const std::string& phase) : start(start), phase(phase) {} template > - Impl& add(BaseTraceEvent& evt, const char* name, T value, T expected, Comparison const& cmp = std::less_equal<>()) { + Impl& add(BaseTraceEvent& evt, + const char* name, + T value, + T expected, + Comparison const& cmp = std::less_equal<>()) { std::string k = fmt::format("{}Gate", name); evt.detail(name, value).detail(k.c_str(), expected); if (!cmp(value, expected)) { @@ -698,10 +702,17 @@ struct QuietDatabaseChecker { std::string traceMessage = fmt::format("QuietDatabase{}Fail", phase); std::string reasons = fmt::format("{}", fmt::join(failReasons, ", ")); TraceEvent(timedOut ? SevError : SevWarnAlways, traceMessage.c_str()) - .detail(failReasons.size() == 1 ? "Reason" : "Reasons", reasons) + .detail("Reasons", reasons) .detail("FailedAfter", now() - start) .detail("Timeout", maxDDRunTime); - ASSERT(!timedOut); + if (timedOut) { + // this bool is just created to make the assertion more readable + bool ddGotStuck = true; + // This assertion is here to make the test fail more quickly. If quietDatabase takes this + // long without completing, we can assume that the test will eventually time out. However, + // time outs are more annoying to debug. This will hopefully be easier to track down. + ASSERT(!ddGotStuck); + } return false; } return true; From e0cbe74d94a993d75a0d6c36ba2b55b0afdfc4d6 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 28 Apr 2022 11:32:35 -0600 Subject: [PATCH 085/299] Only fail DD early in simulation --- fdbserver/QuietDatabase.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index cb6ed40ba1..2c4eeea2ef 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -711,7 +711,7 @@ struct QuietDatabaseChecker { // This assertion is here to make the test fail more quickly. If quietDatabase takes this // long without completing, we can assume that the test will eventually time out. However, // time outs are more annoying to debug. This will hopefully be easier to track down. - ASSERT(!ddGotStuck); + ASSERT(!ddGotStuck || !g_network->isSimulated()); } return false; } From d1c71a7903f9153e38927099fd13ffc943e9db5c Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Fri, 29 Apr 2022 13:10:05 +0100 Subject: [PATCH 086/299] Add sidecar method to check if a file is present --- packaging/docker/sidecar.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/packaging/docker/sidecar.py b/packaging/docker/sidecar.py index 666cb82816..6fb80b4880 100755 --- a/packaging/docker/sidecar.py +++ b/packaging/docker/sidecar.py @@ -518,10 +518,16 @@ class Server(BaseHTTPRequestHandler): return if self.path.startswith("/check_hash/"): try: - self.send_text(check_hash(self.path[12:]), add_newline=False) + self.send_text(check_hash(os.path.basename(self.path)), add_newline=False) except FileNotFoundError: self.send_error(404, "Path not found") self.end_headers() + if self.path.startswith("/is_present/"): + if is_present(os.path.basename(self.path))): + self.send_text("OK") + else: + self.send_error(404, "Path not found") + self.end_headers() elif self.path == "/ready": self.send_text(ready()) elif self.path == "/substitutions": @@ -599,6 +605,10 @@ def check_hash(filename): return m.hexdigest() +def is_present(filename): + return os.path.exists(os.path.join(Config.shared().output_dir, filename)) + + def copy_files(): config = Config.shared() if config.require_not_empty: From 2afaf55a48824c2db5e3add98cfd841adaf42d03 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Fri, 29 Apr 2022 13:16:04 -0400 Subject: [PATCH 087/299] fixed some binding tests and split stack operation between key and value of tenant list --- bindings/bindingtester/tests/api.py | 6 ++--- .../foundationdb/test/AsyncStackTester.java | 22 +++++++++++++++++-- .../foundationdb/test/StackOperation.java | 2 +- .../apple/foundationdb/test/StackTester.java | 21 ++++++++++++++++-- bindings/python/fdb/tenant_management.py | 8 +++---- bindings/python/tests/tester.py | 12 +++++----- 6 files changed, 53 insertions(+), 18 deletions(-) diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index 31ebe473a7..12599b1074 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -165,7 +165,7 @@ class ApiTest(Test): write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT'] txn_sizes = ['GET_APPROXIMATE_SIZE'] storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS'] - tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE'] + tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST_NAMES'] op_choices += reads op_choices += mutations @@ -600,8 +600,8 @@ class ApiTest(Test): instructions.append(op) elif op == 'TENANT_CLEAR_ACTIVE': instructions.append(op) - elif op == 'TENANT_LIST': - instructions.push_args(b'', b'\xff', 10) + elif op == 'TENANT_LIST_NAMES': + instructions.push_args(b'', b'\xff', 10000) instructions.append(op) self.add_strings(1) else: diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index a66813524a..823697b4ff 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -33,6 +33,8 @@ import java.util.Map; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.function.Function; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import com.apple.foundationdb.Database; import com.apple.foundationdb.FDB; @@ -48,6 +50,7 @@ import com.apple.foundationdb.Transaction; import com.apple.foundationdb.async.AsyncUtil; import com.apple.foundationdb.tuple.ByteArrayUtil; import com.apple.foundationdb.tuple.Tuple; +import com.apple.foundationdb.async.CloseableAsyncIterator; public class AsyncStackTester { static final String DIRECTORY_PREFIX = "DIRECTORY_"; @@ -483,12 +486,27 @@ public class AsyncStackTester { inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName)); }, FDB.DEFAULT_EXECUTOR); } - else if (op == StackOperation.TENANT_LIST) { + else if (op == StackOperation.TENANT_LIST_NAMES) { return inst.popParams(3).thenAcceptAsync(params -> { byte[] begin = (byte[])params.get(0); byte[] end = (byte[])params.get(1); int limit = StackUtils.getInt(params.get(2)); - inst.push(TenantManagement.listTenants(inst.context.db, begin, end, limit)); + CloseableAsyncIterator tenantIter = TenantManagement.listTenants(inst.context.db, begin, end, limit); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + try { + while (tenantIter.hasNext()) { + try { + KeyValue next = tenantIter.next(); + outputStream.write(next.getKey()); + } catch (IOException e) { + continue; + } + } + } finally { + tenantIter.close(); + } + byte[] output = outputStream.toByteArray(); + inst.push(output); }, FDB.DEFAULT_EXECUTOR); } else if (op == StackOperation.TENANT_SET_ACTIVE) { diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java index e67d4cff81..f3d667db7c 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java @@ -76,7 +76,7 @@ enum StackOperation { // Tenants TENANT_CREATE, TENANT_DELETE, - TENANT_LIST, + TENANT_LIST_NAMES, TENANT_SET_ACTIVE, TENANT_CLEAR_ACTIVE, diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index 002af5e97a..4be1c9257b 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -20,6 +20,8 @@ package com.apple.foundationdb.test; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -429,12 +431,27 @@ public class StackTester { byte[] tenantName = (byte[])inst.popParam().join(); inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName)); } - else if (op == StackOperation.TENANT_LIST) { + else if (op == StackOperation.TENANT_LIST_NAMES) { List params = inst.popParams(3).join(); byte[] begin = (byte[])params.get(0); byte[] end = (byte[])params.get(1); int limit = StackUtils.getInt(params.get(2)); - inst.push(TenantManagement.listTenants(inst.context.db, begin, end, limit)); + CloseableAsyncIterator tenantIter = TenantManagement.listTenants(inst.context.db, begin, end, limit); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + try { + while (tenantIter.hasNext()) { + try { + KeyValue next = tenantIter.next(); + outputStream.write(next.getKey()); + } catch (IOException e) { + continue; + } + } + } finally { + tenantIter.close(); + } + byte[] output = outputStream.toByteArray(); + inst.push(output); } else if (op == StackOperation.TENANT_SET_ACTIVE) { byte[] tenantName = (byte[])inst.popParam().join(); diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index c8b061611f..6e571f45a5 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -92,11 +92,9 @@ class FDBTenantList(object): return list(self.__iter__()) def __iter__(self): - while True: - result = self._iter.__next__() - - tenant_name = _impl.remove_prefix(result.key, _tenant_map_prefix) - yield _impl.KeyValue(tenant_name, result.value) + for next_item in self._iter: + tenant_name = _impl.remove_prefix(next_item.key, _tenant_map_prefix) + yield _impl.KeyValue(tenant_name, next_item.value) # Lists the tenants created in the cluster, specified by the begin and end range. # Also limited in number of results by the limit parameter. diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index f392772a31..81a731b08d 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -604,12 +604,14 @@ class Tester: self.tenant = self.db.open_tenant(name) elif inst.op == six.u("TENANT_CLEAR_ACTIVE"): self.tenant = None - elif inst.op == six.u("TENANT_LIST"): - begin = inst.pop() - end = inst.pop() - limit = inst.pop() + elif inst.op == six.u("TENANT_LIST_NAMES"): + begin, end, limit = inst.pop(3) tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) - inst.push(tenant_list) + result = bytearray() + for tenant in tenant_list: + result += tenant.key + result_bytes = bytes(result) + inst.push(result_bytes) elif inst.op == six.u("UNIT_TESTS"): try: test_db_options(db) From 43c2ca35a5c7f5f9ee35acce260d0c879aebe8cf Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 2 May 2022 08:39:59 -0700 Subject: [PATCH 088/299] Move fdbcli command and hint generators into the files implementing the command. --- fdbcli/ConfigureCommand.actor.cpp | 33 ++++++- fdbcli/KillCommand.actor.cpp | 11 ++- fdbcli/StatusCommand.actor.cpp | 13 ++- fdbcli/ThrottleCommand.actor.cpp | 96 ++++++++++++++++++- fdbcli/fdbcli.actor.cpp | 153 ++---------------------------- fdbcli/fdbcli.actor.h | 30 +++++- 6 files changed, 186 insertions(+), 150 deletions(-) diff --git a/fdbcli/ConfigureCommand.actor.cpp b/fdbcli/ConfigureCommand.actor.cpp index ebfac8409c..37474242e1 100644 --- a/fdbcli/ConfigureCommand.actor.cpp +++ b/fdbcli/ConfigureCommand.actor.cpp @@ -279,6 +279,36 @@ ACTOR Future configureCommandActor(Reference db, return ret; } +void configureGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + const char* opts[] = { "new", + "single", + "double", + "triple", + "three_data_hall", + "three_datacenter", + "ssd", + "ssd-1", + "ssd-2", + "memory", + "memory-1", + "memory-2", + "memory-radixtree-beta", + "commit_proxies=", + "grv_proxies=", + "logs=", + "resolvers=", + "perpetual_storage_wiggle=", + "perpetual_storage_wiggle_locality=", + "storage_migration_type=", + "tenant_mode=", + "blob_granules_enabled=", + nullptr }; + arrayGenerator(text, line, opts, lc); +} + CommandFactory configureFactory( "configure", CommandHelp( @@ -322,6 +352,7 @@ CommandFactory configureFactory( "optional, then transactions can be run with or without specifying tenants. If required, all data must be " "accessed using tenants.\n\n" - "See the FoundationDB Administration Guide for more information.")); + "See the FoundationDB Administration Guide for more information."), + &configureGenerator); } // namespace fdb_cli diff --git a/fdbcli/KillCommand.actor.cpp b/fdbcli/KillCommand.actor.cpp index 4ce89b5919..20e3c9a425 100644 --- a/fdbcli/KillCommand.actor.cpp +++ b/fdbcli/KillCommand.actor.cpp @@ -95,6 +95,14 @@ ACTOR Future killCommandActor(Reference db, return result; } +void killGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + const char* opts[] = { "all", "list", nullptr }; + arrayGenerator(text, line, opts, lc); +} + CommandFactory killFactory( "kill", CommandHelp( @@ -103,5 +111,6 @@ CommandFactory killFactory( "If no addresses are specified, populates the list of processes which can be killed. Processes cannot be " "killed before this list has been populated.\n\nIf `all' is specified, attempts to kill all known " "processes.\n\nIf `list' is specified, displays all known processes. This is only useful when the database is " - "unresponsive.\n\nFor each IP:port pair in
, attempt to kill the specified process.")); + "unresponsive.\n\nFor each IP:port pair in
, attempt to kill the specified process."), + &killGenerator); } // namespace fdb_cli diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp index b9159acd1e..8e6a7d0df8 100644 --- a/fdbcli/StatusCommand.actor.cpp +++ b/fdbcli/StatusCommand.actor.cpp @@ -1246,6 +1246,16 @@ ACTOR Future statusCommandActor(Reference db, return true; } +void statusGenerator(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { "minimal", "details", "json", nullptr }; + arrayGenerator(text, line, opts, lc); + } +} + CommandFactory statusFactory( "status", CommandHelp("status [minimal|details|json]", @@ -1254,5 +1264,6 @@ CommandFactory statusFactory( "what is wrong. If the cluster is running, this command will print cluster " "statistics.\n\nSpecifying `minimal' will provide a minimal description of the status of your " "database.\n\nSpecifying `details' will provide load information for individual " - "workers.\n\nSpecifying `json' will provide status information in a machine readable JSON format.")); + "workers.\n\nSpecifying `json' will provide status information in a machine readable JSON format."), + &statusGenerator); } // namespace fdb_cli diff --git a/fdbcli/ThrottleCommand.actor.cpp b/fdbcli/ThrottleCommand.actor.cpp index bb8b3e778c..abff0e0475 100644 --- a/fdbcli/ThrottleCommand.actor.cpp +++ b/fdbcli/ThrottleCommand.actor.cpp @@ -310,10 +310,104 @@ ACTOR Future throttleCommandActor(Reference db, std::vector& lc, + std::vector const& tokens) { + if (tokens.size() == 1) { + const char* opts[] = { "on tag", "off", "enable auto", "disable auto", "list", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() >= 2 && tokencmp(tokens[1], "on")) { + if (tokens.size() == 2) { + const char* opts[] = { "tag", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 6) { + const char* opts[] = { "default", "immediate", "batch", nullptr }; + arrayGenerator(text, line, opts, lc); + } + } else if (tokens.size() >= 2 && tokencmp(tokens[1], "off") && !tokencmp(tokens[tokens.size() - 1], "tag")) { + const char* opts[] = { "all", "auto", "manual", "tag", "default", "immediate", "batch", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 2 && (tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable"))) { + const char* opts[] = { "auto", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() >= 2 && tokencmp(tokens[1], "list")) { + if (tokens.size() == 2) { + const char* opts[] = { "throttled", "recommended", "all", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 3) { + const char* opts[] = { "LIMITS", nullptr }; + arrayGenerator(text, line, opts, lc); + } + } +} + +std::vector throttleHintGenerator(std::vector const& tokens, bool inArgument) { + if (tokens.size() == 1) { + return { "", "[ARGS]" }; + } else if (tokencmp(tokens[1], "on")) { + std::vector opts = { "tag", "", "[RATE]", "[DURATION]", "[default|immediate|batch]" }; + if (tokens.size() == 2) { + return opts; + } else if (((tokens.size() == 3 && inArgument) || tokencmp(tokens[2], "tag")) && tokens.size() < 7) { + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } + } else if (tokencmp(tokens[1], "off")) { + if (tokencmp(tokens[tokens.size() - 1], "tag")) { + return { "" }; + } else { + bool hasType = false; + bool hasTag = false; + bool hasPriority = false; + for (int i = 2; i < tokens.size(); ++i) { + if (tokencmp(tokens[i], "all") || tokencmp(tokens[i], "auto") || tokencmp(tokens[i], "manual")) { + hasType = true; + } else if (tokencmp(tokens[i], "default") || tokencmp(tokens[i], "immediate") || + tokencmp(tokens[i], "batch")) { + hasPriority = true; + } else if (tokencmp(tokens[i], "tag")) { + hasTag = true; + ++i; + } else { + return {}; + } + } + + std::vector options; + if (!hasType) { + options.push_back("[all|auto|manual]"); + } + if (!hasTag) { + options.push_back("[tag ]"); + } + if (!hasPriority) { + options.push_back("[default|immediate|batch]"); + } + + return options; + } + } else if ((tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) && tokens.size() == 2) { + return { "auto" }; + } else if (tokens.size() >= 2 && tokencmp(tokens[1], "list")) { + if (tokens.size() == 2) { + return { "[throttled|recommended|all]", "[LIMITS]" }; + } else if (tokens.size() == 3 && (tokencmp(tokens[2], "throttled") || tokencmp(tokens[2], "recommended") || + tokencmp(tokens[2], "all"))) { + return { "[LIMITS]" }; + } + } else if (tokens.size() == 2 && inArgument) { + return { "[ARGS]" }; + } + + return std::vector(); +} + CommandFactory throttleFactory( "throttle", CommandHelp("throttle [ARGS]", "view and control throttled tags", "Use `on' and `off' to manually throttle or unthrottle tags. Use `enable auto' or `disable auto' " - "to enable or disable automatic tag throttling. Use `list' to print the list of throttled tags.\n")); + "to enable or disable automatic tag throttling. Use `list' to print the list of throttled tags.\n"), + &throttleGenerator, + &throttleHintGenerator); } // namespace fdb_cli diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 5f5f7d25fc..46442bb923 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -758,6 +758,7 @@ void optionGenerator(const char* text, const char* line, std::vector& lc) { const char** iter = options; int len = strlen(text); @@ -770,81 +771,13 @@ void arrayGenerator(const char* text, const char* line, const char** options, st } } } +} // namespace fdb_cli void onOffGenerator(const char* text, const char* line, std::vector& lc) { const char* opts[] = { "on", "off", nullptr }; arrayGenerator(text, line, opts, lc); } -void configureGenerator(const char* text, const char* line, std::vector& lc) { - const char* opts[] = { "new", - "single", - "double", - "triple", - "three_data_hall", - "three_datacenter", - "ssd", - "ssd-1", - "ssd-2", - "memory", - "memory-1", - "memory-2", - "memory-radixtree-beta", - "commit_proxies=", - "grv_proxies=", - "logs=", - "resolvers=", - "perpetual_storage_wiggle=", - "perpetual_storage_wiggle_locality=", - "storage_migration_type=", - "tenant_mode=", - "blob_granules_enabled=", - nullptr }; - arrayGenerator(text, line, opts, lc); -} - -void statusGenerator(const char* text, const char* line, std::vector& lc) { - const char* opts[] = { "minimal", "details", "json", nullptr }; - arrayGenerator(text, line, opts, lc); -} - -void killGenerator(const char* text, const char* line, std::vector& lc) { - const char* opts[] = { "all", "list", nullptr }; - arrayGenerator(text, line, opts, lc); -} - -void throttleGenerator(const char* text, - const char* line, - std::vector& lc, - std::vector const& tokens) { - if (tokens.size() == 1) { - const char* opts[] = { "on tag", "off", "enable auto", "disable auto", "list", nullptr }; - arrayGenerator(text, line, opts, lc); - } else if (tokens.size() >= 2 && tokencmp(tokens[1], "on")) { - if (tokens.size() == 2) { - const char* opts[] = { "tag", nullptr }; - arrayGenerator(text, line, opts, lc); - } else if (tokens.size() == 6) { - const char* opts[] = { "default", "immediate", "batch", nullptr }; - arrayGenerator(text, line, opts, lc); - } - } else if (tokens.size() >= 2 && tokencmp(tokens[1], "off") && !tokencmp(tokens[tokens.size() - 1], "tag")) { - const char* opts[] = { "all", "auto", "manual", "tag", "default", "immediate", "batch", nullptr }; - arrayGenerator(text, line, opts, lc); - } else if (tokens.size() == 2 && (tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable"))) { - const char* opts[] = { "auto", nullptr }; - arrayGenerator(text, line, opts, lc); - } else if (tokens.size() >= 2 && tokencmp(tokens[1], "list")) { - if (tokens.size() == 2) { - const char* opts[] = { "throttled", "recommended", "all", nullptr }; - arrayGenerator(text, line, opts, lc); - } else if (tokens.size() == 3) { - const char* opts[] = { "LIMITS", nullptr }; - arrayGenerator(text, line, opts, lc); - } - } -} - void fdbcliCompCmd(std::string const& text, std::vector& lc) { bool err, partial; std::string whole_line = text; @@ -892,81 +825,10 @@ void fdbcliCompCmd(std::string const& text, std::vector& lc) { onOffGenerator(ntext.c_str(), base_input.c_str(), lc); } - if (tokencmp(tokens[0], "configure")) { - configureGenerator(ntext.c_str(), base_input.c_str(), lc); + auto itr = CommandFactory::completionGenerators().find(tokens[0].toString()); + if (itr != CommandFactory::completionGenerators().end()) { + itr->second(ntext.c_str(), base_input.c_str(), lc, tokens); } - - if (tokencmp(tokens[0], "status") && count == 1) { - statusGenerator(ntext.c_str(), base_input.c_str(), lc); - } - - if (tokencmp(tokens[0], "kill") && count == 1) { - killGenerator(ntext.c_str(), base_input.c_str(), lc); - } - - if (tokencmp(tokens[0], "throttle")) { - throttleGenerator(ntext.c_str(), base_input.c_str(), lc, tokens); - } -} - -std::vector throttleHintGenerator(std::vector const& tokens, bool inArgument) { - if (tokens.size() == 1) { - return { "", "[ARGS]" }; - } else if (tokencmp(tokens[1], "on")) { - std::vector opts = { "tag", "", "[RATE]", "[DURATION]", "[default|immediate|batch]" }; - if (tokens.size() == 2) { - return opts; - } else if (((tokens.size() == 3 && inArgument) || tokencmp(tokens[2], "tag")) && tokens.size() < 7) { - return std::vector(opts.begin() + tokens.size() - 2, opts.end()); - } - } else if (tokencmp(tokens[1], "off")) { - if (tokencmp(tokens[tokens.size() - 1], "tag")) { - return { "" }; - } else { - bool hasType = false; - bool hasTag = false; - bool hasPriority = false; - for (int i = 2; i < tokens.size(); ++i) { - if (tokencmp(tokens[i], "all") || tokencmp(tokens[i], "auto") || tokencmp(tokens[i], "manual")) { - hasType = true; - } else if (tokencmp(tokens[i], "default") || tokencmp(tokens[i], "immediate") || - tokencmp(tokens[i], "batch")) { - hasPriority = true; - } else if (tokencmp(tokens[i], "tag")) { - hasTag = true; - ++i; - } else { - return {}; - } - } - - std::vector options; - if (!hasType) { - options.push_back("[all|auto|manual]"); - } - if (!hasTag) { - options.push_back("[tag ]"); - } - if (!hasPriority) { - options.push_back("[default|immediate|batch]"); - } - - return options; - } - } else if ((tokencmp(tokens[1], "enable") || tokencmp(tokens[1], "disable")) && tokens.size() == 2) { - return { "auto" }; - } else if (tokens.size() >= 2 && tokencmp(tokens[1], "list")) { - if (tokens.size() == 2) { - return { "[throttled|recommended|all]", "[LIMITS]" }; - } else if (tokens.size() == 3 && (tokencmp(tokens[2], "throttled") || tokencmp(tokens[2], "recommended") || - tokencmp(tokens[2], "all"))) { - return { "[LIMITS]" }; - } - } else if (tokens.size() == 2 && inArgument) { - return { "[ARGS]" }; - } - - return std::vector(); } void LogCommand(std::string line, UID randomID, std::string errMsg) { @@ -2080,8 +1942,9 @@ ACTOR Future runCli(CLIOptions opt) { bool inArgument = *(line.end() - 1) != ' '; std::string hintLine = inArgument ? " " : ""; - if (tokencmp(command, "throttle")) { - std::vector hintItems = throttleHintGenerator(parsed.back(), inArgument); + auto itr = CommandFactory::hintGenerators().find(command.toString()); + if (itr != CommandFactory::hintGenerators().end()) { + std::vector hintItems = itr->second(parsed.back(), inArgument); if (hintItems.empty()) { return LineNoise::Hint(); } diff --git a/fdbcli/fdbcli.actor.h b/fdbcli/fdbcli.actor.h index 71fe2d4e4a..ea05695e0b 100644 --- a/fdbcli/fdbcli.actor.h +++ b/fdbcli/fdbcli.actor.h @@ -47,8 +47,28 @@ struct CommandHelp { CommandHelp(const char* u, const char* s, const char* l) : usage(u), short_desc(s), long_desc(l) {} }; +void arrayGenerator(const char* text, const char* line, const char** options, std::vector& lc); + struct CommandFactory { - CommandFactory(const char* name, CommandHelp help) { commands()[name] = help; } + typedef void (*CompletionGeneratorFunc)(const char* text, + const char* line, + std::vector& lc, + std::vector const& tokens); + + typedef std::vector (*HintGeneratorFunc)(std::vector const& tokens, bool inArgument); + + CommandFactory(const char* name, + CommandHelp help, + CompletionGeneratorFunc completionFunc = nullptr, + HintGeneratorFunc hintFunc = nullptr) { + commands()[name] = help; + if (completionFunc) { + completionGenerators()[name] = completionFunc; + } + if (hintFunc) { + hintGenerators()[name] = hintFunc; + } + } CommandFactory(const char* name) { hiddenCommands().insert(name); } static std::map& commands() { static std::map helpMap; @@ -58,6 +78,14 @@ struct CommandFactory { static std::set commands; return commands; } + static std::map& completionGenerators() { + static std::map completionMap; + return completionMap; + } + static std::map& hintGenerators() { + static std::map hintMap; + return hintMap; + } }; // Special keys used by fdbcli commands From ff216c2f57294bfa6e72557d6b5e53c71c27cd49 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 2 May 2022 13:42:11 -0400 Subject: [PATCH 089/299] add tenant list metadata to binding tester --- TenantTest.java | 91 ------------------- bindings/bindingtester/spec/tenantTester.md | 10 +- bindings/bindingtester/tests/api.py | 6 +- .../foundationdb/test/AsyncStackTester.java | 35 ++++++- .../foundationdb/test/StackOperation.java | 1 + .../apple/foundationdb/test/StackTester.java | 34 ++++++- bindings/python/fdb/tenant_management.py | 2 +- bindings/python/tests/tester.py | 17 ++++ test_tenant.py | 33 ------- 9 files changed, 97 insertions(+), 132 deletions(-) delete mode 100644 TenantTest.java delete mode 100755 test_tenant.py diff --git a/TenantTest.java b/TenantTest.java deleted file mode 100644 index d9bb02acdd..0000000000 --- a/TenantTest.java +++ /dev/null @@ -1,91 +0,0 @@ -import java.io.UnsupportedEncodingException; - -import java.util.Arrays; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ThreadLocalRandom; - -import com.apple.foundationdb.Database; -import com.apple.foundationdb.FDB; -import com.apple.foundationdb.KeyValue; -import com.apple.foundationdb.Tenant; -import com.apple.foundationdb.Transaction; -import com.apple.foundationdb.tuple.Tuple; -import com.apple.foundationdb.KeyArrayResult; -import com.apple.foundationdb.TenantManagement; -import com.apple.foundationdb.async.AsyncUtil; -import static com.apple.foundationdb.async.AsyncUtil.collectRemaining; -import com.apple.foundationdb.async.CloseableAsyncIterator; - -public class TenantTest { - private FDB fdb; - private Database db; - CloseableAsyncIterator tenants; - - public TenantTest() { - try { - fdb = FDB.selectAPIVersion(710); - fdb.options().setTraceEnable(null); - db = fdb.open(); -///* - Tuple t1 = Tuple.from("tenant"); - Tuple t2 = Tuple.from("tenant2"); - Tuple t3 = Tuple.from("tenant3"); -//*/ -/* - byte[] t1 = Tuple.from("tenant").pack(); - byte[] t2 = Tuple.from("tenant2").pack(); - byte[] t3 = Tuple.from("tenant3").pack(); -*/ - System.out.println(t1); - System.out.println(t2); - System.out.println(t3); - - TenantManagement.createTenant(db, t1).join(); - TenantManagement.createTenant(db, t2).join(); - TenantManagement.createTenant(db, t3).join(); - - tenants = TenantManagement.listTenants(db, Tuple.from("a").pack(), Tuple.from("z").pack(), 100); - - try { -/* - List result = AsyncUtil.collectRemaining(tenants).join(); - System.out.println("Size: " + result.size()); - for(int i = 0; i < result.size(); i++) { - System.out.println(i); - KeyValue res = result.get(i); - System.out.println(new String(res.getKey())); - System.out.println(new String(res.getValue())); - } -*/ -// /* - while (tenants.hasNext()) { - KeyValue res = tenants.next(); - System.out.println(new String(res.getKey())); - System.out.println(new String(res.getValue())); - } -// */ - } - finally { - tenants.close(); - } - TenantManagement.deleteTenant(db, t1).join(); - TenantManagement.deleteTenant(db, t2).join(); - TenantManagement.deleteTenant(db, t3).join(); - } - catch(Exception e) { - e.printStackTrace(); - } - } - - public void close() { - db.close(); - } - - public static void main(String[] args) { - new TenantTest().close(); - } -} - diff --git a/bindings/bindingtester/spec/tenantTester.md b/bindings/bindingtester/spec/tenantTester.md index df33ef4b0b..fea7d49070 100644 --- a/bindings/bindingtester/spec/tenantTester.md +++ b/bindings/bindingtester/spec/tenantTester.md @@ -38,10 +38,16 @@ The tenant API introduces some new operations: Unsets the active tenant. -#### TENANT_LIST +#### TENANT_LIST_NAMES Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. Returns list - of tenants contained in the range BEGIN to END, numbering LIMIT at most. + of tenant names contained in the range BEGIN to END, numbering LIMIT at most. + May optionally push a future onto the stack. + +#### TENANT_LIST_METADATA + + Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. Returns list + of tenant metadata contained in the range BEGIN to END, numbering LIMIT at most. May optionally push a future onto the stack. Updates to Existing Instructions diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index 12599b1074..1e7252b696 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -165,7 +165,7 @@ class ApiTest(Test): write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT'] txn_sizes = ['GET_APPROXIMATE_SIZE'] storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS'] - tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST_NAMES'] + tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST_NAMES', 'TENANT_LIST_METADATA'] op_choices += reads op_choices += mutations @@ -604,6 +604,10 @@ class ApiTest(Test): instructions.push_args(b'', b'\xff', 10000) instructions.append(op) self.add_strings(1) + elif op == 'TENANT_LIST_METADATA': + instructions.push_args(b'', b'\xff', 10000) + instructions.append(op) + self.add_strings(1) else: assert False, 'Unknown operation: ' + op diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index 823697b4ff..687b635a7e 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -509,6 +509,37 @@ public class AsyncStackTester { inst.push(output); }, FDB.DEFAULT_EXECUTOR); } + else if (op == StackOperation.TENANT_LIST_METADATA) { + return inst.popParams(3).thenAcceptAsync(params -> { + byte[] begin = (byte[])params.get(0); + byte[] end = (byte[])params.get(1); + int limit = StackUtils.getInt(params.get(2)); + CloseableAsyncIterator tenantIter = + TenantManagement.listTenants(inst.context.db, begin, end, limit); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + boolean validData = true; + try { + while (tenantIter.hasNext()) { + KeyValue next = tenantIter.next(); + String metadata = new String(next.getValue()); + // Without a JSON parsing library, we try to validate that the metadata consists + // of a select few properties using simple string comparison + if (metadata.charAt(0) != '{' || metadata.charAt(metadata.length() - 1) != '}' || + !metadata.contains("id") || !metadata.contains("prefix")) { + validData = false; + break; + } + } + } finally { + tenantIter.close(); + } + if (validData) { + inst.push("VALID_TENANT_METADATA".getBytes()); + } else { + inst.push("INVALID_TENANT_METADATA".getBytes()); + } + }, FDB.DEFAULT_EXECUTOR); + } else if (op == StackOperation.TENANT_SET_ACTIVE) { return inst.popParam().thenAcceptAsync(param -> { byte[] tenantName = (byte[])param; @@ -519,7 +550,7 @@ public class AsyncStackTester { inst.context.setTenant(Optional.empty()); return AsyncUtil.DONE; } - else if(op == StackOperation.UNIT_TESTS) { + else if (op == StackOperation.UNIT_TESTS) { inst.context.db.options().setLocationCacheSize(100001); return inst.context.db.runAsync(tr -> { FDB fdb = FDB.instance(); @@ -594,7 +625,7 @@ public class AsyncStackTester { throw new RuntimeException("Unit tests failed: " + t.getMessage()); }); } - else if(op == StackOperation.LOG_STACK) { + else if (op == StackOperation.LOG_STACK) { return inst.popParam().thenComposeAsync(prefix -> doLogStack(inst, (byte[])prefix), FDB.DEFAULT_EXECUTOR); } diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java index f3d667db7c..acd0dca676 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java @@ -77,6 +77,7 @@ enum StackOperation { TENANT_CREATE, TENANT_DELETE, TENANT_LIST_NAMES, + TENANT_LIST_METADATA, TENANT_SET_ACTIVE, TENANT_CLEAR_ACTIVE, diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index 4be1c9257b..e5e581b617 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -453,6 +453,36 @@ public class StackTester { byte[] output = outputStream.toByteArray(); inst.push(output); } + else if (op == StackOperation.TENANT_LIST_METADATA) { + List params = inst.popParams(3).join(); + byte[] begin = (byte[])params.get(0); + byte[] end = (byte[])params.get(1); + int limit = StackUtils.getInt(params.get(2)); + CloseableAsyncIterator tenantIter = + TenantManagement.listTenants(inst.context.db, begin, end, limit); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + boolean validData = true; + try { + while (tenantIter.hasNext()) { + KeyValue next = tenantIter.next(); + String metadata = new String(next.getValue()); + // Without a JSON parsing library, we try to validate that the metadata consists + // of a select few properties using simple string comparison + if (metadata.charAt(0) != '{' || metadata.charAt(metadata.length() - 1) != '}' || + !metadata.contains("id") || !metadata.contains("prefix")) { + validData = false; + break; + } + } + } finally { + tenantIter.close(); + } + if (validData) { + inst.push("VALID_TENANT_METADATA".getBytes()); + } else { + inst.push("INVALID_TENANT_METADATA".getBytes()); + } + } else if (op == StackOperation.TENANT_SET_ACTIVE) { byte[] tenantName = (byte[])inst.popParam().join(); inst.context.setTenant(Optional.of(tenantName)); @@ -460,7 +490,7 @@ public class StackTester { else if (op == StackOperation.TENANT_CLEAR_ACTIVE) { inst.context.setTenant(Optional.empty()); } - else if(op == StackOperation.UNIT_TESTS) { + else if (op == StackOperation.UNIT_TESTS) { try { inst.context.db.options().setLocationCacheSize(100001); inst.context.db.run(tr -> { @@ -538,7 +568,7 @@ public class StackTester { throw new RuntimeException("Unit tests failed: " + e.getMessage()); } } - else if(op == StackOperation.LOG_STACK) { + else if (op == StackOperation.LOG_STACK) { List params = inst.popParams(1).join(); byte[] prefix = (byte[]) params.get(0); diff --git a/bindings/python/fdb/tenant_management.py b/bindings/python/fdb/tenant_management.py index 6e571f45a5..061c6961b3 100644 --- a/bindings/python/fdb/tenant_management.py +++ b/bindings/python/fdb/tenant_management.py @@ -131,4 +131,4 @@ def list_tenants(db_or_tr, begin, end, limit): begin = _impl.process_tenant_name(begin) end = _impl.process_tenant_name(end) - return _list_tenants_impl(db_or_tr, begin, end, limit) \ No newline at end of file + return _list_tenants_impl(db_or_tr, begin, end, limit) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 81a731b08d..f190fd1c2f 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -30,6 +30,7 @@ import time import random import time import traceback +import json sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..')] import fdb @@ -612,6 +613,22 @@ class Tester: result += tenant.key result_bytes = bytes(result) inst.push(result_bytes) + elif inst.op == six.u("TENANT_LIST_METADATA"): + begin, end, limit = inst.pop(3) + tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) + valid_data = True + for tenant in tenant_list: + try: + metadata = json.loads(tenant.value) + id = metadata["id"] + prefix = metadata["prefix"] + except (json.decoder.JSONDecodeError, KeyError) as e: + valid_data = False + break + if valid_data: + inst.push(b"VALID_TENANT_METADATA") + else: + inst.push(b"INVALID_TENANT_METADATA") elif inst.op == six.u("UNIT_TESTS"): try: test_db_options(db) diff --git a/test_tenant.py b/test_tenant.py deleted file mode 100755 index 5f3dd9abd6..0000000000 --- a/test_tenant.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 - -import fdb -import sys - -fdb.api_version(710) -db=fdb.open() - -db.options.set_transaction_timeout(2000) - -#tenant = b'tenant' -#tenant2 = b'tenant2' -#tenant3 = b'tenant3' - -tenant = (u"tenant",) -tenant2 = (u"tenant2",) -tenant3 = (u"tenant3",) - -fdb.tenant_management.create_tenant(db, tenant) -fdb.tenant_management.create_tenant(db, tenant2) -fdb.tenant_management.create_tenant(db, tenant3) - -res = fdb.tenant_management.list_tenants(db, (u"a",), (u"z",), 10) -#res = fdb.tenant_management.list_tenants(db, b'a', b'z', 10) -for t in res: - print(t.key.decode()) - print(t.value.decode()) - -fdb.tenant_management.delete_tenant(db, tenant) -fdb.tenant_management.delete_tenant(db, tenant2) -fdb.tenant_management.delete_tenant(db, tenant3) - -sys.exit(0) From 940512f208f4419c7de1943900ed239e49b6c3f6 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 2 May 2022 12:25:02 -0700 Subject: [PATCH 090/299] simplify the storageWiggler ordering; add unittests --- fdbclient/FDBTypes.h | 8 +++++--- fdbserver/DataDistribution.actor.cpp | 17 +++++++++++++++++ fdbserver/DataDistribution.actor.h | 14 +++----------- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 2217717df8..033be08499 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1459,12 +1459,14 @@ struct StorageMetadataType { bool operator<(const StorageMetadataType& b) const { if (wrongConfigured == b.wrongConfigured) { - // the younger, the less - return createdTime > b.createdTime; + // the older SS has smaller createdTime + return createdTime < b.createdTime; } - return wrongConfigured < b.wrongConfigured; + return wrongConfigured > b.wrongConfigured; } + bool operator>(const StorageMetadataType& b) const { return b < *this; } + // To change this serialization, ProtocolVersion::StorageMetadata must be updated, and downgrades need // to be considered template diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 893a46efbf..b5726f4aac 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1344,3 +1344,20 @@ TEST_CASE("/DataDistribution/WaitForMost") { } return Void(); } + +TEST_CASE("/DataDistributor/StorageWiggler/Order") { + StorageWiggler wiggler(nullptr); + wiggler.addServer(UID(1, 0), StorageMetadataType(1, KeyValueStoreType::SSD_BTREE_V2)); + wiggler.addServer(UID(2, 0), StorageMetadataType(2, KeyValueStoreType::MEMORY, true)); + wiggler.addServer(UID(3, 0), StorageMetadataType(3, KeyValueStoreType::SSD_ROCKSDB_V1, true)); + wiggler.addServer(UID(4, 0), StorageMetadataType(4, KeyValueStoreType::SSD_BTREE_V2)); + + std::vector correctOrder{ UID(2, 0), UID(3, 0), UID(1, 0), UID(4, 0) }; + for (int i = 0; i < correctOrder.size(); ++i) { + auto id = wiggler.getNextServerId(); + std::cout << "Get " << id.get().shortString() << "\n"; + ASSERT(id == correctOrder[i]); + } + ASSERT(!wiggler.getNextServerId().present()); + return Void(); +} diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 7f144c0785..900af083c1 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -395,17 +395,9 @@ struct StorageWiggler : ReferenceCounted { // data structures typedef std::pair MetadataUIDP; - // less comparator by (metadata, UID), the largest comes first - struct CompPair { - bool operator()(MetadataUIDP const& a, MetadataUIDP const& b) const { - if (a.first == b.first) { - return a.second > b.second; - } - return a.first < b.first; - } - }; - // max-heap - boost::heap::skew_heap, boost::heap::compare> wiggle_pq; + // min-heap + boost::heap::skew_heap, boost::heap::compare>> + wiggle_pq; std::unordered_map pq_handles; AsyncVar nonEmpty; From dc9e782ccc06bd0a852d594081b728d3048f5874 Mon Sep 17 00:00:00 2001 From: Ray Jenkins Date: Mon, 2 May 2022 14:56:51 -0500 Subject: [PATCH 091/299] OpenTelemetry Tracing Perf Fixes (#6990) --- bindings/c/test/unit/unit_tests.cpp | 23 +- fdbclient/CommitProxyInterface.h | 18 +- fdbclient/CommitTransaction.h | 4 +- fdbclient/DatabaseContext.h | 10 +- fdbclient/FDBTypes.h | 20 -- fdbclient/IClientApi.h | 5 +- fdbclient/IConfigTransaction.h | 2 +- fdbclient/ISingleThreadTransaction.h | 2 +- fdbclient/MultiVersionTransaction.actor.cpp | 6 +- fdbclient/MultiVersionTransaction.h | 4 +- fdbclient/NativeAPI.actor.cpp | 152 +++++----- fdbclient/NativeAPI.actor.h | 13 +- fdbclient/ReadYourWrites.actor.cpp | 2 +- fdbclient/ReadYourWrites.h | 4 +- fdbclient/SpecialKeySpace.actor.cpp | 6 +- fdbclient/StorageServerInterface.h | 21 +- fdbclient/ThreadSafeTransaction.cpp | 4 +- fdbclient/ThreadSafeTransaction.h | 2 +- fdbclient/TransactionLineage.h | 5 +- fdbrpc/FlowTests.actor.cpp | 1 + fdbrpc/FlowTransport.actor.cpp | 86 ++++-- fdbrpc/FlowTransport.h | 5 + fdbrpc/sim2.actor.cpp | 1 + fdbserver/ApplyMetadataMutation.cpp | 14 +- fdbserver/ApplyMetadataMutation.h | 6 +- fdbserver/BackupWorker.actor.cpp | 4 + fdbserver/CMakeLists.txt | 1 + fdbserver/ClusterRecovery.actor.cpp | 2 +- fdbserver/CommitProxyServer.actor.cpp | 13 +- fdbserver/GrvProxyServer.actor.cpp | 4 +- fdbserver/LogSystem.cpp | 33 ++- fdbserver/LogSystem.h | 7 +- fdbserver/MasterInterface.h | 4 +- fdbserver/MutationTracking.cpp | 6 + fdbserver/OTELSpanContextMessage.h | 66 +++++ fdbserver/Resolver.actor.cpp | 6 +- fdbserver/ResolverInterface.h | 2 +- fdbserver/StorageCache.actor.cpp | 9 + fdbserver/TLogInterface.h | 4 +- fdbserver/TagPartitionedLogSystem.actor.cpp | 2 +- fdbserver/TagPartitionedLogSystem.actor.h | 2 +- fdbserver/masterserver.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 91 +++--- fdbserver/workloads/ApiWorkload.h | 12 +- .../workloads/ConsistencyCheck.actor.cpp | 3 +- fdbserver/workloads/Cycle.actor.cpp | 5 +- fdbserver/workloads/MiniCycle.actor.cpp | 2 +- flow/Net2.actor.cpp | 1 + flow/ProtocolVersion.h | 1 + flow/Tracing.actor.cpp | 175 ++++-------- flow/Tracing.h | 264 +++++++----------- 51 files changed, 586 insertions(+), 551 deletions(-) create mode 100644 fdbserver/OTELSpanContextMessage.h diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 4f258eac41..5ec1c6cec2 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -44,6 +44,8 @@ #include "fdbclient/Tuple.h" #include "flow/config.h" +#include "flow/DeterministicRandom.h" +#include "flow/IRandom.h" #include "fdb_api.hpp" @@ -2021,15 +2023,17 @@ TEST_CASE("fdb_transaction_add_conflict_range") { TEST_CASE("special-key-space valid transaction ID") { auto value = get_value("\xff\xff/tracing/transaction_id", /* snapshot */ false, {}); REQUIRE(value.has_value()); - uint64_t transaction_id = std::stoul(value.value()); - CHECK(transaction_id > 0); + UID transaction_id = UID::fromString(value.value()); + CHECK(transaction_id.first() > 0); + CHECK(transaction_id.second() > 0); } TEST_CASE("special-key-space custom transaction ID") { fdb::Transaction tr(db); fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); while (1) { - tr.set("\xff\xff/tracing/transaction_id", std::to_string(ULONG_MAX)); + UID randomTransactionID = UID(deterministicRandom()->randomUInt64(), deterministicRandom()->randomUInt64()); + tr.set("\xff\xff/tracing/transaction_id", randomTransactionID.toString()); fdb::ValueFuture f1 = tr.get("\xff\xff/tracing/transaction_id", /* snapshot */ false); @@ -2046,8 +2050,8 @@ TEST_CASE("special-key-space custom transaction ID") { fdb_check(f1.get(&out_present, (const uint8_t**)&val, &vallen)); REQUIRE(out_present); - uint64_t transaction_id = std::stoul(std::string(val, vallen)); - CHECK(transaction_id == ULONG_MAX); + UID transaction_id = UID::fromString(val); + CHECK(transaction_id == randomTransactionID); break; } } @@ -2074,8 +2078,9 @@ TEST_CASE("special-key-space set transaction ID after write") { fdb_check(f1.get(&out_present, (const uint8_t**)&val, &vallen)); REQUIRE(out_present); - uint64_t transaction_id = std::stoul(std::string(val, vallen)); - CHECK(transaction_id != 0); + UID transaction_id = UID::fromString(val); + CHECK(transaction_id.first() > 0); + CHECK(transaction_id.second() > 0); break; } } @@ -2140,7 +2145,9 @@ TEST_CASE("special-key-space tracing get range") { CHECK(out_count == 2); CHECK(std::string((char*)out_kv[1].key, out_kv[1].key_length) == tracingBegin + "transaction_id"); - CHECK(std::stoul(std::string((char*)out_kv[1].value, out_kv[1].value_length)) > 0); + UID transaction_id = UID::fromString(std::string((char*)out_kv[1].value)); + CHECK(transaction_id.first() > 0); + CHECK(transaction_id.second() > 0); break; } } diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h index 8d068926eb..149e77521d 100644 --- a/fdbclient/CommitProxyInterface.h +++ b/fdbclient/CommitProxyInterface.h @@ -162,7 +162,7 @@ struct CommitTransactionRequest : TimedRequest { bool firstInBatch() const { return (flags & FLAG_FIRST_IN_BATCH) != 0; } Arena arena; - SpanID spanContext; + SpanContext spanContext; CommitTransactionRef transaction; ReplyPromise reply; uint32_t flags; @@ -172,8 +172,8 @@ struct CommitTransactionRequest : TimedRequest { TenantInfo tenantInfo; - CommitTransactionRequest() : CommitTransactionRequest(SpanID()) {} - CommitTransactionRequest(SpanID const& context) : spanContext(context), flags(0) {} + CommitTransactionRequest() : CommitTransactionRequest(SpanContext()) {} + CommitTransactionRequest(SpanContext const& context) : spanContext(context), flags(0) {} template void serialize(Ar& ar) { @@ -242,7 +242,7 @@ struct GetReadVersionRequest : TimedRequest { FLAG_PRIORITY_MASK = PRIORITY_SYSTEM_IMMEDIATE, }; - SpanID spanContext; + SpanContext spanContext; uint32_t transactionCount; uint32_t flags; TransactionPriority priority; @@ -255,7 +255,7 @@ struct GetReadVersionRequest : TimedRequest { Version maxVersion; // max version in the client's version vector cache GetReadVersionRequest() : transactionCount(1), flags(0), maxVersion(invalidVersion) {} - GetReadVersionRequest(SpanID spanContext, + GetReadVersionRequest(SpanContext spanContext, uint32_t transactionCount, TransactionPriority priority, Version maxVersion, @@ -325,7 +325,7 @@ struct GetKeyServerLocationsReply { struct GetKeyServerLocationsRequest { constexpr static FileIdentifier file_identifier = 9144680; Arena arena; - SpanID spanContext; + SpanContext spanContext; Optional tenant; KeyRef begin; Optional end; @@ -340,7 +340,7 @@ struct GetKeyServerLocationsRequest { Version minTenantVersion; GetKeyServerLocationsRequest() : limit(0), reverse(false), minTenantVersion(latestVersion) {} - GetKeyServerLocationsRequest(SpanID spanContext, + GetKeyServerLocationsRequest(SpanContext spanContext, Optional const& tenant, KeyRef const& begin, Optional const& end, @@ -378,12 +378,12 @@ struct GetRawCommittedVersionReply { struct GetRawCommittedVersionRequest { constexpr static FileIdentifier file_identifier = 12954034; - SpanID spanContext; + SpanContext spanContext; Optional debugID; ReplyPromise reply; Version maxVersion; // max version in the grv proxy's version vector cache - explicit GetRawCommittedVersionRequest(SpanID spanContext, + explicit GetRawCommittedVersionRequest(SpanContext spanContext, Optional const& debugID = Optional(), Version maxVersion = invalidVersion) : spanContext(spanContext), debugID(debugID), maxVersion(maxVersion) {} diff --git a/fdbclient/CommitTransaction.h b/fdbclient/CommitTransaction.h index 53c87c43bd..91bccaf7ba 100644 --- a/fdbclient/CommitTransaction.h +++ b/fdbclient/CommitTransaction.h @@ -24,6 +24,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/Knobs.h" +#include "flow/Tracing.h" // The versioned message has wire format : -1, version, messages static const int32_t VERSION_HEADER = -1; @@ -77,6 +78,7 @@ struct MutationRef { AndV2, CompareAndClear, Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */, + Reserved_For_OTELSpanContextMessage, MAX_ATOMIC_OP }; // This is stored this way for serialization purposes. @@ -190,7 +192,7 @@ struct CommitTransactionRef { Version read_snapshot = 0; bool report_conflicting_keys = false; bool lock_aware = false; // set when metadata mutations are present - Optional spanContext; + Optional spanContext; template force_inline void serialize(Ar& ar) { diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 532bc1a096..11f5b1beb7 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -141,7 +141,7 @@ struct WatchParameters : public ReferenceCounted { const Version version; const TagSet tags; - const SpanID spanID; + const SpanContext spanContext; const TaskPriority taskID; const Optional debugID; const UseProvisionalProxies useProvisionalProxies; @@ -151,11 +151,11 @@ struct WatchParameters : public ReferenceCounted { Optional value, Version version, TagSet tags, - SpanID spanID, + SpanContext spanContext, TaskPriority taskID, Optional debugID, UseProvisionalProxies useProvisionalProxies) - : tenant(tenant), key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID), + : tenant(tenant), key(key), value(value), version(version), tags(tags), spanContext(spanContext), taskID(taskID), debugID(debugID), useProvisionalProxies(useProvisionalProxies) {} }; @@ -416,12 +416,12 @@ public: Optional defaultTenant; struct VersionRequest { - SpanID spanContext; + SpanContext spanContext; Promise reply; TagSet tags; Optional debugID; - VersionRequest(SpanID spanContext, TagSet tags = TagSet(), Optional debugID = Optional()) + VersionRequest(SpanContext spanContext, TagSet tags = TagSet(), Optional debugID = Optional()) : spanContext(spanContext), tags(tags), debugID(debugID) {} }; diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 9683c7e27f..9f89237b51 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -29,30 +29,10 @@ #include #include -#include "flow/Arena.h" #include "flow/FastRef.h" #include "flow/ProtocolVersion.h" #include "flow/flow.h" -enum class TraceFlags : uint8_t { unsampled = 0b00000000, sampled = 0b00000001 }; - -inline TraceFlags operator&(TraceFlags lhs, TraceFlags rhs) { - return static_cast(static_cast>(lhs) & - static_cast>(rhs)); -} - -struct SpanContext { - UID traceID; - uint64_t spanID; - TraceFlags m_Flags; - SpanContext() : traceID(UID()), spanID(0), m_Flags(TraceFlags::unsampled) {} - SpanContext(UID traceID, uint64_t spanID, TraceFlags flags) : traceID(traceID), spanID(spanID), m_Flags(flags) {} - SpanContext(UID traceID, uint64_t spanID) : traceID(traceID), spanID(spanID), m_Flags(TraceFlags::unsampled) {} - SpanContext(Arena arena, const SpanContext& span) - : traceID(span.traceID), spanID(span.spanID), m_Flags(span.m_Flags) {} - bool isSampled() const { return (m_Flags & TraceFlags::sampled) == TraceFlags::sampled; } -}; - typedef int64_t Version; typedef uint64_t LogEpoch; typedef uint64_t Sequence; diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index 91ef38eeae..e1861432a1 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -27,6 +27,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/Tenant.h" +#include "flow/Tracing.h" #include "flow/ThreadHelper.actor.h" struct VersionVector; @@ -96,11 +97,11 @@ public: virtual ThreadFuture commit() = 0; virtual Version getCommittedVersion() = 0; - // @todo This API and the "getSpanID()" API may help with debugging simulation + // @todo This API and the "getSpanContext()" API may help with debugging simulation // test failures. (These APIs are not currently invoked anywhere.) Remove them // later if they are not really needed. virtual VersionVector getVersionVector() = 0; - virtual UID getSpanID() = 0; + virtual SpanContext getSpanContext() = 0; virtual ThreadFuture getApproximateSize() = 0; virtual void setOption(FDBTransactionOptions::Option option, Optional value = Optional()) = 0; diff --git a/fdbclient/IConfigTransaction.h b/fdbclient/IConfigTransaction.h index 63e058ee4c..8f21679e27 100644 --- a/fdbclient/IConfigTransaction.h +++ b/fdbclient/IConfigTransaction.h @@ -45,7 +45,7 @@ public: // Not implemented: void setVersion(Version) override { throw client_invalid_operation(); } VersionVector getVersionVector() const override { throw client_invalid_operation(); } - UID getSpanID() const override { throw client_invalid_operation(); } + SpanContext getSpanContext() const override { throw client_invalid_operation(); } Future getKey(KeySelector const& key, Snapshot snapshot = Snapshot::False) override { throw client_invalid_operation(); } diff --git a/fdbclient/ISingleThreadTransaction.h b/fdbclient/ISingleThreadTransaction.h index bb5a4913f1..19beb4e5df 100644 --- a/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/ISingleThreadTransaction.h @@ -95,7 +95,7 @@ public: virtual Future commit() = 0; virtual Version getCommittedVersion() const = 0; virtual VersionVector getVersionVector() const = 0; - virtual UID getSpanID() const = 0; + virtual SpanContext getSpanContext() const = 0; virtual int64_t getApproximateSize() const = 0; virtual Future> getVersionstamp() = 0; virtual void setOption(FDBTransactionOptions::Option option, Optional value = Optional()) = 0; diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index fe18292dde..e281887e11 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -1105,13 +1105,13 @@ VersionVector MultiVersionTransaction::getVersionVector() { return VersionVector(); } -UID MultiVersionTransaction::getSpanID() { +SpanContext MultiVersionTransaction::getSpanContext() { auto tr = getTransaction(); if (tr.transaction) { - return tr.transaction->getSpanID(); + return tr.transaction->getSpanContext(); } - return UID(); + return SpanContext(); } ThreadFuture MultiVersionTransaction::getApproximateSize() { diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index b9d7a20659..1fb5c604ff 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -378,7 +378,7 @@ public: ThreadFuture commit() override; Version getCommittedVersion() override; VersionVector getVersionVector() override; - UID getSpanID() override { return UID(); }; + SpanContext getSpanContext() override { return SpanContext(); }; ThreadFuture getApproximateSize() override; void setOption(FDBTransactionOptions::Option option, Optional value = Optional()) override; @@ -567,7 +567,7 @@ public: ThreadFuture commit() override; Version getCommittedVersion() override; VersionVector getVersionVector() override; - UID getSpanID() override; + SpanContext getSpanContext() override; ThreadFuture getApproximateSize() override; void setOption(FDBTransactionOptions::Option option, Optional value = Optional()) override; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index ec8409ac91..c57ed97d3c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -21,6 +21,7 @@ #include "fdbclient/NativeAPI.actor.h" #include +#include #include #include #include @@ -848,7 +849,9 @@ ACTOR Future assertFailure(GrvProxyInterface remote, Future attemptGRVFromOldProxies(std::vector oldProxies, std::vector newProxies) { - Span span(deterministicRandom()->randomUniqueID(), "VerifyCausalReadRisky"_loc); + auto debugID = nondeterministicRandom()->randomUniqueID(); + g_traceBatch.addEvent("AttemptGRVFromOldProxyDebug", debugID.first(), "NativeAPI.attemptGRVFromOldProxies.Start"); + Span span("VerifyCausalReadRisky"_loc); std::vector> replies; replies.reserve(oldProxies.size()); GetReadVersionRequest req( @@ -2789,13 +2792,13 @@ void updateTagMappings(Database cx, const GetKeyServerLocationsReply& reply) { ACTOR Future getKeyLocation_internal(Database cx, Optional tenant, Key key, - SpanID spanID, + SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, Version version) { - state Span span("NAPI:getKeyLocation"_loc, spanID); + state Span span("NAPI:getKeyLocation"_loc, spanContext); if (isBackward) { ASSERT(key != allKeys.begin && key <= allKeys.end); } else { @@ -2883,7 +2886,7 @@ Future getKeyLocation(Database const& cx, Optional const& tenant, Key const& key, F StorageServerInterface::*member, - SpanID spanID, + SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, @@ -2891,7 +2894,8 @@ Future getKeyLocation(Database const& cx, // we first check whether this range is cached Optional locationInfo = cx->getCachedLocation(tenant, key, isBackward); if (!locationInfo.present()) { - return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward, version); + return getKeyLocation_internal( + cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } bool onlyEndpointFailedAndNeedRefresh = false; @@ -2905,7 +2909,8 @@ Future getKeyLocation(Database const& cx, cx->invalidateCache(locationInfo.get().tenantEntry.prefix, key); // Refresh the cache with a new getKeyLocations made to proxies. - return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward, version); + return getKeyLocation_internal( + cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } return locationInfo.get(); @@ -2922,7 +2927,7 @@ Future getKeyLocation(Reference trState, useTenant ? trState->tenant() : Optional(), key, member, - trState->spanID, + trState->spanContext, trState->debugID, trState->useProvisionalProxies, isBackward, @@ -2944,11 +2949,11 @@ ACTOR Future> getKeyRangeLocations_internal( KeyRange keys, int limit, Reverse reverse, - SpanID spanID, + SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Version version) { - state Span span("NAPI:getKeyRangeLocations"_loc, spanID); + state Span span("NAPI:getKeyRangeLocations"_loc, spanContext); if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.Before"); @@ -3018,7 +3023,7 @@ Future> getKeyRangeLocations(Database const& c int limit, Reverse reverse, F StorageServerInterface::*member, - SpanID const& spanID, + SpanContext const& spanContext, Optional const& debugID, UseProvisionalProxies useProvisionalProxies, Version version) { @@ -3028,7 +3033,7 @@ Future> getKeyRangeLocations(Database const& c std::vector locations; if (!cx->getCachedLocations(tenant, keys, locations, limit, reverse)) { return getKeyRangeLocations_internal( - cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies, version); + cx, tenant, keys, limit, reverse, spanContext, debugID, useProvisionalProxies, version); } bool foundFailed = false; @@ -3049,7 +3054,7 @@ Future> getKeyRangeLocations(Database const& c if (foundFailed) { // Refresh the cache with a new getKeyRangeLocations made to proxies. return getKeyRangeLocations_internal( - cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies, version); + cx, tenant, keys, limit, reverse, spanContext, debugID, useProvisionalProxies, version); } return locations; @@ -3069,7 +3074,7 @@ Future> getKeyRangeLocations(ReferencespanID, + trState->spanContext, trState->debugID, trState->useProvisionalProxies, version); @@ -3098,7 +3103,7 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange keys, CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, Reverse::False, - trState->spanID, + trState->spanContext, trState->debugID, trState->useProvisionalProxies, version)); @@ -3129,38 +3134,35 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange return Void(); } -SpanID generateSpanID(bool transactionTracingSample, SpanID parentContext = SpanID()) { - uint64_t txnId = deterministicRandom()->randomUInt64(); +SpanContext generateSpanID(bool transactionTracingSample, SpanContext parentContext = SpanContext()) { if (parentContext.isValid()) { - if (parentContext.first() > 0) { - txnId = parentContext.first(); - } - uint64_t tokenId = parentContext.second() > 0 ? deterministicRandom()->randomUInt64() : 0; - return SpanID(txnId, tokenId); - } else if (transactionTracingSample) { - uint64_t tokenId = deterministicRandom()->random01() <= FLOW_KNOBS->TRACING_SAMPLE_RATE - ? deterministicRandom()->randomUInt64() - : 0; - return SpanID(txnId, tokenId); - } else { - return SpanID(txnId, 0); + return SpanContext(parentContext.traceID, deterministicRandom()->randomUInt64(), parentContext.m_Flags); } + if (transactionTracingSample) { + return SpanContext(deterministicRandom()->randomUniqueID(), + deterministicRandom()->randomUInt64(), + deterministicRandom()->random01() <= FLOW_KNOBS->TRACING_SAMPLE_RATE + ? TraceFlags::sampled + : TraceFlags::unsampled); + } + return SpanContext( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUInt64(), TraceFlags::unsampled); } TransactionState::TransactionState(Database cx, Optional tenant, TaskPriority taskID, - SpanID spanID, + SpanContext spanContext, Reference trLogInfo) - : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID), readVersionObtainedFromGrvProxy(true), - tenant_(tenant), tenantSet(tenant.present()) {} + : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanContext(spanContext), + readVersionObtainedFromGrvProxy(true), tenant_(tenant), tenantSet(tenant.present()) {} Reference TransactionState::cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const { - SpanID newSpanID = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanID; + SpanContext newSpanContext = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanContext; Reference newState = - makeReference(cx, tenant_, cx->taskID, newSpanID, newTrLogInfo); + makeReference(cx, tenant_, cx->taskID, newSpanContext, newTrLogInfo); if (!cx->apiVersionAtLeast(16)) { newState->options = options; @@ -3218,12 +3220,12 @@ ACTOR Future> getValue(Reference trState, UseTenant useTenant, TransactionRecordLogInfo recordLogInfo) { state Version ver = wait(version); - state Span span("NAPI:getValue"_loc, trState->spanID); + state Span span("NAPI:getValue"_loc, trState->spanContext); if (useTenant && trState->tenant().present()) { - span.addTag("tenant"_sr, trState->tenant().get()); + span.addAttribute("tenant"_sr, trState->tenant().get()); } - span.addTag("key"_sr, key); + span.addAttribute("key"_sr, key); trState->cx->validateVersion(ver); loop { @@ -3349,7 +3351,7 @@ ACTOR Future getKey(Reference trState, wait(success(version)); state Optional getKeyID = Optional(); - state Span span("NAPI:getKey"_loc, trState->spanID); + state Span span("NAPI:getKey"_loc, trState->spanContext); if (trState->debugID.present()) { getKeyID = nondeterministicRandom()->randomUniqueID(); @@ -3448,8 +3450,8 @@ ACTOR Future getKey(Reference trState, } } -ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanID spanContext) { - state Span span("NAPI:waitForCommittedVersion"_loc, { spanContext }); +ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanContext spanContext) { + state Span span("NAPI:waitForCommittedVersion"_loc, spanContext); try { loop { choose { @@ -3483,14 +3485,14 @@ ACTOR Future waitForCommittedVersion(Database cx, Version version, Span } ACTOR Future getRawVersion(Reference trState) { - state Span span("NAPI:getRawVersion"_loc, { trState->spanID }); + state Span span("NAPI:getRawVersion"_loc, trState->spanContext); loop { choose { when(wait(trState->cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance(trState->cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getConsistentReadVersion, - GetReadVersionRequest(trState->spanID, + GetReadVersionRequest(trState->spanContext, 0, TransactionPriority::IMMEDIATE, trState->cx->ssVersionVectorCache.getMaxVersion()), @@ -3512,7 +3514,7 @@ ACTOR Future readVersionBatcher( uint32_t flags); ACTOR Future watchValue(Database cx, Reference parameters) { - state Span span("NAPI:watchValue"_loc, parameters->spanID); + state Span span("NAPI:watchValue"_loc, parameters->spanContext); state Version ver = parameters->version; cx->validateVersion(parameters->version); ASSERT(parameters->version != latestVersion); @@ -3522,7 +3524,7 @@ ACTOR Future watchValue(Database cx, Reference p parameters->tenant.name, parameters->key, &StorageServerInterface::watchValue, - parameters->spanID, + parameters->spanContext, parameters->debugID, parameters->useProvisionalProxies, Reverse::False, @@ -3741,15 +3743,15 @@ ACTOR Future watchValueMap(Future version, Optional value, Database cx, TagSet tags, - SpanID spanID, + SpanContext spanContext, TaskPriority taskID, Optional debugID, UseProvisionalProxies useProvisionalProxies) { state Version ver = wait(version); - wait(getWatchFuture( - cx, - makeReference(tenant, key, value, ver, tags, spanID, taskID, debugID, useProvisionalProxies))); + wait(getWatchFuture(cx, + makeReference( + tenant, key, value, ver, tags, spanContext, taskID, debugID, useProvisionalProxies))); return Void(); } @@ -3795,10 +3797,11 @@ Future getExactRange(Reference trState, Reverse reverse, UseTenant useTenant) { state RangeResultFamily output; - state Span span("NAPI:getExactRange"_loc, trState->spanID); + // TODO - ljoswiak parent or link? + state Span span("NAPI:getExactRange"_loc, trState->spanContext); if (useTenant && trState->tenant().present()) { - span.addTag("tenant"_sr, trState->tenant().get()); + span.addAttribute("tenant"_sr, trState->tenant().get()); } // printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); @@ -4155,9 +4158,9 @@ Future getRange(Reference trState, state KeySelector originalBegin = begin; state KeySelector originalEnd = end; state RangeResultFamily output; - state Span span("NAPI:getRange"_loc, trState->spanID); + state Span span("NAPI:getRange"_loc, trState->spanContext); if (useTenant && trState->tenant().present()) { - span.addTag("tenant"_sr, trState->tenant().get()); + span.addAttribute("tenant"_sr, trState->tenant().get()); } try { @@ -4631,7 +4634,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, GetRangeLimits limits, Snapshot snapshot, Reverse reverse, - SpanID spanContext) { + SpanContext spanContext) { loop { state std::vector locations = wait(getKeyRangeLocations(trState, @@ -4924,7 +4927,7 @@ ACTOR Future getRangeStream(Reference trState, // FIXME: better handling to disable row limits ASSERT(!limits.hasRowLimit()); - state Span span("NAPI:getRangeStream"_loc, trState->spanID); + state Span span("NAPI:getRangeStream"_loc, trState->spanContext); state Version version = wait(fVersion); trState->cx->validateVersion(version); @@ -5047,7 +5050,7 @@ Transaction::Transaction(Database const& cx, Optional const& tenant) cx->taskID, generateSpanID(cx->transactionTracingSample), createTrLogInfoProbabilistically(cx))), - span(trState->spanID, "Transaction"_loc), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), tr(trState->spanID) { + span(trState->spanContext, "Transaction"_loc), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), tr(trState->spanContext) { if (DatabaseContext::debugUseTags) { debugAddTags(trState); } @@ -5182,7 +5185,7 @@ ACTOR Future watch(Reference watch, Database cx, Future tenant, TagSet tags, - SpanID spanID, + SpanContext spanContext, TaskPriority taskID, Optional debugID, UseProvisionalProxies useProvisionalProxies) { @@ -5210,7 +5213,7 @@ ACTOR Future watch(Reference watch, watch->value, cx, tags, - spanID, + spanContext, taskID, debugID, useProvisionalProxies); @@ -5243,7 +5246,7 @@ Future Transaction::watch(Reference watch) { populateAndGetTenant( trState, watch->key, readVersion.isValid() && readVersion.isReady() ? readVersion.get() : latestVersion), trState->options.readTags, - trState->spanID, + trState->spanContext, trState->taskID, trState->debugID, trState->useProvisionalProxies); @@ -5716,7 +5719,7 @@ void TransactionOptions::reset(Database const& cx) { void Transaction::resetImpl(bool generateNewSpan) { flushTrLogsIfEnabled(); trState = trState->cloneAndReset(createTrLogInfoProbabilistically(trState->cx), generateNewSpan); - tr = CommitTransactionRequest(trState->spanID); + tr = CommitTransactionRequest(trState->spanContext); readVersion = Future(); metadataVersion = Promise>(); extraConflictRanges.clear(); @@ -5731,7 +5734,7 @@ void Transaction::reset() { void Transaction::fullReset() { resetImpl(true); - span = Span(trState->spanID, "Transaction"_loc); + span = Span(trState->spanContext, "Transaction"_loc); backoff = CLIENT_KNOBS->DEFAULT_BACKOFF; } @@ -5852,8 +5855,8 @@ ACTOR void checkWrites(Reference trState, ACTOR static Future commitDummyTransaction(Reference trState, KeyRange range) { state Transaction tr(trState->cx); state int retries = 0; - state Span span("NAPI:dummyTransaction"_loc, trState->spanID); - tr.span.addParent(span.context); + state Span span("NAPI:dummyTransaction"_loc, trState->spanContext); + tr.span.setParent(span.context); loop { try { TraceEvent("CommitDummyTransaction").detail("Key", range.begin).detail("Retries", retries); @@ -5896,7 +5899,7 @@ void Transaction::setupWatches() { watches[i]->value, trState->cx, trState->options.readTags, - trState->spanID, + trState->spanContext, trState->taskID, trState->debugID, trState->useProvisionalProxies)); @@ -6019,7 +6022,7 @@ ACTOR static Future tryCommit(Reference trState, Future readVersion) { state TraceInterval interval("TransactionCommit"); state double startTime = now(); - state Span span("NAPI:tryCommit"_loc, trState->spanID); + state Span span("NAPI:tryCommit"_loc, trState->spanContext); state Optional debugID = trState->debugID; if (debugID.present()) { TraceEvent(interval.begin()).detail("Parent", debugID.get()); @@ -6509,10 +6512,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional(value.get(), Unversioned())); + TEST(true); // Adding link in FDBTransactionOptions::SPAN_PARENT + span.setParent(BinaryReader::fromStringRef(value.get(), IncludeVersion())); break; case FDBTransactionOptions::REPORT_CONFLICTING_KEYS: @@ -6555,7 +6559,7 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional getConsistentReadVersion(SpanID parentSpan, +ACTOR Future getConsistentReadVersion(SpanContext parentSpan, DatabaseContext* cx, uint32_t transactionCount, TransactionPriority priority, @@ -6670,7 +6674,7 @@ ACTOR Future readVersionBatcher(DatabaseContext* cx, } g_traceBatch.addAttach("TransactionAttachID", req.debugID.get().first(), debugID.get().first()); } - span.addParent(req.spanContext); + span.addLink(req.spanContext); requests.push_back(req.reply); for (auto tag : req.tags) { ++tags[tag]; @@ -6726,10 +6730,10 @@ ACTOR Future readVersionBatcher(DatabaseContext* cx, ACTOR Future extractReadVersion(Reference trState, Location location, - SpanID spanContext, + SpanContext spanContext, Future f, Promise> metadataVersion) { - state Span span(spanContext, location, { trState->spanID }); + state Span span(spanContext, location, trState->spanContext); GetReadVersionReply rep = wait(f); double replyTime = now(); double latency = replyTime - trState->startTime; @@ -6902,7 +6906,7 @@ Future Transaction::getReadVersion(uint32_t flags) { } Location location = "NAPI:getReadVersion"_loc; - UID spanContext = generateSpanID(trState->cx->transactionTracingSample, trState->spanID); + SpanContext spanContext = generateSpanID(trState->cx->transactionTracingSample, trState->spanContext); auto const req = DatabaseContext::VersionRequest(spanContext, trState->options.tags, trState->debugID); batcher.stream.send(req); trState->startTime = now(); @@ -7392,7 +7396,7 @@ ACTOR Future>> getRangeSplitPoints(ReferencespanID); + state Span span("NAPI:GetRangeSplitPoints"_loc, trState->spanContext); loop { state std::vector locations = @@ -7956,14 +7960,14 @@ Reference Transaction::createTrLogInfoProbabilistically(cons return Reference(); } -void Transaction::setTransactionID(uint64_t id) { +void Transaction::setTransactionID(UID id) { ASSERT(getSize() == 0); - trState->spanID = SpanID(id, trState->spanID.second()); + trState->spanContext = SpanContext(id, trState->spanContext.spanID); } void Transaction::setToken(uint64_t token) { ASSERT(getSize() == 0); - trState->spanID = SpanID(trState->spanID.first(), token); + trState->spanContext = SpanContext(trState->spanContext.traceID, token); } void enableClientInfoLogging() { diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 988e64aaf1..fe4d578e77 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -243,7 +243,7 @@ struct TransactionState : ReferenceCounted { Optional debugID; TaskPriority taskID; - SpanID spanID; + SpanContext spanContext; UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False; bool readVersionObtainedFromGrvProxy; @@ -259,13 +259,14 @@ struct TransactionState : ReferenceCounted { std::shared_ptr> conflictingKeys; // Only available so that Transaction can have a default constructor, for use in state variables - TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID), tenantSet(false) {} + TransactionState(TaskPriority taskID, SpanContext spanContext) + : taskID(taskID), spanContext(spanContext), tenantSet(false) {} // VERSION_VECTOR changed default values of readVersionObtainedFromGrvProxy TransactionState(Database cx, Optional tenant, TaskPriority taskID, - SpanID spanID, + SpanContext spanContext, Reference trLogInfo); Reference cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const; @@ -435,7 +436,7 @@ public: void debugTransaction(UID dID) { trState->debugID = dID; } VersionVector getVersionVector() const; - UID getSpanID() const { return trState->spanID; } + SpanContext getSpanContext() const { return trState->spanContext; } Future commitMutations(); void setupWatches(); @@ -447,7 +448,7 @@ public: Database getDatabase() const { return trState->cx; } static Reference createTrLogInfoProbabilistically(const Database& cx); - void setTransactionID(uint64_t id); + void setTransactionID(UID id); void setToken(uint64_t token); const std::vector>>& getExtraReadConflictRanges() const { return extraConflictRanges; } @@ -490,7 +491,7 @@ private: Future committing; }; -ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanID spanContext); +ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanContext spanContext); ACTOR Future>> waitDataDistributionMetricsList(Database cx, KeyRange keys, int shardLimit); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index c651adad32..6c3fe880c1 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1979,7 +1979,7 @@ void ReadYourWritesTransaction::getWriteConflicts(KeyRangeMap* result) { } } -void ReadYourWritesTransaction::setTransactionID(uint64_t id) { +void ReadYourWritesTransaction::setTransactionID(UID id) { tr.setTransactionID(id); } diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h index 341dc4e2a1..e67b5334f7 100644 --- a/fdbclient/ReadYourWrites.h +++ b/fdbclient/ReadYourWrites.h @@ -140,7 +140,7 @@ public: [[nodiscard]] Future commit() override; Version getCommittedVersion() const override { return tr.getCommittedVersion(); } VersionVector getVersionVector() const override { return tr.getVersionVector(); } - UID getSpanID() const override { return tr.getSpanID(); } + SpanContext getSpanContext() const override { return tr.getSpanContext(); } int64_t getApproximateSize() const override { return approximateSize; } [[nodiscard]] Future> getVersionstamp() override; @@ -177,7 +177,7 @@ public: Reference getTransactionState() const { return tr.trState; } - void setTransactionID(uint64_t id); + void setTransactionID(UID id); void setToken(uint64_t token); // Read from the special key space readConflictRangeKeysRange diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index f018c0fc2b..e24c1829ec 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -1595,10 +1595,10 @@ Future TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, if (key.endsWith(kTracingTransactionIdKey)) { result.push_back_deep(result.arena(), - KeyValueRef(key, std::to_string(ryw->getTransactionState()->spanID.first()))); + KeyValueRef(key, ryw->getTransactionState()->spanContext.traceID.toString())); } else if (key.endsWith(kTracingTokenKey)) { result.push_back_deep(result.arena(), - KeyValueRef(key, std::to_string(ryw->getTransactionState()->spanID.second()))); + KeyValueRef(key, std::to_string(ryw->getTransactionState()->spanContext.spanID))); } } return result; @@ -1612,7 +1612,7 @@ void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, } if (key.endsWith(kTracingTransactionIdKey)) { - ryw->setTransactionID(std::stoul(value.toString())); + ryw->setTransactionID(UID::fromString(value.toString())); } else if (key.endsWith(kTracingTokenKey)) { if (value.toString() == "true") { ryw->setToken(deterministicRandom()->randomUInt64()); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 13ba8f1e18..cda6a32b66 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -35,6 +35,7 @@ #include "fdbclient/CommitTransaction.h" #include "fdbclient/TagThrottle.actor.h" #include "fdbclient/Tenant.h" +#include "flow/Tracing.h" #include "flow/UnitTest.h" #include "fdbclient/VersionVector.h" @@ -271,7 +272,7 @@ struct GetValueReply : public LoadBalancedReply { struct GetValueRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 8454530; - SpanID spanContext; + SpanContext spanContext; TenantInfo tenantInfo; Key key; Version version; @@ -283,7 +284,7 @@ struct GetValueRequest : TimedRequest { // serve the given key GetValueRequest() {} - GetValueRequest(SpanID spanContext, + GetValueRequest(SpanContext spanContext, const TenantInfo& tenantInfo, const Key& key, Version ver, @@ -315,7 +316,7 @@ struct WatchValueReply { struct WatchValueRequest { constexpr static FileIdentifier file_identifier = 14747733; - SpanID spanContext; + SpanContext spanContext; TenantInfo tenantInfo; Key key; Optional value; @@ -326,7 +327,7 @@ struct WatchValueRequest { WatchValueRequest() {} - WatchValueRequest(SpanID spanContext, + WatchValueRequest(SpanContext spanContext, TenantInfo tenantInfo, const Key& key, Optional value, @@ -360,7 +361,7 @@ struct GetKeyValuesReply : public LoadBalancedReply { struct GetKeyValuesRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 6795746; - SpanID spanContext; + SpanContext spanContext; Arena arena; TenantInfo tenantInfo; KeySelectorRef begin, end; @@ -418,7 +419,7 @@ struct GetMappedKeyValuesReply : public LoadBalancedReply { struct GetMappedKeyValuesRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 6795747; - SpanID spanContext; + SpanContext spanContext; Arena arena; TenantInfo tenantInfo; KeySelectorRef begin, end; @@ -483,7 +484,7 @@ struct GetKeyValuesStreamReply : public ReplyPromiseStreamReply { struct GetKeyValuesStreamRequest { constexpr static FileIdentifier file_identifier = 6795746; - SpanID spanContext; + SpanContext spanContext; Arena arena; TenantInfo tenantInfo; KeySelectorRef begin, end; @@ -534,7 +535,7 @@ struct GetKeyReply : public LoadBalancedReply { struct GetKeyRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 10457870; - SpanID spanContext; + SpanContext spanContext; Arena arena; TenantInfo tenantInfo; KeySelectorRef sel; @@ -548,7 +549,7 @@ struct GetKeyRequest : TimedRequest { GetKeyRequest() {} - GetKeyRequest(SpanID spanContext, + GetKeyRequest(SpanContext spanContext, TenantInfo tenantInfo, KeySelectorRef const& sel, Version version, @@ -835,7 +836,7 @@ struct ChangeFeedStreamReply : public ReplyPromiseStreamReply { struct ChangeFeedStreamRequest { constexpr static FileIdentifier file_identifier = 6795746; - SpanID spanContext; + SpanContext spanContext; Arena arena; Key rangeID; Version begin = 0; diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 84ab49504b..c796f02536 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -465,8 +465,8 @@ VersionVector ThreadSafeTransaction::getVersionVector() { return tr->getVersionVector(); } -UID ThreadSafeTransaction::getSpanID() { - return tr->getSpanID(); +SpanContext ThreadSafeTransaction::getSpanContext() { + return tr->getSpanContext(); } ThreadFuture ThreadSafeTransaction::getApproximateSize() { diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index 0ace0a2cfe..a187bb2f45 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -167,7 +167,7 @@ public: ThreadFuture commit() override; Version getCommittedVersion() override; VersionVector getVersionVector() override; - UID getSpanID() override; + SpanContext getSpanContext() override; ThreadFuture getApproximateSize() override; ThreadFuture getProtocolVersion(); diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h index 6eed26b805..04492db4ba 100644 --- a/fdbclient/TransactionLineage.h +++ b/fdbclient/TransactionLineage.h @@ -34,10 +34,13 @@ struct TransactionLineage : LineageProperties { GetKeyServersLocations }; static constexpr std::string_view name = "Transaction"sv; - uint64_t txID; + UID txID; Operation operation = Operation::Unset; bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; } + bool isSet(UID TransactionLineage::*member) const { + return static_cast(this->*member).first() > 0 && static_cast(this->*member).second() > 0; + } bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; } }; diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 5c592ae0cb..a79d13dd02 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -20,6 +20,7 @@ // Unit tests for the flow language and libraries +#include "flow/Arena.h" #include "flow/ProtocolVersion.h" #include "flow/UnitTest.h" #include "flow/DeterministicRandom.h" diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 898f4f9204..ad737d3be4 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -19,6 +19,7 @@ */ #include "fdbrpc/FlowTransport.h" +#include "flow/Arena.h" #include "flow/network.h" #include @@ -278,6 +279,33 @@ struct UnauthorizedEndpointReceiver final : NetworkMessageReceiver { bool isPublic() const override { return true; } }; +// NetworkAddressCachedString retains a cached Standalone of +// a NetworkAddressList.address.toString() value. This cached value is useful +// for features in the hot path (i.e. Tracing), which need the String formatted value +// frequently and do not wish to pay the formatting cost. If the underlying NetworkAddressList +// needs to change, do not attempt to update it directly, use the setNetworkAddress API as it +// will ensure the new toString() cached value is updated. +class NetworkAddressCachedString { +public: + NetworkAddressCachedString() { setAddressList(NetworkAddressList()); } + NetworkAddressCachedString(NetworkAddressList const& list) { setAddressList(list); } + NetworkAddressList const& getAddressList() const { return addressList; } + void setAddressList(NetworkAddressList const& list) { + cachedStr = Standalone(StringRef(list.address.toString())); + addressList = list; + } + void setNetworkAddress(NetworkAddress const& addr) { + addressList.address = addr; + setAddressList(addressList); // force the recaching of the string. + } + Standalone getLocalAddressAsString() const { return cachedStr; } + operator NetworkAddressList const&() { return addressList; } + +private: + NetworkAddressList addressList; + Standalone cachedStr; +}; + class TransportData { public: TransportData(uint64_t transportId, int maxWellKnownEndpoints, IPAllowList const* allowList); @@ -299,7 +327,7 @@ public: // Returns true if given network address 'address' is one of the address we are listening on. bool isLocalAddress(const NetworkAddress& address) const; - NetworkAddressList localAddresses; + NetworkAddressCachedString localAddresses; std::vector> listeners; std::unordered_map> peers; std::unordered_map> closedPeers; @@ -877,12 +905,12 @@ void Peer::send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) { void Peer::prependConnectPacket() { // Send the ConnectPacket expected at the beginning of a new connection ConnectPacket pkt; - if (transport->localAddresses.address.isTLS() == destination.isTLS()) { - pkt.canonicalRemotePort = transport->localAddresses.address.port; - pkt.setCanonicalRemoteIp(transport->localAddresses.address.ip); - } else if (transport->localAddresses.secondaryAddress.present()) { - pkt.canonicalRemotePort = transport->localAddresses.secondaryAddress.get().port; - pkt.setCanonicalRemoteIp(transport->localAddresses.secondaryAddress.get().ip); + if (transport->localAddresses.getAddressList().address.isTLS() == destination.isTLS()) { + pkt.canonicalRemotePort = transport->localAddresses.getAddressList().address.port; + pkt.setCanonicalRemoteIp(transport->localAddresses.getAddressList().address.ip); + } else if (transport->localAddresses.getAddressList().secondaryAddress.present()) { + pkt.canonicalRemotePort = transport->localAddresses.getAddressList().secondaryAddress.get().port; + pkt.setCanonicalRemoteIp(transport->localAddresses.getAddressList().secondaryAddress.get().ip); } else { // a "mixed" TLS/non-TLS connection is like a client/server connection - there's no way to reverse it pkt.canonicalRemotePort = 0; @@ -919,10 +947,10 @@ void Peer::onIncomingConnection(Reference self, Reference con ++self->connectIncomingCount; if (!destination.isPublic() && !outgoingConnectionIdle) throw address_in_use(); - NetworkAddress compatibleAddr = transport->localAddresses.address; - if (transport->localAddresses.secondaryAddress.present() && - transport->localAddresses.secondaryAddress.get().isTLS() == destination.isTLS()) { - compatibleAddr = transport->localAddresses.secondaryAddress.get(); + NetworkAddress compatibleAddr = transport->localAddresses.getAddressList().address; + if (transport->localAddresses.getAddressList().secondaryAddress.present() && + transport->localAddresses.getAddressList().secondaryAddress.get().isTLS() == destination.isTLS()) { + compatibleAddr = transport->localAddresses.getAddressList().secondaryAddress.get(); } if (!destination.isPublic() || outgoingConnectionIdle || destination > compatibleAddr || @@ -1455,10 +1483,10 @@ ACTOR static Future listen(TransportData* self, NetworkAddress listenAddr) state ActorCollectionNoErrors incoming; // Actors monitoring incoming connections that haven't yet been associated with a peer state Reference listener = INetworkConnections::net()->listen(listenAddr); - if (!g_network->isSimulated() && self->localAddresses.address.port == 0) { + if (!g_network->isSimulated() && self->localAddresses.getAddressList().address.port == 0) { TraceEvent(SevInfo, "UpdatingListenAddress") .detail("AssignedListenAddress", listener->getListenAddress().toString()); - self->localAddresses.address = listener->getListenAddress(); + self->localAddresses.setNetworkAddress(listener->getListenAddress()); } state uint64_t connectionCount = 0; try { @@ -1507,8 +1535,9 @@ Reference TransportData::getOrOpenPeer(NetworkAddress const& address, bool } bool TransportData::isLocalAddress(const NetworkAddress& address) const { - return address == localAddresses.address || - (localAddresses.secondaryAddress.present() && address == localAddresses.secondaryAddress.get()); + return address == localAddresses.getAddressList().address || + (localAddresses.getAddressList().secondaryAddress.present() && + address == localAddresses.getAddressList().secondaryAddress.get()); } ACTOR static Future multiVersionCleanupWorker(TransportData* self) { @@ -1554,15 +1583,21 @@ void FlowTransport::initMetrics() { } NetworkAddressList FlowTransport::getLocalAddresses() const { - return self->localAddresses; + return self->localAddresses.getAddressList(); } NetworkAddress FlowTransport::getLocalAddress() const { - return self->localAddresses.address; + return self->localAddresses.getAddressList().address; +} + +Standalone FlowTransport::getLocalAddressAsString() const { + return self->localAddresses.getLocalAddressAsString(); } void FlowTransport::setLocalAddress(NetworkAddress const& address) { - self->localAddresses.address = address; + auto newAddress = self->localAddresses.getAddressList(); + newAddress.address = address; + self->localAddresses.setAddressList(newAddress); } const std::unordered_map>& FlowTransport::getAllPeers() const { @@ -1586,11 +1621,14 @@ Future FlowTransport::onIncompatibleChanged() { Future FlowTransport::bind(NetworkAddress publicAddress, NetworkAddress listenAddress) { ASSERT(publicAddress.isPublic()); - if (self->localAddresses.address == NetworkAddress()) { - self->localAddresses.address = publicAddress; + if (self->localAddresses.getAddressList().address == NetworkAddress()) { + self->localAddresses.setNetworkAddress(publicAddress); } else { - self->localAddresses.secondaryAddress = publicAddress; + auto addrList = self->localAddresses.getAddressList(); + addrList.secondaryAddress = publicAddress; + self->localAddresses.setAddressList(addrList); } + // reformatLocalAddress() TraceEvent("Binding").detail("PublicAddress", publicAddress).detail("ListenAddress", listenAddress); Future listenF = listen(self, listenAddress); @@ -1641,7 +1679,7 @@ void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream) void FlowTransport::addEndpoint(Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID) { endpoint.token = deterministicRandom()->randomUniqueID(); if (receiver->isStream()) { - endpoint.addresses = self->localAddresses; + endpoint.addresses = self->localAddresses.getAddressList(); endpoint.token = UID(endpoint.token.first() | TOKEN_STREAM_FLAG, endpoint.token.second()); } else { endpoint.addresses = NetworkAddressList(); @@ -1651,7 +1689,7 @@ void FlowTransport::addEndpoint(Endpoint& endpoint, NetworkMessageReceiver* rece } void FlowTransport::addEndpoints(std::vector> const& streams) { - self->endpoints.insert(self->localAddresses, streams); + self->endpoints.insert(self->localAddresses.getAddressList(), streams); } void FlowTransport::removeEndpoint(const Endpoint& endpoint, NetworkMessageReceiver* receiver) { @@ -1659,7 +1697,7 @@ void FlowTransport::removeEndpoint(const Endpoint& endpoint, NetworkMessageRecei } void FlowTransport::addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID) { - endpoint.addresses = self->localAddresses; + endpoint.addresses = self->localAddresses.getAddressList(); ASSERT(receiver->isStream()); self->endpoints.insertWellKnown(receiver, endpoint.token, taskID); } diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index 8f60a2fc9b..ceaf3e6f35 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -20,6 +20,7 @@ #ifndef FLOW_TRANSPORT_H #define FLOW_TRANSPORT_H +#include "flow/Arena.h" #pragma once #include @@ -215,6 +216,10 @@ public: // Returns first local NetworkAddress. NetworkAddress getLocalAddress() const; + // Returns first local NetworkAddress as std::string. Caches value + // to avoid unnecessary calls to toString() and fmt overhead. + Standalone getLocalAddressAsString() const; + // Returns first local NetworkAddress. void setLocalAddress(NetworkAddress const&); diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index b991b64a10..065a35d110 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -24,6 +24,7 @@ #include "contrib/fmt-8.1.1/include/fmt/format.h" #include "fdbrpc/simulator.h" +#include "flow/Arena.h" #define BOOST_SYSTEM_NO_LIB #define BOOST_DATE_TIME_NO_LIB #define BOOST_REGEX_NO_LIB diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 90f987021f..290e83efd4 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -53,7 +53,7 @@ namespace { class ApplyMetadataMutationsImpl { public: - ApplyMetadataMutationsImpl(const SpanID& spanContext_, + ApplyMetadataMutationsImpl(const SpanContext& spanContext_, const UID& dbgid_, Arena& arena_, const VectorRef& mutations_, @@ -61,7 +61,7 @@ public: : spanContext(spanContext_), dbgid(dbgid_), arena(arena_), mutations(mutations_), txnStateStore(txnStateStore_), confChange(dummyConfChange) {} - ApplyMetadataMutationsImpl(const SpanID& spanContext_, + ApplyMetadataMutationsImpl(const SpanContext& spanContext_, Arena& arena_, const VectorRef& mutations_, ProxyCommitData& proxyCommitData_, @@ -82,7 +82,7 @@ public: tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap), initialCommit(initialCommit_) {} - ApplyMetadataMutationsImpl(const SpanID& spanContext_, + ApplyMetadataMutationsImpl(const SpanContext& spanContext_, ResolverData& resolverData_, const VectorRef& mutations_) : spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_), @@ -94,7 +94,7 @@ public: private: // The following variables are incoming parameters - const SpanID& spanContext; + const SpanContext& spanContext; const UID& dbgid; @@ -1217,7 +1217,7 @@ public: } // anonymous namespace -void applyMetadataMutations(SpanID const& spanContext, +void applyMetadataMutations(SpanContext const& spanContext, ProxyCommitData& proxyCommitData, Arena& arena, Reference logSystem, @@ -1241,13 +1241,13 @@ void applyMetadataMutations(SpanID const& spanContext, .apply(); } -void applyMetadataMutations(SpanID const& spanContext, +void applyMetadataMutations(SpanContext const& spanContext, ResolverData& resolverData, const VectorRef& mutations) { ApplyMetadataMutationsImpl(spanContext, resolverData, mutations).apply(); } -void applyMetadataMutations(SpanID const& spanContext, +void applyMetadataMutations(SpanContext const& spanContext, const UID& dbgid, Arena& arena, const VectorRef& mutations, diff --git a/fdbserver/ApplyMetadataMutation.h b/fdbserver/ApplyMetadataMutation.h index d4e47e0946..23f9e3a2f9 100644 --- a/fdbserver/ApplyMetadataMutation.h +++ b/fdbserver/ApplyMetadataMutation.h @@ -87,7 +87,7 @@ Reference getStorageInfo(UID id, std::map>* storageCache, IKeyValueStore* txnStateStore); -void applyMetadataMutations(SpanID const& spanContext, +void applyMetadataMutations(SpanContext const& spanContext, ProxyCommitData& proxyCommitData, Arena& arena, Reference logSystem, @@ -97,7 +97,7 @@ void applyMetadataMutations(SpanID const& spanContext, Version version, Version popVersion, bool initialCommit); -void applyMetadataMutations(SpanID const& spanContext, +void applyMetadataMutations(SpanContext const& spanContext, const UID& dbgid, Arena& arena, const VectorRef& mutations, @@ -140,7 +140,7 @@ inline bool containsMetadataMutation(const VectorRef& mutations) { } // Resolver's version -void applyMetadataMutations(SpanID const& spanContext, +void applyMetadataMutations(SpanContext const& spanContext, ResolverData& resolverData, const VectorRef& mutations); diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 0ac5b56a7d..8addd89f05 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -67,6 +67,10 @@ struct VersionedMessage { return false; if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader)) return false; + if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) { + TEST(true); // Returning false for OTELSpanContextMessage + return false; + } reader >> *m; return normalKeys.contains(m->param1) || m->param1 == metadataVersionKey; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 01b3cd343e..f6d56ebe41 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -88,6 +88,7 @@ set(FDBSERVER_SRCS OldTLogServer_4_6.actor.cpp OldTLogServer_6_0.actor.cpp OldTLogServer_6_2.actor.cpp + OTELSpanContextMessage.h OnDemandStore.actor.cpp OnDemandStore.h PaxosConfigConsumer.actor.cpp diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index 94d122ad40..a0f5c22875 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -1629,7 +1629,7 @@ ACTOR Future clusterRecoveryCore(Reference self) { tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned())); } - applyMetadataMutations(SpanID(), + applyMetadataMutations(SpanContext(), self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 5c8449a332..ca2ffbf3d5 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -464,7 +464,7 @@ ACTOR Future addBackupMutations(ProxyCommitData* self, state int yieldBytes = 0; state BinaryWriter valueWriter(Unversioned()); - toCommit->addTransactionInfo(SpanID()); + toCommit->addTransactionInfo(SpanContext()); // Serialize the log range mutations within the map for (; logRangeMutation != logRangeMutations->cend(); ++logRangeMutation) { @@ -731,7 +731,7 @@ void CommitBatchContext::setupTraceBatch() { g_traceBatch.addAttach("CommitAttachID", tr.debugID.get().first(), debugID.get().first()); } - span.addParent(tr.spanContext); + span.addLink(tr.spanContext); } if (debugID.present()) { @@ -960,7 +960,7 @@ void applyMetadataEffect(CommitBatchContext* self) { committed = committed && self->resolution[resolver].stateMutations[versionIndex][transactionIndex].committed; if (committed) { - applyMetadataMutations(SpanID(), + applyMetadataMutations(SpanContext(), *self->pProxyCommitData, self->arena, self->pProxyCommitData->logSystem, @@ -1380,8 +1380,7 @@ ACTOR Future postResolution(CommitBatchContext* self) { // simulation TEST(true); // Semi-committed pipeline limited by MVCC window //TraceEvent("ProxyWaitingForCommitted", pProxyCommitData->dbgid).detail("CommittedVersion", pProxyCommitData->committedVersion.get()).detail("NeedToCommit", commitVersion); - waitVersionSpan = Span( - deterministicRandom()->randomUniqueID(), "MP:overMaxReadTransactionLifeVersions"_loc, { span.context }); + waitVersionSpan = Span("MP:overMaxReadTransactionLifeVersions"_loc, span.context); choose { when(wait(pProxyCommitData->committedVersion.whenAtLeast( self->commitVersion - SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS))) { @@ -1777,7 +1776,7 @@ void addTagMapping(GetKeyServerLocationsReply& reply, ProxyCommitData* commitDat ACTOR static Future doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) { // We can't respond to these requests until we have valid txnStateStore getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations; - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); + getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; wait(commitData->validState.getFuture()); wait(delay(0, TaskPriority::DefaultEndpoint)); @@ -2297,7 +2296,7 @@ ACTOR Future processCompleteTransactionStateRequest(TransactionStateResolv Arena arena; bool confChanges; - applyMetadataMutations(SpanID(), + applyMetadataMutations(SpanContext(), *pContext->pCommitData, arena, Reference(), diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index 6d0127c431..30ad98bcf1 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -542,7 +542,7 @@ ACTOR Future lastCommitUpdater(GrvProxyData* self, PromiseStream getLiveCommittedVersion(SpanID parentSpan, +ACTOR Future getLiveCommittedVersion(SpanContext parentSpan, GrvProxyData* grvProxyData, uint32_t flags, Optional debugID, @@ -945,7 +945,7 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, int batchGRVProcessed = 0; for (int i = 0; i < start.size(); i++) { if (start[i].size()) { - Future readVersionReply = getLiveCommittedVersion(UID() /*span.context*/, + Future readVersionReply = getLiveCommittedVersion(SpanContext(), grvProxyData, i, debugID, diff --git a/fdbserver/LogSystem.cpp b/fdbserver/LogSystem.cpp index 1e1189facb..ab8f43cfc5 100644 --- a/fdbserver/LogSystem.cpp +++ b/fdbserver/LogSystem.cpp @@ -19,6 +19,9 @@ */ #include "fdbserver/LogSystem.h" +#include "fdbclient/FDBTypes.h" +#include "fdbserver/OTELSpanContextMessage.h" +#include "fdbserver/SpanContextMessage.h" #include "flow/serialize.h" std::string LogSet::logRouterString() { @@ -277,8 +280,8 @@ void LogPushData::addTxsTag() { } } -void LogPushData::addTransactionInfo(SpanID const& context) { - TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanID +void LogPushData::addTransactionInfo(SpanContext const& context) { + TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanContext spanContext = context; writtenLocations.clear(); } @@ -344,13 +347,33 @@ bool LogPushData::writeTransactionInfo(int location, uint32_t subseq) { writtenLocations.insert(location); BinaryWriter& wr = messagesWriter[location]; - SpanContextMessage contextMessage(spanContext); - int offset = wr.getLength(); wr << uint32_t(0) << subseq << uint16_t(prev_tags.size()); for (auto& tag : prev_tags) wr << tag; - wr << contextMessage; + if (logSystem->getTLogVersion() >= TLogVersion::V7) { + OTELSpanContextMessage contextMessage(spanContext); + wr << contextMessage; + } else { + // When we're on a TLog version below 7, but the front end of the system (i.e. proxy, sequencer, resolver) + // is using OpenTelemetry tracing (i.e on or above 7.2), we need to convert the OpenTelemetry Span data model + // i.e. 16 bytes for traceId, 8 bytes for spanId, to the OpenTracing spec, which is 8 bytes for traceId + // and 8 bytes for spanId. That means we need to drop some data. + // + // As a workaround for this special case we've decided to drop is the 8 bytes + // for spanId. Therefore we're passing along the full 16 byte traceId to the storage server with 0 for spanID. + // This will result in a follows from relationship for the storage span within the trace rather than a + // parent->child. + SpanContextMessage contextMessage; + if (spanContext.isSampled()) { + TEST(true); // Converting OTELSpanContextMessage to traced SpanContextMessage + contextMessage = SpanContextMessage(UID(spanContext.traceID.first(), spanContext.traceID.second())); + } else { + TEST(true); // Converting OTELSpanContextMessage to untraced SpanContextMessage + contextMessage = SpanContextMessage(UID(0, 0)); + } + wr << contextMessage; + } int length = wr.getLength() - offset; *(uint32_t*)((uint8_t*)wr.getData() + offset) = length - sizeof(uint32_t); return true; diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index e8453184e4..6581457c25 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -26,6 +26,7 @@ #include #include "fdbserver/SpanContextMessage.h" +#include "fdbserver/OTELSpanContextMessage.h" #include "fdbserver/TLogInterface.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbclient/DatabaseConfiguration.h" @@ -519,7 +520,7 @@ struct ILogSystem { Version knownCommittedVersion, Version minKnownCommittedVersion, LogPushData& data, - SpanID const& spanContext, + SpanContext const& spanContext, Optional debugID = Optional(), Optional> tpcvMap = Optional>()) = 0; @@ -762,7 +763,7 @@ struct LogPushData : NonCopyable { } // Add transaction info to be written before the first mutation in the transaction. - void addTransactionInfo(SpanID const& context); + void addTransactionInfo(SpanContext const& context); // copy written_tags, after filtering, into given set void saveTags(std::set& filteredTags) const { @@ -832,7 +833,7 @@ private: // field. std::unordered_set writtenLocations; uint32_t subsequence; - SpanID spanContext; + SpanContext spanContext; bool shardChanged = false; // if keyServers has any changes, i.e., shard boundary modifications. // Writes transaction info to the message stream at the given location if diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index 73fc6ef114..f9c2c506ad 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -133,14 +133,14 @@ struct GetCommitVersionReply { struct GetCommitVersionRequest { constexpr static FileIdentifier file_identifier = 16683181; - SpanID spanContext; + SpanContext spanContext; uint64_t requestNum; uint64_t mostRecentProcessedRequestNum; UID requestingProxy; ReplyPromise reply; GetCommitVersionRequest() {} - GetCommitVersionRequest(SpanID spanContext, + GetCommitVersionRequest(SpanContext spanContext, uint64_t requestNum, uint64_t mostRecentProcessedRequestNum, UID requestingProxy) diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp index 9ec17299d5..fd8f55c313 100644 --- a/fdbserver/MutationTracking.cpp +++ b/fdbserver/MutationTracking.cpp @@ -24,6 +24,7 @@ #include "fdbserver/MutationTracking.h" #include "fdbserver/LogProtocolMessage.h" #include "fdbserver/SpanContextMessage.h" +#include "fdbserver/OTELSpanContextMessage.h" #include "fdbclient/SystemData.h" #if defined(FDB_CLEAN_BUILD) && MUTATION_TRACKING_ENABLED #error "You cannot use mutation tracking in a clean/release build." @@ -96,6 +97,11 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion())); SpanContextMessage scm; br >> scm; + } else if (OTELSpanContextMessage::startsOTELSpanContextMessage(mutationType)) { + TEST(true); // MutationTracking reading OTELSpanContextMessage + BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion())); + OTELSpanContextMessage scm; + br >> scm; } else { MutationRef m; BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion())); diff --git a/fdbserver/OTELSpanContextMessage.h b/fdbserver/OTELSpanContextMessage.h new file mode 100644 index 0000000000..9f6d588fed --- /dev/null +++ b/fdbserver/OTELSpanContextMessage.h @@ -0,0 +1,66 @@ +/* + * OTELSpanContextMessage.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBSERVER_OTELSPANCONTEXTMESSAGE_H +#define FDBSERVER_OTELSPANCONTEXTMESSAGE_H +#pragma once + +#include "flow/Tracing.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/CommitTransaction.h" + +struct OTELSpanContextMessage { + // This message is pushed into the the transaction logs' memory to inform + // it what transaction subsequent mutations were a part of. This allows + // transaction logs and storage servers to associate mutations with a + // transaction identifier, called a span context. + // + // This message is similar to LogProtocolMessage. Storage servers read the + // first byte of this message to uniquely identify it, meaning it will + // never be mistaken for another message. See LogProtocolMessage.h for more + // information. + + SpanContext spanContext; + + OTELSpanContextMessage() {} + OTELSpanContextMessage(SpanContext const& spanContext) : spanContext(spanContext) {} + + std::string toString() const { + return format("code: %d, span context: %s", + MutationRef::Reserved_For_OTELSpanContextMessage, + spanContext.toString().c_str()); + } + + template + void serialize(Ar& ar) { + uint8_t poly = MutationRef::Reserved_For_OTELSpanContextMessage; + serializer(ar, poly, spanContext); + } + + static bool startsOTELSpanContextMessage(uint8_t byte) { + return byte == MutationRef::Reserved_For_OTELSpanContextMessage; + } + template + static bool isNextIn(Ar& ar) { + return startsOTELSpanContextMessage(*(const uint8_t*)ar.peekBytes(1)); + } +}; + +#endif diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index d24a0401b5..91449c6923 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -340,8 +340,8 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc // The condition here must match CommitBatch::applyMetadataToCommittedTransactions() if (reply.committed[t] == ConflictBatch::TransactionCommitted && !self->forceRecovery && SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS && (!isLocked || req.transactions[t].lock_aware)) { - SpanID spanContext = - req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanID(); + SpanContext spanContext = + req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanContext(); applyMetadataMutations(spanContext, resolverData, req.transactions[t].mutations); } @@ -565,7 +565,7 @@ ACTOR Future processCompleteTransactionStateRequest(TransactionStateResolv ResolverData resolverData( pContext->pResolverData->dbgid, pContext->pTxnStateStore, &pContext->pResolverData->keyInfo, confChanges); - applyMetadataMutations(SpanID(), resolverData, mutations); + applyMetadataMutations(SpanContext(), resolverData, mutations); } // loop auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get(); diff --git a/fdbserver/ResolverInterface.h b/fdbserver/ResolverInterface.h index 782fa2be88..51110e5c01 100644 --- a/fdbserver/ResolverInterface.h +++ b/fdbserver/ResolverInterface.h @@ -118,7 +118,7 @@ struct ResolveTransactionBatchRequest { constexpr static FileIdentifier file_identifier = 16462858; Arena arena; - SpanID spanContext; + SpanContext spanContext; Version prevVersion; Version version; // FIXME: ? Version lastReceivedVersion; diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index a97931a2a8..8cf24a67d8 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbserver/OTELSpanContextMessage.h" #include "flow/Arena.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/NativeAPI.actor.h" @@ -1897,6 +1898,10 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { SpanContextMessage::isNextIn(cloneReader)) { SpanContextMessage scm; cloneReader >> scm; + } else if (cloneReader.protocolVersion().hasOTELSpanContext() && + OTELSpanContextMessage::isNextIn(cloneReader)) { + OTELSpanContextMessage scm; + cloneReader >> scm; } else { MutationRef msg; cloneReader >> msg; @@ -1975,6 +1980,10 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { } else if (reader.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(reader)) { SpanContextMessage scm; reader >> scm; + } else if (reader.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(reader)) { + TEST(true); // StorageCache reading OTELSpanContextMessage + OTELSpanContextMessage oscm; + reader >> oscm; } else { MutationRef msg; reader >> msg; diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index b8ec6899d2..9da4ecedd4 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -296,7 +296,7 @@ struct TLogCommitReply { struct TLogCommitRequest { constexpr static FileIdentifier file_identifier = 4022206; - SpanID spanContext; + SpanContext spanContext; Arena arena; Version prevVersion, version, knownCommittedVersion, minKnownCommittedVersion; @@ -307,7 +307,7 @@ struct TLogCommitRequest { Optional debugID; TLogCommitRequest() {} - TLogCommitRequest(const SpanID& context, + TLogCommitRequest(const SpanContext& context, const Arena& a, Version prevVersion, Version version, diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index a4822a7ef5..70a2293f14 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -507,7 +507,7 @@ Future TagPartitionedLogSystem::push(Version prevVersion, Version knownCommittedVersion, Version minKnownCommittedVersion, LogPushData& data, - SpanID const& spanContext, + SpanContext const& spanContext, Optional debugID, Optional> tpcvMap) { // FIXME: Randomize request order as in LegacyLogSystem? diff --git a/fdbserver/TagPartitionedLogSystem.actor.h b/fdbserver/TagPartitionedLogSystem.actor.h index baf1a46711..eb7c389e5b 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.h +++ b/fdbserver/TagPartitionedLogSystem.actor.h @@ -191,7 +191,7 @@ struct TagPartitionedLogSystem final : ILogSystem, ReferenceCounted debugID, Optional> tpcvMap) final; diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 7cb99d8d21..9bd1660b07 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -120,7 +120,7 @@ struct MasterData : NonCopyable, ReferenceCounted { }; ACTOR Future getVersion(Reference self, GetCommitVersionRequest req) { - state Span span("M:getVersion"_loc, { req.spanContext }); + state Span span("M:getVersion"_loc, req.spanContext); state std::map::iterator proxyItr = self->lastCommitProxyVersionReplies.find(req.requestingProxy); // lastCommitProxyVersionReplies never changes diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index c49f19a384..3631affc4d 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -24,8 +24,10 @@ #include #include "contrib/fmt-8.1.1/include/fmt/format.h" +#include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/LoadBalance.h" +#include "fdbserver/OTELSpanContextMessage.h" #include "flow/ActorCollection.h" #include "flow/Arena.h" #include "flow/Error.h" @@ -1395,8 +1397,8 @@ void updateProcessStats(StorageServer* self) { #pragma region Queries #endif -ACTOR Future waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) { - state Span span("SS.WaitForVersion"_loc, { spanContext }); +ACTOR Future waitForVersionActor(StorageServer* data, Version version, SpanContext spanContext) { + state Span span("SS.WaitForVersion"_loc, spanContext); choose { when(wait(data->version.whenAtLeast(version))) { // FIXME: A bunch of these can block with or without the following delay 0. @@ -1433,7 +1435,7 @@ Version getLatestCommitVersion(VersionVector& ssLatestCommitVersions, Tag& tag) return commitVersion; } -Future waitForVersion(StorageServer* data, Version version, SpanID spanContext) { +Future waitForVersion(StorageServer* data, Version version, SpanContext spanContext) { if (version == latestVersion) { version = std::max(Version(1), data->version.get()); } @@ -1454,7 +1456,10 @@ Future waitForVersion(StorageServer* data, Version version, SpanID span return waitForVersionActor(data, version, spanContext); } -Future waitForVersion(StorageServer* data, Version commitVersion, Version readVersion, SpanID spanContext) { +Future waitForVersion(StorageServer* data, + Version commitVersion, + Version readVersion, + SpanContext spanContext) { ASSERT(commitVersion == invalidVersion || commitVersion < readVersion); if (commitVersion == invalidVersion) { @@ -1528,11 +1533,11 @@ Optional StorageServer::getTenantEntry(Version version, TenantIn ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { state int64_t resultSize = 0; - Span span("SS:getValue"_loc, { req.spanContext }); + Span span("SS:getValue"_loc, req.spanContext); if (req.tenantInfo.name.present()) { - span.addTag("tenant"_sr, req.tenantInfo.name.get()); + span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } - span.addTag("key"_sr, req.key); + span.addAttribute("key"_sr, req.key); // Temporarily disabled -- this path is hit a lot // getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); @@ -1665,9 +1670,9 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // must be kept alive until the watch is finished. extern size_t WATCH_OVERHEAD_WATCHQ, WATCH_OVERHEAD_WATCHIMPL; -ACTOR Future watchWaitForValueChange(StorageServer* data, SpanID parent, KeyRef key) { +ACTOR Future watchWaitForValueChange(StorageServer* data, SpanContext parent, KeyRef key) { state Location spanLocation = "SS:watchWaitForValueChange"_loc; - state Span span(spanLocation, { parent }); + state Span span(spanLocation, parent); state Reference metadata = data->getWatchMetadata(key); if (metadata->debugID.present()) @@ -1774,8 +1779,8 @@ void checkCancelWatchImpl(StorageServer* data, WatchValueRequest req) { ACTOR Future watchValueSendReply(StorageServer* data, WatchValueRequest req, Future resp, - SpanID spanContext) { - state Span span("SS:watchValue"_loc, { spanContext }); + SpanContext spanContext) { + state Span span("SS:watchValue"_loc, spanContext); state double startTime = now(); ++data->counters.watchQueries; ++data->numWatches; @@ -2503,7 +2508,7 @@ ACTOR Future stopChangeFeedOnMove(StorageServer* data, ChangeFeedStreamReq } ACTOR Future changeFeedStreamQ(StorageServer* data, ChangeFeedStreamRequest req, UID streamUID) { - state Span span("SS:getChangeFeedStream"_loc, { req.spanContext }); + state Span span("SS:getChangeFeedStream"_loc, req.spanContext); state bool atLatest = false; state bool removeUID = false; state Optional blockedVersion; @@ -2859,7 +2864,7 @@ ACTOR Future readRange(StorageServer* data, KeyRange range, int limit, int* pLimitBytes, - SpanID parentSpan, + SpanContext parentSpan, IKeyValueStore::ReadType type, Optional tenantPrefix) { state GetKeyValuesReply result; @@ -3098,7 +3103,7 @@ ACTOR Future findKey(StorageServer* data, Version version, KeyRange range, int* pOffset, - SpanID parentSpan, + SpanContext parentSpan, IKeyValueStore::ReadType type) // Attempts to find the key indicated by sel in the data at version, within range. // Precondition: selectorInRange(sel, range) @@ -3119,7 +3124,7 @@ ACTOR Future findKey(StorageServer* data, state int sign = forward ? +1 : -1; state bool skipEqualKey = sel.orEqual == forward; state int distance = forward ? sel.offset : 1 - sel.offset; - state Span span("SS.findKey"_loc, { parentSpan }); + state Span span("SS.findKey"_loc, parentSpan); // Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from // the read range in this case) @@ -3217,16 +3222,16 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large // selector offset prevents all data from being read in one range read { - state Span span("SS:getKeyValues"_loc, { req.spanContext }); + state Span span("SS:getKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; state IKeyValueStore::ReadType type = req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; if (req.tenantInfo.name.present()) { - span.addTag("tenant"_sr, req.tenantInfo.name.get()); + span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); + getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; ++data->counters.getRangeQueries; ++data->counters.allQueries; @@ -3711,16 +3716,16 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large // selector offset prevents all data from being read in one range read { - state Span span("SS:getMappedKeyValues"_loc, { req.spanContext }); + state Span span("SS:getMappedKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; state IKeyValueStore::ReadType type = req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; if (req.tenantInfo.name.present()) { - span.addTag("tenant"_sr, req.tenantInfo.name.get()); + span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); + getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; ++data->counters.getMappedRangeQueries; ++data->counters.allQueries; @@ -3925,13 +3930,13 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large // selector offset prevents all data from being read in one range read { - state Span span("SS:getKeyValuesStream"_loc, { req.spanContext }); + state Span span("SS:getKeyValuesStream"_loc, req.spanContext); state int64_t resultSize = 0; state IKeyValueStore::ReadType type = req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; if (req.tenantInfo.name.present()) { - span.addTag("tenant"_sr, req.tenantInfo.name.get()); + span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } req.reply.setByteLimit(SERVER_KNOBS->RANGESTREAM_LIMIT_BYTES); @@ -4129,12 +4134,12 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe } ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { - state Span span("SS:getKey"_loc, { req.spanContext }); + state Span span("SS:getKey"_loc, req.spanContext); if (req.tenantInfo.name.present()) { - span.addTag("tenant"_sr, req.tenantInfo.name.get()); + span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } state int64_t resultSize = 0; - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); + getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; ++data->counters.getKeyQueries; ++data->counters.allQueries; @@ -6851,6 +6856,10 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { SpanContextMessage::isNextIn(cloneReader)) { SpanContextMessage scm; cloneReader >> scm; + } else if (cloneReader.protocolVersion().hasOTELSpanContext() && + OTELSpanContextMessage::isNextIn(cloneReader)) { + OTELSpanContextMessage scm; + cloneReader >> scm; } else { MutationRef msg; cloneReader >> msg; @@ -6933,7 +6942,7 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { state Version ver = invalidVersion; cloneCursor2->setProtocolVersion(data->logProtocol); - state SpanID spanContext = SpanID(); + state SpanContext spanContext = SpanContext(); state double beforeTLogMsgsUpdates = now(); state std::set updatedChangeFeeds; for (; cloneCursor2->hasMessage(); cloneCursor2->nextMessage()) { @@ -6967,17 +6976,27 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { data->logProtocol = rd.protocolVersion(); data->storage.changeLogProtocol(ver, data->logProtocol); cloneCursor2->setProtocolVersion(rd.protocolVersion()); - spanContext = UID(); + spanContext.traceID = UID(); } else if (rd.protocolVersion().hasSpanContext() && SpanContextMessage::isNextIn(rd)) { SpanContextMessage scm; rd >> scm; + TEST(true); // storageserveractor converting SpanContextMessage into OTEL SpanContext + spanContext = + SpanContext(UID(scm.spanContext.first(), scm.spanContext.second()), + 0, + scm.spanContext.first() != 0 && scm.spanContext.second() != 0 ? TraceFlags::sampled + : TraceFlags::unsampled); + } else if (rd.protocolVersion().hasOTELSpanContext() && OTELSpanContextMessage::isNextIn(rd)) { + TEST(true); // storageserveractor reading OTELSpanContextMessage + OTELSpanContextMessage scm; + rd >> scm; spanContext = scm.spanContext; } else { MutationRef msg; rd >> msg; - Span span("SS:update"_loc, { spanContext }); - span.addTag("key"_sr, msg.param1); + Span span("SS:update"_loc, spanContext); + span.addAttribute("key"_sr, msg.param1); // Drop non-private mutations if TSS fault injection is enabled in simulation, or if this is a TSS in // quarantine. @@ -8410,11 +8429,11 @@ ACTOR Future serveGetKeyRequests(StorageServer* self, FutureStream watchValueWaitForVersion(StorageServer* self, WatchValueRequest req, PromiseStream stream) { - state Span span("SS:watchValueWaitForVersion"_loc, { req.spanContext }); + state Span span("SS:watchValueWaitForVersion"_loc, req.spanContext); if (req.tenantInfo.name.present()) { - span.addTag("tenant"_sr, req.tenantInfo.name.get()); + span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); + getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; try { wait(success(waitForVersionNoTooOld(self, req.version))); Optional entry = self->getTenantEntry(latestVersion, req.tenantInfo); @@ -8432,11 +8451,11 @@ ACTOR Future watchValueWaitForVersion(StorageServer* self, ACTOR Future serveWatchValueRequestsImpl(StorageServer* self, FutureStream stream) { loop { - getCurrentLineage()->modify(&TransactionLineage::txID) = 0; + getCurrentLineage()->modify(&TransactionLineage::txID) = UID(); state WatchValueRequest req = waitNext(stream); state Reference metadata = self->getWatchMetadata(req.key.contents()); - state Span span("SS:serveWatchValueRequestsImpl"_loc, { req.spanContext }); - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); + state Span span("SS:serveWatchValueRequestsImpl"_loc, req.spanContext); + getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; // case 1: no watch set for the current key if (!metadata.isValid()) { diff --git a/fdbserver/workloads/ApiWorkload.h b/fdbserver/workloads/ApiWorkload.h index 64836e03b6..8f46f7b148 100644 --- a/fdbserver/workloads/ApiWorkload.h +++ b/fdbserver/workloads/ApiWorkload.h @@ -80,8 +80,8 @@ struct TransactionWrapper : public ReferenceCounted { // Gets the version vector cached in a transaction virtual VersionVector getVersionVector() = 0; - // Gets the spanID of a transaction - virtual UID getSpanID() = 0; + // Gets the spanContext of a transaction + virtual SpanContext getSpanContext() = 0; // Prints debugging messages for a transaction; not implemented for all transaction types virtual void debugTransaction(UID debugId) {} @@ -161,8 +161,8 @@ struct FlowTransactionWrapper : public TransactionWrapper { // Gets the version vector cached in a transaction VersionVector getVersionVector() override { return transaction.getVersionVector(); } - // Gets the spanID of a transaction - UID getSpanID() override { return transaction.getSpanID(); } + // Gets the spanContext of a transaction + SpanContext getSpanContext() override { return transaction.getSpanContext(); } // Prints debugging messages for a transaction void debugTransaction(UID debugId) override { transaction.debugTransaction(debugId); } @@ -229,8 +229,8 @@ struct ThreadTransactionWrapper : public TransactionWrapper { // Gets the version vector cached in a transaction VersionVector getVersionVector() override { return transaction->getVersionVector(); } - // Gets the spanID of a transaction - UID getSpanID() override { return transaction->getSpanID(); } + // Gets the spanContext of a transaction + SpanContext getSpanContext() override { return transaction->getSpanContext(); } void addReadConflictRange(KeyRangeRef const& keys) override { transaction->addReadConflictRange(keys); } }; diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 35952cc7c6..f55d8f975b 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -873,7 +873,8 @@ struct ConsistencyCheckWorkload : TestWorkload { state Key begin = kr.begin; state Key end = kr.end; state int limitKeyServers = BUGGIFY ? 1 : 100; - state Span span(deterministicRandom()->randomUniqueID(), "WL:ConsistencyCheck"_loc); + state Span span(SpanContext(deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUInt64()), + "WL:ConsistencyCheck"_loc); while (begin < end) { state Reference commitProxyInfo = diff --git a/fdbserver/workloads/Cycle.actor.cpp b/fdbserver/workloads/Cycle.actor.cpp index 1b7131f8e3..a7806ac1c8 100644 --- a/fdbserver/workloads/Cycle.actor.cpp +++ b/fdbserver/workloads/Cycle.actor.cpp @@ -106,10 +106,9 @@ struct CycleWorkload : TestWorkload { state Transaction tr(cx); if (deterministicRandom()->random01() >= self->traceParentProbability) { state Span span("CycleClient"_loc); - // TraceEvent("CycleTracingTransaction", span.context).log(); - TraceEvent("CycleTracingTransaction", span.context).log(); + TraceEvent("CycleTracingTransaction", span.context.traceID).log(); tr.setOption(FDBTransactionOptions::SPAN_PARENT, - BinaryWriter::toValue(span.context, Unversioned())); + BinaryWriter::toValue(span.context, IncludeVersion())); } while (true) { try { diff --git a/fdbserver/workloads/MiniCycle.actor.cpp b/fdbserver/workloads/MiniCycle.actor.cpp index b071902a8c..5b9b48ab2c 100644 --- a/fdbserver/workloads/MiniCycle.actor.cpp +++ b/fdbserver/workloads/MiniCycle.actor.cpp @@ -174,7 +174,7 @@ struct MiniCycleWorkload : TestWorkload { state Transaction tr(cx); if (deterministicRandom()->random01() >= self->traceParentProbability) { state Span span("MiniCycleClient"_loc); - TraceEvent("MiniCycleTracingTransaction", span.context).log(); + TraceEvent("MiniCycleTracingTransaction", span.context.traceID).log(); tr.setOption(FDBTransactionOptions::SPAN_PARENT, BinaryWriter::toValue(span.context, Unversioned())); } diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 29785e1f39..f83aac02c5 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -21,6 +21,7 @@ #include "boost/asio/buffer.hpp" #include "boost/asio/ip/address.hpp" #include "boost/system/system_error.hpp" +#include "flow/Arena.h" #include "flow/Platform.h" #include "flow/Trace.h" #include diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index eabcb38145..9f3e1f5440 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -170,6 +170,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, Tenants); PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, StorageInterfaceReadiness); PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, ResolverPrivateMutations); + PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, OTELSpanContext); PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, SWVersionTracking); }; diff --git a/flow/Tracing.actor.cpp b/flow/Tracing.actor.cpp index 144f663b7e..d24673ca84 100644 --- a/flow/Tracing.actor.cpp +++ b/flow/Tracing.actor.cpp @@ -19,6 +19,7 @@ */ #include "flow/Tracing.h" +#include "flow/IRandom.h" #include "flow/UnitTest.h" #include "flow/Knobs.h" #include "flow/network.h" @@ -42,28 +43,11 @@ constexpr float kQueueSizeLogInterval = 5.0; struct NoopTracer : ITracer { TracerType type() const override { return TracerType::DISABLED; } void trace(Span const& span) override {} - void trace(OTELSpan const& span) override {} }; struct LogfileTracer : ITracer { TracerType type() const override { return TracerType::LOG_FILE; } void trace(Span const& span) override { - TraceEvent te(SevInfo, "TracingSpan", span.context); - te.detail("Location", span.location.name) - .detail("Begin", format("%.6f", span.begin)) - .detail("End", format("%.6f", span.end)); - if (span.parents.size() == 1) { - te.detail("Parent", *span.parents.begin()); - } else { - for (auto parent : span.parents) { - TraceEvent(SevInfo, "TracingSpanAddParent", span.context).detail("AddParent", parent); - } - } - for (const auto& [key, value] : span.tags) { - TraceEvent(SevInfo, "TracingSpanTag", span.context).detail("Key", key).detail("Value", value); - } - } - void trace(OTELSpan const& span) override { TraceEvent te(SevInfo, "TracingSpan", span.context.traceID); te.detail("SpanID", span.context.spanID) .detail("Location", span.location.name) @@ -183,31 +167,6 @@ struct UDPTracer : public ITracer { // Serializes span fields as an array into the supplied TraceRequest // buffer. void serialize_span(const Span& span, TraceRequest& request) { - // If you change the serialization format here, make sure to update the - // fluentd filter to be able to correctly parse the updated format! See - // the msgpack specification for more info on the bit patterns used - // here. - uint8_t size = 8; - if (span.parents.size() == 0) - --size; - request.write_byte(size | 0b10010000); // write as array - - serialize_string(g_network->getLocalAddress().toString(), request); // ip:port - - serialize_value(span.context.first(), request, 0xcf); // trace id - serialize_value(span.context.second(), request, 0xcf); // token (span id) - - serialize_value(span.begin, request, 0xcb); // start time - serialize_value(span.end - span.begin, request, 0xcb); // duration - - serialize_string(span.location.name.toString(), request); - - serialize_map(span.tags, request); - - serialize_vector(span.parents, request); - } - - void serialize_span(const OTELSpan& span, TraceRequest& request) { uint16_t size = 14; request.write_byte(size | 0b10010000); // write as array serialize_value(span.context.traceID.first(), request, 0xcf); // trace id @@ -274,30 +233,6 @@ private: serialize_string(reinterpret_cast(str.data()), str.size(), request); } - // Writes the given vector of SpanIDs to the request. If the vector is - // empty, the request is not modified. - inline void serialize_vector(const SmallVectorRef& vec, TraceRequest& request) { - int size = vec.size(); - if (size == 0) { - return; - } - if (size <= 15) { - request.write_byte(static_cast(size) | 0b10010000); - } else if (size <= 65535) { - request.write_byte(0xdc); - request.write_byte(reinterpret_cast(&size)[1]); - request.write_byte(reinterpret_cast(&size)[0]); - } else { - TraceEvent(SevWarn, "TracingSpanSerializeVector") - .detail("Failed to MessagePack encode very large vector", size); - ASSERT_WE_THINK(false); - } - - for (const auto& parentContext : vec) { - serialize_value(parentContext.second(), request, 0xcf); - } - } - // Writes the given vector of linked SpanContext's to the request. If the vector is // empty, the request is not modified. inline void serialize_vector(const SmallVectorRef& vec, TraceRequest& request) { @@ -322,7 +257,7 @@ private: // Writes the given vector of linked SpanContext's to the request. If the vector is // empty, the request is not modified. - inline void serialize_vector(const SmallVectorRef& vec, TraceRequest& request) { + inline void serialize_vector(const SmallVectorRef& vec, TraceRequest& request) { int size = vec.size(); if (size <= 15) { request.write_byte(static_cast(size) | 0b10010000); @@ -453,12 +388,6 @@ struct FastUDPTracer : public UDPTracer { request_.reset(); } - void trace(OTELSpan const& span) override { - prepare(span.location.name.size()); - serialize_span(span, request_); - write(); - } - void trace(Span const& span) override { prepare(span.location.name.size()); serialize_span(span, request_); @@ -513,28 +442,6 @@ void openTracer(TracerType type) { ITracer::~ITracer() {} Span& Span::operator=(Span&& o) { - if (begin > 0.0 && context.second() > 0) { - end = g_network->now(); - g_tracer->trace(*this); - } - arena = std::move(o.arena); - context = o.context; - begin = o.begin; - end = o.end; - location = o.location; - parents = std::move(o.parents); - o.begin = 0; - return *this; -} - -Span::~Span() { - if (begin > 0.0 && context.second() > 0) { - end = g_network->now(); - g_tracer->trace(*this); - } -} - -OTELSpan& OTELSpan::operator=(OTELSpan&& o) { if (begin > 0.0 && o.context.isSampled() > 0) { end = g_network->now(); g_tracer->trace(*this); @@ -558,7 +465,7 @@ OTELSpan& OTELSpan::operator=(OTELSpan&& o) { return *this; } -OTELSpan::~OTELSpan() { +Span::~Span() { if (begin > 0.0 && context.isSampled()) { end = g_network->now(); g_tracer->trace(*this); @@ -567,16 +474,15 @@ OTELSpan::~OTELSpan() { TEST_CASE("/flow/Tracing/CreateOTELSpan") { // Sampling disabled, no parent. - OTELSpan notSampled("foo"_loc); + Span notSampled("foo"_loc); ASSERT(!notSampled.context.isSampled()); // Force Sampling - OTELSpan sampled("foo"_loc, []() { return 1.0; }); - ASSERT(sampled.context.isSampled()); + // Span sampled("foo"_loc, []() { return 1.0; }); + // ASSERT(sampled.context.isSampled()); // Ensure child traceID matches parent, when parent is sampled. - OTELSpan childTraceIDMatchesParent( - "foo"_loc, []() { return 1.0; }, SpanContext(UID(100, 101), 200, TraceFlags::sampled)); + Span childTraceIDMatchesParent("foo"_loc, SpanContext(UID(100, 101), 200, TraceFlags::sampled)); ASSERT(childTraceIDMatchesParent.context.traceID.first() == childTraceIDMatchesParent.parentContext.traceID.first()); ASSERT(childTraceIDMatchesParent.context.traceID.second() == @@ -584,22 +490,20 @@ TEST_CASE("/flow/Tracing/CreateOTELSpan") { // When the parent isn't sampled AND it has legitimate values we should not sample a child, // even if the child was randomly selected for sampling. - OTELSpan parentNotSampled( - "foo"_loc, []() { return 1.0; }, SpanContext(UID(1, 1), 1, TraceFlags::unsampled)); + Span parentNotSampled("foo"_loc, SpanContext(UID(1, 1), 1, TraceFlags::unsampled)); ASSERT(!parentNotSampled.context.isSampled()); // When the parent isn't sampled AND it has zero values for traceID and spanID this means // we should defer to the child as the new root of the trace as there was no actual parent. // If the child was sampled we should send the child trace with a null parent. - OTELSpan noParent( - "foo"_loc, []() { return 1.0; }, SpanContext(UID(0, 0), 0, TraceFlags::unsampled)); - ASSERT(noParent.context.isSampled()); + // Span noParent("foo"_loc, SpanContext(UID(0, 0), 0, TraceFlags::unsampled)); + // ASSERT(noParent.context.isSampled()); return Void(); }; TEST_CASE("/flow/Tracing/AddEvents") { // Use helper method to add an OTELEventRef to an OTELSpan. - OTELSpan span1("span_with_event"_loc); + Span span1("span_with_event"_loc); auto arena = span1.arena; SmallVectorRef attrs; attrs.push_back(arena, KeyValueRef("foo"_sr, "bar"_sr)); @@ -610,14 +514,14 @@ TEST_CASE("/flow/Tracing/AddEvents") { ASSERT(span1.events[0].attributes.begin()->value.toString() == "bar"); // Use helper method to add an OTELEventRef with no attributes to an OTELSpan - OTELSpan span2("span_with_event"_loc); + Span span2("span_with_event"_loc); span2.addEvent(StringRef(span2.arena, LiteralStringRef("commit_succeed")), 1234567.100); ASSERT(span2.events[0].name.toString() == "commit_succeed"); ASSERT(span2.events[0].time == 1234567.100); ASSERT(span2.events[0].attributes.size() == 0); // Add fully constructed OTELEventRef to OTELSpan passed by value. - OTELSpan span3("span_with_event"_loc); + Span span3("span_with_event"_loc); auto s3Arena = span3.arena; SmallVectorRef s3Attrs; s3Attrs.push_back(s3Arena, KeyValueRef("xyz"_sr, "123"_sr)); @@ -636,7 +540,10 @@ TEST_CASE("/flow/Tracing/AddEvents") { }; TEST_CASE("/flow/Tracing/AddAttributes") { - OTELSpan span1("span_with_attrs"_loc); + Span span1("span_with_attrs"_loc, + SpanContext(deterministicRandom()->randomUniqueID(), + deterministicRandom()->randomUInt64(), + TraceFlags::sampled)); auto arena = span1.arena; span1.addAttribute(StringRef(arena, LiteralStringRef("foo")), StringRef(arena, LiteralStringRef("bar"))); span1.addAttribute(StringRef(arena, LiteralStringRef("operation")), StringRef(arena, LiteralStringRef("grv"))); @@ -644,25 +551,34 @@ TEST_CASE("/flow/Tracing/AddAttributes") { ASSERT(span1.attributes[1] == KeyValueRef("foo"_sr, "bar"_sr)); ASSERT(span1.attributes[2] == KeyValueRef("operation"_sr, "grv"_sr)); - OTELSpan span3("span_with_attrs"_loc); - auto s3Arena = span3.arena; - span3.addAttribute(StringRef(s3Arena, LiteralStringRef("a")), StringRef(s3Arena, LiteralStringRef("1"))) - .addAttribute(StringRef(s3Arena, LiteralStringRef("b")), LiteralStringRef("2")) - .addAttribute(StringRef(s3Arena, LiteralStringRef("c")), LiteralStringRef("3")); + Span span2("span_with_attrs"_loc, + SpanContext(deterministicRandom()->randomUniqueID(), + deterministicRandom()->randomUInt64(), + TraceFlags::sampled)); + auto s2Arena = span2.arena; + span2.addAttribute(StringRef(s2Arena, LiteralStringRef("a")), StringRef(s2Arena, LiteralStringRef("1"))) + .addAttribute(StringRef(s2Arena, LiteralStringRef("b")), LiteralStringRef("2")) + .addAttribute(StringRef(s2Arena, LiteralStringRef("c")), LiteralStringRef("3")); - ASSERT_EQ(span3.attributes.size(), 4); // Includes default attribute of "address" - ASSERT(span3.attributes[1] == KeyValueRef("a"_sr, "1"_sr)); - ASSERT(span3.attributes[2] == KeyValueRef("b"_sr, "2"_sr)); - ASSERT(span3.attributes[3] == KeyValueRef("c"_sr, "3"_sr)); + ASSERT_EQ(span2.attributes.size(), 4); // Includes default attribute of "address" + ASSERT(span2.attributes[1] == KeyValueRef("a"_sr, "1"_sr)); + ASSERT(span2.attributes[2] == KeyValueRef("b"_sr, "2"_sr)); + ASSERT(span2.attributes[3] == KeyValueRef("c"_sr, "3"_sr)); return Void(); }; TEST_CASE("/flow/Tracing/AddLinks") { - OTELSpan span1("span_with_links"_loc); + Span span1("span_with_links"_loc); + ASSERT(!span1.context.isSampled()); + ASSERT(!span1.context.isValid()); span1.addLink(SpanContext(UID(100, 101), 200, TraceFlags::sampled)); span1.addLink(SpanContext(UID(200, 201), 300, TraceFlags::unsampled)) .addLink(SpanContext(UID(300, 301), 400, TraceFlags::sampled)); + // Ensure the root span is now sampled and traceID and spanIDs are set. + ASSERT(span1.context.isSampled()); + ASSERT(span1.context.isValid()); + // Ensure links are present. ASSERT(span1.links[0].traceID == UID(100, 101)); ASSERT(span1.links[0].spanID == 200); ASSERT(span1.links[0].m_Flags == TraceFlags::sampled); @@ -673,11 +589,16 @@ TEST_CASE("/flow/Tracing/AddLinks") { ASSERT(span1.links[2].spanID == 400); ASSERT(span1.links[2].m_Flags == TraceFlags::sampled); - OTELSpan span2("span_with_links"_loc); + Span span2("span_with_links"_loc); + ASSERT(!span2.context.isSampled()); + ASSERT(!span2.context.isValid()); auto link1 = SpanContext(UID(1, 1), 1, TraceFlags::sampled); auto link2 = SpanContext(UID(2, 2), 2, TraceFlags::sampled); auto link3 = SpanContext(UID(3, 3), 3, TraceFlags::sampled); span2.addLinks({ link1, link2 }).addLinks({ link3 }); + // Ensure the root span is now sampled and traceID and spanIDs are set. + ASSERT(span2.context.isSampled()); + ASSERT(span2.context.isValid()); ASSERT(span2.links[0].traceID == UID(1, 1)); ASSERT(span2.links[0].spanID == 1); ASSERT(span2.links[0].m_Flags == TraceFlags::sampled); @@ -741,7 +662,7 @@ std::string readMPString(uint8_t* index) { // Windows doesn't like lack of header and declaration of constructor for FastUDPTracer #ifndef WIN32 TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") { - OTELSpan span1("encoded_span"_loc); + Span span1("encoded_span"_loc); auto request = TraceRequest{ .buffer = std::make_unique(kTraceBufferSize), .data_size = 0, .buffer_size = kTraceBufferSize }; @@ -753,9 +674,9 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") { // Test - constructor OTELSpan(const Location& location, const SpanContext parent, const SpanContext& link) // Will delegate to other constructors. - OTELSpan span2("encoded_span"_loc, - SpanContext(UID(100, 101), 1, TraceFlags::sampled), - SpanContext(UID(200, 201), 2, TraceFlags::sampled)); + Span span2("encoded_span"_loc, + SpanContext(UID(100, 101), 1, TraceFlags::sampled), + { SpanContext(UID(200, 201), 2, TraceFlags::sampled) }); tracer.serialize_span(span2, request); data = request.buffer.get(); ASSERT(data[0] == 0b10011110); // 14 element array. @@ -801,7 +722,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") { request.reset(); // Exercise all fluent interfaces, include links, events, and attributes. - OTELSpan span3("encoded_span_3"_loc); + Span span3("encoded_span_3"_loc, SpanContext()); auto s3Arena = span3.arena; SmallVectorRef attrs; attrs.push_back(s3Arena, KeyValueRef("foo"_sr, "bar"_sr)); @@ -870,7 +791,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") { "SGKKUrpIb/7zePhBDi+gzUzyAcbQ2zUbFWI1KNi3zQk58uUG6wWJZkw+GCs7Cc3V" "OUxOljwCJkC4QTgdsbbFhxUC+rtoHV5xAqoTQwR0FXnWigUjP7NtdL6huJUr3qRv" "40c4yUI1a4+P5vJa"; - auto span4 = OTELSpan(); + Span span4; auto location = Location(); location.name = StringRef(span4.arena, longString); span4.location = location; diff --git a/flow/Tracing.h b/flow/Tracing.h index c289a73fcc..02b3c8a5b6 100644 --- a/flow/Tracing.h +++ b/flow/Tracing.h @@ -21,6 +21,7 @@ #pragma once #include "fdbclient/FDBTypes.h" +#include "fdbrpc/FlowTransport.h" #include "flow/IRandom.h" #include #include @@ -33,90 +34,43 @@ inline Location operator"" _loc(const char* str, size_t size) { return Location{ StringRef(reinterpret_cast(str), size) }; } -struct Span { - Span(SpanID context, Location location, std::initializer_list const& parents = {}) - : context(context), begin(g_network->now()), location(location), parents(arena, parents.begin(), parents.end()) { - if (parents.size() > 0) { - // If the parents' token is 0 (meaning the trace should not be - // recorded), set the child token to 0 as well. Otherwise, generate - // a new, random token. - uint64_t traceId = 0; - if ((*parents.begin()).second() > 0) { - traceId = deterministicRandom()->randomUInt64(); - } - this->context = SpanID((*parents.begin()).first(), traceId); - } - } - Span(Location location, std::initializer_list const& parents = {}) - : Span(UID(deterministicRandom()->randomUInt64(), - deterministicRandom()->random01() < FLOW_KNOBS->TRACING_SAMPLE_RATE - ? deterministicRandom()->randomUInt64() - : 0), - location, - parents) {} - Span(Location location, SpanID context) : Span(location, { context }) {} - Span(const Span&) = delete; - Span(Span&& o) { - arena = std::move(o.arena); - context = o.context; - begin = o.begin; - end = o.end; - location = o.location; - parents = std::move(o.parents); - o.context = UID(); - o.begin = 0.0; - o.end = 0.0; - } - Span() {} - ~Span(); - Span& operator=(Span&& o); - Span& operator=(const Span&) = delete; - void swap(Span& other) { - std::swap(arena, other.arena); - std::swap(context, other.context); - std::swap(begin, other.begin); - std::swap(end, other.end); - std::swap(location, other.location); - std::swap(parents, other.parents); - } +enum class TraceFlags : uint8_t { unsampled = 0b00000000, sampled = 0b00000001 }; - void addParent(SpanID span) { - if (parents.size() == 0) { - uint64_t traceId = 0; - if (span.second() > 0) { - traceId = context.second() == 0 ? deterministicRandom()->randomUInt64() : context.second(); - } - // Use first parent to set trace ID. This is non-ideal for spans - // with multiple parents, because the trace ID will associate the - // span with only one trace. A workaround is to look at the parent - // relationships instead of the trace ID. Another option in the - // future is to keep a list of trace IDs. - context = SpanID(span.first(), traceId); - } - parents.push_back(arena, span); +inline TraceFlags operator&(TraceFlags lhs, TraceFlags rhs) { + return static_cast(static_cast>(lhs) & + static_cast>(rhs)); +} + +struct SpanContext { + UID traceID; + uint64_t spanID; + TraceFlags m_Flags; + SpanContext() : traceID(UID()), spanID(0), m_Flags(TraceFlags::unsampled) {} + SpanContext(UID traceID, uint64_t spanID, TraceFlags flags) : traceID(traceID), spanID(spanID), m_Flags(flags) {} + SpanContext(UID traceID, uint64_t spanID) : traceID(traceID), spanID(spanID), m_Flags(TraceFlags::unsampled) {} + SpanContext(Arena arena, const SpanContext& span) + : traceID(span.traceID), spanID(span.spanID), m_Flags(span.m_Flags) {} + bool isSampled() const { return (m_Flags & TraceFlags::sampled) == TraceFlags::sampled; } + std::string toString() const { return format("%016llx%016llx%016llx", traceID.first(), traceID.second(), spanID); }; + bool isValid() const { return traceID.first() != 0 && traceID.second() != 0 && spanID != 0; } + + template + void serialize(Ar& ar) { + serializer(ar, traceID, spanID, m_Flags); } - - void addTag(const StringRef& key, const StringRef& value) { tags[key] = value; } - - Arena arena; - UID context = UID(); - double begin = 0.0, end = 0.0; - Location location; - SmallVectorRef parents; - std::unordered_map tags; }; -// OTELSpan +// Span // -// OTELSpan is a tracing implementation which, for the most part, complies with the W3C Trace Context specification +// Span is a tracing implementation which, for the most part, complies with the W3C Trace Context specification // https://www.w3.org/TR/trace-context/ and the OpenTelemetry API // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md. // -// The major differences between OTELSpan and the current Span implementation, which is based off the OpenTracing.io +// The major differences between Span and the 7.0 Span implementation, which is based off the OpenTracing.io // specification https://opentracing.io/ are as follows. // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#span // -// OTELSpans have... +// OpenTelemetry Spans have... // 1. A SpanContext which consists of 3 attributes. // // TraceId - A valid trace identifier is a 16-byte array with at least one non-zero byte. @@ -146,82 +100,63 @@ enum class SpanKind : uint8_t { INTERNAL = 0, CLIENT = 1, SERVER = 2, PRODUCER = enum class SpanStatus : uint8_t { UNSET = 0, OK = 1, ERR = 2 }; -struct OTELEventRef { - OTELEventRef() {} - OTELEventRef(const StringRef& name, +struct SpanEventRef { + SpanEventRef() {} + SpanEventRef(const StringRef& name, const double& time, const SmallVectorRef& attributes = SmallVectorRef()) : name(name), time(time), attributes(attributes) {} - OTELEventRef(Arena& arena, const OTELEventRef& other) + SpanEventRef(Arena& arena, const SpanEventRef& other) : name(arena, other.name), time(other.time), attributes(arena, other.attributes) {} StringRef name; double time = 0.0; SmallVectorRef attributes; }; -class OTELSpan { +class Span { public: - OTELSpan(const SpanContext& context, - const Location& location, - const SpanContext& parentContext, - const std::initializer_list& links = {}) + // Construct a Span with a given context, location, parentContext and optional links. + // + // N.B. While this constructor receives a parentContext it does not overwrite the traceId of the Span's context. + // Therefore it is the responsibility of the caller to ensure the traceID and m_Flags of both the context and + // parentContext are identical if the caller wishes to establish a parent/child relationship between these spans. We + // do this to avoid needless comparisons or copies as this constructor is only called once in NativeAPI.actor.cpp + // and from below in the by the Span(location, parent, links) constructor. The Span(location, parent, links) + // constructor is used broadly and performs the copy of the parent's traceID and m_Flags. + Span(const SpanContext& context, + const Location& location, + const SpanContext& parentContext, + const std::initializer_list& links = {}) : context(context), location(location), parentContext(parentContext), links(arena, links.begin(), links.end()), begin(g_network->now()) { - // We've simplified the logic here, essentially we're now always setting trace and span ids and relying on the - // TraceFlags to determine if we're sampling. Therefore if the parent is sampled, we simply overwrite this - // span's traceID with the parent trace id. - if (parentContext.isSampled()) { - this->context.traceID = UID(parentContext.traceID.first(), parentContext.traceID.second()); - this->context.m_Flags = TraceFlags::sampled; - } else { - // However there are two other cases. - // 1. A legitamite parent span exists but it was not selected for tracing. - // 2. There is no actual parent, just a default arg parent provided by the constructor AND the "child" span - // was selected for sampling. For case 1. we handle below by marking the child as unsampled. For case 2 we - // needn't do anything, and can rely on the values in this OTELSpan - if (parentContext.traceID.first() != 0 && parentContext.traceID.second() != 0 && - parentContext.spanID != 0) { - this->context.m_Flags = TraceFlags::unsampled; - } - } this->kind = SpanKind::SERVER; this->status = SpanStatus::OK; this->attributes.push_back( - this->arena, KeyValueRef("address"_sr, StringRef(this->arena, g_network->getLocalAddress().toString()))); + // this->arena, KeyValueRef("address"_sr, StringRef(this->arena, "localhost:4000"))); + this->arena, + KeyValueRef("address"_sr, StringRef(this->arena, FlowTransport::transport().getLocalAddressAsString()))); } - OTELSpan(const Location& location, - const SpanContext& parent = SpanContext(), - const std::initializer_list& links = {}) - : OTELSpan( - SpanContext(UID(deterministicRandom()->randomUInt64(), deterministicRandom()->randomUInt64()), // traceID - deterministicRandom()->randomUInt64(), // spanID - deterministicRandom()->random01() < FLOW_KNOBS->TRACING_SAMPLE_RATE // sampled or unsampled - ? TraceFlags::sampled - : TraceFlags::unsampled), - location, - parent, - links) {} + // Construct Span with a location, parent, and optional links. + // This constructor copies the parent's traceID creating a parent->child relationship between Spans. + // Additionally we inherit the m_Flags of the parent, thus enabling or disabling sampling to match the parent. + Span(const Location& location, const SpanContext& parent, const std::initializer_list& links = {}) + : Span(SpanContext(parent.traceID, deterministicRandom()->randomUInt64(), parent.m_Flags), + location, + parent, + links) {} - OTELSpan(const Location& location, const SpanContext parent, const SpanContext& link) - : OTELSpan(location, parent, { link }) {} + // Construct Span without parent. Used for creating a root span, or when the parent is not known at construction + // time. + Span(const SpanContext& context, const Location& location) : Span(context, location, SpanContext()) {} - // NOTE: This constructor is primarly for unit testing until we sort out how to enable/disable a Knob dynamically in - // a test. - OTELSpan(const Location& location, - const std::function& rateProvider, - const SpanContext& parent = SpanContext(), - const std::initializer_list& links = {}) - : OTELSpan(SpanContext(UID(deterministicRandom()->randomUInt64(), deterministicRandom()->randomUInt64()), - deterministicRandom()->randomUInt64(), - deterministicRandom()->random01() < rateProvider() ? TraceFlags::sampled - : TraceFlags::unsampled), - location, - parent, - links) {} + // We've determined for initial tracing release, spans with only a location will not be traced. + // Generally these are for background processes, some are called infrequently, while others may be high volume. + // TODO: review and address in subsequent PRs. + Span(const Location& location) : location(location), begin(g_network->now()) {} - OTELSpan(const OTELSpan&) = delete; - OTELSpan(OTELSpan&& o) { + Span(const Span&) = delete; + Span(Span&& o) { arena = std::move(o.arena); context = o.context; location = o.location; @@ -239,11 +174,11 @@ public: o.end = 0.0; o.status = SpanStatus::UNSET; } - OTELSpan() {} - ~OTELSpan(); - OTELSpan& operator=(OTELSpan&& o); - OTELSpan& operator=(const OTELSpan&) = delete; - void swap(OTELSpan& other) { + Span() {} + ~Span(); + Span& operator=(Span&& o); + Span& operator=(const Span&) = delete; + void swap(Span& other) { std::swap(arena, other.arena); std::swap(context, other.context); std::swap(location, other.location); @@ -256,34 +191,53 @@ public: std::swap(events, other.events); } - OTELSpan& addLink(const SpanContext& linkContext) { + Span& addLink(const SpanContext& linkContext) { links.push_back(arena, linkContext); - return *this; - } - - OTELSpan& addLinks(const std::initializer_list& linkContexts = {}) { - for (auto const& sc : linkContexts) { - links.push_back(arena, sc); + // Check if link is sampled, if so sample this span. + if (!context.isSampled() && linkContext.isSampled()) { + context.m_Flags = TraceFlags::sampled; + // If for some reason this span isn't valid, we need to give it a + // traceID and spanID. This case is currently hit in CommitProxyServer + // CommitBatchContext::CommitBatchContext and CommitBatchContext::setupTraceBatch. + if (!context.isValid()) { + context.traceID = deterministicRandom()->randomUniqueID(); + context.spanID = deterministicRandom()->randomUInt64(); + } } return *this; } - OTELSpan& addEvent(const OTELEventRef& event) { + Span& addLinks(const std::initializer_list& linkContexts = {}) { + for (auto const& sc : linkContexts) { + addLink(sc); + } + return *this; + } + + Span& addEvent(const SpanEventRef& event) { events.push_back_deep(arena, event); return *this; } - OTELSpan& addEvent(const StringRef& name, - const double& time, - const SmallVectorRef& attrs = SmallVectorRef()) { - return addEvent(OTELEventRef(name, time, attrs)); + Span& addEvent(const StringRef& name, + const double& time, + const SmallVectorRef& attrs = SmallVectorRef()) { + return addEvent(SpanEventRef(name, time, attrs)); } - OTELSpan& addAttribute(const StringRef& key, const StringRef& value) { + Span& addAttribute(const StringRef& key, const StringRef& value) { attributes.push_back_deep(arena, KeyValueRef(key, value)); return *this; } + Span& setParent(const SpanContext& parent) { + parentContext = parent; + context.traceID = parent.traceID; + context.spanID = deterministicRandom()->randomUInt64(); + context.m_Flags = parent.m_Flags; + return *this; + } + Arena arena; SpanContext context; Location location; @@ -292,7 +246,7 @@ public: SmallVectorRef links; double begin = 0.0, end = 0.0; SmallVectorRef attributes; // not necessarily sorted - SmallVectorRef events; + SmallVectorRef events; SpanStatus status; }; @@ -311,7 +265,6 @@ struct ITracer { virtual TracerType type() const = 0; // passed ownership to the tracer virtual void trace(Span const& span) = 0; - virtual void trace(OTELSpan const& span) = 0; }; void openTracer(TracerType type); @@ -328,16 +281,3 @@ struct SpannedDeque : Deque { span = std::move(other.span); } }; - -template -struct OTELSpannedDeque : Deque { - OTELSpan span; - explicit OTELSpannedDeque(Location loc) : span(loc) {} - OTELSpannedDeque(OTELSpannedDeque&& other) : Deque(std::move(other)), span(std::move(other.span)) {} - OTELSpannedDeque(OTELSpannedDeque const&) = delete; - OTELSpannedDeque& operator=(OTELSpannedDeque const&) = delete; - OTELSpannedDeque& operator=(OTELSpannedDeque&& other) { - *static_cast*>(this) = std::move(other); - span = std::move(other.span); - } -}; From cf6e39af799fdcb835f4339d9d280a31ecc5d973 Mon Sep 17 00:00:00 2001 From: Rajiv Ranganath Date: Sat, 30 Apr 2022 13:37:20 +0530 Subject: [PATCH 092/299] docs: add `GET_RANGE_SPLIT_POINTS` Add `GET_RANGE_SPLIT_POINTS` instruction documentation. --- bindings/bindingtester/spec/bindingApiTester.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bindings/bindingtester/spec/bindingApiTester.md b/bindings/bindingtester/spec/bindingApiTester.md index 46dec89409..25b9592ce6 100644 --- a/bindings/bindingtester/spec/bindingApiTester.md +++ b/bindings/bindingtester/spec/bindingApiTester.md @@ -171,6 +171,13 @@ futures must apply the following rules to the result: the language binding. Make sure the API returns without error. Finally push the string "GOT_ESTIMATED_RANGE_SIZE" onto the stack. +#### GET_RANGE_SPLIT_POINTS + + Pops the top three items off of the stack as BEGIN_KEY, END_KEY and + CHUNK_SIZE. Then call the `getRangeSplitPoints` API of the language + binding. Make sure the API returns without error. Finally push the string + "GOT_RANGE_SPLIT_POINTS" onto the stack. + #### GET_KEY (_SNAPSHOT, _DATABASE) Pops the top four items off of the stack as KEY, OR_EQUAL, OFFSET, PREFIX From 0a4b364379fd2d257a872bb72b171b23e2756dc1 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 2 May 2022 13:49:42 -0700 Subject: [PATCH 093/299] Fix operation_failed thrown incorrectly from transactions (#6993) * Add a test demonstrating the issue If you write a versionstamped value after a set, then reading throws operation_failed. * Treat SetVersionstampedValue as independent in coalesce and mutate --- fdbclient/RYWIterator.cpp | 21 +++++++++++++++++++++ fdbclient/WriteMap.h | 8 +++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fdbclient/RYWIterator.cpp b/fdbclient/RYWIterator.cpp index 3e7e18c1e8..35ffb32cb0 100644 --- a/fdbclient/RYWIterator.cpp +++ b/fdbclient/RYWIterator.cpp @@ -430,6 +430,27 @@ TEST_CASE("/fdbclient/WriteMap/emptiness") { return Void(); } +TEST_CASE("/fdbclient/WriteMap/VersionstampedvalueAfterSet") { + Arena arena = Arena(); + SnapshotCache cache(&arena); + WriteMap writes = WriteMap(&arena); + + ASSERT(writes.empty()); + writes.mutate("apple"_sr, MutationRef::SetValue, "red"_sr, true); + writes.mutate("apple"_sr, MutationRef::SetVersionstampedValue, metadataVersionRequiredValue, true); + + RYWIterator it(&cache, &writes); + it.bypassUnreadableProtection(); + it.skip("apple"_sr); + ASSERT(it.is_unreadable()); + ASSERT(it.is_kv()); + const KeyValueRef* kv = it.kv(arena); + ASSERT(kv->key == "apple"_sr); + ASSERT(kv->value == metadataVersionRequiredValue); + + return Void(); +} + TEST_CASE("/fdbclient/WriteMap/clear") { Arena arena = Arena(); WriteMap writes = WriteMap(&arena); diff --git a/fdbclient/WriteMap.h b/fdbclient/WriteMap.h index 05793313f0..33fb6aee37 100644 --- a/fdbclient/WriteMap.h +++ b/fdbclient/WriteMap.h @@ -231,7 +231,8 @@ public: is_unreadable)); } } else { - if (!it.is_unreadable() && operation == MutationRef::SetValue) { + if (!it.is_unreadable() && + (operation == MutationRef::SetValue || operation == MutationRef::SetVersionstampedValue)) { it.tree.clear(); PTreeImpl::remove(writes, ver, key); PTreeImpl::insert(writes, @@ -523,9 +524,10 @@ public: static RYWMutation coalesce(RYWMutation existingEntry, RYWMutation newEntry, Arena& arena) { ASSERT(newEntry.value.present()); - if (newEntry.type == MutationRef::SetValue) + if (newEntry.type == MutationRef::SetValue || newEntry.type == MutationRef::SetVersionstampedValue) { + // independent mutations return newEntry; - else if (newEntry.type == MutationRef::AddValue) { + } else if (newEntry.type == MutationRef::AddValue) { switch (existingEntry.type) { case MutationRef::SetValue: return RYWMutation(doLittleEndianAdd(existingEntry.value, newEntry.value.get(), arena), From 7ed82c1ac53f909731276dbcb29c0e5d14ec3fcb Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 2 May 2022 14:24:43 -0700 Subject: [PATCH 094/299] Mac m1 has 16k pages (#7038) Previously the page guard implementation assumed that the page size was 4k. Also check for mmap and mprotect returning errors. --- flow/Platform.actor.cpp | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 4661c0c6ea..0cff826103 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -2038,16 +2038,47 @@ static void enableLargePages() { } #ifndef _WIN32 +static void* mmapSafe(void* addr, size_t len, int prot, int flags, int fd, off_t offset) { + void* result = mmap(addr, len, prot, flags, fd, offset); + if (result == MAP_FAILED) { + int err = errno; + fprintf(stderr, + "Error calling mmap(%p, %zu, %d, %d, %d, %jd): %s\n", + addr, + len, + prot, + flags, + fd, + (intmax_t)offset, + strerror(err)); + fflush(stderr); + std::abort(); + } + return result; +} + +static void mprotectSafe(void* p, size_t s, int prot) { + if (mprotect(p, s, prot) != 0) { + int err = errno; + fprintf(stderr, "Error calling mprotect(%p, %zu, %d): %s\n", p, s, prot, strerror(err)); + fflush(stderr); + std::abort(); + } +} + static void* mmapInternal(size_t length, int flags, bool guardPages) { if (guardPages) { - constexpr size_t pageSize = 4096; + static size_t pageSize = sysconf(_SC_PAGESIZE); + length = RightAlign(length, pageSize); length += 2 * pageSize; // Map enough for the guard pages - void* resultWithGuardPages = mmap(nullptr, length, PROT_READ | PROT_WRITE, flags, -1, 0); - mprotect(resultWithGuardPages, pageSize, PROT_NONE); // left guard page - mprotect((void*)(uintptr_t(resultWithGuardPages) + length - pageSize), pageSize, PROT_NONE); // right guard page + void* resultWithGuardPages = mmapSafe(nullptr, length, PROT_READ | PROT_WRITE, flags, -1, 0); + // left guard page + mprotectSafe(resultWithGuardPages, pageSize, PROT_NONE); + // right guard page + mprotectSafe((void*)(uintptr_t(resultWithGuardPages) + length - pageSize), pageSize, PROT_NONE); return (void*)(uintptr_t(resultWithGuardPages) + pageSize); } else { - return mmap(nullptr, length, PROT_READ | PROT_WRITE, flags, -1, 0); + return mmapSafe(nullptr, length, PROT_READ | PROT_WRITE, flags, -1, 0); } } #endif From fa2e85f1d3f6f9021a798efc2716c288f985f8b0 Mon Sep 17 00:00:00 2001 From: Hao Fu <77984096+hfu94@users.noreply.github.com> Date: Mon, 2 May 2022 15:17:14 -0700 Subject: [PATCH 095/299] Add comment about getMappedRange parameters (#7044) --- .../java/src/main/com/apple/foundationdb/ReadTransaction.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index 417068441d..66ad7a9e80 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -433,7 +433,9 @@ public interface ReadTransaction extends ReadTransactionContext { * * @param begin the beginning of the range (inclusive) * @param end the end of the range (exclusive) - * @param mapper TODO + * @param mapper defines how to map a key-value pair (one of the key-value pairs got + * from the first range query) to a GetRange (or GetValue) request. + * more details: https://github.com/apple/foundationdb/wiki/Everything-about-GetMappedRange * @param limit the maximum number of results to return. Limits results to the * first keys in the range. Pass {@link #ROW_LIMIT_UNLIMITED} if this query * should not limit the number of results. If {@code reverse} is {@code true} rows From 05e63bc703b33d695ff62a778a8a8d4294ac14b6 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 2 May 2022 17:17:37 -0700 Subject: [PATCH 096/299] Fix orphaned storage server due to force recovery (#6914) * Fix orphaned storage server due to force recovery The force recovery can roll back the transaction that adds a storage server. However, the storage server may now at version B > A, the recovery version. As a result, its peek to buddy TLog won't return TLogPeekReply::popped to trigger its exit, and instead getting a higher version C > B back. To the storage server, this means the message is empty, thus not removing itself and keeps peeking. The fix is to instead of using recovery version as the popped version for the SS, we use the recovery transaction version, which is the first transaction after the recovery. Force recovery bumps this version to a much higher version than the SS's version. So the TLog would set TLogPeekReply::popped to trigger the storage server exit. * Fix tlog peek to disallow return empty message between recoveredAt and recovery txn version This contract today is not explicitly set and can cause storage server to fail with assertion "rollbackVersion >= data->storageVersion()". This is because if such an empty version is returned, SS may advance its storage version to a value larger than the rollback version set in the recovery transaction. The fix is to block peek reply until recovery transaction has been received. * Move recoveryTxnReceived to be per LogData This is because a shared TLog can have a first generation TLog which is already setting the promise, thus later generations won't wait for the recovery version. For the current generation, all peeks need to wait, while for older generations, there is no need to wait (by checking if they are stopped). * For initial commit, poppedVersion needs to be at least 2 To get rid of the previous unsuccessful recovery's recruited seed storage servers. --- fdbserver/LogRouter.actor.cpp | 7 ++- fdbserver/LogSystemPeekCursor.actor.cpp | 1 + fdbserver/TLogServer.actor.cpp | 69 ++++++++++++++++++++----- fdbserver/storageserver.actor.cpp | 4 +- 4 files changed, 65 insertions(+), 16 deletions(-) diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 8684fa5263..6e13e35abf 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -603,7 +603,12 @@ Future logRouterPeekMessages(PromiseType replyPromise, } replyPromise.send(reply); - //TraceEvent("LogRouterPeek4", self->dbgid); + DisabledTraceEvent("LogRouterPeek4", self->dbgid) + .detail("Tag", reqTag.toString()) + .detail("ReqBegin", reqBegin) + .detail("End", reply.end) + .detail("MessageSize", reply.messages.size()) + .detail("PoppedVersion", self->poppedVersion); return Void(); } diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 3f604e12a6..a17556e61c 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -58,6 +58,7 @@ ILogSystem::ServerPeekCursor::ServerPeekCursor(Referenceresults.minKnownCommittedVersion = 0; DisabledTraceEvent(SevDebug, "SPC_Starting", randomID) .detail("Tag", tag.toString()) + .detail("UsePeekStream", usePeekStream) .detail("Begin", begin) .detail("End", end); } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 749f63949a..4f6c9f17e7 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -567,6 +567,8 @@ struct LogData : NonCopyable, public ReferenceCounted { TLogData* tLogData; Promise recoveryComplete, committingQueue; Version unrecoveredBefore, recoveredAt; + Version recoveryTxnVersion; + Promise recoveryTxnReceived; struct PeekTrackerData { std::map>> @@ -646,10 +648,11 @@ struct LogData : NonCopyable, public ReferenceCounted { blockingPeekTimeouts("BlockingPeekTimeouts", cc), emptyPeeks("EmptyPeeks", cc), nonEmptyPeeks("NonEmptyPeeks", cc), logId(interf.id()), protocolVersion(protocolVersion), newPersistentDataVersion(invalidVersion), tLogData(tLogData), unrecoveredBefore(1), recoveredAt(1), - logSystem(new AsyncVar>()), remoteTag(remoteTag), isPrimary(isPrimary), - logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0), locality(tagLocalityInvalid), - recruitmentID(recruitmentID), logSpillType(logSpillType), allTags(tags.begin(), tags.end()), - terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false), txsTags(txsTags) { + recoveryTxnVersion(1), logSystem(new AsyncVar>()), remoteTag(remoteTag), + isPrimary(isPrimary), logRouterTags(logRouterTags), logRouterPoppedVersion(0), logRouterPopToVersion(0), + locality(tagLocalityInvalid), recruitmentID(recruitmentID), logSpillType(logSpillType), + allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), execOpCommitInProgress(false), + txsTags(txsTags) { startRole(Role::TRANSACTION_LOG, interf.id(), tLogData->workerID, @@ -1565,7 +1568,7 @@ Version poppedVersion(Reference self, Tag tag) { if (tag == txsTag || tag.locality == tagLocalityTxs) { return 0; } - return self->recoveredAt + 1; + return std::max(self->recoveredAt + 1, self->recoveryTxnVersion); } return tagData->popped; } @@ -1743,12 +1746,24 @@ Future tLogPeekMessages(PromiseType replyPromise, return Void(); } - //TraceEvent("TLogPeekMessages0", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); + DisabledTraceEvent("TLogPeekMessages0", self->dbgid) + .detail("LogId", logData->logId) + .detail("Tag", reqTag.toString()) + .detail("ReqBegin", reqBegin) + .detail("Version", logData->version.get()) + .detail("RecoveredAt", logData->recoveredAt); // Wait until we have something to return that the caller doesn't already have if (logData->version.get() < reqBegin) { wait(logData->version.whenAtLeast(reqBegin)); wait(delay(SERVER_KNOBS->TLOG_PEEK_DELAY, g_network->getCurrentTask())); } + if (!logData->stopped && reqTag.locality != tagLocalityTxs && reqTag != txsTag) { + // Make sure the peek reply has the recovery txn for the current TLog. + // Older generation TLog has been stopped and doesn't wait here. + // Similarly during recovery, reading transaction state store + // doesn't wait here. + wait(logData->recoveryTxnReceived.getFuture()); + } if (logData->locality != tagLocalitySatellite && reqTag.locality == tagLocalityLogRouter) { wait(self->concurrentLogRouterReads.take()); @@ -1788,6 +1803,11 @@ Future tLogPeekMessages(PromiseType replyPromise, poppedVer = poppedVersion(logData, reqTag); } + DisabledTraceEvent("TLogPeekMessages1", self->dbgid) + .detail("LogId", logData->logId) + .detail("Tag", reqTag.toString()) + .detail("ReqBegin", reqBegin) + .detail("PoppedVer", poppedVer); if (poppedVer > reqBegin) { TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); @@ -1832,7 +1852,9 @@ Future tLogPeekMessages(PromiseType replyPromise, onlySpilled = false; // grab messages from disk - //TraceEvent("TLogPeekMessages", self->dbgid).detail("ReqBeginEpoch", reqBegin.epoch).detail("ReqBeginSeq", reqBegin.sequence).detail("Epoch", self->epoch()).detail("PersistentDataSeq", self->persistentDataSequence).detail("Tag1", reqTag1).detail("Tag2", reqTag2); + DisabledTraceEvent("TLogPeekMessages2", self->dbgid) + .detail("ReqBegin", reqBegin) + .detail("Tag", reqTag.toString()); if (reqBegin <= logData->persistentDataDurableVersion) { // Just in case the durable version changes while we are waiting for the read, we grab this data from // memory. We may or may not actually send it depending on whether we get enough data from disk. SOMEDAY: @@ -1993,13 +2015,12 @@ Future tLogPeekMessages(PromiseType replyPromise, reply.end = endVersion; reply.onlySpilled = onlySpilled; - // TraceEvent("TlogPeek", self->dbgid) - // .detail("LogId", logData->logId) - // .detail("Tag", req.tag.toString()) - // .detail("BeginVer", req.begin) - // .detail("EndVer", reply.end) - // .detail("MsgBytes", reply.messages.expectedSize()) - // .detail("ForAddress", req.reply.getEndpoint().getPrimaryAddress()); + DisabledTraceEvent("TLogPeekMessages4", self->dbgid) + .detail("LogId", logData->logId) + .detail("Tag", reqTag.toString()) + .detail("ReqBegin", reqBegin) + .detail("EndVer", reply.end) + .detail("MsgBytes", reply.messages.expectedSize()); if (reqSequence.present()) { auto& trackerData = logData->peekTracker[peekId]; @@ -2221,6 +2242,9 @@ ACTOR Future tLogCommit(TLogData* self, g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.BeforeWaitForVersion"); } + if (req.prevVersion == logData->recoveredAt) { + logData->recoveryTxnVersion = req.version; + } logData->minKnownCommittedVersion = std::max(logData->minKnownCommittedVersion, req.minKnownCommittedVersion); wait(logData->version.whenAtLeast(req.prevVersion)); @@ -2274,6 +2298,15 @@ ACTOR Future tLogCommit(TLogData* self, } // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set(req.version); + if (logData->recoveryTxnReceived.canBeSet() && + (req.prevVersion == 0 || req.prevVersion == logData->recoveredAt)) { + TraceEvent("TLogInfo", self->dbgid) + .detail("Log", logData->logId) + .detail("Prev", req.prevVersion) + .detail("RecoveredAt", logData->recoveredAt) + .detail("RecoveryTxnVersion", req.version); + logData->recoveryTxnReceived.send(Void()); + } if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) { self->unknownCommittedVersions.push_front(std::make_tuple(req.version, req.tLogCount)); while (!self->unknownCommittedVersions.empty() && @@ -2777,6 +2810,7 @@ ACTOR Future pullAsyncData(TLogData* self, state Version ver = 0; state std::vector messages; + state bool pullingRecoveryData = endVersion.present() && endVersion.get() == logData->recoveredAt; loop { state bool foundMessage = r->hasMessage(); if (!foundMessage || r->version().version != ver) { @@ -2814,6 +2848,13 @@ ACTOR Future pullAsyncData(TLogData* self, // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages // actors logData->version.set(ver); + if (logData->recoveryTxnReceived.canBeSet() && !pullingRecoveryData && ver > logData->recoveredAt) { + TraceEvent("TLogInfo", self->dbgid) + .detail("Log", logData->logId) + .detail("RecoveredAt", logData->recoveredAt) + .detail("RecoveryTxnVersion", ver); + logData->recoveryTxnReceived.send(Void()); + } wait(yield(TaskPriority::TLogCommit)); } lastVer = ver; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 3631affc4d..9afc0b5b6b 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -6802,7 +6802,9 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { } data->tlogCursorReadsLatencyHistogram->sampleSeconds(now() - beforeTLogCursorReads); if (cursor->popped() > 0) { - TraceEvent("StorageServerWorkerRemoved", data->thisServerID).detail("Reason", "PeekPoppedTLogData"); + TraceEvent("StorageServerWorkerRemoved", data->thisServerID) + .detail("Reason", "PeekPoppedTLogData") + .detail("Version", cursor->popped()); throw worker_removed(); } From 90dae38d04bc51240c0a5d5010a65ac8f434cccc Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 2 May 2022 18:22:59 -0700 Subject: [PATCH 097/299] Update RYWIterator test to match #6993 (#7046) There's a test which checks behavior against a reference implementation, and so the reference implementation needs to be updated as well. --- fdbclient/RYWIterator.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fdbclient/RYWIterator.cpp b/fdbclient/RYWIterator.cpp index 35ffb32cb0..949f164485 100644 --- a/fdbclient/RYWIterator.cpp +++ b/fdbclient/RYWIterator.cpp @@ -676,7 +676,10 @@ TEST_CASE("/fdbclient/WriteMap/random") { KeyRef key = RandomTestImpl::getRandomKey(arena); ValueRef value = RandomTestImpl::getRandomValue(arena); writes.mutate(key, MutationRef::SetVersionstampedValue, value, addConflict); - setMap[key].push(RYWMutation(value, MutationRef::SetVersionstampedValue)); + if (unreadableMap[key]) + setMap[key].push(RYWMutation(value, MutationRef::SetVersionstampedValue)); + else + setMap[key] = OperationStack(RYWMutation(value, MutationRef::SetVersionstampedValue)); if (addConflict) conflictMap.insert(key, true); clearMap.insert(key, false); From 9a279c24aeef9572962f8c04e1c90a033b6cacb2 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 2 May 2022 19:26:44 -0700 Subject: [PATCH 098/299] Bug fix: Redwood shutdown would wait for pending IO success so if any of them failed the shutdown would never complete. --- fdbserver/VersionedBTree.actor.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9b4b177c2c..6eb2ac0972 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2108,7 +2108,9 @@ public: return !reading() && !writing(); } - Future onEvictable() const { return ready(readFuture) && writeFuture; } + // Entry is evictable when its write and read futures are ready, even if they are + // errors, so any buffers they hold are no longer needed by the underlying file actors + Future onEvictable() const { return ready(readFuture) && ready(writeFuture); } }; typedef ObjectCache PageCacheT; @@ -3761,7 +3763,9 @@ public: // Must wait for pending operations to complete, canceling them can cause a crash because the underlying // operations may be uncancellable and depend on memory from calling scope's page reference debug_printf("DWALPager(%s) shutdown wait for operations\n", self->filename.c_str()); - wait(waitForAll(self->operations)); + + // Pending ops must be all ready, errors are okay + wait(waitForAllReady(self->operations)); self->operations.clear(); debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str()); From 7f05221cfee9a6069d3341f780b88534ad4f4b56 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Mon, 2 May 2022 22:15:27 -0700 Subject: [PATCH 099/299] Removed TLS_DISABLED macro --- fdbbackup/FileConverter.h | 2 - fdbbackup/FileDecoder.actor.cpp | 7 +- fdbbackup/backup.actor.cpp | 122 ++++-------------- fdbcli/fdbcli.actor.cpp | 16 +-- .../BackupContainerS3BlobStore.actor.cpp | 2 +- fdbclient/md5/md5.c | 2 +- fdbclient/md5/md5.h | 2 +- fdbserver/fdbserver.actor.cpp | 8 -- fdbserver/workloads/UnitTests.actor.cpp | 4 +- flow/BlobCipher.h | 8 -- flow/Net2.actor.cpp | 18 --- flow/Platform.actor.cpp | 4 +- flow/StreamCipher.h | 8 -- flow/TLSConfig.actor.cpp | 9 -- flow/TLSConfig.actor.h | 7 - 15 files changed, 32 insertions(+), 187 deletions(-) diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h index 0aa1d105a6..251f8d004a 100644 --- a/fdbbackup/FileConverter.h +++ b/fdbbackup/FileConverter.h @@ -65,9 +65,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP }, { OPT_INPUT_FILE, "-i", SO_REQ_SEP }, { OPT_INPUT_FILE, "--input", SO_REQ_SEP }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, -#ifndef TLS_DISABLED TLS_OPTION_FLAGS -#endif { OPT_BUILD_FLAGS, "--build-flags", SO_NONE }, { OPT_LIST_ONLY, "--list-only", SO_NONE }, { OPT_KEY_PREFIX, "-k", SO_REQ_SEP }, diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp index 71f6932598..2ad7a55df2 100644 --- a/fdbbackup/FileDecoder.actor.cpp +++ b/fdbbackup/FileDecoder.actor.cpp @@ -75,10 +75,7 @@ void printDecodeUsage() { " --crash Crash on serious error.\n" " --blob-credentials FILE\n" " File containing blob credentials in JSON format.\n" - " The same credential format/file fdbbackup uses.\n" -#ifndef TLS_DISABLED - TLS_HELP -#endif + " The same credential format/file fdbbackup uses.\n" TLS_HELP " --build-flags Print build information and exit.\n" " --list-only Print file list and exit.\n" " -k KEY_PREFIX Use the prefix for filtering mutations\n" @@ -302,7 +299,6 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) { param->save_file_locally = true; break; -#ifndef TLS_DISABLED case TLSConfig::OPT_TLS_PLUGIN: args->OptionArg(); break; @@ -326,7 +322,6 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) { case TLSConfig::OPT_TLS_VERIFY_PEERS: param->tlsConfig.tlsVerifyPeers = args->OptionArg(); break; -#endif case OPT_BUILD_FLAGS: printBuildInformation(); diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 40ca160f3e..03f572e340 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -220,10 +220,7 @@ CSimpleOpt::SOption g_rgAgentOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupStartOptions[] = { @@ -269,10 +266,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = { { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_INCREMENTALONLY, "--incremental", SO_NONE }, { OPT_ENCRYPTION_KEY_FILE, "--encryption-key-file", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupModifyOptions[] = { @@ -335,10 +329,7 @@ CSimpleOpt::SOption g_rgBackupStatusOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_JSON, "--json", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupAbortOptions[] = { @@ -364,10 +355,7 @@ CSimpleOpt::SOption g_rgBackupAbortOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupCleanupOptions[] = { @@ -393,10 +381,7 @@ CSimpleOpt::SOption g_rgBackupCleanupOptions[] = { { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_DELETE_DATA, "--delete-data", SO_NONE }, { OPT_MIN_CLEANUP_SECONDS, "--min-cleanup-seconds", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDiscontinueOptions[] = { @@ -424,10 +409,7 @@ CSimpleOpt::SOption g_rgBackupDiscontinueOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupWaitOptions[] = { @@ -455,10 +437,7 @@ CSimpleOpt::SOption g_rgBackupWaitOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupPauseOptions[] = { @@ -482,10 +461,7 @@ CSimpleOpt::SOption g_rgBackupPauseOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupExpireOptions[] = { @@ -521,10 +497,7 @@ CSimpleOpt::SOption g_rgBackupExpireOptions[] = { { OPT_EXPIRE_BEFORE_DATETIME, "--expire-before-timestamp", SO_REQ_SEP }, { OPT_EXPIRE_MIN_RESTORABLE_DAYS, "--min-restorable-days", SO_REQ_SEP }, { OPT_EXPIRE_DELETE_BEFORE_DAYS, "--delete-before-days", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDeleteOptions[] = { @@ -550,10 +523,7 @@ CSimpleOpt::SOption g_rgBackupDeleteOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { @@ -584,10 +554,7 @@ CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { { OPT_DESCRIBE_DEEP, "--deep", SO_NONE }, { OPT_DESCRIBE_TIMESTAMPS, "--version-timestamps", SO_NONE }, { OPT_JSON, "--json", SO_NONE }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDumpOptions[] = { @@ -616,10 +583,7 @@ CSimpleOpt::SOption g_rgBackupDumpOptions[] = { { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_DUMP_BEGIN, "--begin", SO_REQ_SEP }, { OPT_DUMP_END, "--end", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupTagsOptions[] = { @@ -634,10 +598,7 @@ CSimpleOpt::SOption g_rgBackupTagsOptions[] = { { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupListOptions[] = { @@ -662,10 +623,7 @@ CSimpleOpt::SOption g_rgBackupListOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupQueryOptions[] = { @@ -698,10 +656,7 @@ CSimpleOpt::SOption g_rgBackupQueryOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; // g_rgRestoreOptions is used by fdbrestore and fastrestore_tool @@ -747,10 +702,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = { { OPT_RESTORE_BEGIN_VERSION, "--begin-version", SO_REQ_SEP }, { OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY, "--inconsistent-snapshot-only", SO_NONE }, { OPT_ENCRYPTION_KEY_FILE, "--encryption-key-file", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBAgentOptions[] = { @@ -780,10 +732,7 @@ CSimpleOpt::SOption g_rgDBAgentOptions[] = { { OPT_HELP, "-h", SO_NONE }, { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBStartOptions[] = { @@ -813,10 +762,7 @@ CSimpleOpt::SOption g_rgDBStartOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBStatusOptions[] = { @@ -846,10 +792,7 @@ CSimpleOpt::SOption g_rgDBStatusOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBSwitchOptions[] = { @@ -878,10 +821,7 @@ CSimpleOpt::SOption g_rgDBSwitchOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBAbortOptions[] = { @@ -911,10 +851,7 @@ CSimpleOpt::SOption g_rgDBAbortOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBPauseOptions[] = { @@ -940,10 +877,7 @@ CSimpleOpt::SOption g_rgDBPauseOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; const KeyRef exeAgent = LiteralStringRef("backup_agent"); @@ -1017,9 +951,7 @@ static void printAgentUsage(bool devhelp) { printf(" -m SIZE, --memory SIZE\n" " Memory limit. The default value is 8GiB. When specified\n" " without a unit, MiB is assumed.\n"); -#ifndef TLS_DISABLED printf(TLS_HELP); -#endif printf(" --build-flags Print build information and exit.\n"); printf(" -v, --version Print version information and exit.\n"); printf(" -h, --help Display this help and exit.\n"); @@ -1147,9 +1079,7 @@ static void printBackupUsage(bool devhelp) { "and ignore the range files.\n"); printf(" --encryption-key-file" " The AES-128-GCM key in the provided file is used for encrypting backup files.\n"); -#ifndef TLS_DISABLED printf(TLS_HELP); -#endif printf(" -w, --wait Wait for the backup to complete (allowed with `start' and `discontinue').\n"); printf(" -z, --no-stop-when-done\n" " Do not stop backup when restorable.\n"); @@ -1222,9 +1152,7 @@ static void printRestoreUsage(bool devhelp) { "instead of the entire set.\n"); printf(" --encryption-key-file" " The AES-128-GCM key in the provided file is used for decrypting backup files.\n"); -#ifndef TLS_DISABLED printf(TLS_HELP); -#endif printf(" -v DBVERSION The version at which the database will be restored.\n"); printf(" --timestamp Instead of a numeric version, use this to specify a timestamp in %s\n", BackupAgentBase::timeFormat().c_str()); @@ -1281,9 +1209,7 @@ static void printDBAgentUsage(bool devhelp) { printf(" -m, --memory SIZE\n" " Memory limit. The default value is 8GiB. When specified\n" " without a unit, MiB is assumed.\n"); -#ifndef TLS_DISABLED printf(TLS_HELP); -#endif printf(" --build-flags Print build information and exit.\n"); printf(" -v, --version Print version information and exit.\n"); printf(" -h, --help Display this help and exit.\n"); @@ -1322,9 +1248,7 @@ static void printDBBackupUsage(bool devhelp) { " If not specified, the entire database will be backed up.\n"); printf(" --cleanup Abort will attempt to stop mutation logging on the source cluster.\n"); printf(" --dstonly Abort will not make any changes on the source cluster.\n"); -#ifndef TLS_DISABLED printf(TLS_HELP); -#endif printf(" --log Enables trace file logging for the CLI session.\n" " --logdir PATH Specifes the output directory for trace files. If\n" " unspecified, defaults to the current directory. Has\n" @@ -3793,7 +3717,6 @@ int main(int argc, char* argv[]) { case OPT_BLOB_CREDENTIALS: tlsConfig.blobCredentials.push_back(args->OptionArg()); break; -#ifndef TLS_DISABLED case TLSConfig::OPT_TLS_PLUGIN: args->OptionArg(); break; @@ -3812,7 +3735,6 @@ int main(int argc, char* argv[]) { case TLSConfig::OPT_TLS_VERIFY_PEERS: tlsConfig.tlsVerifyPeers = args->OptionArg(); break; -#endif case OPT_DUMP_BEGIN: dumpBegin = parseVersion(args->OptionArg()); break; diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 5f5f7d25fc..af920e63ba 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -125,12 +125,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, { OPT_DEBUG_TLS, "--debug-tls", SO_NONE }, { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, { OPT_MEMORY, "--memory", SO_REQ_SEP }, - -#ifndef TLS_DISABLED - TLS_OPTION_FLAGS -#endif - - SO_END_OF_OPTIONS }; + TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; void printAtCol(const char* text, int col, FILE* stream = stdout) { const char* iter = text; @@ -449,9 +444,7 @@ static void printProgramUsage(const char* name) { " the CLI.\n" " --api-version APIVERSION\n" " Specifies the version of the API for the CLI to use.\n" -#ifndef TLS_DISABLED TLS_HELP -#endif " --knob-KNOBNAME KNOBVALUE\n" " Changes a knob option. KNOBNAME should be lowercase.\n" " --debug-tls Prints the TLS configuration and certificate chain, then exits.\n" @@ -1091,7 +1084,6 @@ struct CLIOptions { case OPT_NO_HINTS: cliHints = false; -#ifndef TLS_DISABLED // TLS Options case TLSConfig::OPT_TLS_PLUGIN: args.OptionArg(); @@ -1111,7 +1103,7 @@ struct CLIOptions { case TLSConfig::OPT_TLS_VERIFY_PEERS: tlsVerifyPeers = args.OptionArg(); break; -#endif + case OPT_HELP: printProgramUsage(program_name.c_str()); return 0; @@ -2224,7 +2216,6 @@ int main(int argc, char** argv) { } if (opt.debugTLS) { -#ifndef TLS_DISABLED // Backdoor into NativeAPI's tlsConfig, which is where the above network option settings ended up. extern TLSConfig tlsConfig; printf("TLS Configuration:\n"); @@ -2241,9 +2232,6 @@ int main(int argc, char** argv) { printf("Use --log and look at the trace logs for more detailed information on the failure.\n"); return 1; } -#else - printf("This fdbcli was built with TLS disabled.\n"); -#endif return 0; } diff --git a/fdbclient/BackupContainerS3BlobStore.actor.cpp b/fdbclient/BackupContainerS3BlobStore.actor.cpp index 2240cc6741..58a436341a 100644 --- a/fdbclient/BackupContainerS3BlobStore.actor.cpp +++ b/fdbclient/BackupContainerS3BlobStore.actor.cpp @@ -20,7 +20,7 @@ #include "fdbclient/AsyncFileS3BlobStore.actor.h" #include "fdbclient/BackupContainerS3BlobStore.h" -#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#if (!defined(_WIN32)) #include "fdbrpc/AsyncFileEncrypted.h" #endif #include "fdbrpc/AsyncFileReadAhead.actor.h" diff --git a/fdbclient/md5/md5.c b/fdbclient/md5/md5.c index 03810862e3..4c3b21ce97 100644 --- a/fdbclient/md5/md5.c +++ b/fdbclient/md5/md5.c @@ -35,7 +35,7 @@ * compile-time configuration. */ -#if !defined(HAVE_OPENSSL) || defined(TLS_DISABLED) +#if !defined(HAVE_OPENSSL) #include diff --git a/fdbclient/md5/md5.h b/fdbclient/md5/md5.h index 71b1b0456b..09147eb7f1 100644 --- a/fdbclient/md5/md5.h +++ b/fdbclient/md5/md5.h @@ -23,7 +23,7 @@ * See md5.c for more information. */ -#if defined(HAVE_OPENSSL) && !defined(TLS_DISABLED) +#if defined(HAVE_OPENSSL) #if defined(HAVE_WOLFSSL) #include #endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 1a2f244b0b..d3737536a2 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -204,11 +204,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_FLOW_PROCESS_NAME, "--process-name", SO_REQ_SEP }, { OPT_FLOW_PROCESS_ENDPOINT, "--process-endpoint", SO_REQ_SEP }, { OPT_IP_TRUSTED_MASK, "--trusted-subnet-", SO_REQ_SEP }, - -#ifndef TLS_DISABLED TLS_OPTION_FLAGS -#endif - SO_END_OF_OPTIONS }; @@ -662,9 +658,7 @@ static void printUsage(const char* name, bool devhelp) { " collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n" " collector_endpoint -- IP:PORT of the fluentd server\n" " collector_protocol -- UDP or TCP (default is UDP)"); -#ifndef TLS_DISABLED printf("%s", TLS_HELP); -#endif printOptionUsage("-v, --version", "Print version information and exit."); printOptionUsage("-h, -?, --help", "Display this help and exit."); if (devhelp) { @@ -1621,7 +1615,6 @@ private: printSimTime = true; break; -#ifndef TLS_DISABLED case TLSConfig::OPT_TLS_PLUGIN: args.OptionArg(); break; @@ -1640,7 +1633,6 @@ private: case TLSConfig::OPT_TLS_VERIFY_PEERS: tlsConfig.addVerifyPeers(args.OptionArg()); break; -#endif } } diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp index 96433181f5..1545f78ab8 100644 --- a/fdbserver/workloads/UnitTests.actor.cpp +++ b/fdbserver/workloads/UnitTests.actor.cpp @@ -28,7 +28,7 @@ void forceLinkFlowTests(); void forceLinkVersionedMapTests(); void forceLinkMemcpyTests(); void forceLinkMemcpyPerfTests(); -#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#if (!defined(_WIN32)) void forceLinkStreamCipherTests(); void forceLinkBlobCipherTests(); #endif @@ -79,7 +79,7 @@ struct UnitTestWorkload : TestWorkload { forceLinkVersionedMapTests(); forceLinkMemcpyTests(); forceLinkMemcpyPerfTests(); -#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#if (!defined(_WIN32)) forceLinkStreamCipherTests(); void forceLinkBlobCipherTests(); #endif diff --git a/flow/BlobCipher.h b/flow/BlobCipher.h index 624762caaa..e50d4caab1 100644 --- a/flow/BlobCipher.h +++ b/flow/BlobCipher.h @@ -25,13 +25,7 @@ #include #include -#if (!defined(TLS_DISABLED)) #define ENCRYPTION_ENABLED 1 -#else -#define ENCRYPTION_ENABLED 0 -#endif - -#if ENCRYPTION_ENABLED #include "flow/Arena.h" #include "flow/EncryptUtils.h" @@ -468,5 +462,3 @@ StringRef computeAuthToken(const uint8_t* payload, const uint8_t* key, const int keyLen, Arena& arena); - -#endif // ENCRYPTION_ENABLED diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index f83aac02c5..0778fefde8 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -30,12 +30,10 @@ #define BOOST_DATE_TIME_NO_LIB #define BOOST_REGEX_NO_LIB #include -#ifndef TLS_DISABLED #if defined(HAVE_WOLFSSL) #include #endif #include "boost/asio/ssl.hpp" -#endif #include #include #include @@ -239,12 +237,10 @@ public: // private: ASIOReactor reactor; -#ifndef TLS_DISABLED AsyncVar>> sslContextVar; Reference sslHandshakerPool; int sslHandshakerThreadsStarted; int sslPoolHandshakesInProgress; -#endif TLSConfig tlsConfig; Future backgroundCertRefresh; ETLSInitState tlsInitializedState; @@ -379,14 +375,12 @@ public: { TraceEvent evt(SevWarn, errContext, errID); evt.suppressFor(1.0).detail("ErrorCode", error.value()).detail("Message", error.message()); -#ifndef TLS_DISABLED // There is no function in OpenSSL to use to check if an error code is from OpenSSL, // but all OpenSSL errors have a non-zero "library" code set in bits 24-32, and linux // error codes should never go that high. if (error.value() >= (1 << 24L)) { evt.detail("WhichMeans", TLSPolicy::ErrorString(error)); } -#endif } p.sendError(connection_failed()); @@ -793,7 +787,6 @@ private: } }; -#ifndef TLS_DISABLED typedef boost::asio::ssl::stream ssl_socket; struct SSLHandshakerThread final : IThreadPoolReceiver { @@ -1197,7 +1190,6 @@ private: } } }; -#endif struct PromiseTask final : public Task, public FastAllocated { Promise promise; @@ -1214,11 +1206,9 @@ struct PromiseTask final : public Task, public FastAllocated { Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics) : globals(enumGlobal::COUNT), useThreadPool(useThreadPool), reactor(this), -#ifndef TLS_DISABLED sslContextVar({ ReferencedObject::from( boost::asio::ssl::context(boost::asio::ssl::context::tls)) }), sslHandshakerThreadsStarted(0), sslPoolHandshakesInProgress(0), -#endif tlsConfig(tlsConfig), tlsInitializedState(ETLSInitState::NONE), network(this), tscBegin(0), tscEnd(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield), tasksIssued(0), stopped(false), started(false), numYields(0), lastPriorityStats(nullptr), ready(FLOW_KNOBS->READY_QUEUE_RESERVED_SIZE) { @@ -1243,7 +1233,6 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics) updateNow(); } -#ifndef TLS_DISABLED ACTOR static Future watchFileForChanges(std::string filename, AsyncTrigger* fileChanged) { if (filename == "") { return Never(); @@ -1320,13 +1309,11 @@ ACTOR static Future reloadCertificatesOnChange( } } } -#endif void Net2::initTLS(ETLSInitState targetState) { if (tlsInitializedState >= targetState) { return; } -#ifndef TLS_DISABLED // Any target state must be higher than NONE so if the current state is NONE // then initialize the TLS config if (tlsInitializedState == ETLSInitState::NONE) { @@ -1380,7 +1367,6 @@ void Net2::initTLS(ETLSInitState targetState) { } } } -#endif tlsInitializedState = targetState; } @@ -1834,12 +1820,10 @@ THREAD_HANDLE Net2::startThread(THREAD_FUNC_RETURN (*func)(void*), void* arg, in } Future> Net2::connect(NetworkAddress toAddr, const std::string& host) { -#ifndef TLS_DISABLED if (toAddr.isTLS()) { initTLS(ETLSInitState::CONNECT); return SSLConnection::connect(&this->reactor.ios, this->sslContextVar.get(), toAddr); } -#endif return Connection::connect(&this->reactor.ios, toAddr); } @@ -1979,12 +1963,10 @@ bool Net2::isAddressOnThisHost(NetworkAddress const& addr) const { Reference Net2::listen(NetworkAddress localAddr) { try { -#ifndef TLS_DISABLED if (localAddr.isTLS()) { initTLS(ETLSInitState::LISTEN); return Reference(new SSLListener(reactor.ios, &this->sslContextVar, localAddr)); } -#endif return Reference(new Listener(reactor.ios, localAddr)); } catch (boost::system::system_error const& e) { Error x; diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 0cff826103..d29aea5c22 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -31,7 +31,7 @@ #include "flow/Platform.actor.h" #include "flow/Arena.h" -#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#if (!defined(_WIN32)) #include "flow/StreamCipher.h" #include "flow/BlobCipher.h" #endif @@ -3552,7 +3552,7 @@ void crashHandler(int sig) { bool error = (sig != SIGUSR2); -#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#if (!defined(_WIN32)) StreamCipherKey::cleanup(); StreamCipher::cleanup(); BlobCipherKeyCache::cleanup(); diff --git a/flow/StreamCipher.h b/flow/StreamCipher.h index 3e1fe10fe0..c6ed26e563 100644 --- a/flow/StreamCipher.h +++ b/flow/StreamCipher.h @@ -20,13 +20,7 @@ #pragma once -#if (!defined(TLS_DISABLED)) #define ENCRYPTION_ENABLED 1 -#else -#define ENCRYPTION_ENABLED 0 -#endif - -#if ENCRYPTION_ENABLED #include "flow/Arena.h" #include "flow/FastRef.h" @@ -117,5 +111,3 @@ public: }; void applyHmacKeyDerivationFunc(StreamCipherKey* cipherKey, HmacSha256StreamCipher* hmacGenerator, Arena& arena); - -#endif // ENCRYPTION_ENABLED diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp index c7cad7945d..4d1ed9cf85 100644 --- a/flow/TLSConfig.actor.cpp +++ b/flow/TLSConfig.actor.cpp @@ -25,14 +25,6 @@ // To force typeinfo to only be emitted once. TLSPolicy::~TLSPolicy() {} -#ifdef TLS_DISABLED - -void LoadedTLSConfig::print(FILE* fp) { - fprintf(fp, "Cannot print LoadedTLSConfig. TLS support is not enabled.\n"); -} - -#else // TLS is enabled - #include #include #include @@ -837,4 +829,3 @@ bool TLSPolicy::verify_peer(bool preverified, X509_STORE_CTX* store_ctx) { } return rc; } -#endif diff --git a/flow/TLSConfig.actor.h b/flow/TLSConfig.actor.h index ff3e670f49..ec1e0e7c64 100644 --- a/flow/TLSConfig.actor.h +++ b/flow/TLSConfig.actor.h @@ -37,8 +37,6 @@ #include "flow/Knobs.h" #include "flow/flow.h" -#ifndef TLS_DISABLED - #if defined(HAVE_WOLFSSL) #include #endif @@ -72,7 +70,6 @@ struct Criteria { return criteria == c.criteria && match_type == c.match_type && location == c.location; } }; -#endif #include "flow/actorcompiler.h" // This must be the last #include. @@ -204,7 +201,6 @@ private: TLSEndpointType endpointType = TLSEndpointType::UNSET; }; -#ifndef TLS_DISABLED namespace boost { namespace asio { namespace ssl { @@ -216,7 +212,6 @@ void ConfigureSSLContext( const LoadedTLSConfig& loaded, boost::asio::ssl::context* context, std::function onPolicyFailure = []() {}); -#endif class TLSPolicy : ReferenceCounted { public: @@ -226,7 +221,6 @@ public: virtual void addref() { ReferenceCounted::addref(); } virtual void delref() { ReferenceCounted::delref(); } -#ifndef TLS_DISABLED static std::string ErrorString(boost::system::error_code e); void set_verify_peers(std::vector verify_peers); @@ -248,7 +242,6 @@ public: }; std::vector rules; -#endif bool is_client; }; From 475d66084db63ca4921272362501770453cefe68 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Mon, 2 May 2022 22:26:31 -0700 Subject: [PATCH 100/299] Remove ENCRYPTION_ENABLED macro --- fdbclient/BackupContainerAzureBlobStore.actor.cpp | 2 -- fdbclient/BackupContainerFileSystem.actor.cpp | 10 ---------- fdbclient/BackupContainerS3BlobStore.actor.cpp | 4 ---- fdbrpc/AsyncFileEncrypted.h | 4 ---- fdbrpc/Net2FileSystem.cpp | 2 -- fdbrpc/sim2.actor.cpp | 2 -- fdbserver/workloads/EncryptionOps.actor.cpp | 4 ---- flow/BlobCipher.cpp | 4 ---- flow/BlobCipher.h | 2 -- flow/StreamCipher.h | 2 -- 10 files changed, 36 deletions(-) diff --git a/fdbclient/BackupContainerAzureBlobStore.actor.cpp b/fdbclient/BackupContainerAzureBlobStore.actor.cpp index 1ec987fc56..0c3ac8ba02 100644 --- a/fdbclient/BackupContainerAzureBlobStore.actor.cpp +++ b/fdbclient/BackupContainerAzureBlobStore.actor.cpp @@ -213,9 +213,7 @@ public: // Hack to get around the fact that macros don't work inside actor functions static Reference encryptFile(Reference const& f, AsyncFileEncrypted::Mode mode) { Reference result = f; -#if ENCRYPTION_ENABLED result = makeReference(result, mode); -#endif return result; } diff --git a/fdbclient/BackupContainerFileSystem.actor.cpp b/fdbclient/BackupContainerFileSystem.actor.cpp index a4778ecc10..0314a918b3 100644 --- a/fdbclient/BackupContainerFileSystem.actor.cpp +++ b/fdbclient/BackupContainerFileSystem.actor.cpp @@ -1128,7 +1128,6 @@ public: return false; } -#if ENCRYPTION_ENABLED ACTOR static Future createTestEncryptionKeyFile(std::string filename) { state Reference keyFile = wait(IAsyncFileSystem::filesystem()->open( filename, @@ -1164,7 +1163,6 @@ public: ASSERT_EQ(bytesRead, cipherKey->size()); return Void(); } -#endif // ENCRYPTION_ENABLED }; // class BackupContainerFileSystemImpl @@ -1481,19 +1479,11 @@ Future BackupContainerFileSystem::encryptionSetupComplete() const { void BackupContainerFileSystem::setEncryptionKey(Optional const& encryptionKeyFileName) { if (encryptionKeyFileName.present()) { -#if ENCRYPTION_ENABLED encryptionSetupFuture = BackupContainerFileSystemImpl::readEncryptionKey(encryptionKeyFileName.get()); -#else - encryptionSetupFuture = Void(); -#endif } } Future BackupContainerFileSystem::createTestEncryptionKeyFile(std::string const& filename) { -#if ENCRYPTION_ENABLED return BackupContainerFileSystemImpl::createTestEncryptionKeyFile(filename); -#else - return Void(); -#endif } // Get a BackupContainerFileSystem based on a container URL string diff --git a/fdbclient/BackupContainerS3BlobStore.actor.cpp b/fdbclient/BackupContainerS3BlobStore.actor.cpp index 58a436341a..af3fb9d128 100644 --- a/fdbclient/BackupContainerS3BlobStore.actor.cpp +++ b/fdbclient/BackupContainerS3BlobStore.actor.cpp @@ -174,11 +174,9 @@ std::string BackupContainerS3BlobStore::getURLFormat() { Future> BackupContainerS3BlobStore::readFile(const std::string& path) { Reference f = makeReference(m_bstore, m_bucket, dataPath(path)); -#if ENCRYPTION_ENABLED if (usesEncryption()) { f = makeReference(f, AsyncFileEncrypted::Mode::READ_ONLY); } -#endif f = makeReference(f, m_bstore->knobs.read_block_size, m_bstore->knobs.read_ahead_blocks, @@ -194,11 +192,9 @@ Future> BackupContainerS3BlobStore::listURLs(Reference< Future> BackupContainerS3BlobStore::writeFile(const std::string& path) { Reference f = makeReference(m_bstore, m_bucket, dataPath(path)); -#if ENCRYPTION_ENABLED if (usesEncryption()) { f = makeReference(f, AsyncFileEncrypted::Mode::APPEND_ONLY); } -#endif return Future>(makeReference(path, f)); } diff --git a/fdbrpc/AsyncFileEncrypted.h b/fdbrpc/AsyncFileEncrypted.h index a01c32f8cf..36436c9127 100644 --- a/fdbrpc/AsyncFileEncrypted.h +++ b/fdbrpc/AsyncFileEncrypted.h @@ -26,8 +26,6 @@ #include "flow/IRandom.h" #include "flow/StreamCipher.h" -#if ENCRYPTION_ENABLED - #include /* @@ -81,5 +79,3 @@ public: void releaseZeroCopy(void* data, int length, int64_t offset) override; int64_t debugFD() const override; }; - -#endif // ENCRYPTION_ENABLED diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index b6460b172c..76c75f211b 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -79,14 +79,12 @@ Future> Net2FileSystem::open(const std::string& file f = map(f, [=](Reference r) { return Reference(new AsyncFileWriteChecker(r)); }); if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) f = map(f, [=](Reference r) { return Reference(new AsyncFileChaos(r)); }); -#if ENCRYPTION_ENABLED if (flags & IAsyncFile::OPEN_ENCRYPTED) f = map(f, [flags](Reference r) { auto mode = flags & IAsyncFile::OPEN_READWRITE ? AsyncFileEncrypted::Mode::APPEND_ONLY : AsyncFileEncrypted::Mode::READ_ONLY; return Reference(new AsyncFileEncrypted(r, mode)); }); -#endif // ENCRYPTION_ENABLED return f; } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 065a35d110..f9e6dfab20 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -2594,14 +2594,12 @@ Future> Sim2FileSystem::open(const std::string& file f = map(f, [=](Reference r) { return Reference(new AsyncFileWriteChecker(r)); }); if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) f = map(f, [=](Reference r) { return Reference(new AsyncFileChaos(r)); }); -#if ENCRYPTION_ENABLED if (flags & IAsyncFile::OPEN_ENCRYPTED) f = map(f, [flags](Reference r) { auto mode = flags & IAsyncFile::OPEN_READWRITE ? AsyncFileEncrypted::Mode::APPEND_ONLY : AsyncFileEncrypted::Mode::READ_ONLY; return Reference(new AsyncFileEncrypted(r, mode)); }); -#endif // ENCRYPTION_ENABLED return f; } else return AsyncFileCached::open(filename, flags, mode); diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index 30567889bd..4062823c98 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -34,8 +34,6 @@ #include "flow/actorcompiler.h" // This must be the last #include. -#if ENCRYPTION_ENABLED - #define MEGA_BYTES (1024 * 1024) #define NANO_SECOND (1000 * 1000 * 1000) @@ -379,5 +377,3 @@ struct EncryptionOpsWorkload : TestWorkload { }; WorkloadFactory EncryptionOpsWorkloadFactory("EncryptionOps"); - -#endif // ENCRYPTION_ENABLED diff --git a/flow/BlobCipher.cpp b/flow/BlobCipher.cpp index c14fbacd26..d8895cea26 100644 --- a/flow/BlobCipher.cpp +++ b/flow/BlobCipher.cpp @@ -35,8 +35,6 @@ #include #include -#if ENCRYPTION_ENABLED - namespace { bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) { return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST; @@ -1186,5 +1184,3 @@ TEST_CASE("flow/BlobCipher") { TraceEvent("BlobCipherTest_Done").log(); return Void(); } - -#endif // ENCRYPTION_ENABLED diff --git a/flow/BlobCipher.h b/flow/BlobCipher.h index e50d4caab1..1d7a8d8dee 100644 --- a/flow/BlobCipher.h +++ b/flow/BlobCipher.h @@ -25,8 +25,6 @@ #include #include -#define ENCRYPTION_ENABLED 1 - #include "flow/Arena.h" #include "flow/EncryptUtils.h" #include "flow/FastRef.h" diff --git a/flow/StreamCipher.h b/flow/StreamCipher.h index c6ed26e563..cc7df84e47 100644 --- a/flow/StreamCipher.h +++ b/flow/StreamCipher.h @@ -20,8 +20,6 @@ #pragma once -#define ENCRYPTION_ENABLED 1 - #include "flow/Arena.h" #include "flow/FastRef.h" #include "flow/flow.h" From ea00ae3a25433dc89ea2ea500b052f6c9b90f701 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Mon, 2 May 2022 22:31:46 -0700 Subject: [PATCH 101/299] Remove DISABLE_TLS CMake argument --- README.md | 1 - cmake/FDBComponents.cmake | 59 ++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index e40bf6ae23..5b1a8d7a45 100755 --- a/README.md +++ b/README.md @@ -126,7 +126,6 @@ You should create a second build-directory which you will use for building and d mkdir .build && cd .build cmake -G Ninja \ -DUSE_CCACHE=on \ - -DDISABLE_TLS=off \ -DUSE_DTRACE=off \ .. ninja -j 10 diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index 998dfaf616..7710e871a5 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -20,40 +20,35 @@ endif() include(CheckSymbolExists) -set(DISABLE_TLS OFF CACHE BOOL "Don't try to find OpenSSL and always build without TLS support") set(USE_WOLFSSL OFF CACHE BOOL "Build against WolfSSL instead of OpenSSL") set(USE_OPENSSL ON CACHE BOOL "Build against OpenSSL") -if(DISABLE_TLS) - set(WITH_TLS OFF) -else() - if(USE_WOLFSSL) - set(WOLFSSL_USE_STATIC_LIBS TRUE) - find_package(WolfSSL) - if(WOLFSSL_FOUND) - set(CMAKE_REQUIRED_INCLUDES ${WOLFSSL_INCLUDE_DIR}) - set(WITH_TLS ON) - add_compile_options(-DHAVE_OPENSSL) - add_compile_options(-DHAVE_WOLFSSL) - else() - message(STATUS "WolfSSL was not found - Will compile without TLS Support") - message(STATUS "You can set WOLFSSL_ROOT_DIR to help cmake find it") - set(WITH_TLS OFF) - endif() - elseif(USE_OPENSSL) - set(OPENSSL_USE_STATIC_LIBS TRUE) - if(WIN32) - set(OPENSSL_MSVC_STATIC_RT ON) - endif() - find_package(OpenSSL) - if(OPENSSL_FOUND) - set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR}) - set(WITH_TLS ON) - add_compile_options(-DHAVE_OPENSSL) - else() - message(STATUS "OpenSSL was not found - Will compile without TLS Support") - message(STATUS "You can set OPENSSL_ROOT_DIR to help cmake find it") - set(WITH_TLS OFF) - endif() +if(USE_WOLFSSL) + set(WOLFSSL_USE_STATIC_LIBS TRUE) + find_package(WolfSSL) + if(WOLFSSL_FOUND) + set(CMAKE_REQUIRED_INCLUDES ${WOLFSSL_INCLUDE_DIR}) + set(WITH_TLS ON) + add_compile_options(-DHAVE_OPENSSL) + add_compile_options(-DHAVE_WOLFSSL) + else() + message(STATUS "WolfSSL was not found - Will compile without TLS Support") + message(STATUS "You can set WOLFSSL_ROOT_DIR to help cmake find it") + set(WITH_TLS OFF) + endif() +elseif(USE_OPENSSL) + set(OPENSSL_USE_STATIC_LIBS TRUE) + if(WIN32) + set(OPENSSL_MSVC_STATIC_RT ON) + endif() + find_package(OpenSSL) + if(OPENSSL_FOUND) + set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR}) + set(WITH_TLS ON) + add_compile_options(-DHAVE_OPENSSL) + else() + message(STATUS "OpenSSL was not found - Will compile without TLS Support") + message(STATUS "You can set OPENSSL_ROOT_DIR to help cmake find it") + set(WITH_TLS OFF) endif() endif() From 8ea68154bfeae76a40c9b5aa98f7e9cbd1ddee32 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Mon, 2 May 2022 22:45:00 -0700 Subject: [PATCH 102/299] Remove WITH_TLS CMake variable --- cmake/FDBComponents.cmake | 11 +++++------ fdbrpc/CMakeLists.txt | 7 +------ flow/CMakeLists.txt | 22 ++++++---------------- flowbench/CMakeLists.txt | 7 +------ 4 files changed, 13 insertions(+), 34 deletions(-) diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index 7710e871a5..f0081df9c9 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -27,13 +27,12 @@ if(USE_WOLFSSL) find_package(WolfSSL) if(WOLFSSL_FOUND) set(CMAKE_REQUIRED_INCLUDES ${WOLFSSL_INCLUDE_DIR}) - set(WITH_TLS ON) add_compile_options(-DHAVE_OPENSSL) add_compile_options(-DHAVE_WOLFSSL) else() message(STATUS "WolfSSL was not found - Will compile without TLS Support") message(STATUS "You can set WOLFSSL_ROOT_DIR to help cmake find it") - set(WITH_TLS OFF) + message(FATAL_ERROR "Unable to find WolfSSL") endif() elseif(USE_OPENSSL) set(OPENSSL_USE_STATIC_LIBS TRUE) @@ -43,13 +42,14 @@ elseif(USE_OPENSSL) find_package(OpenSSL) if(OPENSSL_FOUND) set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR}) - set(WITH_TLS ON) add_compile_options(-DHAVE_OPENSSL) else() message(STATUS "OpenSSL was not found - Will compile without TLS Support") message(STATUS "You can set OPENSSL_ROOT_DIR to help cmake find it") - set(WITH_TLS OFF) + message(FATAL_ERROR "Unable to find OpenSSL") endif() +else() + message(FATAL_ERROR "Must set USE_WOLFSSL or USE_OPENSSL") endif() ################################################################################ @@ -250,7 +250,6 @@ function(print_components) message(STATUS "Build Java Bindings: ${WITH_JAVA_BINDING}") message(STATUS "Build Go bindings: ${WITH_GO_BINDING}") message(STATUS "Build Ruby bindings: ${WITH_RUBY_BINDING}") - message(STATUS "Build with TLS support: ${WITH_TLS}") message(STATUS "Build Documentation (make html): ${WITH_DOCUMENTATION}") message(STATUS "Build Python sdist (make package): ${WITH_PYTHON_BINDING}") message(STATUS "Configure CTest (depends on Python): ${WITH_PYTHON}") @@ -260,7 +259,7 @@ function(print_components) endfunction() if(FORCE_ALL_COMPONENTS) - if(NOT WITH_C_BINDING OR NOT WITH_JAVA_BINDING OR NOT WITH_TLS OR NOT WITH_GO_BINDING OR NOT WITH_RUBY_BINDING OR NOT WITH_PYTHON_BINDING OR NOT WITH_DOCUMENTATION) + if(NOT WITH_C_BINDING OR NOT WITH_JAVA_BINDING OR NOT WITH_GO_BINDING OR NOT WITH_RUBY_BINDING OR NOT WITH_PYTHON_BINDING OR NOT WITH_DOCUMENTATION) print_components() message(FATAL_ERROR "FORCE_ALL_COMPONENTS is set but not all dependencies could be found") endif() diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index 3be4954666..00e13e564d 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -7,6 +7,7 @@ set(FDBRPC_SRCS AsyncFileReadAhead.actor.h AsyncFileWinASIO.actor.h AsyncFileCached.actor.cpp + AsyncFileEncrypted.actor.cpp AsyncFileNonDurable.actor.cpp AsyncFileWriteChecker.cpp FailureMonitor.actor.cpp @@ -45,12 +46,6 @@ set(FDBRPC_SRCS TraceFileIO.cpp TSSComparison.h) -if(WITH_TLS) - set(FDBRPC_SRCS - ${FDBRPC_SRCS} - AsyncFileEncrypted.actor.cpp) -endif() - set(COMPILE_EIO OFF) if(NOT WIN32) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index bf390d4b88..7493dfee94 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -58,6 +58,7 @@ set(FLOW_SRCS SignalSafeUnwind.cpp SignalSafeUnwind.h SimpleOpt.h + StreamCipher.cpp StreamCipher.h SystemMonitor.cpp SystemMonitor.h @@ -108,12 +109,6 @@ set(FLOW_SRCS xxhash.c xxhash.h) -if(WITH_TLS) - set(FLOW_SRCS - ${FLOW_SRCS} - StreamCipher.cpp) -endif() - add_library(stacktrace stacktrace.amalgamation.cpp stacktrace.h) if (USE_ASAN) target_compile_definitions(stacktrace PRIVATE ADDRESS_SANITIZER) @@ -170,16 +165,11 @@ if(USE_VALGRIND) target_link_libraries(flow PUBLIC Valgrind) target_link_libraries(flow_sampling PUBLIC Valgrind) endif() -if(NOT WITH_TLS) - target_compile_definitions(flow PUBLIC TLS_DISABLED) - target_compile_definitions(flow_sampling PUBLIC TLS_DISABLED) -else() - target_link_libraries(flow PUBLIC OpenSSL::SSL) - target_link_libraries(flow_sampling PUBLIC OpenSSL::SSL) - if(USE_WOLFSSL) - target_include_directories(flow SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl) - target_include_directories(flow_sampling SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl) - endif() +target_link_libraries(flow PUBLIC OpenSSL::SSL) +target_link_libraries(flow_sampling PUBLIC OpenSSL::SSL) +if(USE_WOLFSSL) + target_include_directories(flow SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl) + target_include_directories(flow_sampling SYSTEM BEFORE PUBLIC ${WOLFSSL_INCLUDE_DIR}/wolfssl) endif() target_link_libraries(flow PUBLIC Threads::Threads ${CMAKE_DL_LIBS}) target_link_libraries(flow_sampling PUBLIC Threads::Threads ${CMAKE_DL_LIBS}) diff --git a/flowbench/CMakeLists.txt b/flowbench/CMakeLists.txt index 71509a0404..8378c9c8ab 100644 --- a/flowbench/CMakeLists.txt +++ b/flowbench/CMakeLists.txt @@ -1,6 +1,7 @@ set(FLOWBENCH_SRCS flowbench.actor.cpp BenchCallback.actor.cpp + BenchEncrypt.cpp BenchHash.cpp BenchIterate.cpp BenchIONet2.actor.cpp @@ -16,12 +17,6 @@ set(FLOWBENCH_SRCS GlobalData.h GlobalData.cpp) -if(WITH_TLS AND NOT WIN32) - set(FLOWBENCH_SRCS - ${FLOWBENCH_SRCS} - BenchEncrypt.cpp) -endif() - project (flowbench) # include the configurations from benchmark.cmake configure_file(benchmark.cmake googlebenchmark-download/CMakeLists.txt) From 06825775db6068a5493bbd00186a4091c171ef1f Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Mon, 2 May 2022 22:56:06 -0700 Subject: [PATCH 103/299] Fix formatting of lines with TLS_OPTION_FLAGS --- fdbbackup/FileConverter.h | 2 +- fdbbackup/backup.actor.cpp | 66 +++++++++++++++++++++++------------ fdbcli/fdbcli.actor.cpp | 3 +- fdbserver/fdbserver.actor.cpp | 2 +- flow/TLSConfig.actor.h | 5 +-- 5 files changed, 51 insertions(+), 27 deletions(-) diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h index 251f8d004a..37a5c3f761 100644 --- a/fdbbackup/FileConverter.h +++ b/fdbbackup/FileConverter.h @@ -65,7 +65,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP }, { OPT_INPUT_FILE, "-i", SO_REQ_SEP }, { OPT_INPUT_FILE, "--input", SO_REQ_SEP }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, - TLS_OPTION_FLAGS + TLS_OPTION_FLAGS, { OPT_BUILD_FLAGS, "--build-flags", SO_NONE }, { OPT_LIST_ONLY, "--list-only", SO_NONE }, { OPT_KEY_PREFIX, "-k", SO_REQ_SEP }, diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 03f572e340..9ce6b03b18 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -220,7 +220,8 @@ CSimpleOpt::SOption g_rgAgentOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupStartOptions[] = { @@ -266,7 +267,8 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = { { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_INCREMENTALONLY, "--incremental", SO_NONE }, { OPT_ENCRYPTION_KEY_FILE, "--encryption-key-file", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupModifyOptions[] = { @@ -329,7 +331,8 @@ CSimpleOpt::SOption g_rgBackupStatusOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_JSON, "--json", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupAbortOptions[] = { @@ -355,7 +358,8 @@ CSimpleOpt::SOption g_rgBackupAbortOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupCleanupOptions[] = { @@ -381,7 +385,8 @@ CSimpleOpt::SOption g_rgBackupCleanupOptions[] = { { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_DELETE_DATA, "--delete-data", SO_NONE }, { OPT_MIN_CLEANUP_SECONDS, "--min-cleanup-seconds", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDiscontinueOptions[] = { @@ -409,7 +414,8 @@ CSimpleOpt::SOption g_rgBackupDiscontinueOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupWaitOptions[] = { @@ -437,7 +443,8 @@ CSimpleOpt::SOption g_rgBackupWaitOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupPauseOptions[] = { @@ -461,7 +468,8 @@ CSimpleOpt::SOption g_rgBackupPauseOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupExpireOptions[] = { @@ -497,7 +505,8 @@ CSimpleOpt::SOption g_rgBackupExpireOptions[] = { { OPT_EXPIRE_BEFORE_DATETIME, "--expire-before-timestamp", SO_REQ_SEP }, { OPT_EXPIRE_MIN_RESTORABLE_DAYS, "--min-restorable-days", SO_REQ_SEP }, { OPT_EXPIRE_DELETE_BEFORE_DAYS, "--delete-before-days", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDeleteOptions[] = { @@ -523,7 +532,8 @@ CSimpleOpt::SOption g_rgBackupDeleteOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { @@ -554,7 +564,8 @@ CSimpleOpt::SOption g_rgBackupDescribeOptions[] = { { OPT_DESCRIBE_DEEP, "--deep", SO_NONE }, { OPT_DESCRIBE_TIMESTAMPS, "--version-timestamps", SO_NONE }, { OPT_JSON, "--json", SO_NONE }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupDumpOptions[] = { @@ -583,7 +594,8 @@ CSimpleOpt::SOption g_rgBackupDumpOptions[] = { { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_DUMP_BEGIN, "--begin", SO_REQ_SEP }, { OPT_DUMP_END, "--end", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupTagsOptions[] = { @@ -598,7 +610,8 @@ CSimpleOpt::SOption g_rgBackupTagsOptions[] = { { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP }, { OPT_QUIET, "-q", SO_NONE }, { OPT_QUIET, "--quiet", SO_NONE }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupListOptions[] = { @@ -623,7 +636,8 @@ CSimpleOpt::SOption g_rgBackupListOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgBackupQueryOptions[] = { @@ -656,7 +670,8 @@ CSimpleOpt::SOption g_rgBackupQueryOptions[] = { { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; // g_rgRestoreOptions is used by fdbrestore and fastrestore_tool @@ -702,7 +717,8 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = { { OPT_RESTORE_BEGIN_VERSION, "--begin-version", SO_REQ_SEP }, { OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY, "--inconsistent-snapshot-only", SO_NONE }, { OPT_ENCRYPTION_KEY_FILE, "--encryption-key-file", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBAgentOptions[] = { @@ -732,7 +748,8 @@ CSimpleOpt::SOption g_rgDBAgentOptions[] = { { OPT_HELP, "-h", SO_NONE }, { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBStartOptions[] = { @@ -762,7 +779,8 @@ CSimpleOpt::SOption g_rgDBStartOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBStatusOptions[] = { @@ -792,7 +810,8 @@ CSimpleOpt::SOption g_rgDBStatusOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBSwitchOptions[] = { @@ -821,7 +840,8 @@ CSimpleOpt::SOption g_rgDBSwitchOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBAbortOptions[] = { @@ -851,7 +871,8 @@ CSimpleOpt::SOption g_rgDBAbortOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; CSimpleOpt::SOption g_rgDBPauseOptions[] = { @@ -877,7 +898,8 @@ CSimpleOpt::SOption g_rgDBPauseOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_DEVHELP, "--dev-help", SO_NONE }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; const KeyRef exeAgent = LiteralStringRef("backup_agent"); diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index af920e63ba..1df6662fd9 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -125,7 +125,8 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, { OPT_DEBUG_TLS, "--debug-tls", SO_NONE }, { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, { OPT_MEMORY, "--memory", SO_REQ_SEP }, - TLS_OPTION_FLAGS SO_END_OF_OPTIONS }; + TLS_OPTION_FLAGS, + SO_END_OF_OPTIONS }; void printAtCol(const char* text, int col, FILE* stream = stdout) { const char* iter = text; diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index d3737536a2..9c9635d20e 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -204,7 +204,7 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_FLOW_PROCESS_NAME, "--process-name", SO_REQ_SEP }, { OPT_FLOW_PROCESS_ENDPOINT, "--process-endpoint", SO_REQ_SEP }, { OPT_IP_TRUSTED_MASK, "--trusted-subnet-", SO_REQ_SEP }, - TLS_OPTION_FLAGS + TLS_OPTION_FLAGS, SO_END_OF_OPTIONS }; diff --git a/flow/TLSConfig.actor.h b/flow/TLSConfig.actor.h index ec1e0e7c64..248f9aa3cd 100644 --- a/flow/TLSConfig.actor.h +++ b/flow/TLSConfig.actor.h @@ -257,8 +257,9 @@ public: { TLSConfig::OPT_TLS_CERTIFICATES, TLS_CERTIFICATE_FILE_FLAG, SO_REQ_SEP }, \ { TLSConfig::OPT_TLS_KEY, TLS_KEY_FILE_FLAG, SO_REQ_SEP }, \ { TLSConfig::OPT_TLS_VERIFY_PEERS, TLS_VERIFY_PEERS_FLAG, SO_REQ_SEP }, \ - { TLSConfig::OPT_TLS_PASSWORD, TLS_PASSWORD_FLAG, SO_REQ_SEP }, \ - { TLSConfig::OPT_TLS_CA_FILE, TLS_CA_FILE_FLAG, SO_REQ_SEP }, + { TLSConfig::OPT_TLS_PASSWORD, TLS_PASSWORD_FLAG, SO_REQ_SEP }, { \ + TLSConfig::OPT_TLS_CA_FILE, TLS_CA_FILE_FLAG, SO_REQ_SEP \ + } #define TLS_HELP \ " " TLS_CERTIFICATE_FILE_FLAG " CERTFILE\n" \ From 258ba462e17e5b0a8ceb752642fd26ef36c11559 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 3 May 2022 09:48:24 -0700 Subject: [PATCH 104/299] Remove !defined(_WIN32) guards for encryption code --- fdbclient/BackupContainerS3BlobStore.actor.cpp | 2 -- fdbserver/workloads/UnitTests.actor.cpp | 4 ---- flow/Platform.actor.cpp | 4 ---- 3 files changed, 10 deletions(-) diff --git a/fdbclient/BackupContainerS3BlobStore.actor.cpp b/fdbclient/BackupContainerS3BlobStore.actor.cpp index af3fb9d128..413c8ea09b 100644 --- a/fdbclient/BackupContainerS3BlobStore.actor.cpp +++ b/fdbclient/BackupContainerS3BlobStore.actor.cpp @@ -20,9 +20,7 @@ #include "fdbclient/AsyncFileS3BlobStore.actor.h" #include "fdbclient/BackupContainerS3BlobStore.h" -#if (!defined(_WIN32)) #include "fdbrpc/AsyncFileEncrypted.h" -#endif #include "fdbrpc/AsyncFileReadAhead.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp index 1545f78ab8..4ad6feab78 100644 --- a/fdbserver/workloads/UnitTests.actor.cpp +++ b/fdbserver/workloads/UnitTests.actor.cpp @@ -28,10 +28,8 @@ void forceLinkFlowTests(); void forceLinkVersionedMapTests(); void forceLinkMemcpyTests(); void forceLinkMemcpyPerfTests(); -#if (!defined(_WIN32)) void forceLinkStreamCipherTests(); void forceLinkBlobCipherTests(); -#endif void forceLinkParallelStreamTests(); void forceLinkSimExternalConnectionTests(); void forceLinkMutationLogReaderTests(); @@ -79,10 +77,8 @@ struct UnitTestWorkload : TestWorkload { forceLinkVersionedMapTests(); forceLinkMemcpyTests(); forceLinkMemcpyPerfTests(); -#if (!defined(_WIN32)) forceLinkStreamCipherTests(); void forceLinkBlobCipherTests(); -#endif forceLinkParallelStreamTests(); forceLinkSimExternalConnectionTests(); forceLinkMutationLogReaderTests(); diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index d29aea5c22..5003487591 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -31,10 +31,8 @@ #include "flow/Platform.actor.h" #include "flow/Arena.h" -#if (!defined(_WIN32)) #include "flow/StreamCipher.h" #include "flow/BlobCipher.h" -#endif #include "flow/Trace.h" #include "flow/Error.h" @@ -3552,11 +3550,9 @@ void crashHandler(int sig) { bool error = (sig != SIGUSR2); -#if (!defined(_WIN32)) StreamCipherKey::cleanup(); StreamCipher::cleanup(); BlobCipherKeyCache::cleanup(); -#endif fflush(stdout); { From 225146176d1ef0bd24fab47e6f581d2178cc54a4 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 3 May 2022 12:13:09 -0700 Subject: [PATCH 105/299] Apply clang-format to fdbcli.actor.cpp and Net2.actor.cpp --- fdbcli/fdbcli.actor.cpp | 3 +-- flow/Net2.actor.cpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 1df6662fd9..ad8dcaa6e2 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -444,8 +444,7 @@ static void printProgramUsage(const char* name) { " --no-status Disables the initial status check done when starting\n" " the CLI.\n" " --api-version APIVERSION\n" - " Specifies the version of the API for the CLI to use.\n" - TLS_HELP + " Specifies the version of the API for the CLI to use.\n" TLS_HELP " --knob-KNOBNAME KNOBVALUE\n" " Changes a knob option. KNOBNAME should be lowercase.\n" " --debug-tls Prints the TLS configuration and certificate chain, then exits.\n" diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 0778fefde8..19cd12c813 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -1208,8 +1208,8 @@ Net2::Net2(const TLSConfig& tlsConfig, bool useThreadPool, bool useMetrics) : globals(enumGlobal::COUNT), useThreadPool(useThreadPool), reactor(this), sslContextVar({ ReferencedObject::from( boost::asio::ssl::context(boost::asio::ssl::context::tls)) }), - sslHandshakerThreadsStarted(0), sslPoolHandshakesInProgress(0), - tlsConfig(tlsConfig), tlsInitializedState(ETLSInitState::NONE), network(this), tscBegin(0), tscEnd(0), taskBegin(0), + sslHandshakerThreadsStarted(0), sslPoolHandshakesInProgress(0), tlsConfig(tlsConfig), + tlsInitializedState(ETLSInitState::NONE), network(this), tscBegin(0), tscEnd(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield), tasksIssued(0), stopped(false), started(false), numYields(0), lastPriorityStats(nullptr), ready(FLOW_KNOBS->READY_QUEUE_RESERVED_SIZE) { // Until run() is called, yield() will always yield From 484558250bc5bd3a207ba8cd967fb89824669997 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Tue, 3 May 2022 19:23:37 +0000 Subject: [PATCH 106/299] - Do not consult version vector if the client has obtained the read version from its GRV cache. --- fdbclient/NativeAPI.actor.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c57ed97d3c..37dfd7300c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -236,11 +236,14 @@ void DatabaseContext::getLatestCommitVersions(const Reference& loc } if (ssVersionVectorCache.getMaxVersion() != invalidVersion && readVersion > ssVersionVectorCache.getMaxVersion()) { - TraceEvent(SevDebug, "GetLatestCommitVersions") - .detail("ReadVersion", readVersion) - .detail("VersionVector", ssVersionVectorCache.toString()); - ssVersionVectorCache.clear(); - throw stale_version_vector(); // TODO: investigate why + if (!CLIENT_KNOBS->FORCE_GRV_CACHE_OFF && !info->options.skipGrvCache && info->options.useGrvCache) { + return; + } else { + TraceEvent(SevError, "GetLatestCommitVersions") + .detail("ReadVersion", readVersion) + .detail("VersionVector", ssVersionVectorCache.toString()); + ASSERT(false); + } } std::map> versionMap; // order the versions to be returned From 97eb12381baab68b8d58fa9e73410470fba9e123 Mon Sep 17 00:00:00 2001 From: Hao Fu <77984096+hfu94@users.noreply.github.com> Date: Tue, 3 May 2022 12:24:26 -0700 Subject: [PATCH 107/299] implement equals and hashCode in MappedKeyValue (#7041) --- .../apple/foundationdb/MappedKeyValue.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java b/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java index 71bad2caa9..3e66a91b84 100644 --- a/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java +++ b/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java @@ -84,6 +84,27 @@ public class MappedKeyValue extends KeyValue { return b; } + @Override + public boolean equals(Object obj) { + if (obj == null) + return false; + if (obj == this) + return true; + if (!(obj instanceof MappedKeyValue)) + return false; + + MappedKeyValue rhs = (MappedKeyValue) obj; + return Arrays.equals(rangeBegin, rhs.rangeBegin) + && Arrays.equals(rangeEnd, rhs.rangeEnd) + && Objects.equals(rangeResult, rhs.rangeResult); + } + + @Override + public int hashCode() { + int hashForResult = rangeResult == null ? 0 : rangeResult.hashCode(); + return 17 + (29 * hashForResult + 37 * Arrays.hashCode(rangeBegin) + Arrays.hashCode(rangeEnd)); + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("MappedKeyValue{"); From 2102ed1eaa17121335aef310994f0d7d37238350 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Tue, 3 May 2022 21:56:11 +0000 Subject: [PATCH 108/299] - Remove "stale_version_vector" error code. --- fdbclient/DatabaseContext.h | 1 - fdbclient/NativeAPI.actor.cpp | 24 ++++++++++-------------- flow/error_definitions.h | 1 - 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 11f5b1beb7..7965b9833d 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -524,7 +524,6 @@ public: Counter transactionsExpensiveClearCostEstCount; Counter transactionGrvFullBatches; Counter transactionGrvTimedOutBatches; - Counter transactionsStaleVersionVectors; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit, bgLatencies, bgGranulesPerRequest; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 37dfd7300c..57b68282cf 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1447,13 +1447,13 @@ DatabaseContext::DatabaseContext(ReferenceSHARD_STAT_SMOOTH_AMOUNT), + latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), + bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), outstandingWatches(0), sharedStatePtr(nullptr), + lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), + lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), + clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), + healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), + smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { dbId = deterministicRandom()->randomUniqueID(); @@ -1720,9 +1720,8 @@ DatabaseContext::DatabaseContext(const Error& err) transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), - transactionsStaleVersionVectors("NumStaleVersionVectors", cc), latencies(1000), readLatencies(1000), - commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), bgLatencies(1000), - bgGranulesPerRequest(1000), transactionTracingSample(false), + latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), + bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} @@ -7070,14 +7069,11 @@ Future Transaction::onError(Error const& e) { reset(); return delay(backoff, trState->taskID); } - if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || - e.code() == error_code_stale_version_vector) { + if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version) { if (e.code() == error_code_transaction_too_old) ++trState->cx->transactionsTooOld; else if (e.code() == error_code_future_version) ++trState->cx->transactionsFutureVersions; - else if (e.code() == error_code_stale_version_vector) - ++trState->cx->transactionsStaleVersionVectors; double maxBackoff = trState->options.maxBackoff; reset(); diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 0fee428000..4dc43e11af 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -95,7 +95,6 @@ ERROR( page_encoding_not_supported, 1071, "Page encoding type is not supported o ERROR( page_decoding_failed, 1072, "Page content decoding failed" ) ERROR( unexpected_encoding_type, 1073, "Page content decoding failed" ) ERROR( encryption_key_not_found, 1074, "Encryption key not found" ) -ERROR( stale_version_vector, 1075, "Client version vector is stale" ) ERROR( broken_promise, 1100, "Broken promise" ) ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" ) From 75a90be0ddc08f8e6b39ad66890a3f479687ba44 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 3 May 2022 15:26:24 -0700 Subject: [PATCH 109/299] refactor Status retrieval --- fdbclient/FDBTypes.h | 9 +++ fdbserver/Status.actor.cpp | 126 ++++++++++++++++++------------------- 2 files changed, 71 insertions(+), 64 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 033be08499..0e70fbe04a 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -33,6 +33,7 @@ #include "flow/FastRef.h" #include "flow/ProtocolVersion.h" #include "flow/flow.h" +#include "fdbclient/Status.h" enum class TraceFlags : uint8_t { unsampled = 0b00000000, sampled = 0b00000001 }; @@ -1473,6 +1474,14 @@ struct StorageMetadataType { void serialize(Ar& ar) { serializer(ar, createdTime, storeType); } + + StatusObject toJSON() const { + StatusObject result; + result["created_time_timestamp"] = createdTime; + result["created_time_datetime"] = epochsToGMTString(createdTime); + result["storage_engine"] = storeType.toString(); + return result; + } }; // store metadata of wiggle action diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 8d99936596..4340ff53bf 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -103,6 +103,14 @@ extern const char* limitReasonDesc[]; typedef std::map EventMap; +struct StorageServerStatusInfo : public StorageServerInterface { + Optional metadata; + EventMap eventMap; + StorageServerStatusInfo(const StorageServerInterface& interface, + Optional metadata = Optional()) + : StorageServerInterface(interface), metadata(metadata) {} +}; + ACTOR static Future> latestEventOnWorker(WorkerInterface worker, std::string eventName) { try { EventLogRequest req = @@ -468,12 +476,13 @@ struct RolesInfo { obj["role"] = role; return roles.insert(std::make_pair(address, obj))->second; } + JsonBuilderObject& addRole(std::string const& role, - StorageServerInterface& iface, - EventMap const& metrics, + StorageServerStatusInfo& iface, Version maxTLogVersion, double* pDataLagSeconds) { JsonBuilderObject obj; + EventMap const& metrics = iface.eventMap; double dataLagSeconds = -1.0; obj["id"] = iface.id().shortString(); obj["role"] = role; @@ -584,13 +593,8 @@ struct RolesInfo { } } - if (!iface.isTss()) { // only storage server has Metadata field - TraceEventFields const& metadata = metrics.at("Metadata"); - JsonBuilderObject metadataObj; - metadataObj["created_time_datetime"] = metadata.getValue("CreatedTimeDatetime"); - metadataObj["created_time_timestamp"] = metadata.getDouble("CreatedTimeTimestamp"); - metadataObj["storage_engine"] = metadata.getValue("StoreType"); - obj["storage_metadata"] = metadataObj; + if (iface.metadata.present()) { + obj["storage_metadata"] = iface.metadata.get().toJSON(); // printf("%s\n", metadataObj.getJson().c_str()); } @@ -731,7 +735,7 @@ ACTOR static Future processStatusFetcher( WorkerEvents traceFileOpenErrors, WorkerEvents programStarts, std::map> processIssues, - std::vector> storageServers, + std::vector storageServers, std::vector> tLogs, std::vector> commitProxies, std::vector> grvProxies, @@ -861,13 +865,13 @@ ACTOR static Future processStatusFetcher( wait(yield()); } - state std::vector>::iterator ss; + state std::vector::iterator ss; state std::map ssLag; state double lagSeconds; for (ss = storageServers.begin(); ss != storageServers.end(); ++ss) { - roles.addRole("storage", ss->first, ss->second, maxTLogVersion, &lagSeconds); + roles.addRole("storage", *ss, maxTLogVersion, &lagSeconds); if (lagSeconds != -1.0) { - ssLag[ss->first.address()] = lagSeconds; + ssLag[ss->address()] = lagSeconds; } wait(yield()); } @@ -1919,74 +1923,69 @@ static Future> getServerBusiestWriteTags( } ACTOR -static Future>> getServerMetadata(std::vector servers, - Database cx, - bool use_system_priority) { +static Future> readStorageInterfaceAndMetadata(Database cx, + bool use_system_priority) { state KeyBackedObjectMap metadataMap(serverMetadataKeys.begin, IncludeVersion()); - state std::vector> res(servers.size()); state Reference tr = makeReference(cx); - + state std::vector servers; loop { try { + servers.clear(); tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); if (use_system_priority) { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); } - state int i = 0; - for (i = 0; i < servers.size(); ++i) { - Optional metadata = wait(metadataMap.get(tr, servers[i].id(), Snapshot::True)); - // TraceEvent(SevDebug, "MetadataAppear", servers[i].id()).detail("Present", metadata.present()); - res[i] = metadata; + state RangeResult serverList = wait(tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY); + + servers.reserve(serverList.size()); + for (int i = 0; i < serverList.size(); i++) { + servers.push_back(StorageServerStatusInfo(decodeServerListValue(serverList[i].value))); } + state std::vector> futures(servers.size()); + for (int i = 0; i < servers.size(); ++i) { + auto& info = servers[i]; + futures[i] = fmap( + [&info](Optional meta) -> Void { + info.metadata = meta; + return Void(); + }, + metadataMap.get(tr, servers[i].id())); + // TraceEvent(SevDebug, "MetadataAppear", servers[i].id()).detail("Present", metadata.present()); + } + wait(waitForAll(futures)); wait(tr->commit()); break; } catch (Error& e) { wait(tr->onError(e)); } } - return res; + return servers; } -ACTOR static Future>> getStorageServersAndMetrics( +ACTOR static Future> getStorageServerStatusInfos( Database cx, std::unordered_map address_workers, WorkerDetails rkWorker) { - state std::vector servers = wait(timeoutError(getStorageServers(cx, true), 5.0)); - state std::vector> results; + state std::vector servers = + wait(timeoutError(readStorageInterfaceAndMetadata(cx, true), 5.0)); + state std::vector> results; state std::vector busiestWriteTags; - state std::vector> metadata; wait(store(results, getServerMetrics(servers, address_workers, std::vector{ "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag" })) && - store(busiestWriteTags, getServerBusiestWriteTags(servers, address_workers, rkWorker)) && - store(metadata, getServerMetadata(servers, cx, true))); + store(busiestWriteTags, getServerBusiestWriteTags(servers, address_workers, rkWorker))); - ASSERT(busiestWriteTags.size() == results.size() && metadata.size() == results.size()); + ASSERT(busiestWriteTags.size() == results.size()); for (int i = 0; i < results.size(); ++i) { - results[i].second.emplace("BusiestWriteTag", busiestWriteTags[i]); - - // FIXME: it's possible that a SS is removed between `getStorageServers` and `getServerMetadata`. Maybe we can - // read StorageServer and Metadata in an atomic transaction? - if (metadata[i].present()) { - TraceEventFields metadataField; - metadataField.addField("CreatedTimeTimestamp", std::to_string(metadata[i].get().createdTime)); - metadataField.addField("CreatedTimeDatetime", epochsToGMTString(metadata[i].get().createdTime)); - metadataField.addField("StoreType", metadata[i].get().storeType.toString()); - results[i].second.emplace("Metadata", metadataField); - } else if (!servers[i].isTss()) { - TraceEventFields metadataField; - metadataField.addField("CreatedTimeTimestamp", "0"); - metadataField.addField("CreatedTimeDatetime", "[removed]"); - metadataField.addField("StoreType", KeyValueStoreType::getStoreTypeStr(KeyValueStoreType::END)); - results[i].second.emplace("Metadata", metadataField); - } + servers[i].eventMap = std::move(results[i].second); + servers[i].eventMap.emplace("BusiestWriteTag", busiestWriteTags[i]); } - - return results; + return servers; } ACTOR static Future>> getTLogsAndMetrics( @@ -2103,7 +2102,7 @@ ACTOR static Future workloadStatusFetcher( JsonBuilderObject* qos, JsonBuilderObject* data_overlay, std::set* incomplete_reasons, - Future>>> storageServerFuture) { + Future>> storageServerFuture) { state JsonBuilderObject statusObj; state JsonBuilderObject operationsObj; state JsonBuilderObject bytesObj; @@ -2275,7 +2274,7 @@ ACTOR static Future workloadStatusFetcher( // Reads try { - ErrorOr>> storageServers = wait(storageServerFuture); + ErrorOr> storageServers = wait(storageServerFuture); if (!storageServers.present()) { throw storageServers.getError(); } @@ -2287,7 +2286,7 @@ ACTOR static Future workloadStatusFetcher( StatusCounter lowPriorityReads; for (auto& ss : storageServers.get()) { - TraceEventFields const& storageMetrics = ss.second.at("StorageMetrics"); + TraceEventFields const& storageMetrics = ss.eventMap.at("StorageMetrics"); if (storageMetrics.size() > 0) { readRequests.updateValues(StatusCounter(storageMetrics.getValue("QueryQueue"))); @@ -2318,14 +2317,14 @@ ACTOR static Future workloadStatusFetcher( ACTOR static Future clusterSummaryStatisticsFetcher( WorkerEvents pMetrics, - Future>>> storageServerFuture, + Future>> storageServerFuture, Future>>> tlogFuture, std::set* incomplete_reasons) { state JsonBuilderObject statusObj; try { state JsonBuilderObject cacheStatistics; - ErrorOr>> storageServers = wait(storageServerFuture); + ErrorOr> storageServers = wait(storageServerFuture); if (!storageServers.present()) { throw storageServers.getError(); @@ -2335,7 +2334,7 @@ ACTOR static Future clusterSummaryStatisticsFetcher( double storageCacheMissesHz = 0; for (auto& ss : storageServers.get()) { - auto processMetrics = pMetrics.find(ss.first.address()); + auto processMetrics = pMetrics.find(ss.address()); if (processMetrics != pMetrics.end()) { int64_t hits = processMetrics->second.getInt64("CacheHits"); int64_t misses = processMetrics->second.getInt64("CacheMisses"); @@ -2947,7 +2946,7 @@ ACTOR Future clusterGetStatus( state std::map> processIssues = getProcessIssuesAsMessages(workerIssues); - state std::vector> storageServers; + state std::vector storageServers; state std::vector> tLogs; state std::vector> commitProxies; state std::vector> grvProxies; @@ -3021,8 +3020,8 @@ ACTOR Future clusterGetStatus( address_workers[worker.interf.address()] = worker.interf; } - state Future>>> storageServerFuture = - errorOr(getStorageServersAndMetrics(cx, address_workers, rkWorker)); + state Future>> storageServerFuture = + errorOr(getStorageServerStatusInfos(cx, address_workers, rkWorker)); state Future>>> tLogFuture = errorOr(getTLogsAndMetrics(db, address_workers)); state Future>>> commitProxyFuture = @@ -3136,8 +3135,7 @@ ACTOR Future clusterGetStatus( } // Need storage servers now for processStatusFetcher() below. - ErrorOr>> _storageServers = - wait(storageServerFuture); + ErrorOr> _storageServers = wait(storageServerFuture); if (_storageServers.present()) { storageServers = _storageServers.get(); } else { @@ -3225,11 +3223,11 @@ ACTOR Future clusterGetStatus( int activeTSSCount = 0; JsonBuilderArray wiggleServerAddress; for (auto& it : storageServers) { - if (it.first.isTss()) { + if (it.isTss()) { activeTSSCount++; } - if (wiggleServers.count(it.first.id())) { - wiggleServerAddress.push_back(it.first.address().toString()); + if (wiggleServers.count(it.id())) { + wiggleServerAddress.push_back(it.address().toString()); } } statusObj["active_tss_count"] = activeTSSCount; From 7c37d172b9b39b9407777af7ff7414a00a0951e0 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 3 May 2022 17:21:08 -0700 Subject: [PATCH 110/299] solve some comments --- fdbcli/DataDistributionCommand.actor.cpp | 2 +- fdbcli/StatusCommand.actor.cpp | 4 +-- fdbclient/ServerKnobs.h | 2 +- fdbclient/SystemData.cpp | 4 +-- fdbclient/SystemData.h | 4 ++- fdbserver/DataDistributionQueue.actor.cpp | 1 + fdbserver/Status.actor.cpp | 2 +- fdbserver/TCInfo.actor.cpp | 2 +- .../SpecialKeySpaceCorrectness.actor.cpp | 14 +++++++-- tests/CMakeLists.txt | 2 +- tests/noSim/ReadSkewReadWrite.toml | 24 -------------- tests/rare/ReadSkewReadWrite.toml | 31 ++++++++++++------- 12 files changed, 44 insertions(+), 48 deletions(-) delete mode 100644 tests/noSim/ReadSkewReadWrite.toml diff --git a/fdbcli/DataDistributionCommand.actor.cpp b/fdbcli/DataDistributionCommand.actor.cpp index 1cb667c812..7000bdf5c7 100644 --- a/fdbcli/DataDistributionCommand.actor.cpp +++ b/fdbcli/DataDistributionCommand.actor.cpp @@ -69,7 +69,7 @@ ACTOR Future setDDIgnoreRebalanceSwitch(Reference db, uint8_t D try { state ThreadFuture> resultFuture = tr->get(rebalanceDDIgnoreKey); Optional v = wait(safeThreadFutureToFuture(resultFuture)); - uint8_t oldValue = 0; // nothing is disabled + uint8_t oldValue = DDIgnore::NONE; // nothing is disabled if (v.present()) { if (v.get().size() > 0) { oldValue = BinaryReader::fromStringRef(v.get(), Unversioned()); diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp index 98e7bdc377..47bbc6e4f8 100644 --- a/fdbcli/StatusCommand.actor.cpp +++ b/fdbcli/StatusCommand.actor.cpp @@ -1128,8 +1128,8 @@ void printStatus(StatusObjectReader statusObj, "storage server failures."; } if (statusObjCluster.has("data_distribution_disabled_for_rebalance")) { - outputString += "\n\nWARNING: Data distribution is currently turned on but shard size balancing is " - "currently disabled."; + outputString += "\n\nWARNING: Data distribution is currently turned on but one or both of shard " + "size and read-load based balancing are disabled."; // data_distribution_disabled_hex if (statusObjCluster.has("data_distribution_disabled_hex")) { outputString += " Ignore code: " + statusObjCluster["data_distribution_disabled_hex"].get_str(); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 0222cd9736..a969232c85 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -610,7 +610,7 @@ public: int64_t BYTES_READ_UNITS_PER_SAMPLE; int64_t READ_HOT_SUB_RANGE_CHUNK_SIZE; int64_t EMPTY_READ_PENALTY; - int DD_SHARD_COMPARE_LIMIT; + int DD_SHARD_COMPARE_LIMIT; // when read-aware DD is enabled, at most how many shards are compared together bool READ_SAMPLING_ENABLED; // Storage Server diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index bd867665ba..163d5f2862 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -302,8 +302,8 @@ std::pair>, std::vector decodeHealthyZoneValue(ValueRef const&); diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 376d61d90e..b8438a9249 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1513,6 +1513,7 @@ inline double getWorstCpu(const HealthMetrics& metrics, const std::vector& } else { // assume the server is too busy to report its stats cpu = std::max(cpu, 100.0); + break; } } return cpu; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 4486cc9921..86e145e342 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1570,7 +1570,7 @@ struct LoadConfigurationResult { double healthyZoneSeconds; bool rebalanceDDIgnored; // FIXME: possible convert it to int if upgrade value can be resolved? - std::string rebalanceDDIgnoreHex; // any or combination of 0, 1, 2, see enum DDIgnore; + std::string rebalanceDDIgnoreHex; // any or combination of 0, 1, 2, see DDIgnore; bool dataDistributionDisabled; LoadConfigurationResult() diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index a9cf76b95e..41ba999a93 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -402,7 +402,7 @@ double TCTeamInfo::getLoadReadBandwidth(bool includeInFlight, double inflightPen } return (size == 0 ? 0 : sum / size) + // we don't need to divide the inflight bandwidth because when added it the bandwidth is from single server - (includeInFlight ? inflightPenalty * getReadInFlightToTeam() : 0); + (includeInFlight ? inflightPenalty * getReadInFlightToTeam() / servers.size() : 0); } int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index 7a5fa2d3eb..a2f7893a6c 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -1275,13 +1275,21 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } } // set dd mode to 0 and disable DD for rebalance + state uint8_t ddIgnoreValue = DDIgnore::NONE; + if (deterministicRandom()->coinflip()) { + ddIgnoreValue |= DDIgnore::REBALANCE_READ; + } + if (deterministicRandom()->coinflip()) { + ddIgnoreValue |= DDIgnore::REBALANCE_DISK; + } loop { try { tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution"); tx->set(LiteralStringRef("mode").withPrefix(ddPrefix), LiteralStringRef("0")); - tx->set(LiteralStringRef("rebalance_ignored").withPrefix(ddPrefix), Value()); + tx->set(LiteralStringRef("rebalance_ignored").withPrefix(ddPrefix), + BinaryWriter::toValue(ddIgnoreValue, Unversioned())); wait(tx->commit()); tx->reset(); break; @@ -1306,8 +1314,8 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { ASSERT(BinaryReader::fromStringRef(val2.get(), Unversioned()) == 0); // check DD disabled for rebalance Optional val3 = wait(tx->get(rebalanceDDIgnoreKey)); - // default value "on" - ASSERT(val3.present()); + ASSERT(val3.present() && + BinaryReader::fromStringRef(val3.get(), Unversioned()) == ddIgnoreValue); tx->reset(); break; } catch (Error& e) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 64cf08cc8c..5426657bb1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -186,7 +186,6 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/WriteDuringRead.toml) add_fdb_test(TEST_FILES fast/WriteDuringReadClean.toml) add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT) - add_fdb_test(TEST_FILES noSim/ReadSkewReadWrite.toml IGNORE) if (WITH_ROCKSDB_EXPERIMENTAL) add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml) add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml) @@ -211,6 +210,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES rare/LargeApiCorrectnessStatus.toml) add_fdb_test(TEST_FILES rare/RYWDisable.toml) add_fdb_test(TEST_FILES rare/RandomReadWriteTest.toml) + add_fdb_test(TEST_FILES rare/ReadSkewReadWrite.toml) add_fdb_test(TEST_FILES rare/SpecificUnitTests.toml) add_fdb_test(TEST_FILES rare/SwizzledLargeApiCorrectness.toml) add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml) diff --git a/tests/noSim/ReadSkewReadWrite.toml b/tests/noSim/ReadSkewReadWrite.toml deleted file mode 100644 index 4542ef46d3..0000000000 --- a/tests/noSim/ReadSkewReadWrite.toml +++ /dev/null @@ -1,24 +0,0 @@ -[[test]] -testTitle = 'RandomReadWriteTest' -connectionFailuresDisableDuration = 100000 -# waitForQuiescenceBegin= false -# waitForQuiescenceEnd=false -clearAfterTest = false #true -runSetup = true # false -timeout = 3600.0 - -[[test.workload]] -testName = 'ReadWrite' -transactionsPerSecond = 100000 -testDuration = 900.0 -skewRound = 1 -nodeCount = 30000000 -valueBytes = 1000 -readsPerTransactionA = 8 -writesPerTransactionA = 0 -alpha = 0 -discardEdgeMeasurements = false -hotServerFraction = 0.2 -hotServerReadFrac = 0.8 -# hotServerShardFraction = 0.3 -warmingDelay = 180.0 \ No newline at end of file diff --git a/tests/rare/ReadSkewReadWrite.toml b/tests/rare/ReadSkewReadWrite.toml index 0c95c78c75..31d037e8a3 100644 --- a/tests/rare/ReadSkewReadWrite.toml +++ b/tests/rare/ReadSkewReadWrite.toml @@ -1,15 +1,24 @@ [[test]] testTitle = 'RandomReadWriteTest' -simCheckRelocationDuration = true connectionFailuresDisableDuration = 100000 +waitForQuiescenceBegin= false +waitForQuiescenceEnd=false +clearAfterTest = true +runSetup = true # false +timeout = 3600.0 - [[test.workload]] - testName = 'ReadWrite' - testDuration = 30.0 - skewRound = 1 - transactionsPerSecond = 2000 - nodeCount = 150000 - valueBytes = 128 - discardEdgeMeasurements = false - warmingDelay = 10.0 - hotServerFraction = 0.1 \ No newline at end of file +[[test.workload]] +testName = 'ReadWrite' +transactionsPerSecond = 100000 +testDuration = 400.0 +skewRound = 1 +nodeCount = 30000 # 30000000 +valueBytes = 100 +readsPerTransactionA = 8 +writesPerTransactionA = 0 +alpha = 0 +discardEdgeMeasurements = false +hotServerFraction = 0.2 +hotServerReadFrac = 0.8 +# hotServerShardFraction = 0.3 +warmingDelay = 180.0 \ No newline at end of file From d848441cdd27f61a064d5f8d4db3821aafecb53e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 3 May 2022 23:39:05 -0700 Subject: [PATCH 111/299] simulate ReadSkewReadWrite spec --- tests/rare/ReadSkewReadWrite.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/rare/ReadSkewReadWrite.toml b/tests/rare/ReadSkewReadWrite.toml index 31d037e8a3..e6f8cac8ff 100644 --- a/tests/rare/ReadSkewReadWrite.toml +++ b/tests/rare/ReadSkewReadWrite.toml @@ -9,10 +9,10 @@ timeout = 3600.0 [[test.workload]] testName = 'ReadWrite' -transactionsPerSecond = 100000 -testDuration = 400.0 +transactionsPerSecond = 100 +testDuration = 40.0 skewRound = 1 -nodeCount = 30000 # 30000000 +nodeCount = 3000 # 30000000 valueBytes = 100 readsPerTransactionA = 8 writesPerTransactionA = 0 From 1723bee6396a0bd4408e4e6e07fbf38093073fbc Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 3 May 2022 23:42:09 -0700 Subject: [PATCH 112/299] add fetchTopKShardMetrics to dd tracker --- fdbserver/DataDistribution.actor.cpp | 2 + fdbserver/DataDistribution.actor.h | 18 ++++-- fdbserver/DataDistributionTracker.actor.cpp | 61 ++++++++++++++++----- 3 files changed, 62 insertions(+), 19 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index dee6962ef7..3b7b0c8204 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -669,6 +669,7 @@ ACTOR Future dataDistribution(Reference self, state PromiseStream> getAverageShardBytes; state PromiseStream> getUnhealthyRelocationCount; state PromiseStream getShardMetrics; + state PromiseStream getTopKShardMetrics; state Reference> processingUnhealthy(new AsyncVar(false)); state Reference> processingWiggle(new AsyncVar(false)); state Promise readyToStart; @@ -735,6 +736,7 @@ ACTOR Future dataDistribution(Reference self, output, shardsAffectedByTeamFailure, getShardMetrics, + getTopKShardMetrics, getShardMetricsList, getAverageShardBytes.getFuture(), readyToStart, diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index ce38651763..6d364a3f12 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -152,17 +152,22 @@ struct GetTeamRequest { }; struct GetMetricsRequest { + KeyRange keys; + Promise reply; + GetMetricsRequest() {} + GetMetricsRequest(KeyRange const& keys) : keys(keys) {} +}; + +struct GetTopKMetricsRequest { // whether a > b typedef std::function MetricsComparator; - std::vector keys; int topK = 1; // default only return the top 1 shard based on the comparator + MetricsComparator comparator; // Return true if a.score > b.score, return the largest topK in keys + std::vector keys; Promise> reply; // topK storage metrics - Optional comparator; // Return true if a.score > b.score.if comparator is assigned, return the - // largest topK in keys, otherwise return the sum of metrics - GetMetricsRequest() {} - GetMetricsRequest(KeyRange const& keys, int topK = 1) : keys({ keys }), topK(topK) {} - GetMetricsRequest(std::vector const& keys, int topK = 1) : keys(keys), topK(topK) {} + GetTopKMetricsRequest() {} + GetTopKMetricsRequest(std::vector const& keys, int topK = 1) : keys(keys), topK(topK) {} }; struct GetMetricsListRequest { @@ -296,6 +301,7 @@ ACTOR Future dataDistributionTracker(Reference in PromiseStream output, Reference shardsAffectedByTeamFailure, PromiseStream getShardMetrics, + PromiseStream getTopKMetrics, PromiseStream getShardMetricsList, FutureStream> getAverageShardBytes, Promise readyToStart, diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 6b1361101a..2cb279049e 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -832,13 +832,12 @@ ACTOR Future trackInitialShards(DataDistributionTracker* self, Reference fetchShardMetrics_impl(DataDistributionTracker* self, GetMetricsRequest req) { +ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, GetTopKMetricsRequest req) { + ASSERT(req.comparator); try { loop { Future onChange; std::vector returnMetrics; - if (!req.comparator.present()) - returnMetrics.push_back(StorageMetrics()); // TODO: shall we do random shuffle to make the selection uniform distributed over the shard space? for (int i = 0; i < SERVER_KNOBS->DD_SHARD_COMPARE_LIMIT && i < req.keys.size(); ++i) { @@ -858,22 +857,18 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr break; } - if (req.comparator.present()) { - metrics.keys = range; - returnMetrics.push_back(metrics); - } else { - returnMetrics[0] += metrics; - } + metrics.keys = range; + returnMetrics.push_back(metrics); } if (!onChange.isValid()) { - if (!req.comparator.present() || req.topK >= returnMetrics.size()) + if (req.topK >= returnMetrics.size()) req.reply.send(returnMetrics); - else if (req.comparator.present()) { + else { std::nth_element(returnMetrics.begin(), returnMetrics.begin() + req.topK - 1, returnMetrics.end(), - req.comparator.get()); + req.comparator); req.reply.send( std::vector(returnMetrics.begin(), returnMetrics.begin() + req.topK)); } @@ -888,6 +883,42 @@ ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetr } } +ACTOR Future fetchTopKShardMetrics(DataDistributionTracker* self, GetTopKMetricsRequest req) { + choose { + when(wait(fetchTopKShardMetrics_impl(self, req))) {} + when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { req.reply.sendError(timed_out()); } + } + return Void(); +} + +ACTOR Future fetchShardMetrics_impl(DataDistributionTracker* self, GetMetricsRequest req) { + try { + loop { + Future onChange; + StorageMetrics returnMetrics; + for (auto t : self->shards.intersectingRanges(req.keys)) { + auto& stats = t.value().stats; + if (!stats->get().present()) { + onChange = stats->onChange(); + break; + } + returnMetrics += t.value().stats->get().get().metrics; + } + + if (!onChange.isValid()) { + req.reply.send(returnMetrics); + return Void(); + } + + wait(onChange); + } + } catch (Error& e) { + if (e.code() != error_code_actor_cancelled && !req.reply.isSet()) + req.reply.sendError(e); + throw; + } +} + ACTOR Future fetchShardMetrics(DataDistributionTracker* self, GetMetricsRequest req) { choose { when(wait(fetchShardMetrics_impl(self, req))) {} @@ -895,7 +926,7 @@ ACTOR Future fetchShardMetrics(DataDistributionTracker* self, GetMetricsRe TEST(true); // DD_SHARD_METRICS_TIMEOUT StorageMetrics largeMetrics; largeMetrics.bytes = getMaxShardSize(self->dbSizeEstimate->get()); - req.reply.send(std::vector(1, largeMetrics)); + req.reply.send(largeMetrics); } } return Void(); @@ -952,6 +983,7 @@ ACTOR Future dataDistributionTracker(Reference in PromiseStream output, Reference shardsAffectedByTeamFailure, PromiseStream getShardMetrics, + PromiseStream getTopKMetrics, PromiseStream getShardMetricsList, FutureStream> getAverageShardBytes, Promise readyToStart, @@ -990,6 +1022,9 @@ ACTOR Future dataDistributionTracker(Reference in when(GetMetricsRequest req = waitNext(getShardMetrics.getFuture())) { self.sizeChanges.add(fetchShardMetrics(&self, req)); } + when(GetTopKMetricsRequest req = waitNext(getTopKMetrics.getFuture())) { + self.sizeChanges.add(fetchTopKShardMetrics(&self, req)); + } when(GetMetricsListRequest req = waitNext(getShardMetricsList.getFuture())) { self.sizeChanges.add(fetchShardMetricsList(&self, req)); } From a3d0b005dcb2c37b4081b401c4b3ea1fb06826fc Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 00:00:03 -0700 Subject: [PATCH 113/299] reset several method use getShardMetrics --- fdbserver/DDTeamCollection.actor.cpp | 4 ++-- fdbserver/DataDistribution.actor.cpp | 1 + fdbserver/DataDistribution.actor.h | 3 ++- fdbserver/DataDistributionQueue.actor.cpp | 25 +++++++++++++---------- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 4de58f441f..6d0f8b2ae9 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -1472,7 +1472,7 @@ public: wait(delay(SERVER_KNOBS->DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY)); state std::vector shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(team->getServerIDs(), self->primary)); - state std::vector>> sizes; + state std::vector> sizes; sizes.reserve(shards.size()); for (auto const& shard : shards) { @@ -1487,7 +1487,7 @@ public: int64_t bytesLost = 0; for (auto const& size : sizes) { - bytesLost += size.get()[0].bytes; + bytesLost += size.get().bytes; } TraceEvent(SevWarnAlways, "DDZeroServerLeftInTeam", self->distributorId) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 3b7b0c8204..56e153274d 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -751,6 +751,7 @@ ACTOR Future dataDistribution(Reference self, output, input.getFuture(), getShardMetrics, + getTopKShardMetrics, processingUnhealthy, processingWiggle, tcis, diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 6d364a3f12..dec08776d7 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -167,7 +167,7 @@ struct GetTopKMetricsRequest { Promise> reply; // topK storage metrics GetTopKMetricsRequest() {} - GetTopKMetricsRequest(std::vector const& keys, int topK = 1) : keys(keys), topK(topK) {} + GetTopKMetricsRequest(std::vector const& keys, int topK = 1) : topK(topK), keys(keys) {} }; struct GetMetricsListRequest { @@ -314,6 +314,7 @@ ACTOR Future dataDistributionQueue(Database cx, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, + PromiseStream getTopKMetrics, Reference> processingUnhealthy, Reference> processingWiggle, std::vector teamCollection, diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index b8438a9249..dc2b0f330f 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -480,6 +480,7 @@ struct DDQueueData { PromiseStream output; FutureStream input; PromiseStream getShardMetrics; + PromiseStream getTopKMetrics; double* lastLimited; double lastInterval; @@ -544,6 +545,7 @@ struct DDQueueData { PromiseStream output, FutureStream input, PromiseStream getShardMetrics, + PromiseStream getTopKMetrics, double* lastLimited) : distributorId(mid), lock(lock), cx(cx), teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF), getAverageShardBytes(getAverageShardBytes), @@ -551,10 +553,10 @@ struct DDQueueData { finishMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM), fetchSourceLock(new FlowLock(SERVER_KNOBS->DD_FETCH_SOURCE_PARALLELISM)), activeRelocations(0), queuedRelocations(0), bytesWritten(0), teamSize(teamSize), singleRegionTeamSize(singleRegionTeamSize), - output(output), input(input), getShardMetrics(getShardMetrics), lastLimited(lastLimited), lastInterval(0), - suppressIntervals(0), rawProcessingUnhealthy(new AsyncVar(false)), - rawProcessingWiggle(new AsyncVar(false)), unhealthyRelocations(0), - movedKeyServersEventHolder(makeReference("MovedKeyServers")) {} + output(output), input(input), getShardMetrics(getShardMetrics), getTopKMetrics(getTopKMetrics), + lastLimited(lastLimited), lastInterval(0), suppressIntervals(0), + rawProcessingUnhealthy(new AsyncVar(false)), rawProcessingWiggle(new AsyncVar(false)), + unhealthyRelocations(0), movedKeyServersEventHolder(makeReference("MovedKeyServers")) {} void validate() { if (EXPENSIVE_VALIDATION) { @@ -1157,9 +1159,8 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, self->suppressIntervals = 0; } - std::vector metricsList = + state StorageMetrics metrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(rd.keys)))); - state StorageMetrics metrics = metricsList[0]; ASSERT(rd.src.size()); loop { @@ -1553,12 +1554,12 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, // randomly choose topK shards int topK = std::min(int(0.1 * shards.size()), 10); state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetMetricsRequest req(shards, topK); + state GetTopKMetricsRequest req(shards, topK); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) > b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; - state std::vector metricsList = wait(brokenPromiseToNever(self->getShardMetrics.getReply(req))); + state std::vector metricsList = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req))); wait(ready(healthMetrics)); if (getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs()) < 25.0) { // 25% traceEvent->detail("SkipReason", "LowReadLoad"); @@ -1634,11 +1635,11 @@ ACTOR static Future rebalanceTeams(DDQueueData* self, state int retries = 0; while (retries < SERVER_KNOBS->REBALANCE_MAX_RETRIES) { state KeyRange testShard = deterministicRandom()->randomChoice(shards); - std::vector testMetrics = + StorageMetrics testMetrics = wait(brokenPromiseToNever(self->getShardMetrics.getReply(GetMetricsRequest(testShard)))); - if (testMetrics[0].bytes > metrics.bytes) { + if (testMetrics.bytes > metrics.bytes) { moveShard = testShard; - metrics = testMetrics[0]; + metrics = testMetrics; if (metrics.bytes > averageShardBytes) { break; } @@ -2084,6 +2085,7 @@ ACTOR Future dataDistributionQueue(Database cx, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, + PromiseStream getTopKMetrics, Reference> processingUnhealthy, Reference> processingWiggle, std::vector teamCollections, @@ -2107,6 +2109,7 @@ ACTOR Future dataDistributionQueue(Database cx, output, input, getShardMetrics, + getTopKMetrics, lastLimited); state std::set serversToLaunchFrom; state KeyRange keysToLaunchFrom; From 96a35264b46069402b010ab7361e1155195fc78e Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 4 May 2022 12:48:48 -0400 Subject: [PATCH 114/299] return tenant_list as one operation and validate with assertions --- bindings/bindingtester/spec/tenantTester.md | 8 +---- bindings/bindingtester/tests/api.py | 11 +++--- .../foundationdb/test/AsyncStackTester.java | 35 ++----------------- .../foundationdb/test/StackOperation.java | 3 +- .../apple/foundationdb/test/StackTester.java | 34 ++---------------- .../apple/foundationdb/test/StackUtils.java | 7 ++++ bindings/python/tests/tester.py | 18 +++------- 7 files changed, 23 insertions(+), 93 deletions(-) diff --git a/bindings/bindingtester/spec/tenantTester.md b/bindings/bindingtester/spec/tenantTester.md index fea7d49070..719d0a670a 100644 --- a/bindings/bindingtester/spec/tenantTester.md +++ b/bindings/bindingtester/spec/tenantTester.md @@ -38,18 +38,12 @@ The tenant API introduces some new operations: Unsets the active tenant. -#### TENANT_LIST_NAMES +#### TENANT_LIST Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. Returns list of tenant names contained in the range BEGIN to END, numbering LIMIT at most. May optionally push a future onto the stack. -#### TENANT_LIST_METADATA - - Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. Returns list - of tenant metadata contained in the range BEGIN to END, numbering LIMIT at most. - May optionally push a future onto the stack. - Updates to Existing Instructions -------------------------------- diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index 1e7252b696..d08436e213 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -165,7 +165,7 @@ class ApiTest(Test): write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT'] txn_sizes = ['GET_APPROXIMATE_SIZE'] storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS'] - tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST_NAMES', 'TENANT_LIST_METADATA'] + tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST'] op_choices += reads op_choices += mutations @@ -600,12 +600,9 @@ class ApiTest(Test): instructions.append(op) elif op == 'TENANT_CLEAR_ACTIVE': instructions.append(op) - elif op == 'TENANT_LIST_NAMES': - instructions.push_args(b'', b'\xff', 10000) - instructions.append(op) - self.add_strings(1) - elif op == 'TENANT_LIST_METADATA': - instructions.push_args(b'', b'\xff', 10000) + elif op == 'TENANT_LIST': + instructions.push_args(self.random.random_int()) + self.ensure_string(instructions, 2) instructions.append(op) self.add_strings(1) else: diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index 687b635a7e..bc2bcf97dd 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -486,7 +486,7 @@ public class AsyncStackTester { inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName)); }, FDB.DEFAULT_EXECUTOR); } - else if (op == StackOperation.TENANT_LIST_NAMES) { + else if (op == StackOperation.TENANT_LIST) { return inst.popParams(3).thenAcceptAsync(params -> { byte[] begin = (byte[])params.get(0); byte[] end = (byte[])params.get(1); @@ -497,6 +497,8 @@ public class AsyncStackTester { while (tenantIter.hasNext()) { try { KeyValue next = tenantIter.next(); + String metadata = new String(next.getValue()); + assert StackUtils.validTenantMetadata(metadata) : "Invalid Tenant Metadata"; outputStream.write(next.getKey()); } catch (IOException e) { continue; @@ -509,37 +511,6 @@ public class AsyncStackTester { inst.push(output); }, FDB.DEFAULT_EXECUTOR); } - else if (op == StackOperation.TENANT_LIST_METADATA) { - return inst.popParams(3).thenAcceptAsync(params -> { - byte[] begin = (byte[])params.get(0); - byte[] end = (byte[])params.get(1); - int limit = StackUtils.getInt(params.get(2)); - CloseableAsyncIterator tenantIter = - TenantManagement.listTenants(inst.context.db, begin, end, limit); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - boolean validData = true; - try { - while (tenantIter.hasNext()) { - KeyValue next = tenantIter.next(); - String metadata = new String(next.getValue()); - // Without a JSON parsing library, we try to validate that the metadata consists - // of a select few properties using simple string comparison - if (metadata.charAt(0) != '{' || metadata.charAt(metadata.length() - 1) != '}' || - !metadata.contains("id") || !metadata.contains("prefix")) { - validData = false; - break; - } - } - } finally { - tenantIter.close(); - } - if (validData) { - inst.push("VALID_TENANT_METADATA".getBytes()); - } else { - inst.push("INVALID_TENANT_METADATA".getBytes()); - } - }, FDB.DEFAULT_EXECUTOR); - } else if (op == StackOperation.TENANT_SET_ACTIVE) { return inst.popParam().thenAcceptAsync(param -> { byte[] tenantName = (byte[])param; diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java index acd0dca676..e67d4cff81 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackOperation.java @@ -76,8 +76,7 @@ enum StackOperation { // Tenants TENANT_CREATE, TENANT_DELETE, - TENANT_LIST_NAMES, - TENANT_LIST_METADATA, + TENANT_LIST, TENANT_SET_ACTIVE, TENANT_CLEAR_ACTIVE, diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index e5e581b617..358a81647c 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -431,7 +431,7 @@ public class StackTester { byte[] tenantName = (byte[])inst.popParam().join(); inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName)); } - else if (op == StackOperation.TENANT_LIST_NAMES) { + else if (op == StackOperation.TENANT_LIST) { List params = inst.popParams(3).join(); byte[] begin = (byte[])params.get(0); byte[] end = (byte[])params.get(1); @@ -442,6 +442,8 @@ public class StackTester { while (tenantIter.hasNext()) { try { KeyValue next = tenantIter.next(); + String metadata = new String(next.getValue()); + assert StackUtils.validTenantMetadata(metadata) : "Invalid Tenant Metadata"; outputStream.write(next.getKey()); } catch (IOException e) { continue; @@ -453,36 +455,6 @@ public class StackTester { byte[] output = outputStream.toByteArray(); inst.push(output); } - else if (op == StackOperation.TENANT_LIST_METADATA) { - List params = inst.popParams(3).join(); - byte[] begin = (byte[])params.get(0); - byte[] end = (byte[])params.get(1); - int limit = StackUtils.getInt(params.get(2)); - CloseableAsyncIterator tenantIter = - TenantManagement.listTenants(inst.context.db, begin, end, limit); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - boolean validData = true; - try { - while (tenantIter.hasNext()) { - KeyValue next = tenantIter.next(); - String metadata = new String(next.getValue()); - // Without a JSON parsing library, we try to validate that the metadata consists - // of a select few properties using simple string comparison - if (metadata.charAt(0) != '{' || metadata.charAt(metadata.length() - 1) != '}' || - !metadata.contains("id") || !metadata.contains("prefix")) { - validData = false; - break; - } - } - } finally { - tenantIter.close(); - } - if (validData) { - inst.push("VALID_TENANT_METADATA".getBytes()); - } else { - inst.push("INVALID_TENANT_METADATA".getBytes()); - } - } else if (op == StackOperation.TENANT_SET_ACTIVE) { byte[] tenantName = (byte[])inst.popParam().join(); inst.context.setTenant(Optional.of(tenantName)); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackUtils.java b/bindings/java/src/test/com/apple/foundationdb/test/StackUtils.java index 11682b356c..36133f2e10 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackUtils.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackUtils.java @@ -67,6 +67,13 @@ public class StackUtils { return item; } + // Without a JSON parsing library, we try to validate that the metadata consists + // of a select few properties using simple string comparison + static boolean validTenantMetadata(String metadata) { + return (metadata.charAt(0) == '{' && metadata.charAt(metadata.length() - 1) == '}' && metadata.contains("id") && + metadata.contains("prefix")); + } + //////////////////////// // Utilities for forcing Objects into various types //////////////////////// diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index f190fd1c2f..699b631396 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -605,30 +605,20 @@ class Tester: self.tenant = self.db.open_tenant(name) elif inst.op == six.u("TENANT_CLEAR_ACTIVE"): self.tenant = None - elif inst.op == six.u("TENANT_LIST_NAMES"): + elif inst.op == six.u("TENANT_LIST"): begin, end, limit = inst.pop(3) tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) result = bytearray() for tenant in tenant_list: result += tenant.key - result_bytes = bytes(result) - inst.push(result_bytes) - elif inst.op == six.u("TENANT_LIST_METADATA"): - begin, end, limit = inst.pop(3) - tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) - valid_data = True - for tenant in tenant_list: try: metadata = json.loads(tenant.value) id = metadata["id"] prefix = metadata["prefix"] except (json.decoder.JSONDecodeError, KeyError) as e: - valid_data = False - break - if valid_data: - inst.push(b"VALID_TENANT_METADATA") - else: - inst.push(b"INVALID_TENANT_METADATA") + assert False, "Invalid Tenant Metadata" + result_bytes = bytes(result) + inst.push(result_bytes) elif inst.op == six.u("UNIT_TESTS"): try: test_db_options(db) From b5556c57f9dbdaae03ceb1bf5f7ad4d2d2ea981e Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 4 May 2022 13:59:19 -0400 Subject: [PATCH 115/299] adjust instruction ordering in tenant_list api for binding tester --- bindings/bindingtester/tests/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bindings/bindingtester/tests/api.py b/bindings/bindingtester/tests/api.py index d08436e213..c716e41e6d 100644 --- a/bindings/bindingtester/tests/api.py +++ b/bindings/bindingtester/tests/api.py @@ -601,8 +601,10 @@ class ApiTest(Test): elif op == 'TENANT_CLEAR_ACTIVE': instructions.append(op) elif op == 'TENANT_LIST': - instructions.push_args(self.random.random_int()) self.ensure_string(instructions, 2) + instructions.push_args(self.random.random_int()) + test_util.to_front(instructions, 2) + test_util.to_front(instructions, 2) instructions.append(op) self.add_strings(1) else: From aa3376ab4209f351c7ada853f68db5093e038026 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 11:45:27 -0700 Subject: [PATCH 116/299] fix comparison typo --- fdbclient/FDBTypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 0e70fbe04a..110081570e 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -1455,7 +1455,7 @@ struct StorageMetadataType { static double currentTime() { return g_network->timer(); } bool operator==(const StorageMetadataType& b) const { - return createdTime == b.createdTime && storeType == b.storeType && wrongConfigured && b.wrongConfigured; + return createdTime == b.createdTime && storeType == b.storeType && wrongConfigured == b.wrongConfigured; } bool operator<(const StorageMetadataType& b) const { From ae81d0536a99d5d21ff07993ca1377865a538c23 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 12:13:17 -0700 Subject: [PATCH 117/299] Update tests/CMakeLists.txt --- tests/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5426657bb1..d5cd4da986 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -210,7 +210,6 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES rare/LargeApiCorrectnessStatus.toml) add_fdb_test(TEST_FILES rare/RYWDisable.toml) add_fdb_test(TEST_FILES rare/RandomReadWriteTest.toml) - add_fdb_test(TEST_FILES rare/ReadSkewReadWrite.toml) add_fdb_test(TEST_FILES rare/SpecificUnitTests.toml) add_fdb_test(TEST_FILES rare/SwizzledLargeApiCorrectness.toml) add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml) From a7fdb42db853e769b46795aa9d9af48f2c9add04 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Wed, 4 May 2022 14:54:16 -0500 Subject: [PATCH 118/299] Fix for arm BG tests --- .../c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp index e8221780fa..33d3bf7c60 100644 --- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp @@ -53,9 +53,8 @@ private: [this, begin, end, results, tooOld](auto ctx) { ctx->tx()->setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); KeyValuesResult res = ctx->tx()->readBlobGranules(begin, end, ctx->getBGBasePath()); - bool more; + bool more = false; (*results) = res.getKeyValues(&more); - ASSERT(!more); if (res.getError() == error_code_blob_granule_transaction_too_old) { info("BlobGranuleCorrectness::randomReadOp bg too old\n"); ASSERT(!seenReadSuccess); @@ -64,6 +63,7 @@ private: } else if (res.getError() != error_code_success) { ctx->onError(res.getError()); } else { + ASSERT(!more); if (!seenReadSuccess) { info("BlobGranuleCorrectness::randomReadOp first success\n"); } From c001d55c246939bfc37fc3bf7923bed30a30e09c Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 4 May 2022 16:07:06 -0400 Subject: [PATCH 119/299] push tenant_list result as a packed tuple to stay consistent with other operations --- bindings/bindingtester/spec/tenantTester.md | 8 +++++--- .../foundationdb/test/AsyncStackTester.java | 19 ++++++------------- .../apple/foundationdb/test/StackTester.java | 19 ++++++------------- bindings/python/tests/tester.py | 7 +++---- 4 files changed, 20 insertions(+), 33 deletions(-) diff --git a/bindings/bindingtester/spec/tenantTester.md b/bindings/bindingtester/spec/tenantTester.md index 719d0a670a..eae2ab5039 100644 --- a/bindings/bindingtester/spec/tenantTester.md +++ b/bindings/bindingtester/spec/tenantTester.md @@ -40,9 +40,11 @@ The tenant API introduces some new operations: #### TENANT_LIST - Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. Returns list - of tenant names contained in the range BEGIN to END, numbering LIMIT at most. - May optionally push a future onto the stack. + Pops the top 3 items off of the stack as BEGIN, END, & LIMIT. + Performs a range read of the tenant management keyspace in a language-appropriate + way using these parameters. The resulting range of n tenant names are + packed into a tuple as [t1,t2,t3,...,tn], and this single packed value + is pushed onto the stack. Updates to Existing Instructions -------------------------------- diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index bc2bcf97dd..24e9533a63 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -33,8 +33,6 @@ import java.util.Map; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.function.Function; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import com.apple.foundationdb.Database; import com.apple.foundationdb.FDB; @@ -492,23 +490,18 @@ public class AsyncStackTester { byte[] end = (byte[])params.get(1); int limit = StackUtils.getInt(params.get(2)); CloseableAsyncIterator tenantIter = TenantManagement.listTenants(inst.context.db, begin, end, limit); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + List result = new ArrayList(); try { while (tenantIter.hasNext()) { - try { - KeyValue next = tenantIter.next(); - String metadata = new String(next.getValue()); - assert StackUtils.validTenantMetadata(metadata) : "Invalid Tenant Metadata"; - outputStream.write(next.getKey()); - } catch (IOException e) { - continue; - } + KeyValue next = tenantIter.next(); + String metadata = new String(next.getValue()); + assert StackUtils.validTenantMetadata(metadata) : "Invalid Tenant Metadata"; + result.add(next.getKey()); } } finally { tenantIter.close(); } - byte[] output = outputStream.toByteArray(); - inst.push(output); + inst.push(Tuple.fromItems(result).pack()); }, FDB.DEFAULT_EXECUTOR); } else if (op == StackOperation.TENANT_SET_ACTIVE) { diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index 358a81647c..88a123f76a 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -20,8 +20,6 @@ package com.apple.foundationdb.test; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -437,23 +435,18 @@ public class StackTester { byte[] end = (byte[])params.get(1); int limit = StackUtils.getInt(params.get(2)); CloseableAsyncIterator tenantIter = TenantManagement.listTenants(inst.context.db, begin, end, limit); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + List result = new ArrayList(); try { while (tenantIter.hasNext()) { - try { - KeyValue next = tenantIter.next(); - String metadata = new String(next.getValue()); - assert StackUtils.validTenantMetadata(metadata) : "Invalid Tenant Metadata"; - outputStream.write(next.getKey()); - } catch (IOException e) { - continue; - } + KeyValue next = tenantIter.next(); + String metadata = new String(next.getValue()); + assert StackUtils.validTenantMetadata(metadata) : "Invalid Tenant Metadata"; + result.add(next.getKey()); } } finally { tenantIter.close(); } - byte[] output = outputStream.toByteArray(); - inst.push(output); + inst.push(Tuple.fromItems(result).pack()); } else if (op == StackOperation.TENANT_SET_ACTIVE) { byte[] tenantName = (byte[])inst.popParam().join(); diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 699b631396..4392d02015 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -608,17 +608,16 @@ class Tester: elif inst.op == six.u("TENANT_LIST"): begin, end, limit = inst.pop(3) tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) - result = bytearray() + result = [] for tenant in tenant_list: - result += tenant.key + result += [tenant.key] try: metadata = json.loads(tenant.value) id = metadata["id"] prefix = metadata["prefix"] except (json.decoder.JSONDecodeError, KeyError) as e: assert False, "Invalid Tenant Metadata" - result_bytes = bytes(result) - inst.push(result_bytes) + inst.push(fdb.tuple.pack(tuple(result))) elif inst.op == six.u("UNIT_TESTS"): try: test_db_options(db) From c1c316591c9aee3fd7232b64cd87a4d4e91b8011 Mon Sep 17 00:00:00 2001 From: Hao Fu <77984096+hfu94@users.noreply.github.com> Date: Wed, 4 May 2022 13:37:05 -0700 Subject: [PATCH 120/299] Clean up constructMappedKey method (#7062) * Clean up constructMappedKey method * fix comments and bugs * Fix search index * Fix bug, if escaped is true after first round, it would skip the second round --- fdbserver/storageserver.actor.cpp | 115 +++++++++++++++--------------- 1 file changed, 59 insertions(+), 56 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 9afc0b5b6b..e7107176ab 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3471,6 +3471,56 @@ ACTOR Future quickGetKeyValues( } }; +void unpackKeyTuple(Tuple** referenceTuple, Optional& keyTuple, KeyValueRef* keyValue) { + if (!keyTuple.present()) { + // May throw exception if the key is not parsable as a tuple. + try { + keyTuple = Tuple::unpack(keyValue->key); + } catch (Error& e) { + TraceEvent("KeyNotTuple").error(e).detail("Key", keyValue->key.printable()); + throw key_not_tuple(); + } + } + *referenceTuple = &keyTuple.get(); +} + +void unpackValueTuple(Tuple** referenceTuple, Optional& valueTuple, KeyValueRef* keyValue) { + if (!valueTuple.present()) { + // May throw exception if the value is not parsable as a tuple. + try { + valueTuple = Tuple::unpack(keyValue->value); + } catch (Error& e) { + TraceEvent("ValueNotTuple").error(e).detail("Value", keyValue->value.printable()); + throw value_not_tuple(); + } + } + *referenceTuple = &valueTuple.get(); +} + +bool unescapeLiterals(std::string& s, std::string before, std::string after) { + bool escaped = false; + size_t p = 0; + while (true) { + size_t found = s.find(before, p); + if (found == std::string::npos) { + break; + } + s.replace(found, before.length(), after); + p = found + after.length(); + escaped = true; + } + return escaped; +} + +bool singleKeyOrValue(const std::string& s, size_t sz) { + // format would be {K[??]} or {V[??]} + return sz > 5 && s[0] == '{' && (s[1] == 'K' || s[1] == 'V') && s[2] == '[' && s[sz - 2] == ']' && s[sz - 1] == '}'; +} + +bool rangeQuery(const std::string& s) { + return s == "{...}"; +} + Key constructMappedKey(KeyValueRef* keyValue, Tuple& mappedKeyFormatTuple, bool& isRangeQuery) { // Lazily parse key and/or value to tuple because they may not need to be a tuple if not used. Optional keyTuple; @@ -3482,78 +3532,31 @@ Key constructMappedKey(KeyValueRef* keyValue, Tuple& mappedKeyFormatTuple, bool& if (type == Tuple::BYTES || type == Tuple::UTF8) { std::string s = mappedKeyFormatTuple.getString(i).toString(); auto sz = s.size(); - - // Handle escape. - bool escaped = false; - size_t p = 0; - while (true) { - size_t found = s.find("{{", p); - if (found == std::string::npos) { - break; - } - s.replace(found, 2, "{"); - p += 1; - escaped = true; - } - p = 0; - while (true) { - size_t found = s.find("}}", p); - if (found == std::string::npos) { - break; - } - s.replace(found, 2, "}"); - p += 1; - escaped = true; - } + bool escaped = unescapeLiterals(s, "{{", "{"); + escaped = unescapeLiterals(s, "}}", "}") || escaped; if (escaped) { - // If the element uses escape, cope the escaped version. mappedKeyTuple.append(s); - } - // {K[??]} or {V[??]} - else if (sz > 5 && s[0] == '{' && (s[1] == 'K' || s[1] == 'V') && s[2] == '[' && s[sz - 2] == ']' && - s[sz - 1] == '}') { + } else if (singleKeyOrValue(s, sz)) { int idx; + Tuple* referenceTuple; try { idx = std::stoi(s.substr(3, sz - 5)); } catch (std::exception& e) { throw mapper_bad_index(); } - Tuple* referenceTuple; if (s[1] == 'K') { - // Use keyTuple as reference. - if (!keyTuple.present()) { - // May throw exception if the key is not parsable as a tuple. - try { - keyTuple = Tuple::unpack(keyValue->key); - } catch (Error& e) { - TraceEvent("KeyNotTuple").error(e).detail("Key", keyValue->key.printable()); - throw key_not_tuple(); - } - } - referenceTuple = &keyTuple.get(); + unpackKeyTuple(&referenceTuple, keyTuple, keyValue); } else if (s[1] == 'V') { - // Use valueTuple as reference. - if (!valueTuple.present()) { - // May throw exception if the value is not parsable as a tuple. - try { - valueTuple = Tuple::unpack(keyValue->value); - } catch (Error& e) { - TraceEvent("ValueNotTuple").error(e).detail("Value", keyValue->value.printable()); - throw value_not_tuple(); - } - } - referenceTuple = &valueTuple.get(); + unpackValueTuple(&referenceTuple, valueTuple, keyValue); } else { ASSERT(false); throw internal_error(); } - if (idx < 0 || idx >= referenceTuple->size()) { throw mapper_bad_index(); } mappedKeyTuple.append(referenceTuple->subTuple(idx, idx + 1)); - } else if (s == "{...}") { - // Range query. + } else if (rangeQuery(s)) { if (i != mappedKeyFormatTuple.size() - 1) { // It must be the last element of the mapper tuple throw mapper_bad_range_decriptor(); @@ -3562,11 +3565,11 @@ Key constructMappedKey(KeyValueRef* keyValue, Tuple& mappedKeyFormatTuple, bool& isRangeQuery = true; // Do not add it to the mapped key. } else { - // If the element is a string but neither escaped nor descriptors, just copy it. + // If the element is a string but neither escaped nor descriptors, add to result. mappedKeyTuple.append(mappedKeyFormatTuple.subTuple(i, i + 1)); } } else { - // If the element not a string, just copy it. + // If the element not a string, add to result. mappedKeyTuple.append(mappedKeyFormatTuple.subTuple(i, i + 1)); } } From 66f1c5c85aa8096e7dbe8b6a8afa8790df658a1a Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Wed, 4 May 2022 14:09:31 -0700 Subject: [PATCH 121/299] Small BlobCipher and SimKmsConnector fixes and changes (#6936) * SimKmsConnector fix domain id being unsigned * SimKmsConnector fix returning cipher id 0 as latest key, which is invalid * SimKmsConnector fix keys initialized as c-style strings with incorrect length and uninitialized bytes * SimKmsConnector fix returning different keys for the same id after restart * BlobCipher change APIs to return null reference when key not found * BlobCipher insertCipherKey to return the inserted key --- fdbserver/EncryptKeyProxy.actor.cpp | 1 - fdbserver/SimEncryptKmsProxy.actor.h | 22 +++--- fdbserver/SimKmsConnector.actor.cpp | 18 +++-- flow/BlobCipher.cpp | 113 ++++++++++++++------------- flow/BlobCipher.h | 33 ++++---- 5 files changed, 103 insertions(+), 84 deletions(-) diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp index 22e1dbf1e8..2026d8307c 100644 --- a/fdbserver/EncryptKeyProxy.actor.cpp +++ b/fdbserver/EncryptKeyProxy.actor.cpp @@ -232,7 +232,6 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa EKPGetLatestBaseCipherKeysRequest req) { // Scan the cached cipher-keys and filter our baseCipherIds locally cached // for the rest, reachout to KMS to fetch the required details - state std::vector cachedCipherDetails; state EKPGetLatestBaseCipherKeysRequest latestKeysReq = req; state EKPGetLatestBaseCipherKeysReply latestCipherReply; diff --git a/fdbserver/SimEncryptKmsProxy.actor.h b/fdbserver/SimEncryptKmsProxy.actor.h index fbbcbd51ef..588f6d4cfb 100644 --- a/fdbserver/SimEncryptKmsProxy.actor.h +++ b/fdbserver/SimEncryptKmsProxy.actor.h @@ -29,14 +29,13 @@ #include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" +#include "flow/EncryptUtils.h" #include "flow/FileIdentifier.h" #include "flow/Trace.h" #include "flow/flow.h" #include "flow/network.h" #include "flow/actorcompiler.h" // This must be the last #include. -using SimEncryptKeyId = uint64_t; -using SimEncryptDomainId = uint64_t; using SimEncryptKey = std::string; struct SimKmsProxyInterface { @@ -73,12 +72,15 @@ struct SimKmsProxyInterface { struct SimEncryptKeyDetails { constexpr static FileIdentifier file_identifier = 1227025; - SimEncryptDomainId encryptDomainId; - SimEncryptKeyId encryptKeyId; + EncryptCipherDomainId encryptDomainId; + EncryptCipherBaseKeyId encryptKeyId; StringRef encryptKey; SimEncryptKeyDetails() {} - explicit SimEncryptKeyDetails(SimEncryptDomainId domainId, SimEncryptKeyId keyId, StringRef key, Arena& arena) + explicit SimEncryptKeyDetails(EncryptCipherDomainId domainId, + EncryptCipherBaseKeyId keyId, + StringRef key, + Arena& arena) : encryptDomainId(domainId), encryptKeyId(keyId), encryptKey(StringRef(arena, key)) {} template @@ -102,11 +104,12 @@ struct SimGetEncryptKeysByKeyIdsReply { struct SimGetEncryptKeysByKeyIdsRequest { constexpr static FileIdentifier file_identifier = 6913396; - std::vector> encryptKeyIds; + std::vector> encryptKeyIds; ReplyPromise reply; SimGetEncryptKeysByKeyIdsRequest() {} - explicit SimGetEncryptKeysByKeyIdsRequest(const std::vector>& keyIds) + explicit SimGetEncryptKeysByKeyIdsRequest( + const std::vector>& keyIds) : encryptKeyIds(keyIds) {} template @@ -130,11 +133,12 @@ struct SimGetEncryptKeyByDomainIdReply { struct SimGetEncryptKeysByDomainIdsRequest { constexpr static FileIdentifier file_identifier = 9918682; - std::vector encryptDomainIds; + std::vector encryptDomainIds; ReplyPromise reply; SimGetEncryptKeysByDomainIdsRequest() {} - explicit SimGetEncryptKeysByDomainIdsRequest(const std::vector& ids) : encryptDomainIds(ids) {} + explicit SimGetEncryptKeysByDomainIdsRequest(const std::vector& ids) + : encryptDomainIds(ids) {} template void serialize(Ar& ar) { diff --git a/fdbserver/SimKmsConnector.actor.cpp b/fdbserver/SimKmsConnector.actor.cpp index 2ec45f5c8f..958003f7cd 100644 --- a/fdbserver/SimKmsConnector.actor.cpp +++ b/fdbserver/SimKmsConnector.actor.cpp @@ -23,6 +23,7 @@ #include "fdbrpc/sim_validation.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" +#include "flow/BlobCipher.h" #include "flow/EncryptUtils.h" #include "flow/Error.h" #include "flow/FastRef.h" @@ -42,7 +43,7 @@ struct SimEncryptKeyCtx { EncryptCipherBaseKeyId id; SimEncryptKey key; - explicit SimEncryptKeyCtx(EncryptCipherBaseKeyId kId, const char* data) : id(kId), key(data) {} + explicit SimEncryptKeyCtx(EncryptCipherBaseKeyId kId, const char* data) : id(kId), key(data, AES_256_KEY_LENGTH) {} }; struct SimKmsConnectorContext { @@ -50,13 +51,16 @@ struct SimKmsConnectorContext { std::unordered_map> simEncryptKeyStore; explicit SimKmsConnectorContext(uint32_t keyCount) : maxEncryptionKeys(keyCount) { - uint8_t buffer[AES_256_KEY_LENGTH]; + const unsigned char SHA_KEY[] = "0c39e7906db6d51ac0573d328ce1b6be"; // Construct encryption keyStore. - for (int i = 0; i < maxEncryptionKeys; i++) { - generateRandomData(&buffer[0], AES_256_KEY_LENGTH); - SimEncryptKeyCtx ctx(i, reinterpret_cast(buffer)); - simEncryptKeyStore[i] = std::make_unique(i, reinterpret_cast(buffer)); + // Note the keys generated must be the same after restart. + for (int i = 1; i <= maxEncryptionKeys; i++) { + Arena arena; + StringRef digest = computeAuthToken( + reinterpret_cast(&i), sizeof(i), SHA_KEY, AES_256_KEY_LENGTH, arena); + simEncryptKeyStore[i] = + std::make_unique(i, reinterpret_cast(digest.begin())); } } }; @@ -103,7 +107,7 @@ ACTOR Future simKmsConnectorCore_impl(KmsConnectorInterface interf) { // mean multiple domains gets mapped to the same encryption key which is fine, the EncryptKeyStore // guarantees that keyId -> plaintext encryptKey mapping is idempotent. for (EncryptCipherDomainId domainId : req.encryptDomainIds) { - EncryptCipherBaseKeyId keyId = domainId % SERVER_KNOBS->SIM_KMS_MAX_KEYS; + EncryptCipherBaseKeyId keyId = 1 + abs(domainId) % SERVER_KNOBS->SIM_KMS_MAX_KEYS; const auto& itr = ctx->simEncryptKeyStore.find(keyId); if (itr != ctx->simEncryptKeyStore.end()) { keysByDomainIdRep.cipherKeyDetails.emplace_back( diff --git a/flow/BlobCipher.cpp b/flow/BlobCipher.cpp index d8895cea26..c93f292ae0 100644 --- a/flow/BlobCipher.cpp +++ b/flow/BlobCipher.cpp @@ -114,11 +114,10 @@ void BlobCipherKey::reset() { // BlobKeyIdCache class methods BlobCipherKeyIdCache::BlobCipherKeyIdCache() - : domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID), - latestRandomSalt(ENCRYPT_INVALID_RANDOM_SALT) {} + : domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(), latestRandomSalt() {} BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId) - : domainId(dId), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID), latestRandomSalt(ENCRYPT_INVALID_RANDOM_SALT) { + : domainId(dId), latestBaseCipherKeyId(), latestRandomSalt() { TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId); } @@ -131,57 +130,45 @@ BlobCipherKeyIdCacheKey BlobCipherKeyIdCache::getCacheKey(const EncryptCipherBas } Reference BlobCipherKeyIdCache::getLatestCipherKey() { - if (keyIdCache.empty()) { - // Cache is empty, nothing more to do. - throw encrypt_key_not_found(); + if (!latestBaseCipherKeyId.present()) { + return Reference(); } + ASSERT_NE(latestBaseCipherKeyId.get(), ENCRYPT_INVALID_CIPHER_KEY_ID); + ASSERT(latestRandomSalt.present()); + ASSERT_NE(latestRandomSalt.get(), ENCRYPT_INVALID_RANDOM_SALT); - // Ensure latestCipher details sanity - ASSERT_GT(latestBaseCipherKeyId, ENCRYPT_INVALID_CIPHER_KEY_ID); - ASSERT_GT(latestRandomSalt, ENCRYPT_INVALID_RANDOM_SALT); - - return getCipherByBaseCipherId(latestBaseCipherKeyId, latestRandomSalt); + return getCipherByBaseCipherId(latestBaseCipherKeyId.get(), latestRandomSalt.get()); } Reference BlobCipherKeyIdCache::getCipherByBaseCipherId(const EncryptCipherBaseKeyId& baseCipherKeyId, const EncryptCipherRandomSalt& salt) { BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(getCacheKey(baseCipherKeyId, salt)); if (itr == keyIdCache.end()) { - TraceEvent("CipherByBaseCipherId_KeyMissing") - .detail("DomainId", domainId) - .detail("BaseCipherId", baseCipherKeyId) - .detail("Salt", salt); - throw encrypt_key_not_found(); + return Reference(); } return itr->second; } -void BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, - const uint8_t* baseCipher, - int baseCipherLen) { +Reference BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen) { ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID); // BaseCipherKeys are immutable, given the routine invocation updates 'latestCipher', // ensure no key-tampering is done - try { - Reference cipherKey = getLatestCipherKey(); - if (cipherKey.isValid() && cipherKey->getBaseCipherId() == baseCipherId) { - if (memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { - TraceEvent("InsertBaseCipherKey_AlreadyPresent") - .detail("BaseCipherKeyId", baseCipherId) - .detail("DomainId", domainId); - // Key is already present; nothing more to do. - return; - } else { - TraceEvent("InsertBaseCipherKey_UpdateCipher") - .detail("BaseCipherKeyId", baseCipherId) - .detail("DomainId", domainId); - throw encrypt_update_cipher(); - } - } - } catch (Error& e) { - if (e.code() != error_code_encrypt_key_not_found) { - throw e; + Reference latestCipherKey = getLatestCipherKey(); + if (latestCipherKey.isValid() && latestCipherKey->getBaseCipherId() == baseCipherId) { + if (memcmp(latestCipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0) { + TraceEvent("InsertBaseCipherKey_AlreadyPresent") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + // Key is already present; nothing more to do. + return latestCipherKey; + } else { + TraceEvent("InsertBaseCipherKey_UpdateCipher") + .detail("BaseCipherKeyId", baseCipherId) + .detail("DomainId", domainId); + throw encrypt_update_cipher(); } } @@ -193,14 +180,16 @@ void BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& bas // Update the latest BaseCipherKeyId for the given encryption domain latestBaseCipherKeyId = baseCipherId; latestRandomSalt = cipherKey->getSalt(); + + return cipherKey; } void BlobCipherKeyIdCache::insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, int baseCipherLen, const EncryptCipherRandomSalt& salt) { - ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID); - ASSERT_GT(salt, ENCRYPT_INVALID_RANDOM_SALT); + ASSERT_NE(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID); + ASSERT_NE(salt, ENCRYPT_INVALID_RANDOM_SALT); BlobCipherKeyIdCacheKey cacheKey = getCacheKey(baseCipherId, salt); @@ -244,10 +233,10 @@ std::vector> BlobCipherKeyIdCache::getAllCipherKeys() { // BlobCipherKeyCache class methods -void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, - const EncryptCipherBaseKeyId& baseCipherId, - const uint8_t* baseCipher, - int baseCipherLen) { +Reference BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen) { if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) { throw encrypt_invalid_id(); } @@ -257,12 +246,14 @@ void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, if (domainItr == domainCacheMap.end()) { // Add mapping to track new encryption domain Reference keyIdCache = makeReference(domainId); - keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen); + Reference cipherKey = + keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen); domainCacheMap.emplace(domainId, keyIdCache); + return cipherKey; } else { // Track new baseCipher keys Reference keyIdCache = domainItr->second; - keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen); + return keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen); } TraceEvent("InsertCipherKey").detail("DomainId", domainId).detail("BaseCipherKeyId", baseCipherId); @@ -309,19 +300,23 @@ void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, } Reference BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) { + if (domainId == ENCRYPT_INVALID_DOMAIN_ID) { + TraceEvent("GetLatestCipherKey_InvalidID").detail("DomainId", domainId); + throw encrypt_invalid_id(); + } auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId); - throw encrypt_key_not_found(); + return Reference(); } Reference keyIdCache = domainItr->second; Reference cipherKey = keyIdCache->getLatestCipherKey(); - if ((now() - cipherKey->getCreationTime()) > FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL) { + if (cipherKey.isValid() && (now() - cipherKey->getCreationTime()) > FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL) { TraceEvent("GetLatestCipherKey_ExpiredTTL") .detail("DomainId", domainId) .detail("BaseCipherId", cipherKey->getBaseCipherId()); - throw encrypt_key_ttl_expired(); + return Reference(); } return cipherKey; @@ -332,8 +327,7 @@ Reference BlobCipherKeyCache::getCipherKey(const EncryptCipherDom const EncryptCipherRandomSalt& salt) { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { - TraceEvent("GetCipherKey_MissingDomainId").detail("DomainId", domainId); - throw encrypt_key_not_found(); + return Reference(); } Reference keyIdCache = domainItr->second; @@ -343,7 +337,7 @@ Reference BlobCipherKeyCache::getCipherKey(const EncryptCipherDom void BlobCipherKeyCache::resetEncryptDomainId(const EncryptCipherDomainId domainId) { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { - throw encrypt_key_not_found(); + return; } Reference keyIdCache = domainItr->second; @@ -777,9 +771,22 @@ TEST_CASE("flow/BlobCipher") { } ASSERT_EQ(domainKeyMap.size(), maxDomainId); + Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); + + // validate getLatestCipherKey return empty when there's no cipher key + TraceEvent("BlobCipherTest_LatestKeyNotExists").log(); + Reference latestKeyNonexists = + cipherKeyCache->getLatestCipherKey(deterministicRandom()->randomInt(minDomainId, maxDomainId)); + ASSERT(!latestKeyNonexists.isValid()); + try { + cipherKeyCache->getLatestCipherKey(ENCRYPT_INVALID_DOMAIN_ID); + ASSERT(false); // shouldn't get here + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_encrypt_invalid_id); + } + // insert BlobCipher keys into BlobCipherKeyCache map and validate TraceEvent("BlobCipherTest_InsertKeys").log(); - Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); for (auto& domainItr : domainKeyMap) { for (auto& baseKeyItr : domainItr.second) { Reference baseCipher = baseKeyItr.second; diff --git a/flow/BlobCipher.h b/flow/BlobCipher.h index 1d7a8d8dee..e868f45aff 100644 --- a/flow/BlobCipher.h +++ b/flow/BlobCipher.h @@ -244,12 +244,12 @@ public: const EncryptCipherRandomSalt& salt); // API returns the last inserted cipherKey. - // If none exists, 'encrypt_key_not_found' is thrown. + // If none exists, null reference is returned. Reference getLatestCipherKey(); // API returns cipherKey corresponding to input 'baseCipherKeyId'. - // If none exists, 'encrypt_key_not_found' is thrown. + // If none exists, null reference is returned. Reference getCipherByBaseCipherId(const EncryptCipherBaseKeyId& baseCipherKeyId, const EncryptCipherRandomSalt& salt); @@ -257,17 +257,19 @@ public: // API enables inserting base encryption cipher details to the BlobCipherKeyIdCache. // Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey // is treated as a NOP (success), however, an attempt to update cipherKey would throw - // 'encrypt_update_cipher' exception. + // 'encrypt_update_cipher' exception. Returns the inserted cipher key if success. // // API NOTE: Recommended usecase is to update encryption cipher-key is updated the external // keyManagementSolution to limit an encryption key lifetime - void insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, int baseCipherLen); + Reference insertBaseCipherKey(const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen); // API enables inserting base encryption cipher details to the BlobCipherKeyIdCache // Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey // is treated as a NOP (success), however, an attempt to update cipherKey would throw - // 'encrypt_update_cipher' exception. + // 'encrypt_update_cipher' exception. Returns the inserted cipher key if sucess. // // API NOTE: Recommended usecase is to update encryption cipher-key regeneration while performing // decryption. The encryptionheader would contain relevant details including: 'encryptDomainId', @@ -288,8 +290,8 @@ public: private: EncryptCipherDomainId domainId; BlobCipherKeyIdCacheMap keyIdCache; - EncryptCipherBaseKeyId latestBaseCipherKeyId; - EncryptCipherRandomSalt latestRandomSalt; + Optional latestBaseCipherKeyId; + Optional latestRandomSalt; }; using BlobCipherDomainCacheMap = std::unordered_map>; @@ -305,19 +307,21 @@ public: // The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable, // attempting to re-insert same 'identical' cipherKey is treated as a NOP (success), // however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception. + // Returns the inserted cipher key if success. // - // API NOTE: Recommended usecase is to update encryption cipher-key is updated the external + // API NOTE: Recommended use case is to update encryption cipher-key is updated the external // keyManagementSolution to limit an encryption key lifetime - void insertCipherKey(const EncryptCipherDomainId& domainId, - const EncryptCipherBaseKeyId& baseCipherId, - const uint8_t* baseCipher, - int baseCipherLen); + Reference insertCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, + const uint8_t* baseCipher, + int baseCipherLen); // Enable clients to insert base encryption cipher details to the BlobCipherKeyCache. // The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable, // attempting to re-insert same 'identical' cipherKey is treated as a NOP (success), // however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception. + // Returns the inserted cipher key if success. // // API NOTE: Recommended usecase is to update encryption cipher-key regeneration while performing // decryption. The encryptionheader would contain relevant details including: 'encryptDomainId', @@ -331,12 +335,13 @@ public: const EncryptCipherRandomSalt& salt); // API returns the last insert cipherKey for a given encryption domain Id. - // If none exists, it would throw 'encrypt_key_not_found' exception. + // If domain Id is invalid, it would throw 'encrypt_invalid_id' exception, + // otherwise, and if none exists, it would return null reference. Reference getLatestCipherKey(const EncryptCipherDomainId& domainId); // API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple. - // If none exists, it would throw 'encrypt_key_not_found' exception. + // If none exists, it would return null reference. Reference getCipherKey(const EncryptCipherDomainId& domainId, const EncryptCipherBaseKeyId& baseCipherId, From ae66ed6c16a108d277dccf50db1200283c502d37 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 14:11:20 -0700 Subject: [PATCH 122/299] fix DataDistributionQueue time_out ; reset the rebalance poll time --- fdbserver/DataDistributionQueue.actor.cpp | 10 +++++++--- fdbserver/DataDistributionTracker.actor.cpp | 9 +++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index dc2b0f330f..90e73c5acc 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1733,7 +1733,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, try { // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test - delayF = delay(0.1, TaskPriority::DataDistributionLaunch); + delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -2190,7 +2190,9 @@ ACTOR Future dataDistributionQueue(Database cx, debug_setCheckRelocationDuration(false); } } - when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; } + when(KeyRange done = waitNext(rangesComplete.getFuture())) { + keysToLaunchFrom = done; + } when(wait(recordMetrics)) { Promise req; getAverageShardBytes.send(req); @@ -2237,7 +2239,9 @@ ACTOR Future dataDistributionQueue(Database cx, } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} - when(Promise r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); } + when(Promise r = waitNext(getUnhealthyRelocationCount)) { + r.send(self.unhealthyRelocations); + } } } } catch (Error& e) { diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 2cb279049e..6d85a1ad60 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -886,7 +886,10 @@ ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get ACTOR Future fetchTopKShardMetrics(DataDistributionTracker* self, GetTopKMetricsRequest req) { choose { when(wait(fetchTopKShardMetrics_impl(self, req))) {} - when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { req.reply.sendError(timed_out()); } + when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { + TEST(true); // TopK DD_SHARD_METRICS_TIMEOUT + req.reply.send(std::vector(1)); + } } return Void(); } @@ -973,7 +976,9 @@ ACTOR Future fetchShardMetricsList_impl(DataDistributionTracker* self, Get ACTOR Future fetchShardMetricsList(DataDistributionTracker* self, GetMetricsListRequest req) { choose { when(wait(fetchShardMetricsList_impl(self, req))) {} - when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { req.reply.sendError(timed_out()); } + when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { + req.reply.sendError(timed_out()); + } } return Void(); } From 22eafcf7a2ba709b117b89ef93d6478909cbb47c Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 14:27:42 -0700 Subject: [PATCH 123/299] rename trace event --- fdbserver/DataDistributionQueue.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 90e73c5acc..0740a87008 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1714,7 +1714,7 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state bool skipCurrentLoop = false; state Future delayF = Never(); state const bool readRebalance = !isDiskRebalancePriority(ddPriority); - state const char* eventName = isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper" : "BgDDValleyFiller"; + state const char* eventName = isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New"; loop { state bool moved = false; @@ -1845,7 +1845,7 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde loop { state std::pair>, bool> randomTeam; state bool moved = false; - state TraceEvent traceEvent("BgDDMountainChopper", self->distributorId); + state TraceEvent traceEvent("BgDDMountainChopper_Old", self->distributorId); traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); if (*self->lastLimited > 0) { @@ -1968,7 +1968,7 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) loop { state std::pair>, bool> randomTeam; state bool moved = false; - state TraceEvent traceEvent("BgDDValleyFiller", self->distributorId); + state TraceEvent traceEvent("BgDDValleyFiller_Old", self->distributorId); traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); if (*self->lastLimited > 0) { From 964e0cecef2452e4cbd4c6957512d82cee7a9cfd Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 14:52:39 -0700 Subject: [PATCH 124/299] format --- fdbserver/DataDistributionQueue.actor.cpp | 11 ++++------- fdbserver/DataDistributionTracker.actor.cpp | 4 +--- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 0740a87008..26860b586a 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1714,7 +1714,8 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state bool skipCurrentLoop = false; state Future delayF = Never(); state const bool readRebalance = !isDiskRebalancePriority(ddPriority); - state const char* eventName = isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New"; + state const char* eventName = + isMountainChopperPriority(ddPriority) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New"; loop { state bool moved = false; @@ -2190,9 +2191,7 @@ ACTOR Future dataDistributionQueue(Database cx, debug_setCheckRelocationDuration(false); } } - when(KeyRange done = waitNext(rangesComplete.getFuture())) { - keysToLaunchFrom = done; - } + when(KeyRange done = waitNext(rangesComplete.getFuture())) { keysToLaunchFrom = done; } when(wait(recordMetrics)) { Promise req; getAverageShardBytes.send(req); @@ -2239,9 +2238,7 @@ ACTOR Future dataDistributionQueue(Database cx, } when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator when(wait(waitForAll(balancingFutures))) {} - when(Promise r = waitNext(getUnhealthyRelocationCount)) { - r.send(self.unhealthyRelocations); - } + when(Promise r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); } } } } catch (Error& e) { diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 6d85a1ad60..32d24b0f8e 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -976,9 +976,7 @@ ACTOR Future fetchShardMetricsList_impl(DataDistributionTracker* self, Get ACTOR Future fetchShardMetricsList(DataDistributionTracker* self, GetMetricsListRequest req) { choose { when(wait(fetchShardMetricsList_impl(self, req))) {} - when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { - req.reply.sendError(timed_out()); - } + when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) { req.reply.sendError(timed_out()); } } return Void(); } From 03607675e840764537fe178e1d70a8dda650c630 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 4 May 2022 17:42:49 -0700 Subject: [PATCH 125/299] refactor GetTeamRequest --- fdbserver/DDTeamCollection.actor.cpp | 22 +++++++++---- fdbserver/DataDistribution.actor.h | 38 ++++++++++++----------- fdbserver/DataDistributionQueue.actor.cpp | 38 +++++++++-------------- 3 files changed, 51 insertions(+), 47 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 6d0f8b2ae9..0b04cafcf8 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -44,6 +44,8 @@ FDB_DEFINE_BOOLEAN_PARAM(WantNewServers); FDB_DEFINE_BOOLEAN_PARAM(WantTrueBest); FDB_DEFINE_BOOLEAN_PARAM(PreferLowerUtilization); FDB_DEFINE_BOOLEAN_PARAM(TeamMustHaveShards); +FDB_DEFINE_BOOLEAN_PARAM(ForReadBalance); +FDB_DEFINE_BOOLEAN_PARAM(PreferLowerReadUtil); class DDTeamCollectionImpl { ACTOR static Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { @@ -255,8 +257,7 @@ public: (!req.preferLowerUtilization || self->teams[currentIndex]->hasHealthyAvailableSpace(self->medianAvailableSpace))) { int64_t loadBytes = self->teams[currentIndex]->getLoadBytes(true, req.inflightPenalty); - if (req.eligible(self->teams[currentIndex]) && // hard constraints - (!req.teamMustHaveShards || + if ((!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team( self->teams[currentIndex]->getServerIDs(), self->primary))) && // sort conditions @@ -5688,14 +5689,23 @@ public: auto wantsTrueBest = WantTrueBest::True; auto preferLowerUtilization = PreferLowerUtilization::True; auto teamMustHaveShards = TeamMustHaveShards::False; + auto forReadBalance = ForReadBalance::True; std::vector completeSources{ UID(1, 0), UID(2, 0), UID(3, 0) }; - state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards); + state GetTeamRequest req(wantsNewServers, + wantsTrueBest, + preferLowerUtilization, + teamMustHaveShards, + forReadBalance, + PreferLowerReadUtil::True); req.completeSources = completeSources; - req.teamSorter = greaterReadLoad; - state GetTeamRequest reqHigh(wantsNewServers, wantsTrueBest, PreferLowerUtilization::False, teamMustHaveShards); - reqHigh.teamSorter = lessReadLoad; + state GetTeamRequest reqHigh(wantsNewServers, + wantsTrueBest, + PreferLowerUtilization::False, + teamMustHaveShards, + forReadBalance, + PreferLowerReadUtil::False); wait(collection->getTeam(req) && collection->getTeam(reqHigh)); std::pair>, bool> resTeam = req.reply.getFuture().get(), diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index dec08776d7..6e394a38e4 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -86,37 +86,40 @@ FDB_DECLARE_BOOLEAN_PARAM(WantNewServers); FDB_DECLARE_BOOLEAN_PARAM(WantTrueBest); FDB_DECLARE_BOOLEAN_PARAM(PreferLowerUtilization); FDB_DECLARE_BOOLEAN_PARAM(TeamMustHaveShards); +FDB_DECLARE_BOOLEAN_PARAM(ForReadBalance); +FDB_DECLARE_BOOLEAN_PARAM(PreferLowerReadUtil); struct GetTeamRequest { bool wantsNewServers; // In additional to servers in completeSources, try to find teams with new server bool wantsTrueBest; bool preferLowerUtilization; // if true, lower utilized team has higher score bool teamMustHaveShards; + bool forReadBalance; + bool preferLowerReadUtil; // only make sense when forReadBalance is true double inflightPenalty; std::vector completeSources; std::vector src; Promise>, bool>> reply; - // optional typedef Reference TeamRef; - std::function hardConstraint; - std::function - teamSorter; // => -1 if a.score < b.score, 0 if equal, 1 if larger, the reply will choose the largest one GetTeamRequest() {} GetTeamRequest(WantNewServers wantsNewServers, WantTrueBest wantsTrueBest, PreferLowerUtilization preferLowerUtilization, TeamMustHaveShards teamMustHaveShards, + ForReadBalance forReadBalance = ForReadBalance::False, + PreferLowerReadUtil preferLowerReadUtil = PreferLowerReadUtil::False, double inflightPenalty = 1.0) : wantsNewServers(wantsNewServers), wantsTrueBest(wantsTrueBest), preferLowerUtilization(preferLowerUtilization), - teamMustHaveShards(teamMustHaveShards), inflightPenalty(inflightPenalty) {} + teamMustHaveShards(teamMustHaveShards), forReadBalance(forReadBalance), + preferLowerReadUtil(preferLowerReadUtil), inflightPenalty(inflightPenalty) {} // return true if a.score < b.score [[nodiscard]] bool lessCompare(TeamRef a, TeamRef b, int64_t aLoadBytes, int64_t bLoadBytes) const { int res = 0; - if (teamSorter) { - res = teamSorter(a, b); + if (forReadBalance) { + res = preferLowerReadUtil ? greaterReadLoad(a, b) : lessReadLoad(a, b); } return res == 0 ? lessCompareByLoad(aLoadBytes, bLoadBytes) : res < 0; } @@ -128,11 +131,15 @@ struct GetTeamRequest { return preferLowerUtilization ? !lessLoad : lessLoad; } - bool eligible(TeamRef a) const { - if (hardConstraint) { - return hardConstraint(a); - } - return true; + // return -1 if a.readload > b.readload + static int greaterReadLoad(TeamRef a, TeamRef b) { + auto r1 = a->getLoadReadBandwidth(true, 2), r2 = b->getLoadReadBandwidth(true, 2); + return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); + } + // return -1 if a.readload < b.readload + static int lessReadLoad(TeamRef a, TeamRef b) { + auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); + return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); } std::string getDesc() const { @@ -140,8 +147,7 @@ struct GetTeamRequest { ss << "WantsNewServers:" << wantsNewServers << " WantsTrueBest:" << wantsTrueBest << " PreferLowerUtilization:" << preferLowerUtilization << " teamMustHaveShards:" << teamMustHaveShards - << " inflightPenalty:" << inflightPenalty << " hardConstraint: " << (bool)hardConstraint - << " teamSorter: " << (bool)teamSorter << ";"; + << "forReadBalance" << forReadBalance << " inflightPenalty:" << inflightPenalty << ";"; ss << "CompleteSources:"; for (const auto& cs : completeSources) { ss << cs.toString() << ","; @@ -489,9 +495,5 @@ struct StorageWiggler : ReferenceCounted { ACTOR Future>> getServerListAndProcessClasses( Transaction* tr); -// return -1 if a.readload > b.readload -int greaterReadLoad(Reference a, Reference b); -// return -1 if a.readload < b.readload -int lessReadLoad(Reference a, Reference b); #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 26860b586a..a00647115e 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1096,18 +1096,6 @@ struct DDQueueData { } }; -// return -1 if a.readload > b.readload, usually for choose dest team with low read load -int greaterReadLoad(Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(true, 2), r2 = b->getLoadReadBandwidth(true, 2); - return r1 == r2 ? 0 : (r1 > r2 ? -1 : 1); -} - -// return -1 if a.readload < b.readload, usually for choose source team with high read load -int lessReadLoad(Reference a, Reference b) { - auto r1 = a->getLoadReadBandwidth(), r2 = b->getLoadReadBandwidth(); - return r1 == r2 ? 0 : (r1 < r2 ? -1 : 1); -} - static std::string destServersString(std::vector, bool>> const& bestTeams) { std::stringstream ss; @@ -1190,14 +1178,13 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, WantTrueBest(isValleyFillerPriority(rd.priority)), PreferLowerUtilization::True, TeamMustHaveShards::False, + ForReadBalance(rd.reason == RelocateReason::REBALANCE_READ), + PreferLowerReadUtil::True, inflightPenalty); req.src = rd.src; req.completeSources = rd.completeSources; - if (rd.reason == RelocateReason::REBALANCE_READ) { - req.teamSorter = greaterReadLoad; - } // bestTeam.second = false if the bestTeam in the teamCollection (in the DC) does not have any // server that hosts the relocateData. This is possible, for example, in a fearless configuration // when the remote DC is just brought up. @@ -1778,25 +1765,30 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, srcReq = GetTeamRequest(WantNewServers::True, WantTrueBest::True, PreferLowerUtilization::False, - TeamMustHaveShards::True); + TeamMustHaveShards::True, + ForReadBalance(readRebalance), + PreferLowerReadUtil::False); destReq = GetTeamRequest(WantNewServers::True, WantTrueBest::False, PreferLowerUtilization::True, - TeamMustHaveShards::False); + TeamMustHaveShards::False, + ForReadBalance(readRebalance), + PreferLowerReadUtil::True); } else { srcReq = GetTeamRequest(WantNewServers::True, WantTrueBest::False, PreferLowerUtilization::False, - TeamMustHaveShards::True); + TeamMustHaveShards::True, + ForReadBalance(readRebalance), + PreferLowerReadUtil::False); destReq = GetTeamRequest(WantNewServers::True, WantTrueBest::True, PreferLowerUtilization::True, - TeamMustHaveShards::False); - } - if (readRebalance) { - srcReq.teamSorter = lessReadLoad; - destReq.teamSorter = greaterReadLoad; + TeamMustHaveShards::False, + ForReadBalance(readRebalance), + PreferLowerReadUtil::True); } + // clang-format off wait(getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, &sourceTeam, &destTeam,ddPriority,&traceEvent)); if (sourceTeam.isValid() && destTeam.isValid()) { From b970d507c0747256c1bcc3ce22dbc1cda79fb831 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 4 May 2022 21:19:52 -0700 Subject: [PATCH 126/299] Avoid creating LogPushData when PROXY_USE_RESOLVER_PRIVATE_MUTATIONS is off To save CPU cost, especially for creating LogSystemConfig. --- fdbserver/Resolver.actor.cpp | 38 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 91449c6923..e088de711a 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -18,6 +18,8 @@ * limitations under the License. */ +#include + #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/Notified.h" #include "fdbclient/StorageServerInterface.h" @@ -310,20 +312,22 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc auto& stateTransactions = stateTransactionsPair.second; int64_t stateMutations = 0; int64_t stateBytes = 0; - LogPushData toCommit(self->logSystem); // For accumulating private mutations - ResolverData resolverData(self->dbgid, - self->logSystem, - self->txnStateStore, - &self->keyInfo, - &toCommit, - self->forceRecovery, - req.version + 1, - &self->storageCache, - &self->tssMapping); + std::unique_ptr toCommit(nullptr); // For accumulating private mutations + std::unique_ptr resolverData(nullptr); bool isLocked = false; if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) { auto lockedKey = self->txnStateStore->readValue(databaseLockedKey).get(); isLocked = lockedKey.present() && lockedKey.get().size(); + toCommit.reset(new LogPushData(self->logSystem)); + resolverData.reset(new ResolverData(self->dbgid, + self->logSystem, + self->txnStateStore, + &self->keyInfo, + toCommit.get(), + self->forceRecovery, + req.version + 1, + &self->storageCache, + &self->tssMapping)); } for (int t : req.txnStateTransactions) { stateMutations += req.transactions[t].mutations.size(); @@ -343,7 +347,7 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc SpanContext spanContext = req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanContext(); - applyMetadataMutations(spanContext, resolverData, req.transactions[t].mutations); + applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations); } TEST(self->forceRecovery); // Resolver detects forced recovery } @@ -361,20 +365,20 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc // If shardChanged at or before this commit version, the proxy may have computed // the wrong set of groups. Then we need to broadcast to all groups below. - stateTransactionsPair.first = toCommit.isShardChanged(); + stateTransactionsPair.first = toCommit && toCommit->isShardChanged(); bool shardChanged = self->recentStateTransactionsInfo.applyStateTxnsToBatchReply( - &reply, firstUnseenVersion, req.version, toCommit.isShardChanged()); + &reply, firstUnseenVersion, req.version, toCommit && toCommit->isShardChanged()); // Adds private mutation messages to the reply message. if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) { - auto privateMutations = toCommit.getAllMessages(); + auto privateMutations = toCommit->getAllMessages(); for (const auto& mutations : privateMutations) { reply.privateMutations.push_back(reply.arena, mutations); reply.arena.dependsOn(mutations.arena()); } // merge mutation tags with sent client tags - toCommit.saveTags(reply.writtenTags); - reply.privateMutationCount = toCommit.getMutationCount(); + toCommit->saveTags(reply.writtenTags); + reply.privateMutationCount = toCommit->getMutationCount(); } //TraceEvent("ResolveBatch", self->dbgid).detail("PrevVersion", req.prevVersion).detail("Version", req.version).detail("StateTransactionVersions", self->recentStateTransactionsInfo.size()).detail("StateBytes", stateBytes).detail("FirstVersion", self->recentStateTransactionsInfo.empty() ? -1 : self->recentStateTransactionsInfo.firstVersion()).detail("StateMutationsIn", req.txnStateTransactions.size()).detail("StateMutationsOut", reply.stateMutations.size()).detail("From", proxyAddress); @@ -418,7 +422,7 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc writtenTLogs.insert(i); } } else { - toCommit.getLocations(reply.writtenTags, writtenTLogs); + toCommit->getLocations(reply.writtenTags, writtenTLogs); } if (self->tpcvVector[0] == invalidVersion) { std::fill(self->tpcvVector.begin(), self->tpcvVector.end(), req.prevVersion); From 7ba2be6f3045269516a2f5a3e5df7fb48af61d39 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 5 May 2022 10:03:30 -0700 Subject: [PATCH 127/299] Update simulator to sometimes create tenants it doesn't use. Fix bug in mapped range where it would fail if any tenants existed and transactions were run not using the tenants. --- fdbserver/SimulatedCluster.actor.cpp | 18 +++- fdbserver/TesterInterface.actor.h | 20 +++-- fdbserver/storageserver.actor.cpp | 120 +++++++++++++++++++++++++-- fdbserver/tester.actor.cpp | 38 ++++++--- 4 files changed, 167 insertions(+), 29 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 4b319a0081..3d6961527c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2362,6 +2362,7 @@ ACTOR void setupAndRun(std::string dataFolder, allowList.addTrustedSubnet("abcd::/16"sv); state bool allowDefaultTenant = testConfig.allowDefaultTenant; state bool allowDisablingTenants = testConfig.allowDisablingTenants; + state bool allowCreatingTenants = true; // The RocksDB storage engine does not support the restarting tests because you cannot consistently get a clean // snapshot of the storage engine without a snapshotting file system. @@ -2372,6 +2373,7 @@ ACTOR void setupAndRun(std::string dataFolder, // Disable the default tenant in restarting tests for now // TODO: persist the chosen default tenant in the restartInfo.ini file for the second test allowDefaultTenant = false; + allowCreatingTenants = false; } // TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine @@ -2425,9 +2427,11 @@ ACTOR void setupAndRun(std::string dataFolder, TEST(true); // Simulation start state Optional defaultTenant; + state Standalone> tenantsToCreate; state TenantMode tenantMode = TenantMode::DISABLED; if (allowDefaultTenant && deterministicRandom()->random01() < 0.5) { defaultTenant = "SimulatedDefaultTenant"_sr; + tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get()); if (deterministicRandom()->random01() < 0.9) { tenantMode = TenantMode::REQUIRED; } else { @@ -2437,9 +2441,18 @@ ACTOR void setupAndRun(std::string dataFolder, tenantMode = TenantMode::OPTIONAL_TENANT; } + if (allowCreatingTenants && tenantMode != TenantMode::DISABLED && deterministicRandom()->random01() < 0.5) { + int numTenants = deterministicRandom()->randomInt(1, 6); + for (int i = 0; i < numTenants; ++i) { + tenantsToCreate.push_back_deep(tenantsToCreate.arena(), + TenantNameRef(format("SimulatedExtraTenant%04d", i))); + } + } + TraceEvent("SimulatedClusterTenantMode") .detail("UsingTenant", defaultTenant) - .detail("TenantRequired", tenantMode.toString()); + .detail("TenantRequired", tenantMode.toString()) + .detail("TotalTenants", tenantsToCreate.size()); try { // systemActors.push_back( startSystemMonitor(dataFolder) ); @@ -2481,7 +2494,8 @@ ACTOR void setupAndRun(std::string dataFolder, startingConfiguration, LocalityData(), UnitTestParameters(), - defaultTenant), + defaultTenant, + tenantsToCreate), isBuggifyEnabled(BuggifyType::General) ? 36000.0 : 5400.0)); } catch (Error& e) { TraceEvent(SevError, "SetupAndRunError").error(e); diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h index a52b11673e..7d049eac3d 100644 --- a/fdbserver/TesterInterface.actor.h +++ b/fdbserver/TesterInterface.actor.h @@ -122,15 +122,17 @@ ACTOR Future testerServerCore(TesterInterface interf, enum test_location_t { TEST_HERE, TEST_ON_SERVERS, TEST_ON_TESTERS }; enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK, TEST_TYPE_UNIT_TESTS }; -ACTOR Future runTests(Reference connRecord, - test_type_t whatToRun, - test_location_t whereToRun, - int minTestersExpected, - std::string fileName = std::string(), - StringRef startingConfiguration = StringRef(), - LocalityData locality = LocalityData(), - UnitTestParameters testOptions = UnitTestParameters(), - Optional defaultTenant = Optional()); +ACTOR Future runTests( + Reference connRecord, + test_type_t whatToRun, + test_location_t whereToRun, + int minTestersExpected, + std::string fileName = std::string(), + StringRef startingConfiguration = StringRef(), + LocalityData locality = LocalityData(), + UnitTestParameters testOptions = UnitTestParameters(), + Optional defaultTenant = Optional(), + Standalone> tenantsToCreate = Standalone>()); #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index e7107176ab..0b329c34f8 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3713,6 +3713,118 @@ ACTOR Future mapKeyValues(StorageServer* data, return result; } +bool rangeIntersectsAnyTenant(TenantPrefixIndex& prefixIndex, KeyRangeRef range, Version ver) { + auto view = prefixIndex.at(ver); + auto beginItr = view.lastLessOrEqual(range.begin); + auto endItr = view.lastLess(range.end); + + // If the begin and end reference different spots in the tenant index, then the tenant pointed to + // by endItr intersects the range + if (beginItr != endItr) { + return true; + } + + // If the iterators point to the same entry and that entry contains begin, then we are wholly in + // one tenant + if (beginItr != view.end() && range.begin.startsWith(beginItr.key())) { + return true; + } + + return false; +} + +TEST_CASE("/fdbserver/storageserver/rangeIntersectsAnyTenant") { + std::map entries = { std::make_pair("tenant0"_sr, TenantMapEntry(0, ""_sr)), + std::make_pair("tenant2"_sr, TenantMapEntry(2, ""_sr)), + std::make_pair("tenant3"_sr, TenantMapEntry(3, ""_sr)), + std::make_pair("tenant4"_sr, TenantMapEntry(4, ""_sr)), + std::make_pair("tenant6"_sr, TenantMapEntry(6, ""_sr)) }; + TenantPrefixIndex index; + index.createNewVersion(1); + for (auto entry : entries) { + index.insert(entry.second.prefix, entry.first); + } + + // Before all tenants + ASSERT(!rangeIntersectsAnyTenant(index, KeyRangeRef(""_sr, "\x00"_sr), index.getLatestVersion())); + + // After all tenants + ASSERT(!rangeIntersectsAnyTenant(index, KeyRangeRef("\xfe"_sr, "\xff"_sr), index.getLatestVersion())); + + // In between tenants + ASSERT(!rangeIntersectsAnyTenant( + index, + KeyRangeRef(TenantMapEntry::idToPrefix(1), TenantMapEntry::idToPrefix(1).withSuffix("\xff"_sr)), + index.getLatestVersion())); + + // In between tenants with end intersecting tenant start + ASSERT(!rangeIntersectsAnyTenant( + index, KeyRangeRef(TenantMapEntry::idToPrefix(5), entries["tenant6"_sr].prefix), index.getLatestVersion())); + + // Entire tenants + ASSERT(rangeIntersectsAnyTenant( + index, KeyRangeRef(entries["tenant0"_sr].prefix, TenantMapEntry::idToPrefix(1)), index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant( + index, KeyRangeRef(entries["tenant2"_sr].prefix, entries["tenant3"_sr].prefix), index.getLatestVersion())); + + // Partial tenants + ASSERT(rangeIntersectsAnyTenant( + index, + KeyRangeRef(entries["tenant0"_sr].prefix, entries["tenant0"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant( + index, + KeyRangeRef(entries["tenant3"_sr].prefix.withSuffix("foo"_sr), entries["tenant4"_sr].prefix), + index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant(index, + KeyRangeRef(entries["tenant4"_sr].prefix.withSuffix("bar"_sr), + entries["tenant4"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + + // Begin outside, end inside tenant + ASSERT(rangeIntersectsAnyTenant( + index, + KeyRangeRef(TenantMapEntry::idToPrefix(1), entries["tenant2"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant( + index, + KeyRangeRef(TenantMapEntry::idToPrefix(1), entries["tenant3"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + + // Begin inside, end outside tenant + ASSERT(rangeIntersectsAnyTenant( + index, + KeyRangeRef(entries["tenant3"_sr].prefix.withSuffix("foo"_sr), TenantMapEntry::idToPrefix(5)), + index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant( + index, + KeyRangeRef(entries["tenant4"_sr].prefix.withSuffix("foo"_sr), TenantMapEntry::idToPrefix(5)), + index.getLatestVersion())); + + // Both inside different tenants + ASSERT(rangeIntersectsAnyTenant(index, + KeyRangeRef(entries["tenant0"_sr].prefix.withSuffix("foo"_sr), + entries["tenant2"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant(index, + KeyRangeRef(entries["tenant0"_sr].prefix.withSuffix("foo"_sr), + entries["tenant3"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant(index, + KeyRangeRef(entries["tenant2"_sr].prefix.withSuffix("foo"_sr), + entries["tenant6"_sr].prefix.withSuffix("foo"_sr)), + index.getLatestVersion())); + + // Both outside tenants with tenant in the middle + ASSERT(rangeIntersectsAnyTenant( + index, KeyRangeRef(""_sr, TenantMapEntry::idToPrefix(1).withSuffix("foo"_sr)), index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant(index, KeyRangeRef(""_sr, "\xff"_sr), index.getLatestVersion())); + ASSERT(rangeIntersectsAnyTenant( + index, KeyRangeRef(TenantMapEntry::idToPrefix(5).withSuffix("foo"_sr), "\xff"_sr), index.getLatestVersion())); + + return Void(); +} + // Most of the actor is copied from getKeyValuesQ. I tried to use templates but things become nearly impossible after // combining actor shenanigans with template shenanigans. ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRequest req) @@ -3799,13 +3911,7 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe throw tenant_name_required(); } - auto view = data->tenantPrefixIndex.at(req.version); - auto beginItr = view.lastLessOrEqual(begin); - if (beginItr != view.end() && !begin.startsWith(beginItr.key())) { - ++beginItr; - } - auto endItr = view.lastLessOrEqual(end); - if (beginItr != endItr) { + if (rangeIntersectsAnyTenant(data->tenantPrefixIndex, KeyRangeRef(begin, end), req.version)) { throw tenant_name_required(); } } diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 64b9924560..11851f100e 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1534,7 +1534,8 @@ ACTOR Future runTests(Reference tests, StringRef startingConfiguration, LocalityData locality, - Optional defaultTenant) { + Optional defaultTenant, + Standalone> tenantsToCreate) { state Database cx; state Reference> dbInfo(new AsyncVar); state Future ccMonitor = monitorServerDBInfo(cc, LocalityData(), dbInfo); // FIXME: locality @@ -1610,9 +1611,14 @@ ACTOR Future runTests(Reference> tenantFutures; + for (auto tenant : tenantsToCreate) { + TraceEvent("CreatingTenant").detail("Tenant", tenant); + tenantFutures.push_back(ManagementAPI::createTenant(cx.getReference(), tenant)); + } + + wait(waitForAll(tenantFutures)); } if (useDB && waitForQuiescenceBegin) { @@ -1694,7 +1700,8 @@ ACTOR Future runTests(Reference defaultTenant) { + Optional defaultTenant, + Standalone> tenantsToCreate) { state int flags = (at == TEST_ON_SERVERS ? 0 : GetWorkersRequest::TESTER_CLASS_ONLY) | GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY; state Future testerTimeout = delay(600.0); // wait 600 sec for testers to show up @@ -1725,7 +1732,7 @@ ACTOR Future runTests(Reference runTests(Reference connRecord, StringRef startingConfiguration, LocalityData locality, UnitTestParameters testOptions, - Optional defaultTenant) { + Optional defaultTenant, + Standalone> tenantsToCreate) { state TestSet testSet; state std::unique_ptr knobProtectiveGroup(nullptr); auto cc = makeReference>>(); @@ -1847,11 +1855,19 @@ ACTOR Future runTests(Reference connRecord, actors.push_back( reportErrors(monitorServerDBInfo(cc, LocalityData(), db), "MonitorServerDBInfo")); // FIXME: Locality actors.push_back(reportErrors(testerServerCore(iTesters[0], connRecord, db, locality), "TesterServerCore")); - tests = runTests(cc, ci, iTesters, testSet.testSpecs, startingConfiguration, locality, defaultTenant); + tests = runTests( + cc, ci, iTesters, testSet.testSpecs, startingConfiguration, locality, defaultTenant, tenantsToCreate); } else { - tests = reportErrors( - runTests(cc, ci, testSet.testSpecs, at, minTestersExpected, startingConfiguration, locality, defaultTenant), - "RunTests"); + tests = reportErrors(runTests(cc, + ci, + testSet.testSpecs, + at, + minTestersExpected, + startingConfiguration, + locality, + defaultTenant, + tenantsToCreate), + "RunTests"); } choose { From f8d3b20994d7def54e442d640a8e20b98bc91204 Mon Sep 17 00:00:00 2001 From: Hao Fu <77984096+hfu94@users.noreply.github.com> Date: Thu, 5 May 2022 12:52:54 -0700 Subject: [PATCH 128/299] re-use same arena when construct mapped key (#7066) * re-use same arena when construct mapped key ConstructMappedKey seems to be a hotspot, try eliminate unnecessary operations by * re-using the same Tuple * preprocess the formatTuple to get a list of Tuples and strings * throw mapper_bad_range_decriptor when range query is not the last element --- fdbclient/Tuple.h | 5 + fdbserver/storageserver.actor.cpp | 176 +++++++++++++++++++++--------- 2 files changed, 127 insertions(+), 54 deletions(-) diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h index 5c357af712..2ff31bf990 100644 --- a/fdbclient/Tuple.h +++ b/fdbclient/Tuple.h @@ -56,6 +56,11 @@ struct Tuple { // this is number of elements, not length of data size_t size() const { return offsets.size(); } + void reserve(size_t cap) { offsets.reserve(cap); } + void clear() { + data.clear(); + offsets.clear(); + } ElementType getType(size_t index) const; Standalone getString(size_t index) const; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index e7107176ab..ef17342bc1 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3521,12 +3521,17 @@ bool rangeQuery(const std::string& s) { return s == "{...}"; } -Key constructMappedKey(KeyValueRef* keyValue, Tuple& mappedKeyFormatTuple, bool& isRangeQuery) { - // Lazily parse key and/or value to tuple because they may not need to be a tuple if not used. - Optional keyTuple; - Optional valueTuple; +// create a vector of Optional +// in case of a singleKeyOrValue, insert an empty Tuple to vector as placeholder +// in case of a rangeQuery, insert Optional.empty as placeholder +// in other cases, insert the correct Tuple to be used. +void preprocessMappedKey(Tuple& mappedKeyFormatTuple, + std::vector>& vt, + std::vector& strings, + bool& isRangeQuery) { + vt.reserve(mappedKeyFormatTuple.size()); + strings.reserve(mappedKeyFormatTuple.size()); - Tuple mappedKeyTuple; for (int i = 0; i < mappedKeyFormatTuple.size(); i++) { Tuple::ElementType type = mappedKeyFormatTuple.getType(i); if (type == Tuple::BYTES || type == Tuple::UTF8) { @@ -3534,43 +3539,71 @@ Key constructMappedKey(KeyValueRef* keyValue, Tuple& mappedKeyFormatTuple, bool& auto sz = s.size(); bool escaped = unescapeLiterals(s, "{{", "{"); escaped = unescapeLiterals(s, "}}", "}") || escaped; + strings.push_back(s); if (escaped) { - mappedKeyTuple.append(s); + Tuple escapedTuple; + escapedTuple.append(s); + vt.push_back(Optional(escapedTuple)); } else if (singleKeyOrValue(s, sz)) { - int idx; - Tuple* referenceTuple; - try { - idx = std::stoi(s.substr(3, sz - 5)); - } catch (std::exception& e) { - throw mapper_bad_index(); - } - if (s[1] == 'K') { - unpackKeyTuple(&referenceTuple, keyTuple, keyValue); - } else if (s[1] == 'V') { - unpackValueTuple(&referenceTuple, valueTuple, keyValue); - } else { - ASSERT(false); - throw internal_error(); - } - if (idx < 0 || idx >= referenceTuple->size()) { - throw mapper_bad_index(); - } - mappedKeyTuple.append(referenceTuple->subTuple(idx, idx + 1)); + // when it is SingleKeyOrValue, insert an empty Tuple to vector as placeholder + vt.push_back(Optional(Tuple())); } else if (rangeQuery(s)) { if (i != mappedKeyFormatTuple.size() - 1) { // It must be the last element of the mapper tuple throw mapper_bad_range_decriptor(); } - // Every record will try to set it. It's ugly, but not wrong. + // when it is rangeQuery, insert Optional.empty as placeholder + vt.push_back(Optional()); isRangeQuery = true; - // Do not add it to the mapped key. } else { - // If the element is a string but neither escaped nor descriptors, add to result. - mappedKeyTuple.append(mappedKeyFormatTuple.subTuple(i, i + 1)); + vt.push_back(Optional(mappedKeyFormatTuple.subTuple(i, i + 1))); } } else { - // If the element not a string, add to result. - mappedKeyTuple.append(mappedKeyFormatTuple.subTuple(i, i + 1)); + vt.push_back(Optional(mappedKeyFormatTuple.subTuple(i, i + 1))); + } + } +} + +Key constructMappedKey(KeyValueRef* keyValue, + std::vector>& vec, + Tuple& mappedKeyTuple, + std::vector& strings) { + // Lazily parse key and/or value to tuple because they may not need to be a tuple if not used. + Optional keyTuple; + Optional valueTuple; + mappedKeyTuple.clear(); + mappedKeyTuple.reserve(vec.size()); + + for (int i = 0; i < vec.size(); i++) { + if (!vec[i].present()) { + // rangeQuery + continue; + } + if (vec[i].get().size()) { + mappedKeyTuple.append(vec[i].get()); + } else { + // singleKeyOrValue is true + std::string s = strings[i]; + auto sz = s.size(); + int idx; + Tuple* referenceTuple; + try { + idx = std::stoi(s.substr(3, sz - 5)); + } catch (std::exception& e) { + throw mapper_bad_index(); + } + if (s[1] == 'K') { + unpackKeyTuple(&referenceTuple, keyTuple, keyValue); + } else if (s[1] == 'V') { + unpackValueTuple(&referenceTuple, valueTuple, keyValue); + } else { + ASSERT(false); + throw internal_error(); + } + if (idx < 0 || idx >= referenceTuple->size()) { + throw mapper_bad_index(); + } + mappedKeyTuple.append(referenceTuple->subTuple(idx, idx + 1)); } } @@ -3582,15 +3615,20 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { Value value = Tuple().append("value-0"_sr).append("value-1"_sr).append("value-2"_sr).getDataAsStandalone(); state KeyValueRef kvr(key, value); { - Tuple mapperTuple = Tuple() - .append("normal"_sr) - .append("{{escaped}}"_sr) - .append("{K[2]}"_sr) - .append("{V[0]}"_sr) - .append("{...}"_sr); + Tuple mappedKeyFormatTuple = Tuple() + .append("normal"_sr) + .append("{{escaped}}"_sr) + .append("{K[2]}"_sr) + .append("{V[0]}"_sr) + .append("{...}"_sr); + Tuple mappedKeyTuple; + std::vector> vt; + std::vector strings; bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); Key expectedMappedKey = Tuple() .append("normal"_sr) @@ -3602,11 +3640,16 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { ASSERT(mappedKey.compare(expectedMappedKey) == 0); ASSERT(isRangeQuery == true); } - { - Tuple mapperTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr); + { + Tuple mappedKeyFormatTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr); + + Tuple mappedKeyTuple; + std::vector> vt; + std::vector strings; bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone(); // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; @@ -3614,10 +3657,14 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { ASSERT(isRangeQuery == false); } { - Tuple mapperTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr); + Tuple mappedKeyFormatTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr); + Tuple mappedKeyTuple; + std::vector> vt; + std::vector strings; bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone(); // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; @@ -3625,11 +3672,16 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { ASSERT(isRangeQuery == false); } { - Tuple mapperTuple = Tuple().append("{K[100]}"_sr); - bool isRangeQuery = false; + Tuple mappedKeyFormatTuple = Tuple().append("{K[100]}"_sr); state bool throwException = false; try { - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); + Tuple mappedKeyTuple; + std::vector> vt; + std::vector strings; + bool isRangeQuery = false; + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_index); throwException = true; @@ -3637,11 +3689,16 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { ASSERT(throwException); } { - Tuple mapperTuple = Tuple().append("{...}"_sr).append("last-element"_sr); - bool isRangeQuery = false; + Tuple mappedKeyFormatTuple = Tuple().append("{...}"_sr).append("last-element"_sr); state bool throwException2 = false; try { - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); + Tuple mappedKeyTuple; + std::vector> vt; + std::vector strings; + bool isRangeQuery = false; + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_range_decriptor); throwException2 = true; @@ -3649,11 +3706,16 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { ASSERT(throwException2); } { - Tuple mapperTuple = Tuple().append("{K[not-a-number]}"_sr); - bool isRangeQuery = false; + Tuple mappedKeyFormatTuple = Tuple().append("{K[not-a-number]}"_sr); state bool throwException3 = false; try { - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); + Tuple mappedKeyTuple; + std::vector> vt; + std::vector strings; + bool isRangeQuery = false; + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_index); throwException3 = true; @@ -3678,6 +3740,8 @@ ACTOR Future mapKeyValues(StorageServer* data, result.data.reserve(result.arena, input.data.size()); state Tuple mappedKeyFormatTuple; + state Tuple mappedKeyTuple; + try { mappedKeyFormatTuple = Tuple::unpack(mapper); } catch (Error& e) { @@ -3685,13 +3749,17 @@ ACTOR Future mapKeyValues(StorageServer* data, throw mapper_not_tuple(); } state KeyValueRef* it = input.data.begin(); + state std::vector> vt; + state std::vector strings; + state bool isRangeQuery = false; + preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + for (; it != input.data.end(); it++) { state MappedKeyValueRef kvm; kvm.key = it->key; kvm.value = it->value; - state bool isRangeQuery = false; - state Key mappedKey = constructMappedKey(it, mappedKeyFormatTuple, isRangeQuery); + state Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, strings); // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. result.arena.dependsOn(mappedKey.arena()); From 19e7b13eb2bef1ba94a41b40478fc977bdf292b7 Mon Sep 17 00:00:00 2001 From: Aaron Molitor Date: Thu, 5 May 2022 16:34:58 -0500 Subject: [PATCH 129/299] Update Badge URL in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b1a8d7a45..e872ac2258 100755 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ FoundationDB logo -![Build Status](https://codebuild.us-west-2.amazonaws.com/badges?uuid=eyJlbmNyeXB0ZWREYXRhIjoiZ1FhRlNwU0JXeHVpZkt0a0k0QlNJK3BEUkplTGVRYnk3azBoT1FOazBQbGlIeDgrYmRJZVhuSUI4RTd3RWJWcjVMT3ZPTzV0NXlCTWpPTGlPVlMzckJJPSIsIml2UGFyYW1ldGVyU3BlYyI6IlB0TWVCM0VYdU5PQWtMUFYiLCJtYXRlcmlhbFNldFNlcmlhbCI6MX0%3D&branch=master) +![Build Status](https://codebuild.us-west-2.amazonaws.com/badges?uuid=eyJlbmNyeXB0ZWREYXRhIjoiVjVzb1RQNUZTaGxGNm9iUnk4OUZ1d09GdTMzZnVOT1YzaUU1RU1xR2o2TENRWFZjb3ZrTHJEcngrZVdnNE40bXJJVDErOGVwendIL3lFWFY3Y3oxQmdjPSIsIml2UGFyYW1ldGVyU3BlYyI6IlJUbWhnaUlJVXRORUNJTjQiLCJtYXRlcmlhbFNldFNlcmlhbCI6MX0%3D&branch=main) FoundationDB is a distributed database designed to handle large volumes of structured data across clusters of commodity servers. It organizes data as an ordered key-value store and employs ACID transactions for all operations. It is especially well-suited for read/write workloads but also has excellent performance for write-intensive workloads. Users interact with the database using API language binding. From 93f9ea9a5bad453aeff894f63ad6e5179314c1ff Mon Sep 17 00:00:00 2001 From: Neethu Haneesha Bingi Date: Thu, 5 May 2022 16:28:54 -0400 Subject: [PATCH 130/299] Adding rocksdb throttling counters to trace event. --- fdbserver/KeyValueStoreRocksDB.actor.cpp | 44 ++++++++++++++---------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index 380b327930..d11f46ecfc 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -332,6 +332,15 @@ rocksdb::ReadOptions getReadOptions() { return options; } +struct Counters { + CounterCollection cc; + Counter immediateThrottle; + Counter failedToAcquire; + + Counters() + : cc("RocksDBThrottle"), immediateThrottle("ImmediateThrottle", cc), failedToAcquire("FailedToAcquire", cc) {} +}; + struct ReadIterator { CF& cf; uint64_t index; // incrementing counter to uniquely identify read iterator. @@ -730,7 +739,8 @@ ACTOR Future flowLockLogger(const FlowLock* readLock, const FlowLock* fetc ACTOR Future rocksDBMetricLogger(std::shared_ptr statistics, std::shared_ptr perfContextMetrics, rocksdb::DB* db, - std::shared_ptr readIterPool) { + std::shared_ptr readIterPool, + Counters* counters) { state std::vector> tickerStats = { { "StallMicros", rocksdb::STALL_MICROS, 0 }, { "BytesRead", rocksdb::BYTES_READ, 0 }, @@ -769,7 +779,6 @@ ACTOR Future rocksDBMetricLogger(std::shared_ptr stat }; state std::vector> propertyStats = { - { "NumCompactionsRunning", rocksdb::DB::Properties::kNumRunningCompactions }, { "NumImmutableMemtables", rocksdb::DB::Properties::kNumImmutableMemTable }, { "NumImmutableMemtablesFlushed", rocksdb::DB::Properties::kNumImmutableMemTableFlushed }, { "IsMemtableFlushPending", rocksdb::DB::Properties::kMemTableFlushPending }, @@ -829,6 +838,8 @@ ACTOR Future rocksDBMetricLogger(std::shared_ptr stat e.detail("NumTimesReadIteratorsReused", stat - readIteratorPoolStats["NumTimesReadIteratorsReused"]); readIteratorPoolStats["NumTimesReadIteratorsReused"] = stat; + counters->cc.logToTraceEvent(e); + if (SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE) { perfContextMetrics->log(true); } @@ -907,13 +918,15 @@ struct RocksDBKeyValueStore : IKeyValueStore { const FlowLock* readLock; const FlowLock* fetchLock; std::shared_ptr errorListener; + Counters& counters; OpenAction(std::string path, Optional>& metrics, const FlowLock* readLock, const FlowLock* fetchLock, - std::shared_ptr errorListener) + std::shared_ptr errorListener, + Counters& counters) : path(std::move(path)), metrics(metrics), readLock(readLock), fetchLock(fetchLock), - errorListener(errorListener) {} + errorListener(errorListener), counters(counters) {} double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } }; @@ -973,12 +986,14 @@ struct RocksDBKeyValueStore : IKeyValueStore { // The current thread and main thread are same when the code runs in simulation. // blockUntilReady() is getting the thread into deadlock state, so directly calling // the metricsLogger. - a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) && - flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); + a.metrics = + rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool, &a.counters) && + flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); } else { onMainThread([&] { - a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) && - flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); + a.metrics = + rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool, &a.counters) && + flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); return Future(true); }).blockUntilReady(); } @@ -1621,16 +1636,6 @@ struct RocksDBKeyValueStore : IKeyValueStore { Future actorErrorListener; Future collection; PromiseStream> addActor; - - struct Counters { - CounterCollection cc; - Counter immediateThrottle; - Counter failedToAcquire; - - Counters() - : cc("RocksDBThrottle"), immediateThrottle("ImmediateThrottle", cc), failedToAcquire("failedToAcquire", cc) {} - }; - Counters counters; explicit RocksDBKeyValueStore(const std::string& path, UID id) @@ -1822,7 +1827,8 @@ struct RocksDBKeyValueStore : IKeyValueStore { if (openFuture.isValid()) { return openFuture; } - auto a = std::make_unique(path, metrics, &readSemaphore, &fetchSemaphore, errorListener); + auto a = std::make_unique( + path, metrics, &readSemaphore, &fetchSemaphore, errorListener, counters); openFuture = a->done.getFuture(); writeThread->post(a.release()); return openFuture; From 7ce53ca164d7c08aa07219c5a344f61653be6130 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 5 May 2022 23:53:51 -0700 Subject: [PATCH 131/299] add SkewReadWriteWorkload --- fdbclient/SystemData.cpp | 17 +- fdbclient/SystemData.h | 2 + fdbserver/CMakeLists.txt | 733 +++++++++-------- fdbserver/tester.actor.cpp | 22 +- fdbserver/workloads/ReadWrite.actor.cpp | 71 +- fdbserver/workloads/SkewedReadWrite.actor.cpp | 778 ++++++++++++++++++ fdbserver/workloads/workloads.actor.h | 2 + 7 files changed, 1223 insertions(+), 402 deletions(-) create mode 100644 fdbserver/workloads/SkewedReadWrite.actor.cpp diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index af9ba32a31..a84eae4f77 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -302,7 +302,8 @@ std::pair>, std::vector> server_id; return server_id; } + +std::pair serverKeysDecodeServerBegin(const KeyRef& key) { + UID server_id; + BinaryReader rd(key.removePrefix(serverKeysPrefix), Unversioned()); + rd >> server_id; + rd.readBytes(1); // skip "/" + std::string bytes; + while (!rd.empty()) { + bytes.push_back((char)*rd.arenaRead(1)); + } + // std::cout << bytes.size() << " " <& serve // Using the serverID as a prefix, then followed by the beginning of the shard range // as the key, the value indicates whether the shard does or does not exist on the server. // These values can be changed as data movement occurs. +extern const KeyRangeRef serverKeysRange; extern const KeyRef serverKeysPrefix; extern const ValueRef serverKeysTrue, serverKeysTrueEmptyRange, serverKeysFalse; const Key serverKeysKey(UID serverID, const KeyRef& keys); const Key serverKeysPrefixFor(UID serverID); UID serverKeysDecodeServer(const KeyRef& key); +std::pair serverKeysDecodeServerBegin(const KeyRef& key); bool serverHasKey(ValueRef storedValue); extern const KeyRangeRef conflictingKeysRange; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 01b3cd343e..6c6b991301 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -1,388 +1,389 @@ set(FDBSERVER_SRCS - ApplyMetadataMutation.cpp - ApplyMetadataMutation.h - BackupInterface.h - BackupProgress.actor.cpp - BackupProgress.actor.h - BackupWorker.actor.cpp - BlobGranuleServerCommon.actor.cpp - BlobGranuleServerCommon.actor.h - BlobGranuleValidation.actor.cpp - BlobGranuleValidation.actor.h - BlobManager.actor.cpp - BlobManagerInterface.h - BlobWorker.actor.cpp - ClusterController.actor.cpp - ClusterController.actor.h - ClusterRecovery.actor.cpp - ClusterRecovery.actor.h - CommitProxyServer.actor.cpp - ConfigBroadcaster.actor.cpp - ConfigBroadcaster.h - ConfigDatabaseUnitTests.actor.cpp - ConfigFollowerInterface.cpp - ConfigFollowerInterface.h - ConfigNode.actor.cpp - ConfigNode.h - ConflictSet.h - CoordinatedState.actor.cpp - CoordinatedState.h - Coordination.actor.cpp - CoordinationInterface.h - CoroFlow.h - DataDistribution.actor.cpp - DataDistribution.actor.h - DataDistributionQueue.actor.cpp - DataDistributionTracker.actor.cpp - DataDistributorInterface.h - DBCoreState.h - DDTeamCollection.actor.cpp - DDTeamCollection.h - DiskQueue.actor.cpp - EncryptKeyProxy.actor.cpp - EncryptKeyProxyInterface.h - FDBExecHelper.actor.cpp - FDBExecHelper.actor.h - fdbserver.actor.cpp - GrvProxyServer.actor.cpp - IConfigConsumer.cpp - IConfigConsumer.h - IDiskQueue.h - IKeyValueContainer.h - IKeyValueStore.h - IPager.h - KeyValueStoreCompressTestData.actor.cpp - KeyValueStoreMemory.actor.cpp - KeyValueStoreRocksDB.actor.cpp - KeyValueStoreSQLite.actor.cpp - KmsConnector.h - KmsConnectorInterface.h - KnobProtectiveGroups.cpp - KnobProtectiveGroups.h - Knobs.h - LatencyBandConfig.cpp - LatencyBandConfig.h - LeaderElection.actor.cpp - LeaderElection.h - LocalConfiguration.actor.cpp - LocalConfiguration.h - LogProtocolMessage.h - LogRouter.actor.cpp - LogSystem.cpp - LogSystem.h - LogSystemConfig.cpp - LogSystemConfig.h - LogSystemDiskQueueAdapter.actor.cpp - LogSystemDiskQueueAdapter.h - LogSystemPeekCursor.actor.cpp - MasterInterface.h - masterserver.actor.cpp - MetricLogger.actor.cpp - MetricLogger.actor.h - MoveKeys.actor.cpp - MoveKeys.actor.h - MutationTracking.cpp - MutationTracking.h - networktest.actor.cpp - NetworkTest.h - OldTLogServer_4_6.actor.cpp - OldTLogServer_6_0.actor.cpp - OldTLogServer_6_2.actor.cpp - OnDemandStore.actor.cpp - OnDemandStore.h - PaxosConfigConsumer.actor.cpp - PaxosConfigConsumer.h - ProxyCommitData.actor.h - pubsub.actor.cpp - pubsub.h - QuietDatabase.actor.cpp - QuietDatabase.h - RadixTree.h - Ratekeeper.actor.cpp - Ratekeeper.h - RatekeeperInterface.h - RecoveryState.h - RemoteIKeyValueStore.actor.h - RemoteIKeyValueStore.actor.cpp - ResolutionBalancer.actor.cpp - ResolutionBalancer.actor.h - Resolver.actor.cpp - ResolverInterface.h - RestoreApplier.actor.cpp - RestoreApplier.actor.h - RestoreCommon.actor.cpp - RestoreCommon.actor.h - RestoreController.actor.cpp - RestoreController.actor.h - RestoreLoader.actor.cpp - RestoreLoader.actor.h - RestoreRoleCommon.actor.cpp - RestoreRoleCommon.actor.h - RestoreUtil.actor.cpp - RestoreUtil.h - RestoreWorker.actor.cpp - RestoreWorker.actor.h - RestoreWorkerInterface.actor.cpp - RestoreWorkerInterface.actor.h - RkTagThrottleCollection.cpp - RkTagThrottleCollection.h - RocksDBCheckpointUtils.actor.cpp - RocksDBCheckpointUtils.actor.h - RoleLineage.actor.cpp - RoleLineage.actor.h - ServerCheckpoint.actor.cpp - ServerCheckpoint.actor.h - ServerDBInfo.actor.h - ServerDBInfo.h - SigStack.cpp - SimKmsConnector.actor.h - SimKmsConnector.actor.cpp - SimpleConfigConsumer.actor.cpp - SimpleConfigConsumer.h - SimulatedCluster.actor.cpp - SimulatedCluster.h - SkipList.cpp - SpanContextMessage.h - Status.actor.cpp - Status.h - StorageCache.actor.cpp - StorageMetrics.actor.h - StorageMetrics.h - storageserver.actor.cpp - TagPartitionedLogSystem.actor.cpp - TagPartitionedLogSystem.actor.h - TagThrottler.actor.cpp - TagThrottler.h - TCInfo.actor.cpp - TCInfo.h - template_fdb.h - tester.actor.cpp - TesterInterface.actor.h - TLogInterface.h - TLogServer.actor.cpp - TransactionTagCounter.cpp - TransactionTagCounter.h - TSSMappingUtil.actor.cpp - TSSMappingUtil.actor.h - VersionedBTree.actor.cpp - VFSAsync.cpp - VFSAsync.h - WaitFailure.actor.cpp - WaitFailure.h - worker.actor.cpp - WorkerInterface.actor.h - workloads/ApiCorrectness.actor.cpp - workloads/ApiWorkload.actor.cpp - workloads/ApiWorkload.h - workloads/AsyncFile.actor.h - workloads/AsyncFile.cpp - workloads/AsyncFileCorrectness.actor.cpp - workloads/AsyncFileRead.actor.cpp - workloads/AsyncFileWrite.actor.cpp - workloads/AtomicOps.actor.cpp - workloads/AtomicOpsApiCorrectness.actor.cpp - workloads/AtomicRestore.actor.cpp - workloads/AtomicSwitchover.actor.cpp - workloads/BackgroundSelectors.actor.cpp - workloads/BackupAndParallelRestoreCorrectness.actor.cpp - workloads/BackupCorrectness.actor.cpp - workloads/BackupToBlob.actor.cpp - workloads/BackupToDBAbort.actor.cpp - workloads/BackupToDBCorrectness.actor.cpp - workloads/BackupToDBUpgrade.actor.cpp - workloads/BlobGranuleCorrectnessWorkload.actor.cpp - workloads/BlobGranuleVerifier.actor.cpp - workloads/BlobStoreWorkload.h - workloads/BulkLoad.actor.cpp - workloads/BulkSetup.actor.h - workloads/Cache.actor.cpp - workloads/ChangeConfig.actor.cpp - workloads/ChangeFeeds.actor.cpp - workloads/ClearSingleRange.actor.cpp - workloads/ClientTransactionProfileCorrectness.actor.cpp - workloads/ClientWorkload.actor.cpp - workloads/ClogSingleConnection.actor.cpp - workloads/CommitBugCheck.actor.cpp - workloads/ConfigIncrement.actor.cpp - workloads/ConfigureDatabase.actor.cpp - workloads/ConflictRange.actor.cpp - workloads/ConsistencyCheck.actor.cpp - workloads/CpuProfiler.actor.cpp - workloads/Cycle.actor.cpp - workloads/DataDistributionMetrics.actor.cpp - workloads/DataLossRecovery.actor.cpp - workloads/DDBalance.actor.cpp - workloads/DDMetrics.actor.cpp - workloads/DDMetricsExclude.actor.cpp - workloads/DifferentClustersSameRV.actor.cpp - workloads/DiskDurability.actor.cpp - workloads/DiskDurabilityTest.actor.cpp - workloads/DiskFailureInjection.actor.cpp - workloads/DummyWorkload.actor.cpp - workloads/EncryptionOps.actor.cpp - workloads/EncryptKeyProxyTest.actor.cpp - workloads/ExternalWorkload.actor.cpp - workloads/FastTriggeredWatches.actor.cpp - workloads/FileSystem.actor.cpp - workloads/Fuzz.cpp - workloads/FuzzApiCorrectness.actor.cpp - workloads/GetMappedRange.actor.cpp - workloads/GetRangeStream.actor.cpp - workloads/HealthMetricsApi.actor.cpp - workloads/HighContentionPrefixAllocatorWorkload.actor.cpp - workloads/Increment.actor.cpp - workloads/IncrementalBackup.actor.cpp - workloads/IndexScan.actor.cpp - workloads/Inventory.actor.cpp - workloads/KillRegion.actor.cpp - workloads/KVStoreTest.actor.cpp - workloads/LocalRatekeeper.actor.cpp - workloads/LockDatabase.actor.cpp - workloads/LockDatabaseFrequently.actor.cpp - workloads/LogMetrics.actor.cpp - workloads/LowLatency.actor.cpp - workloads/MachineAttrition.actor.cpp - workloads/Mako.actor.cpp - workloads/MemoryKeyValueStore.cpp - workloads/MemoryKeyValueStore.h - workloads/MemoryLifetime.actor.cpp - workloads/MetricLogging.actor.cpp - workloads/MiniCycle.actor.cpp - workloads/MutationLogReaderCorrectness.actor.cpp - workloads/ParallelRestore.actor.cpp - workloads/Performance.actor.cpp - workloads/PhysicalShardMove.actor.cpp - workloads/Ping.actor.cpp - workloads/PopulateTPCC.actor.cpp - workloads/PrivateEndpoints.actor.cpp - workloads/ProtocolVersion.actor.cpp - workloads/PubSubMultiples.actor.cpp - workloads/QueuePush.actor.cpp - workloads/RandomClogging.actor.cpp - workloads/RandomMoveKeys.actor.cpp - workloads/RandomSelector.actor.cpp - workloads/ReadAfterWrite.actor.cpp - workloads/ReadHotDetection.actor.cpp - workloads/ReadWrite.actor.cpp - workloads/RemoveServersSafely.actor.cpp - workloads/ReportConflictingKeys.actor.cpp - workloads/RestoreBackup.actor.cpp - workloads/RestoreFromBlob.actor.cpp - workloads/Rollback.actor.cpp - workloads/RyowCorrectness.actor.cpp - workloads/RYWDisable.actor.cpp - workloads/RYWPerformance.actor.cpp - workloads/SaveAndKill.actor.cpp - workloads/SelectorCorrectness.actor.cpp - workloads/Serializability.actor.cpp - workloads/Sideband.actor.cpp - workloads/SidebandSingle.actor.cpp - workloads/SimpleAtomicAdd.actor.cpp - workloads/SlowTaskWorkload.actor.cpp - workloads/SnapTest.actor.cpp - workloads/SpecialKeySpaceCorrectness.actor.cpp - workloads/StatusWorkload.actor.cpp - workloads/Storefront.actor.cpp - workloads/StreamingRangeRead.actor.cpp - workloads/StreamingRead.actor.cpp - workloads/SubmitBackup.actor.cpp - workloads/SuspendProcesses.actor.cpp - workloads/TagThrottleApi.actor.cpp - workloads/TargetedKill.actor.cpp - workloads/TaskBucketCorrectness.actor.cpp - workloads/TenantManagement.actor.cpp - workloads/ThreadSafety.actor.cpp - workloads/Throttling.actor.cpp - workloads/Throughput.actor.cpp - workloads/TimeKeeperCorrectness.actor.cpp - workloads/TPCC.actor.cpp - workloads/TPCCWorkload.h - workloads/TriggerRecovery.actor.cpp - workloads/UDPWorkload.actor.cpp - workloads/UnitPerf.actor.cpp - workloads/UnitTests.actor.cpp - workloads/Unreadable.actor.cpp - workloads/VersionStamp.actor.cpp - workloads/WatchAndWait.actor.cpp - workloads/Watches.actor.cpp - workloads/WatchesSameKeyCorrectness.actor.cpp - workloads/WorkerErrors.actor.cpp - workloads/workloads.actor.h - workloads/WriteBandwidth.actor.cpp - workloads/WriteDuringRead.actor.cpp - workloads/WriteTagThrottling.actor.cpp -) + ApplyMetadataMutation.cpp + ApplyMetadataMutation.h + BackupInterface.h + BackupProgress.actor.cpp + BackupProgress.actor.h + BackupWorker.actor.cpp + BlobGranuleServerCommon.actor.cpp + BlobGranuleServerCommon.actor.h + BlobGranuleValidation.actor.cpp + BlobGranuleValidation.actor.h + BlobManager.actor.cpp + BlobManagerInterface.h + BlobWorker.actor.cpp + ClusterController.actor.cpp + ClusterController.actor.h + ClusterRecovery.actor.cpp + ClusterRecovery.actor.h + CommitProxyServer.actor.cpp + ConfigBroadcaster.actor.cpp + ConfigBroadcaster.h + ConfigDatabaseUnitTests.actor.cpp + ConfigFollowerInterface.cpp + ConfigFollowerInterface.h + ConfigNode.actor.cpp + ConfigNode.h + ConflictSet.h + CoordinatedState.actor.cpp + CoordinatedState.h + Coordination.actor.cpp + CoordinationInterface.h + CoroFlow.h + DataDistribution.actor.cpp + DataDistribution.actor.h + DataDistributionQueue.actor.cpp + DataDistributionTracker.actor.cpp + DataDistributorInterface.h + DBCoreState.h + DDTeamCollection.actor.cpp + DDTeamCollection.h + DiskQueue.actor.cpp + EncryptKeyProxy.actor.cpp + EncryptKeyProxyInterface.h + FDBExecHelper.actor.cpp + FDBExecHelper.actor.h + fdbserver.actor.cpp + GrvProxyServer.actor.cpp + IConfigConsumer.cpp + IConfigConsumer.h + IDiskQueue.h + IKeyValueContainer.h + IKeyValueStore.h + IPager.h + KeyValueStoreCompressTestData.actor.cpp + KeyValueStoreMemory.actor.cpp + KeyValueStoreRocksDB.actor.cpp + KeyValueStoreSQLite.actor.cpp + KmsConnector.h + KmsConnectorInterface.h + KnobProtectiveGroups.cpp + KnobProtectiveGroups.h + Knobs.h + LatencyBandConfig.cpp + LatencyBandConfig.h + LeaderElection.actor.cpp + LeaderElection.h + LocalConfiguration.actor.cpp + LocalConfiguration.h + LogProtocolMessage.h + LogRouter.actor.cpp + LogSystem.cpp + LogSystem.h + LogSystemConfig.cpp + LogSystemConfig.h + LogSystemDiskQueueAdapter.actor.cpp + LogSystemDiskQueueAdapter.h + LogSystemPeekCursor.actor.cpp + MasterInterface.h + masterserver.actor.cpp + MetricLogger.actor.cpp + MetricLogger.actor.h + MoveKeys.actor.cpp + MoveKeys.actor.h + MutationTracking.cpp + MutationTracking.h + networktest.actor.cpp + NetworkTest.h + OldTLogServer_4_6.actor.cpp + OldTLogServer_6_0.actor.cpp + OldTLogServer_6_2.actor.cpp + OnDemandStore.actor.cpp + OnDemandStore.h + PaxosConfigConsumer.actor.cpp + PaxosConfigConsumer.h + ProxyCommitData.actor.h + pubsub.actor.cpp + pubsub.h + QuietDatabase.actor.cpp + QuietDatabase.h + RadixTree.h + Ratekeeper.actor.cpp + Ratekeeper.h + RatekeeperInterface.h + RecoveryState.h + RemoteIKeyValueStore.actor.h + RemoteIKeyValueStore.actor.cpp + ResolutionBalancer.actor.cpp + ResolutionBalancer.actor.h + Resolver.actor.cpp + ResolverInterface.h + RestoreApplier.actor.cpp + RestoreApplier.actor.h + RestoreCommon.actor.cpp + RestoreCommon.actor.h + RestoreController.actor.cpp + RestoreController.actor.h + RestoreLoader.actor.cpp + RestoreLoader.actor.h + RestoreRoleCommon.actor.cpp + RestoreRoleCommon.actor.h + RestoreUtil.actor.cpp + RestoreUtil.h + RestoreWorker.actor.cpp + RestoreWorker.actor.h + RestoreWorkerInterface.actor.cpp + RestoreWorkerInterface.actor.h + RkTagThrottleCollection.cpp + RkTagThrottleCollection.h + RocksDBCheckpointUtils.actor.cpp + RocksDBCheckpointUtils.actor.h + RoleLineage.actor.cpp + RoleLineage.actor.h + ServerCheckpoint.actor.cpp + ServerCheckpoint.actor.h + ServerDBInfo.actor.h + ServerDBInfo.h + SigStack.cpp + SimKmsConnector.actor.h + SimKmsConnector.actor.cpp + SimpleConfigConsumer.actor.cpp + SimpleConfigConsumer.h + SimulatedCluster.actor.cpp + SimulatedCluster.h + SkipList.cpp + SpanContextMessage.h + Status.actor.cpp + Status.h + StorageCache.actor.cpp + StorageMetrics.actor.h + StorageMetrics.h + storageserver.actor.cpp + TagPartitionedLogSystem.actor.cpp + TagPartitionedLogSystem.actor.h + TagThrottler.actor.cpp + TagThrottler.h + TCInfo.actor.cpp + TCInfo.h + template_fdb.h + tester.actor.cpp + TesterInterface.actor.h + TLogInterface.h + TLogServer.actor.cpp + TransactionTagCounter.cpp + TransactionTagCounter.h + TSSMappingUtil.actor.cpp + TSSMappingUtil.actor.h + VersionedBTree.actor.cpp + VFSAsync.cpp + VFSAsync.h + WaitFailure.actor.cpp + WaitFailure.h + worker.actor.cpp + WorkerInterface.actor.h + workloads/ApiCorrectness.actor.cpp + workloads/ApiWorkload.actor.cpp + workloads/ApiWorkload.h + workloads/AsyncFile.actor.h + workloads/AsyncFile.cpp + workloads/AsyncFileCorrectness.actor.cpp + workloads/AsyncFileRead.actor.cpp + workloads/AsyncFileWrite.actor.cpp + workloads/AtomicOps.actor.cpp + workloads/AtomicOpsApiCorrectness.actor.cpp + workloads/AtomicRestore.actor.cpp + workloads/AtomicSwitchover.actor.cpp + workloads/BackgroundSelectors.actor.cpp + workloads/BackupAndParallelRestoreCorrectness.actor.cpp + workloads/BackupCorrectness.actor.cpp + workloads/BackupToBlob.actor.cpp + workloads/BackupToDBAbort.actor.cpp + workloads/BackupToDBCorrectness.actor.cpp + workloads/BackupToDBUpgrade.actor.cpp + workloads/BlobGranuleCorrectnessWorkload.actor.cpp + workloads/BlobGranuleVerifier.actor.cpp + workloads/BlobStoreWorkload.h + workloads/BulkLoad.actor.cpp + workloads/BulkSetup.actor.h + workloads/Cache.actor.cpp + workloads/ChangeConfig.actor.cpp + workloads/ChangeFeeds.actor.cpp + workloads/ClearSingleRange.actor.cpp + workloads/ClientTransactionProfileCorrectness.actor.cpp + workloads/ClientWorkload.actor.cpp + workloads/ClogSingleConnection.actor.cpp + workloads/CommitBugCheck.actor.cpp + workloads/ConfigIncrement.actor.cpp + workloads/ConfigureDatabase.actor.cpp + workloads/ConflictRange.actor.cpp + workloads/ConsistencyCheck.actor.cpp + workloads/CpuProfiler.actor.cpp + workloads/Cycle.actor.cpp + workloads/DataDistributionMetrics.actor.cpp + workloads/DataLossRecovery.actor.cpp + workloads/DDBalance.actor.cpp + workloads/DDMetrics.actor.cpp + workloads/DDMetricsExclude.actor.cpp + workloads/DifferentClustersSameRV.actor.cpp + workloads/DiskDurability.actor.cpp + workloads/DiskDurabilityTest.actor.cpp + workloads/DiskFailureInjection.actor.cpp + workloads/DummyWorkload.actor.cpp + workloads/EncryptionOps.actor.cpp + workloads/EncryptKeyProxyTest.actor.cpp + workloads/ExternalWorkload.actor.cpp + workloads/FastTriggeredWatches.actor.cpp + workloads/FileSystem.actor.cpp + workloads/Fuzz.cpp + workloads/FuzzApiCorrectness.actor.cpp + workloads/GetMappedRange.actor.cpp + workloads/GetRangeStream.actor.cpp + workloads/HealthMetricsApi.actor.cpp + workloads/HighContentionPrefixAllocatorWorkload.actor.cpp + workloads/Increment.actor.cpp + workloads/IncrementalBackup.actor.cpp + workloads/IndexScan.actor.cpp + workloads/Inventory.actor.cpp + workloads/KillRegion.actor.cpp + workloads/KVStoreTest.actor.cpp + workloads/LocalRatekeeper.actor.cpp + workloads/LockDatabase.actor.cpp + workloads/LockDatabaseFrequently.actor.cpp + workloads/LogMetrics.actor.cpp + workloads/LowLatency.actor.cpp + workloads/MachineAttrition.actor.cpp + workloads/Mako.actor.cpp + workloads/MemoryKeyValueStore.cpp + workloads/MemoryKeyValueStore.h + workloads/MemoryLifetime.actor.cpp + workloads/MetricLogging.actor.cpp + workloads/MiniCycle.actor.cpp + workloads/MutationLogReaderCorrectness.actor.cpp + workloads/ParallelRestore.actor.cpp + workloads/Performance.actor.cpp + workloads/PhysicalShardMove.actor.cpp + workloads/Ping.actor.cpp + workloads/PopulateTPCC.actor.cpp + workloads/PrivateEndpoints.actor.cpp + workloads/ProtocolVersion.actor.cpp + workloads/PubSubMultiples.actor.cpp + workloads/QueuePush.actor.cpp + workloads/RandomClogging.actor.cpp + workloads/RandomMoveKeys.actor.cpp + workloads/RandomSelector.actor.cpp + workloads/ReadAfterWrite.actor.cpp + workloads/ReadHotDetection.actor.cpp + workloads/ReadWrite.actor.cpp + workloads/RemoveServersSafely.actor.cpp + workloads/ReportConflictingKeys.actor.cpp + workloads/RestoreBackup.actor.cpp + workloads/RestoreFromBlob.actor.cpp + workloads/Rollback.actor.cpp + workloads/RyowCorrectness.actor.cpp + workloads/RYWDisable.actor.cpp + workloads/RYWPerformance.actor.cpp + workloads/SaveAndKill.actor.cpp + workloads/SelectorCorrectness.actor.cpp + workloads/Serializability.actor.cpp + workloads/Sideband.actor.cpp + workloads/SidebandSingle.actor.cpp + workloads/SimpleAtomicAdd.actor.cpp + workloads/SkewedReadWrite.actor.cpp + workloads/SlowTaskWorkload.actor.cpp + workloads/SnapTest.actor.cpp + workloads/SpecialKeySpaceCorrectness.actor.cpp + workloads/StatusWorkload.actor.cpp + workloads/Storefront.actor.cpp + workloads/StreamingRangeRead.actor.cpp + workloads/StreamingRead.actor.cpp + workloads/SubmitBackup.actor.cpp + workloads/SuspendProcesses.actor.cpp + workloads/TagThrottleApi.actor.cpp + workloads/TargetedKill.actor.cpp + workloads/TaskBucketCorrectness.actor.cpp + workloads/TenantManagement.actor.cpp + workloads/ThreadSafety.actor.cpp + workloads/Throttling.actor.cpp + workloads/Throughput.actor.cpp + workloads/TimeKeeperCorrectness.actor.cpp + workloads/TPCC.actor.cpp + workloads/TPCCWorkload.h + workloads/TriggerRecovery.actor.cpp + workloads/UDPWorkload.actor.cpp + workloads/UnitPerf.actor.cpp + workloads/UnitTests.actor.cpp + workloads/Unreadable.actor.cpp + workloads/VersionStamp.actor.cpp + workloads/WatchAndWait.actor.cpp + workloads/Watches.actor.cpp + workloads/WatchesSameKeyCorrectness.actor.cpp + workloads/WorkerErrors.actor.cpp + workloads/workloads.actor.h + workloads/WriteBandwidth.actor.cpp + workloads/WriteDuringRead.actor.cpp + workloads/WriteTagThrottling.actor.cpp + ) -if(${COROUTINE_IMPL} STREQUAL libcoro) - list(APPEND FDBSERVER_SRCS CoroFlowCoro.actor.cpp) -else() - list(APPEND FDBSERVER_SRCS CoroFlow.actor.cpp) -endif() +if (${COROUTINE_IMPL} STREQUAL libcoro) + list(APPEND FDBSERVER_SRCS CoroFlowCoro.actor.cpp) +else () + list(APPEND FDBSERVER_SRCS CoroFlow.actor.cpp) +endif () add_library(fdb_sqlite STATIC - sqlite/btree.h - sqlite/hash.h - sqlite/sqlite3.h - sqlite/sqlite3ext.h - sqlite/sqliteInt.h - sqlite/sqliteLimit.h - sqlite/sqlite3.amalgamation.c) + sqlite/btree.h + sqlite/hash.h + sqlite/sqlite3.h + sqlite/sqlite3ext.h + sqlite/sqliteInt.h + sqlite/sqliteLimit.h + sqlite/sqlite3.amalgamation.c) if (WITH_ROCKSDB_EXPERIMENTAL) - add_definitions(-DSSD_ROCKSDB_EXPERIMENTAL) + add_definitions(-DSSD_ROCKSDB_EXPERIMENTAL) - include(CompileRocksDB) - # CompileRocksDB sets `lz4_LIBRARIES` to be the shared lib, we want to link - # statically, so find the static library here. - find_library(lz4_STATIC_LIBRARIES - NAMES liblz4.a REQUIRED) - if (WITH_LIBURING) - find_package(uring) - endif() -endif() + include(CompileRocksDB) + # CompileRocksDB sets `lz4_LIBRARIES` to be the shared lib, we want to link + # statically, so find the static library here. + find_library(lz4_STATIC_LIBRARIES + NAMES liblz4.a REQUIRED) + if (WITH_LIBURING) + find_package(uring) + endif () +endif () # Suppress warnings in sqlite since it's third party -if(NOT WIN32) - target_compile_definitions(fdb_sqlite PRIVATE $<$:NDEBUG>) - target_compile_options(fdb_sqlite BEFORE PRIVATE -w) # disable warnings for third party -endif() +if (NOT WIN32) + target_compile_definitions(fdb_sqlite PRIVATE $<$:NDEBUG>) + target_compile_options(fdb_sqlite BEFORE PRIVATE -w) # disable warnings for third party +endif () file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/workloads) add_flow_target(EXECUTABLE NAME fdbserver SRCS ${FDBSERVER_SRCS}) target_include_directories(fdbserver PRIVATE - ${CMAKE_SOURCE_DIR}/bindings/c - ${CMAKE_BINARY_DIR}/bindings/c - ${CMAKE_CURRENT_BINARY_DIR}/workloads - ${CMAKE_CURRENT_SOURCE_DIR}/workloads) + ${CMAKE_SOURCE_DIR}/bindings/c + ${CMAKE_BINARY_DIR}/bindings/c + ${CMAKE_CURRENT_BINARY_DIR}/workloads + ${CMAKE_CURRENT_SOURCE_DIR}/workloads) if (WITH_ROCKSDB_EXPERIMENTAL) - add_dependencies(fdbserver rocksdb) - if(WITH_LIBURING) - target_include_directories(fdbserver PRIVATE ${ROCKSDB_INCLUDE_DIR} ${uring_INCLUDE_DIR}) - target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite ${ROCKSDB_LIBRARIES} ${uring_LIBRARIES} ${lz4_STATIC_LIBRARIES}) - target_compile_definitions(fdbserver PRIVATE BOOST_ASIO_HAS_IO_URING=1 BOOST_ASIO_DISABLE_EPOLL=1) - else() - target_include_directories(fdbserver PRIVATE ${ROCKSDB_INCLUDE_DIR}) - target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite ${ROCKSDB_LIBRARIES} ${lz4_STATIC_LIBRARIES}) - target_compile_definitions(fdbserver PRIVATE) - endif() -else() - target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite) -endif() + add_dependencies(fdbserver rocksdb) + if (WITH_LIBURING) + target_include_directories(fdbserver PRIVATE ${ROCKSDB_INCLUDE_DIR} ${uring_INCLUDE_DIR}) + target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite ${ROCKSDB_LIBRARIES} ${uring_LIBRARIES} ${lz4_STATIC_LIBRARIES}) + target_compile_definitions(fdbserver PRIVATE BOOST_ASIO_HAS_IO_URING=1 BOOST_ASIO_DISABLE_EPOLL=1) + else () + target_include_directories(fdbserver PRIVATE ${ROCKSDB_INCLUDE_DIR}) + target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite ${ROCKSDB_LIBRARIES} ${lz4_STATIC_LIBRARIES}) + target_compile_definitions(fdbserver PRIVATE) + endif () +else () + target_link_libraries(fdbserver PRIVATE fdbclient fdb_sqlite) +endif () target_link_libraries(fdbserver PRIVATE toml11_target jemalloc) # target_compile_definitions(fdbserver PRIVATE -DENABLE_SAMPLING) if (GPERFTOOLS_FOUND) - target_link_libraries(fdbserver PRIVATE gperftools) -endif() + target_link_libraries(fdbserver PRIVATE gperftools) +endif () -if(NOT OPEN_FOR_IDE) - if(GENERATE_DEBUG_PACKAGES) - fdb_install(TARGETS fdbserver DESTINATION sbin COMPONENT server) - else() - add_custom_target(prepare_fdbserver_install ALL DEPENDS strip_only_fdbserver) - fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbserver DESTINATION sbin COMPONENT server) - endif() -endif() +if (NOT OPEN_FOR_IDE) + if (GENERATE_DEBUG_PACKAGES) + fdb_install(TARGETS fdbserver DESTINATION sbin COMPONENT server) + else () + add_custom_target(prepare_fdbserver_install ALL DEPENDS strip_only_fdbserver) + fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbserver DESTINATION sbin COMPONENT server) + endif () +endif () diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 64b9924560..d03ea133cd 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -99,6 +99,20 @@ Key KVWorkload::keyForIndex(uint64_t index) const { } } +int64_t KVWorkload::indexForKey(const KeyRef& key, bool absent) const { + int idx = 0; + if (nodePrefix > 0) { + ASSERT(keyBytes >= 32); + idx += 16; + } + ASSERT(keyBytes >= 16); + // extract int64_t index, the reverse process of emplaceIndex() + auto end = key.size() - idx - (absent ? 1 : 0); + std::string str((char*)key.begin() + idx, end); + int64_t res = std::stoll(str, nullptr, 16); + return res; +} + Key KVWorkload::keyForIndex(uint64_t index, bool absent) const { int adjustedKeyBytes = (absent) ? (keyBytes + 1) : keyBytes; Key result = makeString(adjustedKeyBytes); @@ -112,8 +126,8 @@ Key KVWorkload::keyForIndex(uint64_t index, bool absent) const { idx += 16; } ASSERT(keyBytes >= 16); - double d = double(index) / nodeCount; - emplaceIndex(data, idx, *(int64_t*)&d); + emplaceIndex(data, idx, (int64_t)index); + // ASSERT(indexForKey(result) == (int64_t)index); // debug assert return result; } @@ -1855,7 +1869,9 @@ ACTOR Future runTests(Reference connRecord, } choose { - when(wait(tests)) { return Void(); } + when(wait(tests)) { + return Void(); + } when(wait(quorum(actors, 1))) { ASSERT(false); throw internal_error(); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 4741a01371..1a5c5e3e56 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -76,57 +76,64 @@ DESCR struct ReadMetric { }; struct ReadWriteWorkload : KVWorkload { + // general test setting + Standalone descriptionString; + bool doSetup, cancelWorkersAtDuration; + double testDuration, transactionsPerSecond, warmingDelay, maxInsertRate, debugInterval, debugTime; + double metricsStart, metricsDuration; + std::vector insertionCountsToMeasure; // measure the speed of sequential insertion when bulkSetup + + // test log setting + bool enableReadLatencyLogging; + double periodicLoggingInterval; + + // use ReadWrite as a ramp up workload + bool rampUpLoad; // indicate this is a ramp up workload + int rampSweepCount; // how many times of ramp up + bool rampTransactionType; // choose transaction type based on client start time + bool rampUpConcurrency; // control client concurrency + + // transaction setting + bool useRYW; + bool batchPriority; + bool rangeReads; // read operations are all single key range read + bool dependentReads; // read operations are issued sequentially + bool inconsistentReads; // read with previous read version + bool adjacentReads; // keys are adjacent within a transaction + bool adjacentWrites; + double alpha; // probability for run TransactionA type + // two type of transaction int readsPerTransactionA, writesPerTransactionA; int readsPerTransactionB, writesPerTransactionB; int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; - double testDuration, transactionsPerSecond, alpha, warmingDelay, loadTime, maxInsertRate, debugInterval, debugTime; - double metricsStart, metricsDuration, clientBegin; std::string valueString; + // hot traffic pattern + double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting - bool dependentReads; - bool enableReadLatencyLogging; - double periodicLoggingInterval; - bool cancelWorkersAtDuration; - bool inconsistentReads; - bool adjacentReads; - bool adjacentWrites; - bool rampUpLoad; - int rampSweepCount; - double hotKeyFraction, forceHotProbability; - bool rangeReads; - bool useRYW; - bool rampTransactionType; - bool rampUpConcurrency; - bool batchPriority; - - Standalone descriptionString; - + // states of metric Int64MetricHandle totalReadsMetric; Int64MetricHandle totalRetriesMetric; EventMetricHandle transactionSuccessMetric; EventMetricHandle transactionFailureMetric; EventMetricHandle readMetric; - - std::vector> clients; PerfIntCounter aTransactions, bTransactions, retries; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; double readLatencyTotal; int readLatencyCount; - - std::vector insertionCountsToMeasure; - std::vector> ratesAtKeyCounts; - std::vector periodicMetrics; + std::vector> ratesAtKeyCounts; // sequential insertion speed - bool doSetup; + // other internal states + std::vector> clients; + double loadTime, clientBegin; ReadWriteWorkload(WorkloadContext const& wcx) - : KVWorkload(wcx), loadTime(0.0), clientBegin(0), dependentReads(false), adjacentReads(false), - adjacentWrites(false), totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), + : KVWorkload(wcx), dependentReads(false), adjacentReads(false), adjacentWrites(false), totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), totalRetriesMetric(LiteralStringRef("RWWorkload.TotalRetries")), aTransactions("A Transactions"), - bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), - commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), - readLatencyCount(0) { + bTransactions("B Transactions"), retries("Retries"), + latencies(sampleSize), readLatencies(sampleSize), commitLatencies(sampleSize), GRVLatencies(sampleSize), + fullReadLatencies(sampleSize), readLatencyTotal(0), readLatencyCount(0), loadTime(0.0), + clientBegin(0) { transactionSuccessMetric.init(LiteralStringRef("RWWorkload.SuccessfulTransaction")); transactionFailureMetric.init(LiteralStringRef("RWWorkload.FailedTransaction")); readMetric.init(LiteralStringRef("RWWorkload.Read")); diff --git a/fdbserver/workloads/SkewedReadWrite.actor.cpp b/fdbserver/workloads/SkewedReadWrite.actor.cpp new file mode 100644 index 0000000000..6ea9d9b8aa --- /dev/null +++ b/fdbserver/workloads/SkewedReadWrite.actor.cpp @@ -0,0 +1,778 @@ +/* + * ReadWrite.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "fdbrpc/ContinuousSample.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbserver/TesterInterface.actor.h" +#include "fdbserver/WorkerInterface.actor.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/workloads/BulkSetup.actor.h" +#include "fdbclient/ReadYourWrites.h" +#include "flow/TDMetric.actor.h" +#include "fdbclient/RunTransaction.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +const int sampleSize = 10000; +DESCR struct TransactionSuccessMetric { + int64_t totalLatency; // ns + int64_t startLatency; // ns + int64_t commitLatency; // ns + int64_t retries; // count +}; + +DESCR struct TransactionFailureMetric { + int64_t startLatency; // ns + int64_t errorCode; // flow error code +}; + +DESCR struct ReadMetric { + int64_t readLatency; // ns +}; + +struct SkewedReadWriteWorkload : KVWorkload { + // general test setting + Standalone descriptionString; + bool doSetup, cancelWorkersAtDuration; + double testDuration, transactionsPerSecond, warmingDelay, maxInsertRate, debugInterval, debugTime; + double metricsStart, metricsDuration; + std::vector insertionCountsToMeasure; // measure the speed of sequential insertion when bulkSetup + + // test log setting + bool enableReadLatencyLogging; + double periodicLoggingInterval; + + // transaction setting + bool useRYW; + double alpha; // probability for run TransactionA type + // two type of transaction + int readsPerTransactionA, writesPerTransactionA; + int readsPerTransactionB, writesPerTransactionB; + std::string valueString; + + // server based hot traffic setting + int skewRound = 0; // skewDuration = ceil(testDuration / skewRound) + double hotServerFraction = 0, hotServerShardFraction = 1.0; // set > 0 to issue hot key based on shard map + double hotServerReadFrac, hotServerWriteFrac; // hot many traffic goes to hot servers + double hotReadWriteServerOverlap; // the portion of intersection of write and hot server + + // hot server state + typedef std::vector> IndexRangeVec; + // keyForIndex generate key from index. So for a shard range, recording the start and end is enough + std::vector> serverShards; // storage server and the shards it owns + std::map serverInterfaces; + int hotServerCount = 0, currentHotRound = -1; + + // states of metric + Int64MetricHandle totalReadsMetric; + Int64MetricHandle totalRetriesMetric; + EventMetricHandle transactionSuccessMetric; + EventMetricHandle transactionFailureMetric; + EventMetricHandle readMetric; + PerfIntCounter aTransactions, bTransactions, retries; + ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; + double readLatencyTotal; + int readLatencyCount; + std::vector periodicMetrics; + std::vector> ratesAtKeyCounts; // sequential insertion speed + + // other internal states + std::vector> clients; + double loadTime, clientBegin; + + SkewedReadWriteWorkload(WorkloadContext const& wcx) + : KVWorkload(wcx), totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), + totalRetriesMetric(LiteralStringRef("RWWorkload.TotalRetries")), aTransactions("A Transactions"), + bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), + commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), + readLatencyCount(0), loadTime(0.0), clientBegin(0) { + + transactionSuccessMetric.init(LiteralStringRef("RWWorkload.SuccessfulTransaction")); + transactionFailureMetric.init(LiteralStringRef("RWWorkload.FailedTransaction")); + readMetric.init(LiteralStringRef("RWWorkload.Read")); + + testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0); + transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 5000.0) / clientCount; + double allowedLatency = getOption(options, LiteralStringRef("allowedLatency"), 0.250); + actorCount = ceil(transactionsPerSecond * allowedLatency); + actorCount = getOption(options, LiteralStringRef("actorCountPerTester"), actorCount); + + readsPerTransactionA = getOption(options, LiteralStringRef("readsPerTransactionA"), 10); + writesPerTransactionA = getOption(options, LiteralStringRef("writesPerTransactionA"), 0); + readsPerTransactionB = getOption(options, LiteralStringRef("readsPerTransactionB"), 1); + writesPerTransactionB = getOption(options, LiteralStringRef("writesPerTransactionB"), 9); + alpha = getOption(options, LiteralStringRef("alpha"), 0.1); + + valueString = std::string(maxValueBytes, '.'); + if (nodePrefix > 0) { + keyBytes += 16; + } + + metricsStart = getOption(options, LiteralStringRef("metricsStart"), 0.0); + metricsDuration = getOption(options, LiteralStringRef("metricsDuration"), testDuration); + if (getOption(options, LiteralStringRef("discardEdgeMeasurements"), true)) { + // discardEdgeMeasurements keeps the metrics from the middle 3/4 of the test + metricsStart += testDuration * 0.125; + metricsDuration *= 0.75; + } + + warmingDelay = getOption(options, LiteralStringRef("warmingDelay"), 0.0); + maxInsertRate = getOption(options, LiteralStringRef("maxInsertRate"), 1e12); + debugInterval = getOption(options, LiteralStringRef("debugInterval"), 0.0); + debugTime = getOption(options, LiteralStringRef("debugTime"), 0.0); + enableReadLatencyLogging = getOption(options, LiteralStringRef("enableReadLatencyLogging"), false); + periodicLoggingInterval = getOption(options, LiteralStringRef("periodicLoggingInterval"), 5.0); + cancelWorkersAtDuration = getOption(options, LiteralStringRef("cancelWorkersAtDuration"), true); + useRYW = getOption(options, LiteralStringRef("useRYW"), false); + doSetup = getOption(options, LiteralStringRef("setup"), true); + descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("SkewedReadWrite")); + + // Validate that keyForIndex() is monotonic + for (int i = 0; i < 30; i++) { + int64_t a = deterministicRandom()->randomInt64(0, nodeCount); + int64_t b = deterministicRandom()->randomInt64(0, nodeCount); + if (a > b) { + std::swap(a, b); + } + ASSERT(a <= b); + ASSERT((keyForIndex(a, false) <= keyForIndex(b, false))); + } + + std::vector insertionCountsToMeasureString = + getOption(options, LiteralStringRef("insertionCountsToMeasure"), std::vector()); + for (int i = 0; i < insertionCountsToMeasureString.size(); i++) { + try { + uint64_t count = boost::lexical_cast(insertionCountsToMeasureString[i]); + insertionCountsToMeasure.push_back(count); + } catch (...) { + } + } + + { + hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.2); + hotServerShardFraction = getOption(options, "hotServerShardFraction"_sr, 1.0); + hotReadWriteServerOverlap = getOption(options, "hotReadWriteServerOverlap"_sr, 0.0); + skewRound = getOption(options, "skewRound"_sr, 1); + hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.8); + hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); + ASSERT((hotServerReadFrac >= hotServerFraction || hotServerWriteFrac >= hotServerFraction) && + skewRound > 0); + } + } + + std::string description() const override { return descriptionString.toString(); } + Future setup(Database const& cx) override { return _setup(cx, this); } + Future start(Database const& cx) override { return _start(cx, this); } + + ACTOR static Future traceDumpWorkers(Reference const> db) { + try { + loop { + choose { + when(wait(db->onChange())) {} + + when(ErrorOr> workerList = + wait(db->get().clusterInterface.getWorkers.tryGetReply(GetWorkersRequest()))) { + if (workerList.present()) { + std::vector>> dumpRequests; + dumpRequests.reserve(workerList.get().size()); + for (int i = 0; i < workerList.get().size(); i++) + dumpRequests.push_back(workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( + TraceBatchDumpRequest())); + wait(waitForAll(dumpRequests)); + return true; + } + wait(delay(1.0)); + } + } + } + } catch (Error& e) { + TraceEvent(SevError, "FailedToDumpWorkers").error(e); + throw; + } + } + + Future check(Database const& cx) override { + clients.clear(); + + if (!cancelWorkersAtDuration && now() < metricsStart + metricsDuration) + metricsDuration = now() - metricsStart; + + g_traceBatch.dump(); + if (clientId == 0) + return traceDumpWorkers(dbInfo); + else + return true; + } + + void getMetrics(std::vector& m) override { + double duration = metricsDuration; + int reads = + (aTransactions.getValue() * readsPerTransactionA) + (bTransactions.getValue() * readsPerTransactionB); + int writes = + (aTransactions.getValue() * writesPerTransactionA) + (bTransactions.getValue() * writesPerTransactionB); + m.emplace_back("Measured Duration", duration, Averaged::True); + m.emplace_back( + "Transactions/sec", (aTransactions.getValue() + bTransactions.getValue()) / duration, Averaged::False); + m.emplace_back("Operations/sec", ((reads + writes) / duration), Averaged::False); + m.push_back(aTransactions.getMetric()); + m.push_back(bTransactions.getMetric()); + m.push_back(retries.getMetric()); + m.emplace_back("Mean load time (seconds)", loadTime, Averaged::True); + m.emplace_back("Read rows", reads, Averaged::False); + m.emplace_back("Write rows", writes, Averaged::False); + m.emplace_back("Read rows/sec", reads / duration, Averaged::False); + m.emplace_back("Write rows/sec", writes / duration, Averaged::False); + m.emplace_back( + "Bytes read/sec", (reads * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, Averaged::False); + m.emplace_back("Bytes written/sec", + (writes * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, + Averaged::False); + m.insert(m.end(), periodicMetrics.begin(), periodicMetrics.end()); + + std::vector>::iterator ratesItr = ratesAtKeyCounts.begin(); + for (; ratesItr != ratesAtKeyCounts.end(); ratesItr++) + m.emplace_back(format("%lld keys imported bytes/sec", ratesItr->first), ratesItr->second, Averaged::False); + } + + Value randomValue() { + return StringRef((uint8_t*)valueString.c_str(), + deterministicRandom()->randomInt(minValueBytes, maxValueBytes + 1)); + } + + Standalone operator()(uint64_t n) { return KeyValueRef(keyForIndex(n, false), randomValue()); } + + void debugPrintServerShards() const { + std::cout << std::hex; + for (auto it : this->serverShards) { + std::cout << serverInterfaces.at(it.first).address().toString() << ": ["; + for (auto p : it.second) { + std::cout << "[" << p.first << "," << p.second << "], "; + } + std::cout << "] \n"; + } + } + + // for each boundary except the last one in boundaries, found the first existed key generated from keyForIndex as + // beginIdx, found the last existed key generated from keyForIndex the endIdx. + ACTOR static Future convertKeyBoundaryToIndexShard(Database cx, + SkewedReadWriteWorkload* self, + Standalone> boundaries) { + state IndexRangeVec res; + state int i = 0; + for (; i < boundaries.size() - 1; ++i) { + KeyRangeRef currentShard = KeyRangeRef(boundaries[i], boundaries[i + 1]); + // std::cout << currentShard.toString() << "\n"; + std::vector ranges = wait(runRYWTransaction( + cx, [currentShard](Reference tr) -> Future> { + std::vector> f; + f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::False)); + f.push_back(tr->getRange(currentShard, 1, Snapshot::False, Reverse::True)); + return getAll(f); + })); + ASSERT(ranges[0].size() == 1 && ranges[1].size() == 1); + res.emplace_back(self->indexForKey(ranges[0][0].key), self->indexForKey(ranges[1][0].key)); + } + + ASSERT(res.size() == boundaries.size() - 1); + return res; + } + + ACTOR static Future updateServerShards(Database cx, SkewedReadWriteWorkload* self) { + state Future serverList = + runRYWTransaction(cx, [](Reference tr) -> Future { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY); + }); + state RangeResult range = + wait(runRYWTransaction(cx, [](Reference tr) -> Future { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return tr->getRange(serverKeysRange, CLIENT_KNOBS->TOO_MANY); + })); + wait(success(serverList)); + // decode server interfaces + self->serverInterfaces.clear(); + for (int i = 0; i < serverList.get().size(); i++) { + auto ssi = decodeServerListValue(serverList.get()[i].value); + self->serverInterfaces.emplace(ssi.id(), ssi); + } + // clear self->serverShards + self->serverShards.clear(); + + // leftEdge < workloadBegin < workloadEnd + Key workloadBegin = self->keyForIndex(0), workloadEnd = self->keyForIndex(self->nodeCount); + Key leftEdge(allKeys.begin); + std::vector leftServer; // left server owns the range [leftEdge, workloadBegin) + KeyRangeRef workloadRange(workloadBegin, workloadEnd); + state std::map> beginServers; // begin index to server ID + + for (auto kv = range.begin(); kv != range.end(); kv++) { + if (serverHasKey(kv->value)) { + auto [id, key] = serverKeysDecodeServerBegin(kv->key); + + if (workloadRange.contains(key)) { + beginServers[key].push_back(id); + } else if (workloadBegin > key && key > leftEdge) { // update left boundary + leftEdge = key; + leftServer.clear(); + } + + if (key == leftEdge) { + leftServer.push_back(id); + } + } + } + ASSERT(beginServers.size() == 0 || beginServers.begin()->first >= workloadBegin); + // handle the left boundary + if (beginServers.size() == 0 || beginServers.begin()->first > workloadBegin) { + beginServers[workloadBegin] = leftServer; + } + Standalone> keyBegins; + for (auto p = beginServers.begin(); p != beginServers.end(); ++p) { + keyBegins.push_back(keyBegins.arena(), p->first); + } + // deep count because wait below will destruct workloadEnd + keyBegins.push_back_deep(keyBegins.arena(), workloadEnd); + + IndexRangeVec indexShards = wait(convertKeyBoundaryToIndexShard(cx, self, keyBegins)); + ASSERT(beginServers.size() == indexShards.size()); + // sort shard begin idx + // build self->serverShards, starting from the left shard + std::map serverShards; + int i = 0; + for (auto p = beginServers.begin(); p != beginServers.end(); ++p) { + for (int j = 0; j < p->second.size(); ++j) { + serverShards[p->second[j]].emplace_back(indexShards[i]); + } + ++i; + } + // self->serverShards is ordered by UID + for (auto it : serverShards) { + self->serverShards.emplace_back(it); + } + // if (self->clientId == 0) { + // self->debugPrintServerShards(); + // } + return Void(); + } + + ACTOR static Future tracePeriodically(SkewedReadWriteWorkload* self) { + state double start = now(); + state double elapsed = 0.0; + state int64_t last_ops = 0; + + loop { + elapsed += self->periodicLoggingInterval; + wait(delayUntil(start + elapsed)); + + TraceEvent((self->description() + "_RowReadLatency").c_str()) + .detail("Mean", self->readLatencies.mean()) + .detail("Median", self->readLatencies.median()) + .detail("Percentile5", self->readLatencies.percentile(.05)) + .detail("Percentile95", self->readLatencies.percentile(.95)) + .detail("Percentile99", self->readLatencies.percentile(.99)) + .detail("Percentile99_9", self->readLatencies.percentile(.999)) + .detail("Max", self->readLatencies.max()) + .detail("Count", self->readLatencyCount) + .detail("Elapsed", elapsed); + + TraceEvent((self->description() + "_GRVLatency").c_str()) + .detail("Mean", self->GRVLatencies.mean()) + .detail("Median", self->GRVLatencies.median()) + .detail("Percentile5", self->GRVLatencies.percentile(.05)) + .detail("Percentile95", self->GRVLatencies.percentile(.95)) + .detail("Percentile99", self->GRVLatencies.percentile(.99)) + .detail("Percentile99_9", self->GRVLatencies.percentile(.999)) + .detail("Max", self->GRVLatencies.max()); + + TraceEvent((self->description() + "_CommitLatency").c_str()) + .detail("Mean", self->commitLatencies.mean()) + .detail("Median", self->commitLatencies.median()) + .detail("Percentile5", self->commitLatencies.percentile(.05)) + .detail("Percentile95", self->commitLatencies.percentile(.95)) + .detail("Percentile99", self->commitLatencies.percentile(.99)) + .detail("Percentile99_9", self->commitLatencies.percentile(.999)) + .detail("Max", self->commitLatencies.max()); + + TraceEvent((self->description() + "_TotalLatency").c_str()) + .detail("Mean", self->latencies.mean()) + .detail("Median", self->latencies.median()) + .detail("Percentile5", self->latencies.percentile(.05)) + .detail("Percentile95", self->latencies.percentile(.95)) + .detail("Percentile99", self->latencies.percentile(.99)) + .detail("Percentile99_9", self->latencies.percentile(.999)) + .detail("Max", self->latencies.max()); + + int64_t ops = + (self->aTransactions.getValue() * (self->readsPerTransactionA + self->writesPerTransactionA)) + + (self->bTransactions.getValue() * (self->readsPerTransactionB + self->writesPerTransactionB)); + bool recordBegin = self->shouldRecord(std::max(now() - self->periodicLoggingInterval, self->clientBegin)); + bool recordEnd = self->shouldRecord(now()); + if (recordBegin && recordEnd) { + std::string ts = format("T=%04.0fs:", elapsed); + self->periodicMetrics.emplace_back( + ts + "Operations/sec", (ops - last_ops) / self->periodicLoggingInterval, Averaged::False); + + // if(self->rampUpLoad) { + self->periodicMetrics.emplace_back( + ts + "Mean Latency (ms)", 1000 * self->latencies.mean(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Median Latency (ms, averaged)", 1000 * self->latencies.median(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "5% Latency (ms, averaged)", 1000 * self->latencies.percentile(.05), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "95% Latency (ms, averaged)", 1000 * self->latencies.percentile(.95), Averaged::True); + + self->periodicMetrics.emplace_back( + ts + "Mean Row Read Latency (ms)", 1000 * self->readLatencies.mean(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Median Row Read Latency (ms, averaged)", 1000 * self->readLatencies.median(), Averaged::True); + self->periodicMetrics.emplace_back(ts + "5% Row Read Latency (ms, averaged)", + 1000 * self->readLatencies.percentile(.05), + Averaged::True); + self->periodicMetrics.emplace_back(ts + "95% Row Read Latency (ms, averaged)", + 1000 * self->readLatencies.percentile(.95), + Averaged::True); + + self->periodicMetrics.emplace_back( + ts + "Mean Total Read Latency (ms)", 1000 * self->fullReadLatencies.mean(), Averaged::True); + self->periodicMetrics.emplace_back(ts + "Median Total Read Latency (ms, averaged)", + 1000 * self->fullReadLatencies.median(), + Averaged::True); + self->periodicMetrics.emplace_back(ts + "5% Total Read Latency (ms, averaged)", + 1000 * self->fullReadLatencies.percentile(.05), + Averaged::True); + self->periodicMetrics.emplace_back(ts + "95% Total Read Latency (ms, averaged)", + 1000 * self->fullReadLatencies.percentile(.95), + Averaged::True); + + self->periodicMetrics.emplace_back( + ts + "Mean GRV Latency (ms)", 1000 * self->GRVLatencies.mean(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Median GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.median(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "5% GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.percentile(.05), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "95% GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.percentile(.95), Averaged::True); + + self->periodicMetrics.emplace_back( + ts + "Mean Commit Latency (ms)", 1000 * self->commitLatencies.mean(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Median Commit Latency (ms, averaged)", 1000 * self->commitLatencies.median(), Averaged::True); + self->periodicMetrics.emplace_back(ts + "5% Commit Latency (ms, averaged)", + 1000 * self->commitLatencies.percentile(.05), + Averaged::True); + self->periodicMetrics.emplace_back(ts + "95% Commit Latency (ms, averaged)", + 1000 * self->commitLatencies.percentile(.95), + Averaged::True); + //} + + self->periodicMetrics.emplace_back( + ts + "Max Latency (ms, averaged)", 1000 * self->latencies.max(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Max Row Read Latency (ms, averaged)", 1000 * self->readLatencies.max(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Max Total Read Latency (ms, averaged)", 1000 * self->fullReadLatencies.max(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Max GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.max(), Averaged::True); + self->periodicMetrics.emplace_back( + ts + "Max Commit Latency (ms, averaged)", 1000 * self->commitLatencies.max(), Averaged::True); + } + last_ops = ops; + + // if(self->rampUpLoad) { + self->latencies.clear(); + self->readLatencies.clear(); + self->fullReadLatencies.clear(); + self->GRVLatencies.clear(); + self->commitLatencies.clear(); + //} + + self->readLatencyTotal = 0.0; + self->readLatencyCount = 0; + } + } + + ACTOR static Future logLatency(Future> f, + ContinuousSample* latencies, + double* totalLatency, + int* latencyCount, + EventMetricHandle readMetric, + bool shouldRecord) { + state double readBegin = now(); + Optional value = wait(f); + + double latency = now() - readBegin; + readMetric->readLatency = latency * 1e9; + readMetric->log(); + + if (shouldRecord) { + *totalLatency += latency; + ++*latencyCount; + latencies->addSample(latency); + } + return Void(); + } + + ACTOR template + Future readOp(Trans* tr, std::vector keys, SkewedReadWriteWorkload* self, bool shouldRecord) { + if (!keys.size()) + return Void(); + + std::vector> readers; + for (int op = 0; op < keys.size(); op++) { + ++self->totalReadsMetric; + readers.push_back(logLatency(tr->get(self->keyForIndex(keys[op])), + &self->readLatencies, + &self->readLatencyTotal, + &self->readLatencyCount, + self->readMetric, + shouldRecord)); + } + + wait(waitForAll(readers)); + return Void(); + } + + ACTOR static Future _setup(Database cx, SkewedReadWriteWorkload* self) { + if (!self->doSetup) + return Void(); + + state Promise loadTime; + state Promise>> ratesAtKeyCounts; + + wait(bulkSetup(cx, + self, + self->nodeCount, + loadTime, + self->insertionCountsToMeasure.empty(), + self->warmingDelay, + self->maxInsertRate, + self->insertionCountsToMeasure, + ratesAtKeyCounts)); + + self->loadTime = loadTime.getFuture().get(); + self->ratesAtKeyCounts = ratesAtKeyCounts.getFuture().get(); + + return Void(); + } + + void startReadWriteClients(Database cx, std::vector>& clients) { + clientBegin = now(); + for (int c = 0; c < actorCount; c++) { + Future worker; + if (useRYW) + worker = + randomReadWriteClient(cx, this, actorCount / transactionsPerSecond, c); + else + worker = randomReadWriteClient(cx, this, actorCount / transactionsPerSecond, c); + clients.push_back(worker); + } + } + + ACTOR static Future _start(Database cx, SkewedReadWriteWorkload* self) { + state std::vector> clients; + if (self->enableReadLatencyLogging) + clients.push_back(tracePeriodically(self)); + + wait(updateServerShards(cx, self)); + for (self->currentHotRound = 0; self->currentHotRound < self->skewRound; ++self->currentHotRound) { + self->setHotServers(); + self->startReadWriteClients(cx, clients); + wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void())); + clients.clear(); + wait(delay(5.0) >> updateServerShards(cx, self)); + } + + return Void(); + } + + bool shouldRecord() { return shouldRecord(now()); } + + bool shouldRecord(double checkTime) { + double timeSinceStart = checkTime - clientBegin; + return timeSinceStart >= metricsStart && timeSinceStart < (metricsStart + metricsDuration); + } + + // calculate hot server count + void setHotServers() { + hotServerCount = ceil(hotServerFraction * serverShards.size()); + std::cout << "Choose " << hotServerCount << "/" << serverShards.size() << "/" << serverInterfaces.size() + << " hot servers: ["; + int begin = currentHotRound * hotServerCount; + for (int i = 0; i < hotServerCount; ++i) { + int idx = (begin + i) % serverShards.size(); + std::cout << serverInterfaces.at(serverShards[idx].first).address().toString() << ","; + } + std::cout << "]\n"; + } + + int64_t getRandomKeyFromHotServer(bool hotServerRead = true) { + ASSERT(hotServerCount > 0); + int begin = currentHotRound * hotServerCount; + if (!hotServerRead) { + begin += hotServerCount * (1.0 - hotReadWriteServerOverlap); // calculate non-overlap part offset + } + int idx = deterministicRandom()->randomInt(begin, begin + hotServerCount) % serverShards.size(); + int shardMax = std::min(serverShards[idx].second.size(), + (size_t)ceil(serverShards[idx].second.size() * hotServerShardFraction)); + int shardIdx = deterministicRandom()->randomInt(0, shardMax); + return deterministicRandom()->randomInt64(serverShards[idx].second[shardIdx].first, + serverShards[idx].second[shardIdx].second + 1); + } + + int64_t getRandomKey(uint64_t nodeCount, bool hotServerRead = true) { + auto random = deterministicRandom()->random01(); + if (hotServerFraction > 0) { + if ((hotServerRead && random < hotServerReadFrac) || (!hotServerRead && random < hotServerWriteFrac)) { + return getRandomKeyFromHotServer(hotServerRead); + } + } + return deterministicRandom()->randomInt64(0, nodeCount); + } + + ACTOR template + Future randomReadWriteClient(Database cx, SkewedReadWriteWorkload* self, double delay, int clientIndex) { + state double startTime = now(); + state double lastTime = now(); + state double GRVStartTime; + state UID debugID; + + loop { + wait(poisson(&lastTime, delay)); + + state double tstart = now(); + state bool aTransaction = deterministicRandom()->random01() > self->alpha; + + state std::vector keys; + state std::vector values; + state std::vector extra_ranges; + int reads = aTransaction ? self->readsPerTransactionA : self->readsPerTransactionB; + state int writes = aTransaction ? self->writesPerTransactionA : self->writesPerTransactionB; + for (int op = 0; op < reads; op++) + keys.push_back(self->getRandomKey(self->nodeCount)); + + values.reserve(writes); + for (int op = 0; op < writes; op++) + values.push_back(self->randomValue()); + + state Trans tr(cx); + + if (tstart - self->clientBegin > self->debugTime && + tstart - self->clientBegin <= self->debugTime + self->debugInterval) { + debugID = deterministicRandom()->randomUniqueID(); + tr.debugTransaction(debugID); + g_traceBatch.addEvent("TransactionDebug", debugID.first(), "ReadWrite.randomReadWriteClient.Before"); + } else { + debugID = UID(); + } + + self->transactionSuccessMetric->retries = 0; + self->transactionSuccessMetric->commitLatency = -1; + + loop { + try { + GRVStartTime = now(); + self->transactionFailureMetric->startLatency = -1; + + double grvLatency = now() - GRVStartTime; + self->transactionSuccessMetric->startLatency = grvLatency * 1e9; + self->transactionFailureMetric->startLatency = grvLatency * 1e9; + if (self->shouldRecord()) + self->GRVLatencies.addSample(grvLatency); + + state double readStart = now(); + wait(self->readOp(&tr, keys, self, self->shouldRecord())); + + double readLatency = now() - readStart; + if (self->shouldRecord()) + self->fullReadLatencies.addSample(readLatency); + + if (!writes) + break; + + for (int op = 0; op < writes; op++) + tr.set(self->keyForIndex(self->getRandomKey(self->nodeCount, false), false), values[op]); + + state double commitStart = now(); + wait(tr.commit()); + + double commitLatency = now() - commitStart; + self->transactionSuccessMetric->commitLatency = commitLatency * 1e9; + if (self->shouldRecord()) + self->commitLatencies.addSample(commitLatency); + + break; + } catch (Error& e) { + self->transactionFailureMetric->errorCode = e.code(); + self->transactionFailureMetric->log(); + + wait(tr.onError(e)); + + ++self->transactionSuccessMetric->retries; + ++self->totalRetriesMetric; + + if (self->shouldRecord()) + ++self->retries; + } + } + + if (debugID != UID()) + g_traceBatch.addEvent("TransactionDebug", debugID.first(), "ReadWrite.randomReadWriteClient.After"); + + tr = Trans(); + + double transactionLatency = now() - tstart; + self->transactionSuccessMetric->totalLatency = transactionLatency * 1e9; + self->transactionSuccessMetric->log(); + + if (self->shouldRecord()) { + if (aTransaction) + ++self->aTransactions; + else + ++self->bTransactions; + + self->latencies.addSample(transactionLatency); + } + } + } +}; + +WorkloadFactory SkewedReadWriteWorkloadFactory("SkewedReadWrite"); + +TEST_CASE("/KVWorkload/methods/ParseKeyForIndex") { + auto wk = SkewedReadWriteWorkload(WorkloadContext()); + for (int i = 0; i < 1000; ++i) { + auto idx = deterministicRandom()->randomInt64(0, wk.nodeCount); + Key k = wk.keyForIndex(idx); + auto parse = wk.indexForKey(k); + // std::cout << parse << " " << idx << "\n"; + ASSERT(parse == idx); + } + for (int i = 0; i < 1000; ++i) { + auto idx = deterministicRandom()->randomInt64(0, wk.nodeCount); + Key k = wk.keyForIndex(idx, true); + auto parse = wk.indexForKey(k, true); + ASSERT(parse == idx); + } + return Void(); +} \ No newline at end of file diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index bdbbd5707c..df08911fc7 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -131,6 +131,8 @@ struct KVWorkload : TestWorkload { Key getRandomKey(bool absent) const; Key keyForIndex(uint64_t index) const; Key keyForIndex(uint64_t index, bool absent) const; + // the reverse process of keyForIndex() without division. Set absent=true to ignore the last byte in Key + int64_t indexForKey(const KeyRef& key, bool absent = false) const; }; struct IWorkloadFactory : ReferenceCounted { From 77b930be153e20554a49e9bb43e6f565a9cb144e Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 6 May 2022 11:32:28 +0200 Subject: [PATCH 132/299] Upgrade Tests: Avoid race conditions when copying library files from a local repo --- tests/TestRunner/upgrade_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py index 5250466ac1..569786d072 100755 --- a/tests/TestRunner/upgrade_test.py +++ b/tests/TestRunner/upgrade_test.py @@ -259,10 +259,14 @@ class UpgradeTest: dest_lib_file = self.download_dir.joinpath(version, "libfdb_c.so") if dest_lib_file.exists(): return + # Avoid race conditions in case of parallel test execution by first copying to a temporary file + # and then renaming it atomically + dest_file_tmp = Path("{}.{}".format(str(dest_lib_file), random_secret_string(8))) src_lib_file = self.local_binary_repo.joinpath(version, "lib", "libfdb_c-{}.so".format(version)) assert src_lib_file.exists(), "Missing file {} in the local old binaries repository".format(src_lib_file) self.download_dir.joinpath(version).mkdir(parents=True, exist_ok=True) - shutil.copyfile(src_lib_file, dest_lib_file) + shutil.copyfile(src_lib_file, dest_file_tmp) + os.rename(dest_file_tmp, dest_lib_file) assert dest_lib_file.exists(), "{} does not exist".format(dest_lib_file) # Download all old binaries required for testing the specified upgrade path From 767a37f7d22dd8bf9bb485ddfcfef773081ec7cb Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Thu, 5 May 2022 18:06:38 +0200 Subject: [PATCH 133/299] Helper functions to generate certs and keys for TLS testing --- flow/CMakeLists.txt | 2 + flow/MkCert.cpp | 369 ++++++++++++++++++++++++++++++++++++++++++++ flow/MkCert.h | 184 ++++++++++++++++++++++ 3 files changed, 555 insertions(+) create mode 100644 flow/MkCert.cpp create mode 100644 flow/MkCert.h diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index bf390d4b88..9548dc81c2 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -46,6 +46,8 @@ set(FLOW_SRCS Knobs.cpp Knobs.h MetricSample.h + MkCert.h + MkCert.cpp Net2.actor.cpp Net2Packet.cpp Net2Packet.h diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp new file mode 100644 index 0000000000..71dd67f88f --- /dev/null +++ b/flow/MkCert.cpp @@ -0,0 +1,369 @@ +/* + * MkCert.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/Arena.h" +#include "flow/IRandom.h" +#include "flow/MkCert.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +template +class ExitGuard { + std::decay_t fn; + +public: + ExitGuard(Func&& fn) : fn(std::forward(fn)) {} + + ~ExitGuard() { fn(); } +}; + +[[noreturn]] void traceAndThrow(const char* condition, const char* file, int line) { + fprintf(stderr, "Failed condition check %s at %s:%d\n", condition, file, line); + auto te = TraceEvent(SevWarnAlways, "ErrorTLSKeyOrCertGen"); + te.suppressFor(60) + .detail("File", file) + .detail("Line", line) + .detail("Condition", condition); + if (auto err = ::ERR_get_error()) { + char buf[256]{ + 0, + }; + ::ERR_error_string_n(err, buf, sizeof(buf)); + te.detail("OpenSSLError", buf); + fprintf(stderr, "OpenSSL error: %s\n", buf); + } + throw tls_error(); +} + +} // anonymous namespace + +#define OSSL_ASSERT(condition) \ + do { \ + if (!(condition)) \ + traceAndThrow(#condition, __FILE__, __LINE__); \ + } while (false) + +namespace mkcert { + +// Helper functions working with OpenSSL native types +std::shared_ptr readX509CertPem(StringRef x509CertPem); +std::shared_ptr readPrivateKeyPem(StringRef privateKeyPem); +std::shared_ptr readPrivateKeyPem(StringRef privateKeyPem); +std::shared_ptr makeEllipticCurveKeyPairNative(); +StringRef writeX509CertPem(Arena& arena, const std::shared_ptr& nativeCert); +StringRef writePrivateKeyPem(Arena& arena, const std::shared_ptr& nativePrivateKey); + +struct CertAndKeyNative { + std::shared_ptr cert; + std::shared_ptr privateKey; + bool valid() const noexcept { return cert && privateKey; } + // self-signed cert case + bool null() const noexcept { return !cert && !privateKey; } + using SelfType = CertAndKeyNative; + using PemType = CertAndKeyRef; + + static SelfType fromPem(PemType certAndKey) { + auto ret = SelfType{}; + if (certAndKey.empty()) + return ret; + auto [certPem, keyPem] = certAndKey; + // either both set or both unset + ASSERT(!certPem.empty() && !keyPem.empty()); + ret.cert = readX509CertPem(certPem); + ret.privateKey = readPrivateKeyPem(keyPem); + return ret; + } + + PemType toPem(Arena& arena) { + auto ret = PemType{}; + if (null()) return ret; + ASSERT(valid()); + ret.certPem = writeX509CertPem(arena, cert); + ret.privateKeyPem = writePrivateKeyPem(arena, privateKey); + return ret; + } +}; + +CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer); + +void printCert(FILE* out, StringRef certPem) { + auto x = readX509CertPem(certPem); + OSSL_ASSERT(0 < ::X509_print_fp(out, x.get())); +} + +void printPrivateKey(FILE* out, StringRef privateKeyPem) { + auto key = readPrivateKeyPem(privateKeyPem); + auto bio = ::BIO_new_fp(out, BIO_NOCLOSE); + OSSL_ASSERT(bio); + auto bioGuard = ExitGuard([bio]() { ::BIO_free(bio); }); + OSSL_ASSERT(0 < ::EVP_PKEY_print_private(bio, key.get(), 0, nullptr)); +} + +std::shared_ptr makeEllipticCurveKeyPairNative() { + auto params = std::add_pointer_t(); + { + auto pctx = ::EVP_PKEY_CTX_new_id(EVP_PKEY_EC, nullptr); + OSSL_ASSERT(pctx); + auto ctxGuard = ExitGuard([pctx]() { ::EVP_PKEY_CTX_free(pctx); }); + OSSL_ASSERT(0 < ::EVP_PKEY_paramgen_init(pctx)); + OSSL_ASSERT(0 < ::EVP_PKEY_CTX_set_ec_paramgen_curve_nid(pctx, NID_X9_62_prime256v1)); + OSSL_ASSERT(0 < ::EVP_PKEY_paramgen(pctx, ¶ms)); + OSSL_ASSERT(params); + } + auto paramsGuard = ExitGuard([params]() { ::EVP_PKEY_free(params); }); + // keygen + auto kctx = ::EVP_PKEY_CTX_new(params, nullptr); + OSSL_ASSERT(kctx); + auto kctxGuard = ExitGuard([kctx]() { ::EVP_PKEY_CTX_free(kctx); }); + auto key = std::add_pointer_t(); + OSSL_ASSERT(0 < ::EVP_PKEY_keygen_init(kctx)); + OSSL_ASSERT(0 < ::EVP_PKEY_keygen(kctx, &key)); + OSSL_ASSERT(key); + return std::shared_ptr(key, &::EVP_PKEY_free); +} + +std::shared_ptr readX509CertPem(StringRef x509CertPem) { + ASSERT(!x509CertPem.empty()); + auto bio_mem = ::BIO_new_mem_buf(x509CertPem.begin(), x509CertPem.size()); + OSSL_ASSERT(bio_mem); + auto bioGuard = ExitGuard([bio_mem]() { ::BIO_free(bio_mem); }); + auto ret = ::PEM_read_bio_X509(bio_mem, nullptr, nullptr, nullptr); + OSSL_ASSERT(ret); + return std::shared_ptr(ret, &::X509_free); +} + +std::shared_ptr readPrivateKeyPem(StringRef privateKeyPem) { + ASSERT(!privateKeyPem.empty()); + auto bio_mem = ::BIO_new_mem_buf(privateKeyPem.begin(), privateKeyPem.size()); + OSSL_ASSERT(bio_mem); + auto bioGuard = ExitGuard([bio_mem]() { ::BIO_free(bio_mem); }); + auto ret = ::PEM_read_bio_PrivateKey(bio_mem, nullptr, nullptr, nullptr); + OSSL_ASSERT(ret); + return std::shared_ptr(ret, &::EVP_PKEY_free); +} + +StringRef writeX509CertPem(Arena& arena, const std::shared_ptr& nativeCert) { + auto mem = ::BIO_new(::BIO_s_secmem()); + OSSL_ASSERT(mem); + auto memGuard = ExitGuard([mem]() { ::BIO_free(mem); }); + OSSL_ASSERT(::PEM_write_bio_X509(mem, nativeCert.get())); + auto bioBuf = std::add_pointer_t{}; + auto const len = ::BIO_get_mem_data(mem, &bioBuf); + ASSERT_GT(len, 0); + auto buf = new (arena) uint8_t[len]; + ::memcpy(buf, bioBuf, len); + return StringRef(buf, static_cast(len)); +} + +StringRef writePrivateKeyPem(Arena& arena, const std::shared_ptr& nativePrivateKey) { + auto mem = ::BIO_new(::BIO_s_secmem()); + OSSL_ASSERT(mem); + auto memGuard = ExitGuard([mem]() { ::BIO_free(mem); }); + OSSL_ASSERT(::PEM_write_bio_PrivateKey(mem, nativePrivateKey.get(), nullptr, nullptr, 0, 0, nullptr)); + auto bioBuf = std::add_pointer_t{}; + auto const len = ::BIO_get_mem_data(mem, &bioBuf); + ASSERT_GT(len, 0); + auto buf = new (arena) uint8_t[len]; + ::memcpy(buf, bioBuf, len); + return StringRef(buf, static_cast(len)); +} + +KeyPairRef KeyPairRef::make(Arena& arena) { + auto keypair = makeEllipticCurveKeyPairNative(); + auto ret = KeyPairRef{}; + { + auto len = 0; + len = ::i2d_PrivateKey(keypair.get(), nullptr); + ASSERT_LT(0, len); + auto buf = new (arena) uint8_t[len]; + auto out = std::add_pointer_t(buf); + len = ::i2d_PrivateKey(keypair.get(), &out); + ret.privateKeyDer = StringRef(buf, len); + } + { + auto len = 0; + len = ::i2d_PUBKEY(keypair.get(), nullptr); + ASSERT_LT(0, len); + auto buf = new (arena) uint8_t[len]; + auto out = std::add_pointer_t(buf); + len = ::i2d_PUBKEY(keypair.get(), &out); + ret.publicKeyDer = StringRef(buf, len); + } + return ret; +} + +CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { + // issuer key/cert must be both set or both null (self-signed case) + ASSERT(issuer.valid() || issuer.null()); + + auto const isSelfSigned = issuer.null(); + auto nativeKeyPair = makeEllipticCurveKeyPairNative(); + auto newX = ::X509_new(); + OSSL_ASSERT(newX); + auto x509Guard = ExitGuard([&newX]() { if (newX) ::X509_free(newX); }); + auto smartX = std::shared_ptr(newX, &::X509_free); + newX = nullptr; + auto x = smartX.get(); + OSSL_ASSERT(0 < ::X509_set_version(x, 2/*X509_VERSION_3*/)); + auto serialPtr = ::X509_get_serialNumber(x); + OSSL_ASSERT(serialPtr); + OSSL_ASSERT(0 < ::ASN1_INTEGER_set(serialPtr, spec.serialNumber)); + auto notBefore = ::X509_getm_notBefore(x); + OSSL_ASSERT(notBefore); + OSSL_ASSERT(::X509_gmtime_adj(notBefore, spec.offsetNotBefore)); + auto notAfter = ::X509_getm_notAfter(x); + OSSL_ASSERT(notAfter); + OSSL_ASSERT(::X509_gmtime_adj(notAfter, spec.offsetNotAfter)); + OSSL_ASSERT(0 < ::X509_set_pubkey(x, nativeKeyPair.get())); + auto subjectName = ::X509_get_subject_name(x); + OSSL_ASSERT(subjectName); + for (const auto& entry : spec.subjectName) { + // field names are expected to null-terminate + auto fieldName = entry.field.toString(); + OSSL_ASSERT(0 < ::X509_NAME_add_entry_by_txt(subjectName, fieldName.c_str(), MBSTRING_ASC, entry.bytes.begin(), entry.bytes.size(), -1, 0)); + } + auto issuerName = ::X509_get_issuer_name(x); + OSSL_ASSERT(issuerName); + OSSL_ASSERT(::X509_set_issuer_name(x, (isSelfSigned ? subjectName : ::X509_get_subject_name(issuer.cert.get())))); + auto ctx = X509V3_CTX{}; + X509V3_set_ctx_nodb(&ctx); + ::X509V3_set_ctx(&ctx, (isSelfSigned ? x : issuer.cert.get()), x, nullptr, nullptr, 0); + for (const auto& entry : spec.extensions) { + // extension field names and values are expected to null-terminate + auto extName = entry.field.toString(); + auto extValue = entry.bytes.toString(); + auto ext = ::X509V3_EXT_conf(nullptr, &ctx, extName.c_str(), extValue.c_str()); + OSSL_ASSERT(ext); + auto extGuard = ExitGuard([ext]() { ::X509_EXTENSION_free(ext); }); + OSSL_ASSERT(::X509_add_ext(x, ext, -1)); + } + OSSL_ASSERT(::X509_sign(x, (isSelfSigned ? nativeKeyPair.get() : issuer.privateKey.get()), ::EVP_sha256())); + auto ret = CertAndKeyNative{}; + ret.cert = smartX; + ret.privateKey = nativeKeyPair; + return ret; +} + +CertAndKeyRef CertAndKeyRef::make(Arena& arena, CertSpecRef spec, CertAndKeyRef issuerPem) { + auto issuer = CertAndKeyNative::fromPem(issuerPem); + auto newCertAndKey = makeCertNative(spec, issuer); + return newCertAndKey.toPem(arena); +} + +CertSpecRef CertSpecRef::make(Arena& arena, CertKind kind) { + auto spec = CertSpecRef{}; + spec.serialNumber = static_cast(deterministicRandom()->randomInt64(0, 1e10)); + spec.offsetNotBefore = 0; // now + spec.offsetNotAfter = 60 * 60 * 24 * 365; // 1 year from now + auto& subject = spec.subjectName; + subject.push_back(arena, {"countryName"_sr, "DE"_sr}); + subject.push_back(arena, {"localityName"_sr, "Berlin"_sr}); + subject.push_back(arena, {"organizationName"_sr, "FoundationDB"_sr}); + subject.push_back(arena, {"commonName"_sr, kind.getCommonName("FDB Testing Services"_sr, arena)}); + auto& ext = spec.extensions; + if (kind.isCA()) { + ext.push_back(arena, {"basicConstraints"_sr, "critical, CA:TRUE"_sr}); + ext.push_back(arena, {"keyUsage"_sr, "critical, digitalSignature, keyCertSign, cRLSign"_sr}); + } else { + ext.push_back(arena, {"basicConstraints"_sr, "critical, CA:FALSE"_sr}); + ext.push_back(arena, {"keyUsage"_sr, "critical, digitalSignature, keyEncipherment"_sr}); + ext.push_back(arena, {"extendedKeyUsage"_sr, "serverAuth, clientAuth"_sr}); + } + ext.push_back(arena, {"subjectKeyIdentifier"_sr, "hash"_sr}); + if (!kind.isRootCA()) + ext.push_back(arena, {"authorityKeyIdentifier"_sr, "keyid, issuer"_sr}); + return spec; +} + +StringRef concatCertChain(Arena& arena, CertChainRef chain) { + auto len = 0; + for (const auto& entry : chain) { + len += entry.certPem.size(); + } + if (len == 0) return StringRef(); + auto buf = new (arena) uint8_t[len]; + auto offset = 0; + for (auto const& entry : chain) { + ::memcpy(&buf[offset], entry.certPem.begin(), entry.certPem.size()); + offset += entry.certPem.size(); + } + UNSTOPPABLE_ASSERT(offset == len); + return StringRef(buf, len); +} + +CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKeyRef rootAuthority) { + ASSERT_GT(specs.size(), 0); + // if rootAuthority is empty, use last element in specs to make root CA + auto const needRootCA = rootAuthority.empty(); + if (needRootCA) { + int const chainLength = specs.size(); + auto chain = new (arena) CertAndKeyRef[chainLength]; + auto caNative = makeCertNative(specs.back(), CertAndKeyNative{}/* empty issuer == self-signed */); + chain[chainLength-1] = caNative.toPem(arena); + for (auto i = chainLength - 2; i >= 0; i--) { + auto cnkNative = makeCertNative(specs[i], caNative); + chain[i] = cnkNative.toPem(arena); + caNative = cnkNative; + } + return CertChainRef(chain, chainLength); + } else { + int const chainLength = specs.size() + 1; /* account for deep-copied rootAuthority */ + auto chain = new (arena) CertAndKeyRef[chainLength]; + auto caNative = CertAndKeyNative::fromPem(rootAuthority); + chain[chainLength-1] = rootAuthority.deepCopy(arena); + for (auto i = chainLength - 2; i >= 0; i--) { + auto cnkNative = makeCertNative(specs[i], caNative); + chain[i] = cnkNative.toPem(arena); + caNative = cnkNative; + } + return CertChainRef(chain, chainLength); + } +} + +CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { + if (!length) return {}; + // temporary arena for writing up specs + auto tmpArena = Arena(); + auto specs = new (tmpArena) CertSpecRef[length]; + auto const isServerSide = side == ESide::Server; + for (auto i = 0u; i < length; i++) { + auto kind = CertKind{}; + if (i == 0u) + kind = isServerSide ? CertKind(Server{}) : CertKind(Client{}); + else if (i == length - 1) + kind = isServerSide ? CertKind(ServerRootCA{}) : CertKind(ClientRootCA{}); + else + kind = isServerSide ? CertKind(ServerIntermediateCA{i}) : CertKind(ClientIntermediateCA{i}); + specs[i] = CertSpecRef::make(tmpArena, kind); + } + return makeCertChain(arena, VectorRef(specs, length), {}/*root*/); +} + +} // namespace mkcert diff --git a/flow/MkCert.h b/flow/MkCert.h new file mode 100644 index 0000000000..da83da40b9 --- /dev/null +++ b/flow/MkCert.h @@ -0,0 +1,184 @@ +/* + * MkCert.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MKCERT_H +#define MKCERT_H + +#include "flow/Arena.h" +#include "flow/Error.h" +#include +#include +#include +#include + +namespace mkcert { + +void printCert(FILE* out, StringRef certPem); + +void printPrivateKey(FILE* out, StringRef privateKeyPem); + +struct KeyPairRef { + using SelfType = KeyPairRef; + + // Make new Elliptic Curve private-public key pair in DER + static SelfType make(Arena& arena); + + StringRef privateKeyDer; + StringRef publicKeyDer; +}; + +struct Asn1EntryRef { + // field must match one of ASN.1 object short/long names: e.g. "C", "countryName", "CN", "commonName", "subjectAltName", ... + StringRef field; + StringRef bytes; +}; + +struct ServerRootCA {}; +struct ServerIntermediateCA { unsigned level; }; +struct Server {}; +struct ClientRootCA {}; +struct ClientIntermediateCA { unsigned level; }; +struct Client {}; + +struct CertKind { + + CertKind() noexcept = default; + + template + CertKind(Kind kind) noexcept : + value(std::in_place_type, kind) + {} + + template + bool is() const noexcept { return std::holds_alternative(value); } + + template + Kind const& get() const { return std::get(value); } + + bool isServerSide() const noexcept { + return is() || is() || is(); + } + + bool isClientSide() const noexcept { + return !isServerSide(); + } + + bool isRootCA() const noexcept { + return is() || is(); + } + + bool isIntermediateCA() const noexcept { + return is() || is(); + } + + bool isLeaf() const noexcept { + return is() || is(); + } + + bool isCA() const noexcept { + return !isLeaf(); + } + + StringRef getCommonName(StringRef prefix, Arena& arena) const { + auto const side = std::string(isClientSide() ? " Client" : " Server"); + if (isIntermediateCA()) { + auto const level = isClientSide() ? get().level + : get().level; + return prefix.withSuffix( + fmt::format("{} Intermediate {}", side, level), + arena); + } else if (isRootCA()) { + return prefix.withSuffix(fmt::format("{} Root", side), arena); + } else { + return prefix.withSuffix(side, arena); + } + } + + std::variant value; +}; + +struct CertSpecRef { + using SelfType = CertSpecRef; + long serialNumber; + // offset in number of seconds relative to now, i.e. cert creation + long offsetNotBefore; + long offsetNotAfter; + VectorRef subjectName; + // time offset relative to time of cert creation (now) + VectorRef extensions; + // make test-only sample certificate whose fields are inferred from CertKind + static SelfType make(Arena& arena, CertKind kind); +}; + +struct CertAndKeyRef { + using SelfType = CertAndKeyRef; + StringRef certPem; + StringRef privateKeyPem; + + void printCert(FILE* out) { + if (!certPem.empty()) { + ::mkcert::printCert(out, certPem); + } + } + + void printPrivateKey(FILE* out) { + if (!privateKeyPem.empty()) { + ::mkcert::printPrivateKey(out, privateKeyPem); + } + } + + bool empty() const noexcept { + return certPem.empty() && privateKeyPem.empty(); + } + + SelfType deepCopy(Arena& arena) { + auto ret = SelfType{}; + if (!certPem.empty()) + ret.certPem = StringRef(arena, certPem); + if (!privateKeyPem.empty()) + ret.privateKeyPem = StringRef(arena, privateKeyPem); + return ret; + } + + // Empty (default) issuer produces a self-signed certificate + static SelfType make(Arena& arena, CertSpecRef spec, CertAndKeyRef issuer); +}; + +using CertChainRef = VectorRef; + +// Concatenate chain of PEMs to one StringRef +StringRef concatCertChain(Arena& arena, CertChainRef chain); + +enum class ESide : int { + Server, + Client +}; + +// For empty (default) rootAuthority, the last item in specs is used to generate rootAuthority +// Otherwise, rootAuthority is deep-copied to first element of returned chain +CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKeyRef rootAuthority); + +// Make stub cert chain of given length inc. root authority +// Note: side does not imply anything different other than the choice of common names +CertChainRef makeCertChain(Arena& arena, unsigned depth, ESide side); + +} // namespace mkcert + +#endif /*MKCERT_H*/ From fab8f35683257f924667663ccd4d7270af1d144f Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 29 Apr 2022 16:37:59 +0200 Subject: [PATCH 134/299] Cluster wiggle test --- bindings/c/CMakeLists.txt | 21 ++ .../apitester/TesterTransactionExecutor.cpp | 4 +- tests/TestRunner/local_cluster.py | 192 +++++++++++++++--- tests/TestRunner/upgrade_test.py | 69 ++++--- 4 files changed, 225 insertions(+), 61 deletions(-) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 9258d384d9..ec600bf501 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -333,6 +333,27 @@ endif() --upgrade-path "7.0.0" "7.2.0" --process-number 3 ) + + add_test(NAME fdb_c_cluster_wiggle + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.2.0" "wiggle" + --disable-log-dump + --process-number 3 + --redundancy double + ) + + add_test(NAME fdb_c_wiggle_and_upgrade + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.0.0" "wiggle" "7.2.0" + --disable-log-dump + --process-number 3 + --redundancy double + ) + endif() endif() diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index aae7c19b5e..e0cb8f93e2 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -33,8 +33,8 @@ namespace FdbApiTester { -constexpr int LONG_WAIT_TIME_US = 1000000; -constexpr int LARGE_NUMBER_OF_RETRIES = 5; +constexpr int LONG_WAIT_TIME_US = 2000000; +constexpr int LARGE_NUMBER_OF_RETRIES = 10; void TransactionActorBase::complete(fdb_error_t err) { error = err; diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index defdead9ec..4e47319b0f 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -7,6 +7,10 @@ import os import socket import time +CLUSTER_UPDATE_TIMEOUT_SEC = 10 +EXCLUDE_SERVERS_TIMEOUT_SEC = 120 +RETRY_INTERVAL_SEC = 0.5 + def _get_free_port_internal(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: @@ -94,6 +98,7 @@ logdir = {logdir} port=None, ip_address=None, blob_granules_enabled: bool = False, + redundancy: str = "single" ): self.basedir = Path(basedir) self.etc = self.basedir.joinpath("etc") @@ -110,6 +115,7 @@ logdir = {logdir} self.log.mkdir(exist_ok=True) self.data.mkdir(exist_ok=True) self.process_number = process_number + self.redundancy = redundancy self.ip_address = "127.0.0.1" if ip_address is None else ip_address self.first_port = port self.blob_granules_enabled = blob_granules_enabled @@ -119,7 +125,9 @@ logdir = {logdir} if self.first_port is not None: self.last_used_port = int(self.first_port) - 1 - self.server_ports = [self.__next_port() for _ in range(self.process_number)] + self.server_ports = {server_id: self.__next_port() for server_id in range(self.process_number)} + self.server_by_port = {port: server_id for server_id, port in self.server_ports.items()} + self.next_server_id = self.process_number self.cluster_desc = random_secret_string(8) self.cluster_secret = random_secret_string(8) self.env_vars = {} @@ -127,6 +135,8 @@ logdir = {logdir} self.process = None self.fdbmonitor_logfile = None self.use_legacy_conf_syntax = False + self.coordinators = set() + self.active_servers = set(self.server_ports.keys()) if create_config: self.create_cluster_file() @@ -163,11 +173,15 @@ logdir = {logdir} # E.g., port = 4000, process_number = 5 # Then 4000,4001,4002,4003,4004 will be used as ports # If port number is not given, we will randomly pick free ports - for port in self.server_ports: - f.write("[fdbserver.{server_port}]\n".format(server_port=port)) + for server_id in self.active_servers: + f.write("[fdbserver.{server_port}]\n".format(server_port=self.server_ports[server_id])) + if self.use_legacy_conf_syntax: + f.write("machine_id = {}\n".format(server_id)) + else: + f.write("machine-id = {}\n".format(server_id)) if self.blob_granules_enabled: # make last process a blob_worker class - f.write("class = blob_worker") + f.write("class = blob_worker\n") f.flush() os.fsync(f.fileno()) @@ -183,6 +197,7 @@ logdir = {logdir} server_port=self.server_ports[0], ) ) + self.coordinators = {0} def start_cluster(self): assert not self.running, "Can't start a server that is already running" @@ -212,7 +227,8 @@ logdir = {logdir} sec = 0 while sec < timeout_sec: in_use = False - for port in self.server_ports: + for server_id in self.active_servers: + port = self.server_ports[server_id] if is_port_in_use(port): print("Port {} in use. Waiting for it to be released".format(port)) in_use = True @@ -230,37 +246,60 @@ logdir = {logdir} def __exit__(self, xc_type, exc_value, traceback): self.stop_cluster() + def __fdbcli_exec(self, cmd, stdout, stderr, timeout): + args = [self.fdbcli_binary, "-C", self.cluster_file, "--exec", cmd] + res = subprocess.run(args, env=self.process_env(), stderr=stderr, stdout=stdout, timeout=timeout) + assert res.returncode == 0, "fdbcli command {} failed with {}".format(cmd, res.returncode) + return res.stdout + + # Execute a fdbcli command + def fdbcli_exec(self, cmd, timeout=None): + self.__fdbcli_exec(cmd, None, None, timeout) + + # Execute a fdbcli command and return its output + def fdbcli_exec_and_get(self, cmd, timeout=None): + return self.__fdbcli_exec(cmd, subprocess.PIPE, None, timeout) + def create_database(self, storage="ssd", enable_tenants=True): - db_config = "configure new single {}".format(storage) + db_config = "configure new {} {}".format(self.redundancy, storage) if enable_tenants: db_config += " tenant_mode=optional_experimental" if self.blob_granules_enabled: db_config += " blob_granules_enabled:=1" - args = [self.fdbcli_binary, "-C", self.cluster_file, "--exec", db_config] - - res = subprocess.run(args, env=self.process_env()) - assert res.returncode == 0, "Create database failed with {}".format( - res.returncode - ) + self.fdbcli_exec(db_config) if self.blob_granules_enabled: - bg_args = [ - self.fdbcli_binary, - "-C", - self.cluster_file, - "--exec", - "blobrange start \\x00 \\xff", - ] - bg_res = subprocess.run(bg_args, env=self.process_env()) - assert bg_res.returncode == 0, "Start blob granules failed with {}".format( - bg_res.returncode - ) + self.fdbcli_exec("blobrange start \\x00 \\xff") + # Get cluster status using fdbcli def get_status(self): - args = [self.fdbcli_binary, "-C", self.cluster_file, "--exec", "status json"] - res = subprocess.run(args, env=self.process_env(), stdout=subprocess.PIPE) - assert res.returncode == 0, "Get status failed with {}".format(res.returncode) - return json.loads(res.stdout) + status_output = self.fdbcli_exec_and_get("status json") + return json.loads(status_output) + + # Get the set of servers from the cluster status matching the given filter + def get_servers_from_status(self, filter): + status = self.get_status() + if "processes" not in status["cluster"]: + return {} + + servers_found = set() + addresses = [proc_info["address"] for proc_info in status["cluster"]["processes"].values() if filter(proc_info)] + for addr in addresses: + port = int(addr.split(":", 1)[1]) + assert port in self.server_by_port, "Unknown server port {}".format(port) + servers_found.add(self.server_by_port[port]) + + return servers_found + + # Get the set of all servers from the cluster status + def get_all_servers_from_status(self): + return self.get_servers_from_status(lambda _: True) + + # Get the set of all servers with coordinator role from the cluster status + def get_coordinators_from_status(self): + def is_coordinator(proc_status): + return any(entry["role"] == "coordinator" for entry in proc_status["roles"]) + return self.get_servers_from_status(is_coordinator) def process_env(self): env = dict(os.environ) @@ -269,3 +308,102 @@ logdir = {logdir} def set_env_var(self, var_name, var_val): self.env_vars[var_name] = var_val + + # Add a new server process to the cluster and return its ID + # Need to call save_config to apply the changes + def add_server(self): + server_id = self.next_server_id + assert server_id not in self.server_ports, "Server ID {} is already in use".format(server_id) + self.next_server_id += 1 + port = self.__next_port() + self.server_ports[server_id] = port + self.server_by_port[port] = server_id + self.active_servers.add(server_id) + return server_id + + # Remove the server with the given ID from the cluster + # Need to call save_config to apply the changes + def remove_server(self, server_id): + assert server_id in self.active_servers, "Server {} does not exist".format(server_id) + self.active_servers.remove(server_id) + + # Wait until changes to the set of servers (additions & removals) are applied + def wait_for_server_update(self, timeout=CLUSTER_UPDATE_TIMEOUT_SEC): + time_limit = time.time() + timeout + servers_found = set() + while (time.time() <= time_limit): + servers_found = self.get_all_servers_from_status() + if (servers_found != self.active_servers): + break + time.sleep(RETRY_INTERVAL_SEC) + assert "Failed to apply server changes after {}sec. Expected: {}, Actual: {}".format( + timeout, self.active_servers, servers_found) + + # Apply changes to the set of the coordinators, based on the current value of self.coordinators + def update_coordinators(self): + urls = ["{}:{}".format(self.ip_address, self.server_ports[id]) for id in self.coordinators] + self.fdbcli_exec("coordinators {}".format(" ".join(urls))) + + # Wait until the changes to the set of the coordinators are applied + def wait_for_coordinator_update(self, timeout=CLUSTER_UPDATE_TIMEOUT_SEC): + time_limit = time.time() + timeout + coord_found = set() + while (time.time() <= time_limit): + coord_found = self.get_coordinators_from_status() + if (coord_found != self.coordinators): + break + time.sleep(RETRY_INTERVAL_SEC) + assert "Failed to apply coordinator changes after {}sec. Expected: {}, Actual: {}".format( + timeout, self.coordinators, coord_found) + # Check if the cluster file was successfully updated too + connection_string = open(self.cluster_file, "r").read() + for server_id in self.coordinators: + assert connection_string.find(str(self.server_ports[server_id])) != -1, \ + "Missing coordinator {} port {} in the cluster file".format(server_id, self.server_ports[server_id]) + + # Exclude the servers with the given ID from the cluster, i.e. move out their data + # The method waits until the changes are applied + def exclude_servers(self, server_ids): + urls = ["{}:{}".format(self.ip_address, self.server_ports[id]) for id in server_ids] + self.fdbcli_exec("exclude FORCE {}".format(" ".join(urls)), timeout=EXCLUDE_SERVERS_TIMEOUT_SEC) + + # Perform a cluster wiggle: replace all servers with new ones + def cluster_wiggle(self): + old_servers = self.active_servers.copy() + new_servers = set() + print("Starting cluster wiggle") + print("Old servers: {} on ports {}".format(old_servers, [ + self.server_ports[server_id] for server_id in old_servers])) + print("Old coordinators: {}".format(self.coordinators)) + + # Step 1: add new servers + start_time = time.time() + for _ in range(len(old_servers)): + new_servers.add(self.add_server()) + print("New servers: {} on ports {}".format(new_servers, [ + self.server_ports[server_id] for server_id in new_servers])) + self.save_config() + self.wait_for_server_update() + print("New servers successfully added to the cluster. Time: {}s".format(time.time()-start_time)) + + # Step 2: change coordinators + start_time = time.time() + new_coordinators = set(random.sample(new_servers, len(self.coordinators))) + print("New coordinators: {}".format(new_coordinators)) + self.coordinators = new_coordinators.copy() + self.update_coordinators() + self.wait_for_coordinator_update() + print("Coordinators successfully changed. Time: {}s".format(time.time()-start_time)) + + # Step 3: exclude old servers from the cluster, i.e. move out their data + start_time = time.time() + self.exclude_servers(old_servers) + print("Old servers successfully excluded from the cluster. Time: {}s".format(time.time()-start_time)) + + # Step 4: remove the old servers + start_time = time.time() + for server_id in old_servers: + self.remove_server(server_id) + self.save_config() + self.wait_for_server_update() + print("Old servers successfully removed from the cluster. Time: {}s".format(time.time()-start_time)) diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py index 5250466ac1..d88cf53acb 100755 --- a/tests/TestRunner/upgrade_test.py +++ b/tests/TestRunner/upgrade_test.py @@ -65,6 +65,7 @@ SUPPORTED_VERSIONS = [ "5.1.7", "5.1.6", ] +CLUSTER_ACTIONS = ["wiggle"] FDB_DOWNLOAD_ROOT = "https://github.com/apple/foundationdb/releases/download/" LOCAL_OLD_BINARY_REPO = "/opt/foundationdb/old/" CURRENT_VERSION = "7.2.0" @@ -128,19 +129,15 @@ def read_to_str(filename): class UpgradeTest: def __init__( self, - build_dir: str, - upgrade_path: list, - process_number: int = 1, - port: str = None, + args ): - self.build_dir = Path(build_dir).resolve() - assert self.build_dir.exists(), "{} does not exist".format(build_dir) - assert self.build_dir.is_dir(), "{} is not a directory".format(build_dir) - self.upgrade_path = upgrade_path - for version in upgrade_path: - assert version in SUPPORTED_VERSIONS, "Unsupported version {}".format( - version - ) + self.build_dir = Path(args.build_dir).resolve() + assert self.build_dir.exists(), "{} does not exist".format(args.build_dir) + assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir) + self.upgrade_path = args.upgrade_path + self.used_versions = set(self.upgrade_path).difference(set(CLUSTER_ACTIONS)) + for version in self.used_versions: + assert version in SUPPORTED_VERSIONS, "Unsupported version or cluster action {}".format(version) self.platform = platform.machine() assert self.platform in SUPPORTED_PLATFORMS, "Unsupported platform {}".format( self.platform @@ -153,15 +150,15 @@ class UpgradeTest: self.local_binary_repo = None self.download_old_binaries() self.create_external_lib_dir() - init_version = upgrade_path[0] + init_version = self.upgrade_path[0] self.cluster = LocalCluster( self.tmp_dir, self.binary_path(init_version, "fdbserver"), self.binary_path(init_version, "fdbmonitor"), self.binary_path(init_version, "fdbcli"), - process_number, - port=port, + args.process_number, create_config=False, + redundancy=args.redundancy ) self.cluster.create_cluster_file() self.configure_version(init_version) @@ -267,7 +264,7 @@ class UpgradeTest: # Download all old binaries required for testing the specified upgrade path def download_old_binaries(self): - for version in self.upgrade_path: + for version in self.used_versions: if version == CURRENT_VERSION: continue @@ -293,7 +290,7 @@ class UpgradeTest: def create_external_lib_dir(self): self.external_lib_dir = self.tmp_dir.joinpath("client_libs") self.external_lib_dir.mkdir(parents=True) - for version in self.upgrade_path: + for version in self.used_versions: src_file_path = self.lib_dir(version).joinpath("libfdb_c.so") assert src_file_path.exists(), "{} does not exist".format(src_file_path) target_file_path = self.external_lib_dir.joinpath( @@ -313,7 +310,7 @@ class UpgradeTest: time.sleep(1) continue num_proc = len(status["cluster"]["processes"]) - if num_proc < self.cluster.process_number: + if num_proc != self.cluster.process_number: print( "Health check: {} of {} processes found. Retrying".format( num_proc, self.cluster.process_number @@ -321,11 +318,6 @@ class UpgradeTest: ) time.sleep(1) continue - assert ( - num_proc == self.cluster.process_number - ), "Number of processes: expected: {}, actual: {}".format( - self.cluster.process_number, num_proc - ) for (_, proc_stat) in status["cluster"]["processes"].items(): proc_ver = proc_stat["version"] assert ( @@ -370,7 +362,7 @@ class UpgradeTest: # Determine FDB API version matching the upgrade path def determine_api_version(self): self.api_version = api_version_from_str(CURRENT_VERSION) - for version in self.upgrade_path: + for version in self.used_versions: self.api_version = min(api_version_from_str(version), self.api_version) # Start the tester to generate the workload specified by the test file @@ -428,7 +420,6 @@ class UpgradeTest: os._exit(1) # Perform a progress check: Trigger it and wait until it is completed - def progress_check(self): self.progress_event.clear() os.write(self.ctrl_pipe, b"CHECK\n") @@ -464,11 +455,15 @@ class UpgradeTest: try: self.health_check() self.progress_check() - for version in self.upgrade_path[1:]: - random_sleep(0.0, 2.0) - self.upgrade_to(version) - self.health_check() - self.progress_check() + random_sleep(0.0, 2.0) + for entry in self.upgrade_path[1:]: + if entry == "wiggle": + self.cluster.cluster_wiggle() + else: + assert entry in self.used_versions, "Unexpected entry in the upgrade path: {}".format(entry) + self.upgrade_to(entry) + self.health_check() + self.progress_check() os.write(self.ctrl_pipe, b"STOP\n") finally: os.close(self.ctrl_pipe) @@ -611,7 +606,8 @@ if __name__ == "__main__": parser.add_argument( "--upgrade-path", nargs="+", - help="Cluster upgrade path: a space separated list of versions", + help="Cluster upgrade path: a space separated list of versions.\n" + + "The list may also contain cluster change actions: {}".format(CLUSTER_ACTIONS), default=[CURRENT_VERSION], ) parser.add_argument( @@ -626,6 +622,12 @@ if __name__ == "__main__": type=int, default=0, ) + parser.add_argument( + "--redundancy", + help="Database redundancy level (default: single)", + type=str, + default="single", + ) parser.add_argument( "--disable-log-dump", help="Do not dump cluster log on error", @@ -639,11 +641,14 @@ if __name__ == "__main__": args.process_number = random.randint(1, 5) print("Testing with {} processes".format(args.process_number)) + assert len(args.upgrade_path) > 0, "Upgrade path must be specified" + assert args.upgrade_path[0] in SUPPORTED_VERSIONS, "Upgrade path begin with a valid version number" + if args.run_with_gdb: RUN_WITH_GDB = True errcode = 1 - with UpgradeTest(args.build_dir, args.upgrade_path, args.process_number) as test: + with UpgradeTest(args) as test: print("log-dir: {}".format(test.log)) print("etc-dir: {}".format(test.etc)) print("data-dir: {}".format(test.data)) From 27c01133057e72cdf4cd883339713f65843502d9 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 6 May 2022 15:21:57 +0200 Subject: [PATCH 135/299] Upgrade Tests: Including 7.1 into the upgrade paths --- bindings/c/CMakeLists.txt | 8 ++++---- tests/TestRunner/upgrade_test.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 9258d384d9..34ffa96d1f 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -306,7 +306,7 @@ endif() COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.2.0" + --upgrade-path "6.3.23" "7.0.0" "7.1.3" "7.2.0" --process-number 1 ) @@ -314,7 +314,7 @@ endif() COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadSingleThr.toml - --upgrade-path "7.0.0" "7.2.0" + --upgrade-path "7.0.0" "7.1.3" "7.2.0" --process-number 1 ) @@ -322,7 +322,7 @@ endif() COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "6.3.23" "7.0.0" "7.2.0" + --upgrade-path "6.3.23" "7.0.0" "7.1.3" "7.2.0" --process-number 3 ) @@ -330,7 +330,7 @@ endif() COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.0.0" "7.2.0" + --upgrade-path "7.0.0" "7.1.3" "7.2.0" --process-number 3 ) endif() diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py index 5250466ac1..51444bd63b 100755 --- a/tests/TestRunner/upgrade_test.py +++ b/tests/TestRunner/upgrade_test.py @@ -21,6 +21,8 @@ from local_cluster import LocalCluster, random_secret_string SUPPORTED_PLATFORMS = ["x86_64"] SUPPORTED_VERSIONS = [ "7.2.0", + "7.1.3", + "7.1.2", "7.1.1", "7.1.0", "7.0.0", From 24f6276e14fb7f68710109b577bd708fa88f0901 Mon Sep 17 00:00:00 2001 From: hao fu Date: Thu, 5 May 2022 20:52:07 -0700 Subject: [PATCH 136/299] Eliminate vector string during mapped key processing There was OOM if we pre-process all the strings. --- fdbserver/storageserver.actor.cpp | 46 ++++++++++++------------------- packaging/docker/sidecar.py | 2 +- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index ef17342bc1..523555a771 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3525,12 +3525,8 @@ bool rangeQuery(const std::string& s) { // in case of a singleKeyOrValue, insert an empty Tuple to vector as placeholder // in case of a rangeQuery, insert Optional.empty as placeholder // in other cases, insert the correct Tuple to be used. -void preprocessMappedKey(Tuple& mappedKeyFormatTuple, - std::vector>& vt, - std::vector& strings, - bool& isRangeQuery) { +void preprocessMappedKey(Tuple& mappedKeyFormatTuple, std::vector>& vt, bool& isRangeQuery) { vt.reserve(mappedKeyFormatTuple.size()); - strings.reserve(mappedKeyFormatTuple.size()); for (int i = 0; i < mappedKeyFormatTuple.size(); i++) { Tuple::ElementType type = mappedKeyFormatTuple.getType(i); @@ -3539,7 +3535,6 @@ void preprocessMappedKey(Tuple& mappedKeyFormatTuple, auto sz = s.size(); bool escaped = unescapeLiterals(s, "{{", "{"); escaped = unescapeLiterals(s, "}}", "}") || escaped; - strings.push_back(s); if (escaped) { Tuple escapedTuple; escapedTuple.append(s); @@ -3567,7 +3562,7 @@ void preprocessMappedKey(Tuple& mappedKeyFormatTuple, Key constructMappedKey(KeyValueRef* keyValue, std::vector>& vec, Tuple& mappedKeyTuple, - std::vector& strings) { + Tuple& mappedKeyFormatTuple) { // Lazily parse key and/or value to tuple because they may not need to be a tuple if not used. Optional keyTuple; Optional valueTuple; @@ -3583,7 +3578,7 @@ Key constructMappedKey(KeyValueRef* keyValue, mappedKeyTuple.append(vec[i].get()); } else { // singleKeyOrValue is true - std::string s = strings[i]; + std::string s = mappedKeyFormatTuple.getString(i).toString(); auto sz = s.size(); int idx; Tuple* referenceTuple; @@ -3624,11 +3619,10 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { Tuple mappedKeyTuple; std::vector> vt; - std::vector strings; bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); - Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple); Key expectedMappedKey = Tuple() .append("normal"_sr) @@ -3646,10 +3640,9 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { Tuple mappedKeyTuple; std::vector> vt; - std::vector strings; bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); - Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple); Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone(); // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; @@ -3661,10 +3654,9 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { Tuple mappedKeyTuple; std::vector> vt; - std::vector strings; bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); - Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple); Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone(); // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; @@ -3677,11 +3669,10 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { try { Tuple mappedKeyTuple; std::vector> vt; - std::vector strings; bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); - Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_index); throwException = true; @@ -3694,11 +3685,10 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { try { Tuple mappedKeyTuple; std::vector> vt; - std::vector strings; bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); - Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_range_decriptor); throwException2 = true; @@ -3711,11 +3701,10 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { try { Tuple mappedKeyTuple; std::vector> vt; - std::vector strings; bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); - Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, strings); + Key mappedKey = constructMappedKey(&kvr, vt, mappedKeyTuple, mappedKeyFormatTuple); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_index); throwException3 = true; @@ -3750,16 +3739,15 @@ ACTOR Future mapKeyValues(StorageServer* data, } state KeyValueRef* it = input.data.begin(); state std::vector> vt; - state std::vector strings; state bool isRangeQuery = false; - preprocessMappedKey(mappedKeyFormatTuple, vt, strings, isRangeQuery); + preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); for (; it != input.data.end(); it++) { state MappedKeyValueRef kvm; kvm.key = it->key; kvm.value = it->value; - state Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, strings); + state Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, mappedKeyFormatTuple); // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. result.arena.dependsOn(mappedKey.arena()); diff --git a/packaging/docker/sidecar.py b/packaging/docker/sidecar.py index 6fb80b4880..342d9dc305 100755 --- a/packaging/docker/sidecar.py +++ b/packaging/docker/sidecar.py @@ -523,7 +523,7 @@ class Server(BaseHTTPRequestHandler): self.send_error(404, "Path not found") self.end_headers() if self.path.startswith("/is_present/"): - if is_present(os.path.basename(self.path))): + if is_present(os.path.basename(self.path)): self.send_text("OK") else: self.send_error(404, "Path not found") From 410bcc8d5d72c5d9acceeff7047a2e155f0092a2 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Fri, 6 May 2022 10:01:22 +0100 Subject: [PATCH 137/299] Format sidecar script --- packaging/docker/sidecar.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packaging/docker/sidecar.py b/packaging/docker/sidecar.py index 342d9dc305..39c4685bc7 100755 --- a/packaging/docker/sidecar.py +++ b/packaging/docker/sidecar.py @@ -518,7 +518,9 @@ class Server(BaseHTTPRequestHandler): return if self.path.startswith("/check_hash/"): try: - self.send_text(check_hash(os.path.basename(self.path)), add_newline=False) + self.send_text( + check_hash(os.path.basename(self.path)), add_newline=False + ) except FileNotFoundError: self.send_error(404, "Path not found") self.end_headers() From a94be36e03d499204933a0439bd6fb1619804b1a Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 6 May 2022 09:22:58 -0700 Subject: [PATCH 138/299] add test spec file --- fdbserver/tester.actor.cpp | 4 +--- fdbserver/workloads/ReadWrite.actor.cpp | 10 +++++----- tests/CMakeLists.txt | 1 + tests/rare/ReadSkewReadWrite.toml | 24 ++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 tests/rare/ReadSkewReadWrite.toml diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index d03ea133cd..c4380af809 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1869,9 +1869,7 @@ ACTOR Future runTests(Reference connRecord, } choose { - when(wait(tests)) { - return Void(); - } + when(wait(tests)) { return Void(); } when(wait(quorum(actors, 1))) { ASSERT(false); throw internal_error(); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 1a5c5e3e56..f149de94ea 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -128,12 +128,12 @@ struct ReadWriteWorkload : KVWorkload { double loadTime, clientBegin; ReadWriteWorkload(WorkloadContext const& wcx) - : KVWorkload(wcx), dependentReads(false), adjacentReads(false), adjacentWrites(false), totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), + : KVWorkload(wcx), dependentReads(false), adjacentReads(false), adjacentWrites(false), + totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), totalRetriesMetric(LiteralStringRef("RWWorkload.TotalRetries")), aTransactions("A Transactions"), - bTransactions("B Transactions"), retries("Retries"), - latencies(sampleSize), readLatencies(sampleSize), commitLatencies(sampleSize), GRVLatencies(sampleSize), - fullReadLatencies(sampleSize), readLatencyTotal(0), readLatencyCount(0), loadTime(0.0), - clientBegin(0) { + bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), + commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), + readLatencyCount(0), loadTime(0.0), clientBegin(0) { transactionSuccessMetric.init(LiteralStringRef("RWWorkload.SuccessfulTransaction")); transactionFailureMetric.init(LiteralStringRef("RWWorkload.FailedTransaction")); readMetric.init(LiteralStringRef("RWWorkload.Read")); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6006da60c1..1b9089078f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -195,6 +195,7 @@ if(WITH_PYTHON) endif() add_fdb_test(TEST_FILES rare/CheckRelocation.toml) add_fdb_test(TEST_FILES rare/ClogUnclog.toml) + add_fdb_test(TEST_FILES rare/ReadSkewReadWrite.toml) add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml) add_fdb_test(TEST_FILES rare/ConfigIncrement.toml) add_fdb_test(TEST_FILES rare/ConfigIncrementWithKills.toml) diff --git a/tests/rare/ReadSkewReadWrite.toml b/tests/rare/ReadSkewReadWrite.toml new file mode 100644 index 0000000000..207ecd6c63 --- /dev/null +++ b/tests/rare/ReadSkewReadWrite.toml @@ -0,0 +1,24 @@ +[[test]] +testTitle = 'SkewedReadWriteTest' +connectionFailuresDisableDuration = 100000 +# waitForQuiescenceBegin= false +# waitForQuiescenceEnd=false +clearAfterTest = true +runSetup = true # false +timeout = 3600.0 + +[[test.workload]] +testName = 'SkewedReadWrite' +transactionsPerSecond = 100 +testDuration = 40.0 +skewRound = 1 +nodeCount = 3000 # 30000000 +valueBytes = 100 +readsPerTransactionA = 8 +writesPerTransactionA = 0 +alpha = 0 +discardEdgeMeasurements = false +hotServerFraction = 0.2 +hotServerReadFrac = 0.8 +# hotServerShardFraction = 0.3 +warmingDelay = 180.0 From b6a200bd2ccfd5523330d64343c1415919bd7f0c Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Fri, 6 May 2022 13:00:04 +0200 Subject: [PATCH 139/299] Set up a unit test to find the right setup for selective mTLS To be modified or removed once implementation is complete --- flow/CMakeLists.txt | 8 ++ flow/TLSTest.cpp | 247 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 flow/TLSTest.cpp diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 9548dc81c2..906f05f085 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -203,3 +203,11 @@ if(APPLE) target_link_libraries(flow PRIVATE ${IO_KIT} ${CORE_FOUNDATION}) target_link_libraries(flow_sampling PRIVATE ${IO_KIT} ${CORE_FOUNDATION}) endif() + +add_executable(tls_poc TLSTest.cpp) + +if(USE_SANITIZER) + target_link_libraries(tls_poc PUBLIC fmt::fmt flow boost_asan) +else() + target_link_libraries(tls_poc PUBLIC fmt::fmt flow boost_target) +endif() diff --git a/flow/TLSTest.cpp b/flow/TLSTest.cpp new file mode 100644 index 0000000000..6241519404 --- /dev/null +++ b/flow/TLSTest.cpp @@ -0,0 +1,247 @@ +/* + * TLSTest.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "flow/Arena.h" +#include "flow/MkCert.h" + +std::FILE* outp = stderr; + +template +void log(Args&& ... args) { + auto buf = fmt::memory_buffer{}; + fmt::format_to(std::back_inserter(buf), std::forward(args)...); + fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); +} + +template +void logc(Args&& ... args) { + auto buf = fmt::memory_buffer{}; + fmt::format_to(std::back_inserter(buf), "[CLIENT] "); + fmt::format_to(std::back_inserter(buf), std::forward(args)...); + fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); +} + +template +void logs(Args&& ... args) { + auto buf = fmt::memory_buffer{}; + fmt::format_to(std::back_inserter(buf), "[SERVER] "); + fmt::format_to(std::back_inserter(buf), std::forward(args)...); + fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); +} + +using namespace boost::asio; +using ip::tcp; + +using ec_type = boost::system::error_code; + +using socket_type = ssl::stream; +using work_guard_type = executor_work_guard; + +auto client_ssl = ssl::context(ssl::context::tls); +auto server_ssl = ssl::context(ssl::context::tls); + +mkcert::CertChainRef server_chain; +mkcert::CertChainRef client_chain; + +void trust_root_cacert(ssl::context& ctx, StringRef certPem) { + ctx.add_certificate_authority(const_buffer(certPem.begin(), certPem.size())); +} + +void use_chain(ssl::context& ctx, mkcert::CertChainRef chain) { + auto arena = Arena(); + auto chain_str = concatCertChain(arena, chain); + ctx.use_certificate_chain(const_buffer(chain_str.begin(), chain_str.size())); + auto keyPem = chain.front().privateKeyPem; + ctx.use_private_key(const_buffer(keyPem.begin(), keyPem.size()), ssl::context::pem); +} + +void init_certs(ssl::context& ctx, mkcert::CertChainRef my_chain, StringRef peerRootPem) { + if (!peerRootPem.empty()) + trust_root_cacert(ctx, peerRootPem); + if (my_chain.size() > 1) my_chain.pop_back(); + if (my_chain.size() > 0) + use_chain(ctx, my_chain); +} + +void init_client_ssl_context() { + auto& ctx = client_ssl; + ctx.set_options(ssl::context::default_workarounds); + ctx.set_verify_mode(ssl::context::verify_peer | ssl::verify_fail_if_no_peer_cert); + ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { + logc("context preverify: {}", preverify); + return preverify; + }); + init_certs(ctx, client_chain, server_chain.empty() ? StringRef() : server_chain.back().certPem); +} + +void init_server_ssl_context() { + auto& ctx = server_ssl; + ctx.set_options(ssl::context::default_workarounds); + ctx.set_verify_mode(ssl::context::verify_peer | (client_chain.empty() ? 0 : ssl::verify_fail_if_no_peer_cert)); + ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { + logs("context preverify: {}", preverify); + return preverify; + }); + init_certs(ctx, server_chain, client_chain.empty() ? StringRef() : client_chain.back().certPem); +} + +template <> struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { + return ctx.begin(); + } + + template + auto format(const tcp::endpoint& ep, FormatContext& ctx) -> decltype(ctx.out()) { + return fmt::format_to(ctx.out(), "{}:{}", ep.address().to_string(), ep.port()); + } +}; + +int main(int argc, char** argv) { + auto const server_chain_len = (argc > 1 ? std::strtoul(argv[1], nullptr, 10) : 3ul); + auto const client_chain_len = (argc > 2 ? std::strtoul(argv[2], nullptr, 10) : 3ul); + auto const expect_trusted = client_chain_len != 0; + log("cert chain length: server {}, client {}", server_chain_len, client_chain_len); + [[maybe_unused]] auto print_chain = [](mkcert::CertChainRef chain) -> void { + if (chain.empty()) { + log("EMPTY"); + return; + } + for (auto certAndKey : chain) { + certAndKey.printCert(outp); + log("==========="); + certAndKey.printPrivateKey(outp); + log("==========="); + } + }; + auto arena = Arena(); + if (server_chain_len > 0) + server_chain = mkcert::makeCertChain(arena, server_chain_len, mkcert::ESide::Server); + if (client_chain_len > 0) + client_chain = mkcert::makeCertChain(arena, client_chain_len, mkcert::ESide::Client); + /* + log("=========== SERVER CHAIN"); + print_chain(server_chain); + auto concat = concatCertChain(arena, server_chain); + if (!concat.empty()) + log(concat.toString()); + log("=========== CLIENT CHAIN"); + print_chain(client_chain); + concat = concatCertChain(arena, client_chain); + if (!concat.empty()) + log(concat.toString()); + */ + init_client_ssl_context(); + log("client SSL contexts initialized"); + init_server_ssl_context(); + log("server SSL contexts initialized"); + auto io = io_context(); + auto wg_server = work_guard_type(io.get_executor()); + auto wg_client = work_guard_type(io.get_executor()); + auto const ip = ip::address::from_string("127.0.0.1"); + auto acceptor = tcp::acceptor(io, tcp::endpoint(ip, 0)); + auto const server_addr = acceptor.local_endpoint(); + logs("server listening at {}", server_addr); + auto server_sock = tcp::socket(io); + auto server_ssl_sock = socket_type(server_sock, server_ssl); + enum class ESockState { AssumedUntrusted, Trusted }; + auto server_sock_state = ESockState::AssumedUntrusted; + auto client_sock_state = ESockState::AssumedUntrusted; + server_ssl_sock.set_verify_callback([&server_sock_state](bool preverify, ssl::verify_context&) { + logs("client preverify: {}", preverify); + switch (server_sock_state) { + case ESockState::AssumedUntrusted: + if (!preverify) return false; + server_sock_state = ESockState::Trusted; + break; + case ESockState::Trusted: + if (!preverify) return false; + break; + default: + break; + } + // if untrusted connection passes preverify, they are considered trusted + return true; + }); + acceptor.async_accept(server_sock, [&server_ssl_sock, &wg_server](const ec_type& ec) { + if (ec) { + logs("accept error: {}", ec.message()); + wg_server.reset(); + } else { + logs("accepted connection from {}", server_ssl_sock.next_layer().remote_endpoint()); + server_ssl_sock.async_handshake( + ssl::stream_base::handshake_type::server, + [&wg_server](const ec_type& ec) { + if (ec) { + logs("server handshake returned {}", ec.message()); + } else { + logs("handshake OK"); + } + wg_server.reset(); + }); + } + }); + auto client_sock = tcp::socket(io); + auto client_ssl_sock = socket_type(client_sock, client_ssl); + client_ssl_sock.set_verify_callback([&client_sock_state](bool preverify, ssl::verify_context&) { + logc("server preverify: {}", preverify); + switch (client_sock_state) { + case ESockState::AssumedUntrusted: + if (!preverify) return false; + client_sock_state = ESockState::Trusted; + break; + case ESockState::Trusted: + if (!preverify) return false; + break; + default: + break; + } + // if untrusted connection passes preverify, they are considered trusted + return true; + }); + client_sock.async_connect(server_addr, [&wg_client, &client_sock, &client_ssl_sock](const ec_type& ec) { + if (ec) { + logc("connect error: {}", ec.message()); + wg_client.reset(); + } else { + logc("connected to {}", client_sock.remote_endpoint()); + client_ssl_sock.async_handshake( + ssl::stream_base::handshake_type::client, + [&wg_client](const ec_type& ec) { + if (ec) { + logc("client handshake returned {}", ec.message()); + } else { + logc("handshake OK"); + } + wg_client.reset(); + }); + } + }); + io.run(); + ASSERT_EQ(expect_trusted, (server_sock_state == ESockState::Trusted)); + log("Test OK: Connection considered {}", server_sock_state == ESockState::Trusted ? "trusted" : "untrusted"); + return 0; +} From e5f039acf8e1a275045ee19525060f7dd1ff9220 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Fri, 6 May 2022 19:10:42 +0200 Subject: [PATCH 140/299] Apply clang format --- flow/MkCert.cpp | 67 +++++++++++++++++++++++--------------------- flow/MkCert.h | 63 +++++++++++++++++------------------------- flow/TLSTest.cpp | 72 ++++++++++++++++++++++++------------------------ 3 files changed, 97 insertions(+), 105 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 71dd67f88f..96d00676a5 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -47,10 +47,7 @@ public: [[noreturn]] void traceAndThrow(const char* condition, const char* file, int line) { fprintf(stderr, "Failed condition check %s at %s:%d\n", condition, file, line); auto te = TraceEvent(SevWarnAlways, "ErrorTLSKeyOrCertGen"); - te.suppressFor(60) - .detail("File", file) - .detail("Line", line) - .detail("Condition", condition); + te.suppressFor(60).detail("File", file).detail("Line", line).detail("Condition", condition); if (auto err = ::ERR_get_error()) { char buf[256]{ 0, @@ -64,10 +61,10 @@ public: } // anonymous namespace -#define OSSL_ASSERT(condition) \ - do { \ - if (!(condition)) \ - traceAndThrow(#condition, __FILE__, __LINE__); \ +#define OSSL_ASSERT(condition) \ + do { \ + if (!(condition)) \ + traceAndThrow(#condition, __FILE__, __LINE__); \ } while (false) namespace mkcert { @@ -103,7 +100,8 @@ struct CertAndKeyNative { PemType toPem(Arena& arena) { auto ret = PemType{}; - if (null()) return ret; + if (null()) + return ret; ASSERT(valid()); ret.certPem = writeX509CertPem(arena, cert); ret.privateKeyPem = writePrivateKeyPem(arena, privateKey); @@ -227,11 +225,14 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { auto nativeKeyPair = makeEllipticCurveKeyPairNative(); auto newX = ::X509_new(); OSSL_ASSERT(newX); - auto x509Guard = ExitGuard([&newX]() { if (newX) ::X509_free(newX); }); + auto x509Guard = ExitGuard([&newX]() { + if (newX) + ::X509_free(newX); + }); auto smartX = std::shared_ptr(newX, &::X509_free); newX = nullptr; auto x = smartX.get(); - OSSL_ASSERT(0 < ::X509_set_version(x, 2/*X509_VERSION_3*/)); + OSSL_ASSERT(0 < ::X509_set_version(x, 2 /*X509_VERSION_3*/)); auto serialPtr = ::X509_get_serialNumber(x); OSSL_ASSERT(serialPtr); OSSL_ASSERT(0 < ::ASN1_INTEGER_set(serialPtr, spec.serialNumber)); @@ -247,7 +248,9 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { for (const auto& entry : spec.subjectName) { // field names are expected to null-terminate auto fieldName = entry.field.toString(); - OSSL_ASSERT(0 < ::X509_NAME_add_entry_by_txt(subjectName, fieldName.c_str(), MBSTRING_ASC, entry.bytes.begin(), entry.bytes.size(), -1, 0)); + OSSL_ASSERT(0 < + ::X509_NAME_add_entry_by_txt( + subjectName, fieldName.c_str(), MBSTRING_ASC, entry.bytes.begin(), entry.bytes.size(), -1, 0)); } auto issuerName = ::X509_get_issuer_name(x); OSSL_ASSERT(issuerName); @@ -283,22 +286,22 @@ CertSpecRef CertSpecRef::make(Arena& arena, CertKind kind) { spec.offsetNotBefore = 0; // now spec.offsetNotAfter = 60 * 60 * 24 * 365; // 1 year from now auto& subject = spec.subjectName; - subject.push_back(arena, {"countryName"_sr, "DE"_sr}); - subject.push_back(arena, {"localityName"_sr, "Berlin"_sr}); - subject.push_back(arena, {"organizationName"_sr, "FoundationDB"_sr}); - subject.push_back(arena, {"commonName"_sr, kind.getCommonName("FDB Testing Services"_sr, arena)}); + subject.push_back(arena, { "countryName"_sr, "DE"_sr }); + subject.push_back(arena, { "localityName"_sr, "Berlin"_sr }); + subject.push_back(arena, { "organizationName"_sr, "FoundationDB"_sr }); + subject.push_back(arena, { "commonName"_sr, kind.getCommonName("FDB Testing Services"_sr, arena) }); auto& ext = spec.extensions; if (kind.isCA()) { - ext.push_back(arena, {"basicConstraints"_sr, "critical, CA:TRUE"_sr}); - ext.push_back(arena, {"keyUsage"_sr, "critical, digitalSignature, keyCertSign, cRLSign"_sr}); + ext.push_back(arena, { "basicConstraints"_sr, "critical, CA:TRUE"_sr }); + ext.push_back(arena, { "keyUsage"_sr, "critical, digitalSignature, keyCertSign, cRLSign"_sr }); } else { - ext.push_back(arena, {"basicConstraints"_sr, "critical, CA:FALSE"_sr}); - ext.push_back(arena, {"keyUsage"_sr, "critical, digitalSignature, keyEncipherment"_sr}); - ext.push_back(arena, {"extendedKeyUsage"_sr, "serverAuth, clientAuth"_sr}); + ext.push_back(arena, { "basicConstraints"_sr, "critical, CA:FALSE"_sr }); + ext.push_back(arena, { "keyUsage"_sr, "critical, digitalSignature, keyEncipherment"_sr }); + ext.push_back(arena, { "extendedKeyUsage"_sr, "serverAuth, clientAuth"_sr }); } - ext.push_back(arena, {"subjectKeyIdentifier"_sr, "hash"_sr}); + ext.push_back(arena, { "subjectKeyIdentifier"_sr, "hash"_sr }); if (!kind.isRootCA()) - ext.push_back(arena, {"authorityKeyIdentifier"_sr, "keyid, issuer"_sr}); + ext.push_back(arena, { "authorityKeyIdentifier"_sr, "keyid, issuer"_sr }); return spec; } @@ -307,7 +310,8 @@ StringRef concatCertChain(Arena& arena, CertChainRef chain) { for (const auto& entry : chain) { len += entry.certPem.size(); } - if (len == 0) return StringRef(); + if (len == 0) + return StringRef(); auto buf = new (arena) uint8_t[len]; auto offset = 0; for (auto const& entry : chain) { @@ -325,8 +329,8 @@ CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKe if (needRootCA) { int const chainLength = specs.size(); auto chain = new (arena) CertAndKeyRef[chainLength]; - auto caNative = makeCertNative(specs.back(), CertAndKeyNative{}/* empty issuer == self-signed */); - chain[chainLength-1] = caNative.toPem(arena); + auto caNative = makeCertNative(specs.back(), CertAndKeyNative{} /* empty issuer == self-signed */); + chain[chainLength - 1] = caNative.toPem(arena); for (auto i = chainLength - 2; i >= 0; i--) { auto cnkNative = makeCertNative(specs[i], caNative); chain[i] = cnkNative.toPem(arena); @@ -335,9 +339,9 @@ CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKe return CertChainRef(chain, chainLength); } else { int const chainLength = specs.size() + 1; /* account for deep-copied rootAuthority */ - auto chain = new (arena) CertAndKeyRef[chainLength]; + auto chain = new (arena) CertAndKeyRef[chainLength]; auto caNative = CertAndKeyNative::fromPem(rootAuthority); - chain[chainLength-1] = rootAuthority.deepCopy(arena); + chain[chainLength - 1] = rootAuthority.deepCopy(arena); for (auto i = chainLength - 2; i >= 0; i--) { auto cnkNative = makeCertNative(specs[i], caNative); chain[i] = cnkNative.toPem(arena); @@ -348,7 +352,8 @@ CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKe } CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { - if (!length) return {}; + if (!length) + return {}; // temporary arena for writing up specs auto tmpArena = Arena(); auto specs = new (tmpArena) CertSpecRef[length]; @@ -360,10 +365,10 @@ CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { else if (i == length - 1) kind = isServerSide ? CertKind(ServerRootCA{}) : CertKind(ClientRootCA{}); else - kind = isServerSide ? CertKind(ServerIntermediateCA{i}) : CertKind(ClientIntermediateCA{i}); + kind = isServerSide ? CertKind(ServerIntermediateCA{ i }) : CertKind(ClientIntermediateCA{ i }); specs[i] = CertSpecRef::make(tmpArena, kind); } - return makeCertChain(arena, VectorRef(specs, length), {}/*root*/); + return makeCertChain(arena, VectorRef(specs, length), {} /*root*/); } } // namespace mkcert diff --git a/flow/MkCert.h b/flow/MkCert.h index da83da40b9..c5f23cde76 100644 --- a/flow/MkCert.h +++ b/flow/MkCert.h @@ -45,16 +45,21 @@ struct KeyPairRef { }; struct Asn1EntryRef { - // field must match one of ASN.1 object short/long names: e.g. "C", "countryName", "CN", "commonName", "subjectAltName", ... + // field must match one of ASN.1 object short/long names: e.g. "C", "countryName", "CN", "commonName", + // "subjectAltName", ... StringRef field; StringRef bytes; }; struct ServerRootCA {}; -struct ServerIntermediateCA { unsigned level; }; +struct ServerIntermediateCA { + unsigned level; +}; struct Server {}; struct ClientRootCA {}; -struct ClientIntermediateCA { unsigned level; }; +struct ClientIntermediateCA { + unsigned level; +}; struct Client {}; struct CertKind { @@ -62,48 +67,35 @@ struct CertKind { CertKind() noexcept = default; template - CertKind(Kind kind) noexcept : - value(std::in_place_type, kind) - {} + CertKind(Kind kind) noexcept : value(std::in_place_type, kind) {} template - bool is() const noexcept { return std::holds_alternative(value); } + bool is() const noexcept { + return std::holds_alternative(value); + } template - Kind const& get() const { return std::get(value); } - - bool isServerSide() const noexcept { - return is() || is() || is(); + Kind const& get() const { + return std::get(value); } - bool isClientSide() const noexcept { - return !isServerSide(); - } + bool isServerSide() const noexcept { return is() || is() || is(); } - bool isRootCA() const noexcept { - return is() || is(); - } + bool isClientSide() const noexcept { return !isServerSide(); } - bool isIntermediateCA() const noexcept { - return is() || is(); - } + bool isRootCA() const noexcept { return is() || is(); } - bool isLeaf() const noexcept { - return is() || is(); - } + bool isIntermediateCA() const noexcept { return is() || is(); } - bool isCA() const noexcept { - return !isLeaf(); - } + bool isLeaf() const noexcept { return is() || is(); } + + bool isCA() const noexcept { return !isLeaf(); } StringRef getCommonName(StringRef prefix, Arena& arena) const { auto const side = std::string(isClientSide() ? " Client" : " Server"); if (isIntermediateCA()) { - auto const level = isClientSide() ? get().level - : get().level; - return prefix.withSuffix( - fmt::format("{} Intermediate {}", side, level), - arena); + auto const level = isClientSide() ? get().level : get().level; + return prefix.withSuffix(fmt::format("{} Intermediate {}", side, level), arena); } else if (isRootCA()) { return prefix.withSuffix(fmt::format("{} Root", side), arena); } else { @@ -144,9 +136,7 @@ struct CertAndKeyRef { } } - bool empty() const noexcept { - return certPem.empty() && privateKeyPem.empty(); - } + bool empty() const noexcept { return certPem.empty() && privateKeyPem.empty(); } SelfType deepCopy(Arena& arena) { auto ret = SelfType{}; @@ -166,10 +156,7 @@ using CertChainRef = VectorRef; // Concatenate chain of PEMs to one StringRef StringRef concatCertChain(Arena& arena, CertChainRef chain); -enum class ESide : int { - Server, - Client -}; +enum class ESide : int { Server, Client }; // For empty (default) rootAuthority, the last item in specs is used to generate rootAuthority // Otherwise, rootAuthority is deep-copied to first element of returned chain diff --git a/flow/TLSTest.cpp b/flow/TLSTest.cpp index 6241519404..670e01efb5 100644 --- a/flow/TLSTest.cpp +++ b/flow/TLSTest.cpp @@ -30,23 +30,23 @@ std::FILE* outp = stderr; -template -void log(Args&& ... args) { +template +void log(Args&&... args) { auto buf = fmt::memory_buffer{}; fmt::format_to(std::back_inserter(buf), std::forward(args)...); fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); } -template -void logc(Args&& ... args) { +template +void logc(Args&&... args) { auto buf = fmt::memory_buffer{}; fmt::format_to(std::back_inserter(buf), "[CLIENT] "); fmt::format_to(std::back_inserter(buf), std::forward(args)...); fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); } -template -void logs(Args&& ... args) { +template +void logs(Args&&... args) { auto buf = fmt::memory_buffer{}; fmt::format_to(std::back_inserter(buf), "[SERVER] "); fmt::format_to(std::back_inserter(buf), std::forward(args)...); @@ -82,7 +82,8 @@ void use_chain(ssl::context& ctx, mkcert::CertChainRef chain) { void init_certs(ssl::context& ctx, mkcert::CertChainRef my_chain, StringRef peerRootPem) { if (!peerRootPem.empty()) trust_root_cacert(ctx, peerRootPem); - if (my_chain.size() > 1) my_chain.pop_back(); + if (my_chain.size() > 1) + my_chain.pop_back(); if (my_chain.size() > 0) use_chain(ctx, my_chain); } @@ -92,9 +93,9 @@ void init_client_ssl_context() { ctx.set_options(ssl::context::default_workarounds); ctx.set_verify_mode(ssl::context::verify_peer | ssl::verify_fail_if_no_peer_cert); ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { - logc("context preverify: {}", preverify); - return preverify; - }); + logc("context preverify: {}", preverify); + return preverify; + }); init_certs(ctx, client_chain, server_chain.empty() ? StringRef() : server_chain.back().certPem); } @@ -103,16 +104,15 @@ void init_server_ssl_context() { ctx.set_options(ssl::context::default_workarounds); ctx.set_verify_mode(ssl::context::verify_peer | (client_chain.empty() ? 0 : ssl::verify_fail_if_no_peer_cert)); ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { - logs("context preverify: {}", preverify); - return preverify; - }); + logs("context preverify: {}", preverify); + return preverify; + }); init_certs(ctx, server_chain, client_chain.empty() ? StringRef() : client_chain.back().certPem); } -template <> struct fmt::formatter { - constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { - return ctx.begin(); - } +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { return ctx.begin(); } template auto format(const tcp::endpoint& ep, FormatContext& ctx) -> decltype(ctx.out()) { @@ -147,12 +147,12 @@ int main(int argc, char** argv) { print_chain(server_chain); auto concat = concatCertChain(arena, server_chain); if (!concat.empty()) - log(concat.toString()); + log(concat.toString()); log("=========== CLIENT CHAIN"); print_chain(client_chain); concat = concatCertChain(arena, client_chain); if (!concat.empty()) - log(concat.toString()); + log(concat.toString()); */ init_client_ssl_context(); log("client SSL contexts initialized"); @@ -174,11 +174,13 @@ int main(int argc, char** argv) { logs("client preverify: {}", preverify); switch (server_sock_state) { case ESockState::AssumedUntrusted: - if (!preverify) return false; + if (!preverify) + return false; server_sock_state = ESockState::Trusted; break; case ESockState::Trusted: - if (!preverify) return false; + if (!preverify) + return false; break; default: break; @@ -192,9 +194,7 @@ int main(int argc, char** argv) { wg_server.reset(); } else { logs("accepted connection from {}", server_ssl_sock.next_layer().remote_endpoint()); - server_ssl_sock.async_handshake( - ssl::stream_base::handshake_type::server, - [&wg_server](const ec_type& ec) { + server_ssl_sock.async_handshake(ssl::stream_base::handshake_type::server, [&wg_server](const ec_type& ec) { if (ec) { logs("server handshake returned {}", ec.message()); } else { @@ -210,11 +210,13 @@ int main(int argc, char** argv) { logc("server preverify: {}", preverify); switch (client_sock_state) { case ESockState::AssumedUntrusted: - if (!preverify) return false; + if (!preverify) + return false; client_sock_state = ESockState::Trusted; break; case ESockState::Trusted: - if (!preverify) return false; + if (!preverify) + return false; break; default: break; @@ -228,16 +230,14 @@ int main(int argc, char** argv) { wg_client.reset(); } else { logc("connected to {}", client_sock.remote_endpoint()); - client_ssl_sock.async_handshake( - ssl::stream_base::handshake_type::client, - [&wg_client](const ec_type& ec) { - if (ec) { - logc("client handshake returned {}", ec.message()); - } else { - logc("handshake OK"); - } - wg_client.reset(); - }); + client_ssl_sock.async_handshake(ssl::stream_base::handshake_type::client, [&wg_client](const ec_type& ec) { + if (ec) { + logc("client handshake returned {}", ec.message()); + } else { + logc("handshake OK"); + } + wg_client.reset(); + }); } }); io.run(); From 5bb70dda915e5d560428ddc9581f25c73b548ca5 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 6 May 2022 10:39:45 -0700 Subject: [PATCH 141/299] add remainedBytes method --- fdbclient/SystemData.cpp | 10 ++++------ flow/serialize.h | 9 +++++++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index a84eae4f77..73baffd127 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -335,12 +335,10 @@ std::pair serverKeysDecodeServerBegin(const KeyRef& key) { BinaryReader rd(key.removePrefix(serverKeysPrefix), Unversioned()); rd >> server_id; rd.readBytes(1); // skip "/" - std::string bytes; - while (!rd.empty()) { - bytes.push_back((char)*rd.arenaRead(1)); - } - // std::cout << bytes.size() << " " < { From 154ae7559e46016cec1018e34068d8986eb2e901 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Fri, 6 May 2022 18:10:37 +0000 Subject: [PATCH 142/299] - If getRange() is called with "latestVersion" as the read version then both GetKeyValuesFamilyRequest and getRangeFallback should do reads at the same version. --- fdbclient/NativeAPI.actor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 57b68282cf..3c06c0f506 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -4376,11 +4376,10 @@ Future getRange(Reference trState, TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange if (!rep.data.size()) { - // VERSION_VECTOR change version to readVersion in getRangeFallback RangeResultFamily result = wait( getRangeFallback( trState, - version, + readVersion, originalBegin, originalEnd, mapper, From 0cce5feae3e6251e69130467f0ad6450046d8ea0 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 5 May 2022 09:52:20 -0700 Subject: [PATCH 143/299] Optimize LogPushData to avoid constructing LogSystemConfig Which seems to be a CPU hot spot in our testing. --- fdbserver/CommitProxyServer.actor.cpp | 16 ++++++---------- fdbserver/LogSystem.cpp | 9 +++++++++ fdbserver/LogSystem.h | 15 +++------------ fdbserver/ProxyCommitData.actor.h | 1 + fdbserver/Resolver.actor.cpp | 4 +++- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index ca2ffbf3d5..04b96ee002 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -698,16 +698,11 @@ std::set CommitBatchContext::getWrittenTagsPreResolution() { CommitBatchContext::CommitBatchContext(ProxyCommitData* const pProxyCommitData_, const std::vector* trs_, const int currentBatchMemBytesCount) - : - - pProxyCommitData(pProxyCommitData_), trs(std::move(*const_cast*>(trs_))), - currentBatchMemBytesCount(currentBatchMemBytesCount), - - startTime(g_network->now()), - - localBatchNumber(++pProxyCommitData->localCommitBatchesStarted), toCommit(pProxyCommitData->logSystem), - - span("MP:commitBatch"_loc), committed(trs.size()) { + : pProxyCommitData(pProxyCommitData_), trs(std::move(*const_cast*>(trs_))), + currentBatchMemBytesCount(currentBatchMemBytesCount), startTime(g_network->now()), + localBatchNumber(++pProxyCommitData->localCommitBatchesStarted), + toCommit(pProxyCommitData->logSystem, pProxyCommitData->localTLogs), span("MP:commitBatch"_loc), + committed(trs.size()) { evaluateBatchSize(); @@ -2408,6 +2403,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, //TraceEvent("ProxyInit3", proxy.id()); commitData.resolvers = commitData.db->get().resolvers; + commitData.localTLogs = commitData.db->get().logSystemConfig.numLogs(); ASSERT(commitData.resolvers.size() != 0); for (int i = 0; i < commitData.resolvers.size(); ++i) { commitData.stats.resolverDist.push_back( diff --git a/fdbserver/LogSystem.cpp b/fdbserver/LogSystem.cpp index ab8f43cfc5..6444b23234 100644 --- a/fdbserver/LogSystem.cpp +++ b/fdbserver/LogSystem.cpp @@ -272,6 +272,15 @@ void LogSet::getPushLocations(VectorRef tags, std::vector& locations, // .detail("Included", alsoServers.size()).detail("Duration", timer() - t); } +LogPushData::LogPushData(Reference logSystem, int tlogCount) : logSystem(logSystem), subsequence(1) { + ASSERT(tlogCount > 0); + messagesWriter.reserve(tlogCount); + for (int i = 0; i < tlogCount; i++) { + messagesWriter.emplace_back(AssumeVersion(g_network->protocolVersion())); + } + messagesWritten = std::vector(tlogCount, false); +} + void LogPushData::addTxsTag() { if (logSystem->getTLogVersion() >= TLogVersion::V4) { next_message_tags.push_back(logSystem->getRandomTxsTag()); diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index 6581457c25..aaf40def20 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -652,14 +652,14 @@ struct ILogSystem { // Returns an ILogSystem representing a new epoch immediately following this one. The new epoch is only provisional // until the caller updates the coordinated DBCoreState - virtual LogSystemConfig getLogSystemConfig() const = 0; // Returns the physical configuration of this LogSystem, that could be used to construct an equivalent LogSystem // using fromLogSystemConfig() + virtual LogSystemConfig getLogSystemConfig() const = 0; virtual Standalone getLogsValue() const = 0; - virtual Future onLogSystemConfigChange() = 0; // Returns when the log system configuration has changed due to a tlog rejoin. + virtual Future onLogSystemConfigChange() = 0; virtual void getPushLocations(VectorRef tags, std::vector& locations, @@ -741,16 +741,7 @@ struct LogPushData : NonCopyable { // Log subsequences have to start at 1 (the MergedPeekCursor relies on this to make sure we never have !hasMessage() // in the middle of data for a version - explicit LogPushData(Reference logSystem) : logSystem(logSystem), subsequence(1) { - for (auto& log : logSystem->getLogSystemConfig().tLogs) { - if (log.isLocal) { - for (int i = 0; i < log.tLogs.size(); i++) { - messagesWriter.push_back(BinaryWriter(AssumeVersion(g_network->protocolVersion()))); - } - } - } - messagesWritten = std::vector(messagesWriter.size(), false); - } + explicit LogPushData(Reference logSystem, int tlogCount); void addTxsTag(); diff --git a/fdbserver/ProxyCommitData.actor.h b/fdbserver/ProxyCommitData.actor.h index 5b9315da59..97e07b4670 100644 --- a/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/ProxyCommitData.actor.h @@ -228,6 +228,7 @@ struct ProxyCommitData { double lastResolverReset; std::map tenantMap; + int localTLogs = -1; // The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly // more CPU efficient. When a tag related to a storage server does change, we empty out all of these vectors to diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index e088de711a..173e081c1c 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -138,6 +138,7 @@ struct Resolver : ReferenceCounted { LogSystemDiskQueueAdapter* logAdapter = nullptr; Reference logSystem; IKeyValueStore* txnStateStore = nullptr; + int localTLogs = -1; std::map> storageCache; KeyRangeMap keyInfo; // keyrange -> all storage servers in all DCs for the keyrange @@ -318,7 +319,7 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) { auto lockedKey = self->txnStateStore->readValue(databaseLockedKey).get(); isLocked = lockedKey.present() && lockedKey.get().size(); - toCommit.reset(new LogPushData(self->logSystem)); + toCommit.reset(new LogPushData(self->logSystem, self->localTLogs)); resolverData.reset(new ResolverData(self->dbgid, self->logSystem, self->txnStateStore, @@ -647,6 +648,7 @@ ACTOR Future resolverCore(ResolverInterface resolver, // Initialize txnStateStore self->logSystem = ILogSystem::fromServerDBInfo(resolver.id(), db->get(), false, addActor); + self->localTLogs = db->get().logSystemConfig.numLogs(); state PromiseStream> addActor; state Future onError = transformError(actorCollection(addActor.getFuture()), broken_promise(), resolver_failed()); From 3caa3bc59503db7c684d3107824b3f4668128fd1 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 6 May 2022 11:19:37 -0700 Subject: [PATCH 144/299] Rename variable names --- fdbserver/CommitProxyServer.actor.cpp | 4 ++-- fdbserver/ProxyCommitData.actor.h | 2 +- fdbserver/Resolver.actor.cpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 04b96ee002..141adc296c 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -701,7 +701,7 @@ CommitBatchContext::CommitBatchContext(ProxyCommitData* const pProxyCommitData_, : pProxyCommitData(pProxyCommitData_), trs(std::move(*const_cast*>(trs_))), currentBatchMemBytesCount(currentBatchMemBytesCount), startTime(g_network->now()), localBatchNumber(++pProxyCommitData->localCommitBatchesStarted), - toCommit(pProxyCommitData->logSystem, pProxyCommitData->localTLogs), span("MP:commitBatch"_loc), + toCommit(pProxyCommitData->logSystem, pProxyCommitData->localTLogCount), span("MP:commitBatch"_loc), committed(trs.size()) { evaluateBatchSize(); @@ -2403,7 +2403,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, //TraceEvent("ProxyInit3", proxy.id()); commitData.resolvers = commitData.db->get().resolvers; - commitData.localTLogs = commitData.db->get().logSystemConfig.numLogs(); + commitData.localTLogCount = commitData.db->get().logSystemConfig.numLogs(); ASSERT(commitData.resolvers.size() != 0); for (int i = 0; i < commitData.resolvers.size(); ++i) { commitData.stats.resolverDist.push_back( diff --git a/fdbserver/ProxyCommitData.actor.h b/fdbserver/ProxyCommitData.actor.h index 97e07b4670..c5e523dd3e 100644 --- a/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/ProxyCommitData.actor.h @@ -228,7 +228,7 @@ struct ProxyCommitData { double lastResolverReset; std::map tenantMap; - int localTLogs = -1; + int localTLogCount = -1; // The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly // more CPU efficient. When a tag related to a storage server does change, we empty out all of these vectors to diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 173e081c1c..5357b32822 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -138,7 +138,7 @@ struct Resolver : ReferenceCounted { LogSystemDiskQueueAdapter* logAdapter = nullptr; Reference logSystem; IKeyValueStore* txnStateStore = nullptr; - int localTLogs = -1; + int localTLogCount = -1; std::map> storageCache; KeyRangeMap keyInfo; // keyrange -> all storage servers in all DCs for the keyrange @@ -319,7 +319,7 @@ ACTOR Future resolveBatch(Reference self, ResolveTransactionBatc if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) { auto lockedKey = self->txnStateStore->readValue(databaseLockedKey).get(); isLocked = lockedKey.present() && lockedKey.get().size(); - toCommit.reset(new LogPushData(self->logSystem, self->localTLogs)); + toCommit.reset(new LogPushData(self->logSystem, self->localTLogCount)); resolverData.reset(new ResolverData(self->dbgid, self->logSystem, self->txnStateStore, @@ -648,7 +648,7 @@ ACTOR Future resolverCore(ResolverInterface resolver, // Initialize txnStateStore self->logSystem = ILogSystem::fromServerDBInfo(resolver.id(), db->get(), false, addActor); - self->localTLogs = db->get().logSystemConfig.numLogs(); + self->localTLogCount = db->get().logSystemConfig.numLogs(); state PromiseStream> addActor; state Future onError = transformError(actorCollection(addActor.getFuture()), broken_promise(), resolver_failed()); From 14e5fc5f24438a752f6a7e8a0ff0336f63b816db Mon Sep 17 00:00:00 2001 From: Renxuan Wang Date: Fri, 6 May 2022 14:53:56 -0700 Subject: [PATCH 145/299] Fix compatibility issue. (#7095) New added field should be the last in serializer. --- fdbclient/CoordinationInterface.h | 4 ++-- fdbserver/CoordinationInterface.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index 65a18cfa7c..c076ad8fbd 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -251,9 +251,9 @@ struct OpenDatabaseCoordRequest { traceLogGroup, knownClientInfoID, clusterKey, - hostnames, coordinators, - reply); + reply, + hostnames); } }; diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h index 2c8c037afa..0589f5cde2 100644 --- a/fdbserver/CoordinationInterface.h +++ b/fdbserver/CoordinationInterface.h @@ -167,7 +167,7 @@ struct ElectionResultRequest { template void serialize(Ar& ar) { - serializer(ar, key, hostnames, coordinators, knownLeader, reply); + serializer(ar, key, coordinators, knownLeader, reply, hostnames); } }; From b0a00effa051a0c43cf47dd0f862d4156f63f235 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 6 May 2022 16:37:12 -0700 Subject: [PATCH 146/299] set max shard bandwidth --- fdbserver/DataDistribution.actor.h | 6 +++++- fdbserver/DataDistributionQueue.actor.cpp | 21 ++++++++++----------- fdbserver/DataDistributionTracker.actor.cpp | 6 ++++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 6e394a38e4..6c3f1b4212 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -171,9 +171,13 @@ struct GetTopKMetricsRequest { MetricsComparator comparator; // Return true if a.score > b.score, return the largest topK in keys std::vector keys; Promise> reply; // topK storage metrics + double maxBytesReadPerKSecond = 0; // all returned shards won't exceed this read load GetTopKMetricsRequest() {} - GetTopKMetricsRequest(std::vector const& keys, int topK = 1) : topK(topK), keys(keys) {} + GetTopKMetricsRequest(std::vector const& keys, + int topK = 1, + double maxBytesReadPerKSecond = std::numeric_limits::max()) + : topK(topK), keys(keys), maxBytesReadPerKSecond(maxBytesReadPerKSecond) {} }; struct GetMetricsListRequest { diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index a00647115e..806cfc134d 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1536,12 +1536,20 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, traceEvent->detail("SkipReason", "SourceTeamThrottle"); return false; } + // check team difference + auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth(); + traceEvent->detail("SrcReadBandwidth", srcLoad).detail("DestReadBandwidth", destLoad); + // read bandwidth difference is less than 30% of src load + if (0.7 * srcLoad <= destLoad) { + traceEvent->detail("SkipReason", "TeamTooSimilar"); + return false; + } // TODO: set 10 as a knob // randomly choose topK shards int topK = std::min(int(0.1 * shards.size()), 10); state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetTopKMetricsRequest req(shards, topK); + state GetTopKMetricsRequest req(shards, topK, (srcLoad - destLoad) / 3.0); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) > b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); @@ -1567,16 +1575,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, } auto& metrics = metricsList[chosenIdx]; - auto srcLoad = sourceTeam->getLoadReadBandwidth(false), destLoad = destTeam->getLoadReadBandwidth(); - traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond) - .detail("SrcReadBandwidth", srcLoad) - .detail("DestReadBandwidth", destLoad); - - if (srcLoad - destLoad <= - 5 * std::max(metrics.bytesReadPerKSecond, SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS)) { - traceEvent->detail("SkipReason", "TeamTooSimilar"); - return false; - } + traceEvent->detail("ShardReadBandwidth", metrics.bytesReadPerKSecond); // Verify the shard is still in ShardsAffectedByTeamFailure shards = self->shardsAffectedByTeamFailure->getShardsFor( ShardsAffectedByTeamFailure::Team(sourceTeam->getServerIDs(), primary)); diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 32d24b0f8e..055b477111 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -857,8 +857,10 @@ ACTOR Future fetchTopKShardMetrics_impl(DataDistributionTracker* self, Get break; } - metrics.keys = range; - returnMetrics.push_back(metrics); + if (metrics.bytesReadPerKSecond <= req.maxBytesReadPerKSecond) { + metrics.keys = range; + returnMetrics.push_back(metrics); + } } if (!onChange.isValid()) { From c01f680d6280e795e9ac04370debd296e222a139 Mon Sep 17 00:00:00 2001 From: hao fu Date: Thu, 5 May 2022 13:44:10 -0700 Subject: [PATCH 147/299] add appendRaw and subTupleRawString for Tuple --- fdbclient/Tuple.cpp | 16 ++++++++++++++++ fdbclient/Tuple.h | 3 ++- fdbserver/storageserver.actor.cpp | 10 +++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp index ec74abaeac..77716b94ee 100644 --- a/fdbclient/Tuple.cpp +++ b/fdbclient/Tuple.cpp @@ -121,6 +121,13 @@ Tuple& Tuple::append(StringRef const& str, bool utf8) { return *this; } +Tuple& Tuple::appendRaw(StringRef const& str) { + offsets.push_back(data.size()); + + data.append(data.arena(), str.begin(), str.size()); + return *this; +} + Tuple& Tuple::append(int64_t value) { uint64_t swap = value; bool neg = false; @@ -383,3 +390,12 @@ Tuple Tuple::subTuple(size_t start, size_t end) const { size_t endPos = end < offsets.size() ? offsets[end] : data.size(); return Tuple(StringRef(data.begin() + offsets[start], endPos - offsets[start])); } + +StringRef Tuple::subTupleRawString(size_t index) const { + if (index >= offsets.size()) { + return StringRef(); + } + size_t end = index + 1; + size_t endPos = end < offsets.size() ? offsets[end] : data.size(); + return StringRef(data.begin() + offsets[index], endPos - offsets[index]); +} diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h index 2ff31bf990..6201bfdfcf 100644 --- a/fdbclient/Tuple.h +++ b/fdbclient/Tuple.h @@ -36,6 +36,7 @@ struct Tuple { static Tuple unpack(StringRef const& str, bool exclude_incomplete = false); Tuple& append(Tuple const& tuple); + Tuple& appendRaw(StringRef const& str); Tuple& append(StringRef const& str, bool utf8 = false); Tuple& append(int64_t); // There are some ambiguous append calls in fdbclient, so to make it easier @@ -61,7 +62,7 @@ struct Tuple { data.clear(); offsets.clear(); } - + StringRef subTupleRawString(size_t index) const; ElementType getType(size_t index) const; Standalone getString(size_t index) const; int64_t getInt(size_t index, bool allow_incomplete = false) const; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 523555a771..cbc57e3673 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3551,10 +3551,14 @@ void preprocessMappedKey(Tuple& mappedKeyFormatTuple, std::vector()); isRangeQuery = true; } else { - vt.push_back(Optional(mappedKeyFormatTuple.subTuple(i, i + 1))); + Tuple t; + t.appendRaw(mappedKeyFormatTuple.subTupleRawString(i)); + vt.push_back(Optional(t)); } } else { - vt.push_back(Optional(mappedKeyFormatTuple.subTuple(i, i + 1))); + Tuple t; + t.appendRaw(mappedKeyFormatTuple.subTupleRawString(i)); + vt.push_back(Optional(t)); } } } @@ -3598,7 +3602,7 @@ Key constructMappedKey(KeyValueRef* keyValue, if (idx < 0 || idx >= referenceTuple->size()) { throw mapper_bad_index(); } - mappedKeyTuple.append(referenceTuple->subTuple(idx, idx + 1)); + mappedKeyTuple.appendRaw(referenceTuple->subTupleRawString(idx)); } } From 4804bb21cf088bc8e118bff65163459fff543c3c Mon Sep 17 00:00:00 2001 From: hao fu Date: Fri, 6 May 2022 13:12:00 -0700 Subject: [PATCH 148/299] Fix comments --- fdbclient/Tuple.h | 3 +++ fdbserver/storageserver.actor.cpp | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h index 6201bfdfcf..9db4d7ba4c 100644 --- a/fdbclient/Tuple.h +++ b/fdbclient/Tuple.h @@ -36,6 +36,8 @@ struct Tuple { static Tuple unpack(StringRef const& str, bool exclude_incomplete = false); Tuple& append(Tuple const& tuple); + + // the str needs to be a Tuple encoded string. Tuple& appendRaw(StringRef const& str); Tuple& append(StringRef const& str, bool utf8 = false); Tuple& append(int64_t); @@ -62,6 +64,7 @@ struct Tuple { data.clear(); offsets.clear(); } + // Return a Tuple encoded raw string. StringRef subTupleRawString(size_t index) const; ElementType getType(size_t index) const; Standalone getString(size_t index) const; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index cbc57e3673..55e9405716 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3538,27 +3538,27 @@ void preprocessMappedKey(Tuple& mappedKeyFormatTuple, std::vector(escapedTuple)); + vt.emplace_back(escapedTuple); } else if (singleKeyOrValue(s, sz)) { // when it is SingleKeyOrValue, insert an empty Tuple to vector as placeholder - vt.push_back(Optional(Tuple())); + vt.emplace_back(Tuple()); } else if (rangeQuery(s)) { if (i != mappedKeyFormatTuple.size() - 1) { // It must be the last element of the mapper tuple throw mapper_bad_range_decriptor(); } // when it is rangeQuery, insert Optional.empty as placeholder - vt.push_back(Optional()); + vt.emplace_back(Optional()); isRangeQuery = true; } else { Tuple t; t.appendRaw(mappedKeyFormatTuple.subTupleRawString(i)); - vt.push_back(Optional(t)); + vt.emplace_back(t); } } else { Tuple t; t.appendRaw(mappedKeyFormatTuple.subTupleRawString(i)); - vt.push_back(Optional(t)); + vt.emplace_back(t); } } } From b53c9fe8515f924c32534608e05b7519e257fdd1 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Sat, 7 May 2022 09:26:00 +0100 Subject: [PATCH 149/299] Make use of relpath instead of basepath --- packaging/docker/sidecar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/docker/sidecar.py b/packaging/docker/sidecar.py index 39c4685bc7..a9c618c129 100755 --- a/packaging/docker/sidecar.py +++ b/packaging/docker/sidecar.py @@ -519,13 +519,13 @@ class Server(BaseHTTPRequestHandler): if self.path.startswith("/check_hash/"): try: self.send_text( - check_hash(os.path.basename(self.path)), add_newline=False + check_hash(os.path.relpath(self.path, "/check_hash")), add_newline=False ) except FileNotFoundError: self.send_error(404, "Path not found") self.end_headers() if self.path.startswith("/is_present/"): - if is_present(os.path.basename(self.path)): + if is_present(os.path.relpath(self.path, "/is_present")): self.send_text("OK") else: self.send_error(404, "Path not found") From 33ae398268bbc2ec497e4f22b3c61204a182129c Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Sat, 7 May 2022 13:18:35 -0700 Subject: [PATCH 150/299] REST KmsConnector implementation (#6994) * REST KmsConnector implementation Description diff-1: Address review comments. Add utility interface to Platform namespace to create and operate on tmpfile diff-2: Address review comments Link Boost::filesystem to CMake build process Major changes includes: 1. Implement REST based KmsConnector implementation. 2. Salient features of the connector: 2.1. Two required configuration are: a. Discovery KMS URLs - enable KMS discovery on bootstrap b. Endpoint path configuration to construct URI to fetch/refresh encryption keys c. Configuration to provide "validationTokens" to connect with external KMS. Patch implements file-based token validation scheme. 2.2. On startup, RESTKmsConnector discovers KMS Urls and caches them in-memory. Extracts "validationTokens" based on input config. 2.3. Expose endpoints to allow fetch/refresh of encryption keys. 2.4. Defines JSON format to interact with external KMS - request & response payload format. 3. Extend Platform namespace with an interface to create and operate on tmp files. 4. Update Platform 'readFileBytes' and 'writeFileBytes' to leverage fstream supported implementation. NOTE: KMS URLs fetched after initial discovery will be persisted using DynamicKnobs. It is TODO at the moment and shall be completed once DynamicKnobs is feature complete Testing Unit test to validation following: 1. Parsing on "validation tokens" logic. 2. Construction and parsing of REST JSON request and response strings. --- cmake/CompileBoost.cmake | 25 +- fdbclient/ServerKnobs.cpp | 23 +- fdbclient/ServerKnobs.h | 11 + fdbrpc/RESTUtils.actor.cpp | 69 +- fdbserver/CMakeLists.txt | 2 + fdbserver/EncryptKeyProxy.actor.cpp | 5 +- fdbserver/RESTKmsConnector.actor.cpp | 1297 +++++++++++++++++++++++ fdbserver/RESTKmsConnector.actor.h | 37 + fdbserver/fdbserver.actor.cpp | 17 +- fdbserver/workloads/UnitTests.actor.cpp | 2 + flow/Platform.actor.cpp | 138 ++- flow/Platform.h | 25 + flow/error_definitions.h | 2 + tests/CMakeLists.txt | 2 + tests/fast/RESTKmsConnectorUnit.toml | 8 + tests/fast/RESTUtilsUnit.toml | 8 + 16 files changed, 1578 insertions(+), 93 deletions(-) create mode 100644 fdbserver/RESTKmsConnector.actor.cpp create mode 100644 fdbserver/RESTKmsConnector.actor.h create mode 100644 tests/fast/RESTKmsConnectorUnit.toml create mode 100644 tests/fast/RESTUtilsUnit.toml diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake index 52788e68ce..d73d0ab024 100644 --- a/cmake/CompileBoost.cmake +++ b/cmake/CompileBoost.cmake @@ -9,7 +9,7 @@ function(compile_boost) # Configure bootstrap command set(BOOTSTRAP_COMMAND "./bootstrap.sh") - set(BOOTSTRAP_LIBRARIES "context") + set(BOOTSTRAP_LIBRARIES "context,filesystem") set(BOOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}") if(CLANG) @@ -57,15 +57,20 @@ function(compile_boost) INSTALL_COMMAND "" UPDATE_COMMAND "" BUILD_BYPRODUCTS "${BOOST_INSTALL_DIR}/boost/config.hpp" - "${BOOST_INSTALL_DIR}/lib/libboost_context.a") + "${BOOST_INSTALL_DIR}/lib/libboost_context.a" + "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a") add_library(${COMPILE_BOOST_TARGET}_context STATIC IMPORTED) add_dependencies(${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}Project) set_target_properties(${COMPILE_BOOST_TARGET}_context PROPERTIES IMPORTED_LOCATION "${BOOST_INSTALL_DIR}/lib/libboost_context.a") + add_library(${COMPILE_BOOST_TARGET}_filesystem STATIC IMPORTED) + add_dependencies(${COMPILE_BOOST_TARGET}_filesystem ${COMPILE_BOOST_TARGET}Project) + set_target_properties(${COMPILE_BOOST_TARGET}_filesystem PROPERTIES IMPORTED_LOCATION "${BOOST_INSTALL_DIR}/lib/libboost_filesystem.a") + add_library(${COMPILE_BOOST_TARGET} INTERFACE) target_include_directories(${COMPILE_BOOST_TARGET} SYSTEM INTERFACE ${BOOST_INSTALL_DIR}/include) - target_link_libraries(${COMPILE_BOOST_TARGET} INTERFACE ${COMPILE_BOOST_TARGET}_context) + target_link_libraries(${COMPILE_BOOST_TARGET} INTERFACE ${COMPILE_BOOST_TARGET}_context ${COMPILE_BOOST_TARGET}_filesystem) endfunction(compile_boost) @@ -91,11 +96,11 @@ set(Boost_USE_STATIC_LIBS ON) if (UNIX AND CMAKE_CXX_COMPILER_ID MATCHES "Clang$") list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0_clang) set(BOOST_HINT_PATHS /opt/boost_1_78_0_clang) - message(STATUS "Using Clang version of boost::context") + message(STATUS "Using Clang version of boost::context and boost::filesystem") else () list(APPEND CMAKE_PREFIX_PATH /opt/boost_1_78_0) set(BOOST_HINT_PATHS /opt/boost_1_78_0) - message(STATUS "Using g++ version of boost::context") + message(STATUS "Using g++ version of boost::context and boost::filesystem") endif () if(BOOST_ROOT) @@ -107,18 +112,18 @@ if(WIN32) # properly for config mode. So we use the old way on Windows # find_package(Boost 1.72.0 EXACT QUIET REQUIRED CONFIG PATHS ${BOOST_HINT_PATHS}) # I think depending on the cmake version this will cause weird warnings - find_package(Boost 1.72) + find_package(Boost 1.72 COMPONENTS filesystem) add_library(boost_target INTERFACE) - target_link_libraries(boost_target INTERFACE Boost::boost) + target_link_libraries(boost_target INTERFACE Boost::boost Boost::filesystem) return() endif() -find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context CONFIG PATHS ${BOOST_HINT_PATHS}) +find_package(Boost 1.78.0 EXACT QUIET COMPONENTS context filesystem CONFIG PATHS ${BOOST_HINT_PATHS}) set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores any installed boost") -if(Boost_FOUND AND NOT FORCE_BOOST_BUILD) +if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND NOT FORCE_BOOST_BUILD) add_library(boost_target INTERFACE) - target_link_libraries(boost_target INTERFACE Boost::boost Boost::context) + target_link_libraries(boost_target INTERFACE Boost::boost Boost::context_FOUND Boost::filesystem) elseif(WIN32) message(FATAL_ERROR "Could not find Boost") else() diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index e5d64bf32f..42dddd5a3b 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -858,9 +858,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ENCRYPTION_MODE, "AES-256-CTR"); init( SIM_KMS_MAX_KEYS, 4096); - // Support KmsConnector types are: - // KMS_CONNECTOR_TYPE_HTTP -> 1 - init( KMS_CONNECTOR_TYPE, "HttpKmsConnector"); + // KMS connector type + init( KMS_CONNECTOR_TYPE, "RESTKmsConnector"); // Blob granlues init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually @@ -887,6 +886,24 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 ); init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 ); + // HTTP KMS Connector + init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE, "file"); + init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_MODE, "file"); + init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE, 1024); + init( REST_KMS_CONNECTOR_VALIDATION_TOKENS_MAX_PAYLOAD_SIZE, 10 * 1024); + init( REST_KMS_CONNECTOR_REFRESH_KMS_URLS, true); + init( REST_KMS_CONNECTOR_REFRESH_KMS_URLS_INTERVAL_SEC, 600); + // Below KMS configurations are responsible for: + // Discovering KMS URLs, fetch encryption keys endpoint and validation token details. + // Configurations are expected to be passed as command-line arguments. + // NOTE: Care must be taken when attempting to update below configurations for a up/running FDB cluster. + init( REST_KMS_CONNECTOR_DISCOVER_KMS_URL_FILE, ""); + init( REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT, ""); + // Details to fetch validation token from a localhost file + // acceptable format: "#,#,.." + // NOTE: 'token-name" can NOT contain '#' character + init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS, ""); + // clang-format on if (clientKnobs) { diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 9ef13d43fd..0176011fec 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -852,6 +852,17 @@ public: double BGCC_TIMEOUT; double BGCC_MIN_INTERVAL; + // HTTP KMS Connector + std::string REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE; + std::string REST_KMS_CONNECTOR_DISCOVER_KMS_URL_FILE; + std::string REST_KMS_CONNECTOR_VALIDATION_TOKEN_MODE; + std::string REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS; + int REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE; + int REST_KMS_CONNECTOR_VALIDATION_TOKENS_MAX_PAYLOAD_SIZE; + bool REST_KMS_CONNECTOR_REFRESH_KMS_URLS; + double REST_KMS_CONNECTOR_REFRESH_KMS_URLS_INTERVAL_SEC; + std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT; + ServerKnobs(Randomize, ClientKnobs*, IsSimulated); void initialize(Randomize, ClientKnobs*, IsSimulated); }; diff --git a/fdbrpc/RESTUtils.actor.cpp b/fdbrpc/RESTUtils.actor.cpp index 43f661224d..c07b29f8d7 100644 --- a/fdbrpc/RESTUtils.actor.cpp +++ b/fdbrpc/RESTUtils.actor.cpp @@ -202,8 +202,7 @@ void RESTUrl::parseUrl(const std::string& fullUrl, const bool isSecure) { // Only used to link unit tests void forceLinkRESTUtilsTests() {} -TEST_CASE("fdbrpc/RESTUtils") { - // invalid protocol +TEST_CASE("/RESTUtils/InvalidProtocol") { try { std::string uri("httpx://foo/bar"); RESTUrl r(uri, false); @@ -213,7 +212,10 @@ TEST_CASE("fdbrpc/RESTUtils") { throw e; } } + return Void(); +} +TEST_CASE("/RESTUtils/MismatchKnobVal") { // mismatch protocol and knob values try { std::string uri("http://foo/bar"); @@ -224,8 +226,10 @@ TEST_CASE("fdbrpc/RESTUtils") { throw e; } } + return Void(); +} - // missing host +TEST_CASE("/RESTUtils/MissingHost") { try { std::string uri("https://:/bar"); RESTUrl r(uri, true); @@ -235,42 +239,33 @@ TEST_CASE("fdbrpc/RESTUtils") { throw e; } } + return Void(); +} - // valid URI with service - try { - std::string uri("https://host:80/foo/bar"); - RESTUrl r(uri, true); - ASSERT_EQ(r.host.compare("host"), 0); - ASSERT_EQ(r.service.compare("80"), 0); - ASSERT_EQ(r.resource.compare("foo/bar"), 0); - } catch (Error& e) { - throw e; - } +TEST_CASE("/RESTUtils/ValidURIWithService") { + std::string uri("https://host:80/foo/bar"); + RESTUrl r(uri, true); + ASSERT_EQ(r.host.compare("host"), 0); + ASSERT_EQ(r.service.compare("80"), 0); + ASSERT_EQ(r.resource.compare("foo/bar"), 0); + return Void(); +} - // valid URI with-out service - try { - std::string uri("https://host/foo/bar"); - RESTUrl r(uri, true); - ASSERT_EQ(r.host.compare("host"), 0); - ASSERT(r.service.empty()); - ASSERT_EQ(r.resource.compare("foo/bar"), 0); - } catch (Error& e) { - throw e; - } - - // valid URI with parameters - try { - std::string uri("https://host/foo/bar?param1,param2"); - RESTUrl r(uri, true); - ASSERT_EQ(r.host.compare("host"), 0); - ASSERT(r.service.empty()); - ASSERT_EQ(r.resource.compare("foo/bar"), 0); - ASSERT_EQ(r.reqParameters.compare("param1,param2"), 0); - } catch (Error& e) { - throw e; - } - - // ensure RESTClient::Knob default values and updates +TEST_CASE("/RESTUtils/ValidURIWithoutService") { + std::string uri("https://host/foo/bar"); + RESTUrl r(uri, true); + ASSERT_EQ(r.host.compare("host"), 0); + ASSERT(r.service.empty()); + ASSERT_EQ(r.resource.compare("foo/bar"), 0); + return Void(); +} +TEST_CASE("/RESTUtils/ValidURIWithParams") { + std::string uri("https://host/foo/bar?param1,param2"); + RESTUrl r(uri, true); + ASSERT_EQ(r.host.compare("host"), 0); + ASSERT(r.service.empty()); + ASSERT_EQ(r.resource.compare("foo/bar"), 0); + ASSERT_EQ(r.reqParameters.compare("param1,param2"), 0); return Void(); } \ No newline at end of file diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index f6d56ebe41..337d5345d6 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -105,6 +105,8 @@ set(FDBSERVER_SRCS RecoveryState.h RemoteIKeyValueStore.actor.h RemoteIKeyValueStore.actor.cpp + RESTKmsConnector.actor.h + RESTKmsConnector.actor.cpp ResolutionBalancer.actor.cpp ResolutionBalancer.actor.h Resolver.actor.cpp diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp index 2026d8307c..03d0874f1a 100644 --- a/fdbserver/EncryptKeyProxy.actor.cpp +++ b/fdbserver/EncryptKeyProxy.actor.cpp @@ -24,6 +24,7 @@ #include "fdbserver/KmsConnector.h" #include "fdbserver/KmsConnectorInterface.h" #include "fdbserver/Knobs.h" +#include "fdbserver/RESTKmsConnector.actor.h" #include "fdbserver/ServerDBInfo.actor.h" #include "fdbserver/SimKmsConnector.actor.h" #include "fdbserver/WorkerInterface.actor.h" @@ -335,8 +336,8 @@ void refreshEncryptionKeys(Reference ekpProxyData, KmsConne void activateKmsConnector(Reference ekpProxyData, KmsConnectorInterface kmsConnectorInf) { if (g_network->isSimulated()) { ekpProxyData->kmsConnector = std::make_unique(); - } else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare("HttpKmsConnector")) { - throw not_implemented(); + } else if (SERVER_KNOBS->KMS_CONNECTOR_TYPE.compare("RESTKmsConnector")) { + ekpProxyData->kmsConnector = std::make_unique(); } else { throw not_implemented(); } diff --git a/fdbserver/RESTKmsConnector.actor.cpp b/fdbserver/RESTKmsConnector.actor.cpp new file mode 100644 index 0000000000..be4023ca51 --- /dev/null +++ b/fdbserver/RESTKmsConnector.actor.cpp @@ -0,0 +1,1297 @@ +/* + * RESTKmsConnector.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/RESTKmsConnector.actor.h" + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/rapidjson/document.h" +#include "fdbclient/rapidjson/rapidjson.h" +#include "fdbclient/rapidjson/stringbuffer.h" +#include "fdbclient/rapidjson/writer.h" +#include "fdbrpc/HTTP.h" +#include "fdbrpc/IAsyncFile.h" +#include "fdbserver/KmsConnectorInterface.h" +#include "fdbserver/Knobs.h" +#include "fdbrpc/RESTClient.h" +#include "flow/Arena.h" +#include "flow/EncryptUtils.h" +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/IRandom.h" +#include "flow/Platform.h" +#include "flow/Trace.h" +#include "flow/UnitTest.h" + +#include +#include +#include +#include +#include +#include + +#include "flow/actorcompiler.h" // This must be the last #include + +namespace { +const char* BASE_CIPHER_ID_TAG = "base_cipher_id"; +const char* BASE_CIPHER_TAG = "baseCipher"; +const char* CIPHER_KEY_DETAILS_TAG = "cipher_key_details"; +const char* ENCRYPT_DOMAIN_ID_TAG = "encrypt_domain_id"; +const char* ERROR_TAG = "error"; +const char* ERROR_DETAIL_TAG = "details"; +const char* KMS_URLS_TAG = "kms_urls"; +const char* QUERY_MODE_TAG = "query_mode"; +const char* REFRESH_KMS_URLS_TAG = "refresh_kms_urls"; +const char* VALIDATION_TOKENS_TAG = "validation_tokens"; +const char* VALIDATION_TOKEN_NAME_TAG = "token_name"; +const char* VALIDATION_TOKEN_VALUE_TAG = "token_value"; + +const char* TOKEN_NAME_FILE_SEP = "#"; +const char* TOKEN_TUPLE_SEP = ","; +const char DISCOVER_URL_FILE_URL_SEP = '\n'; + +const char* QUERY_MODE_LOOKUP_BY_DOMAIN_ID = "lookupByDomainId"; +const char* QUERY_MODE_LOOKUP_BY_KEY_ID = "lookupByKeyId"; + +bool canReplyWith(Error e) { + switch (e.code()) { + case error_code_encrypt_invalid_kms_config: + case error_code_encrypt_keys_fetch_failed: + case error_code_file_not_found: + case error_code_file_too_large: + case error_code_http_request_failed: + case error_code_io_error: + case error_code_operation_failed: + case error_code_value_too_large: + return true; + default: + return false; + } +} +} // namespace + +struct KmsUrlCtx { + std::string url; + uint64_t nRequests; + uint64_t nFailedResponses; + uint64_t nResponseParseFailures; + + KmsUrlCtx() : url(""), nRequests(0), nFailedResponses(0), nResponseParseFailures(0) {} + explicit KmsUrlCtx(const std::string& u) : url(u), nRequests(0), nFailedResponses(0), nResponseParseFailures(0) {} + + bool operator<(const KmsUrlCtx& toCompare) const { + if (nFailedResponses != toCompare.nFailedResponses) { + return nFailedResponses > toCompare.nFailedResponses; + } + return nResponseParseFailures > toCompare.nResponseParseFailures; + } +}; + +enum class ValidationTokenSource { + VALIDATION_TOKEN_SOURCE_FILE = 1, + VALIDATION_TOKEN_SOURCE_LAST // Always the last element +}; + +struct ValidationTokenCtx { + std::string name; + std::string value; + ValidationTokenSource source; + Optional filePath; + + explicit ValidationTokenCtx(const std::string& n, ValidationTokenSource s) + : name(n), value(""), source(s), filePath(Optional()), readTS(now()) {} + double getReadTS() const { return readTS; } + +private: + double readTS; // Approach assists refreshing token based on time of creation +}; + +using KmsUrlMinHeap = std::priority_queue, + std::vector>, + std::less>::value_type>>; + +struct RESTKmsConnectorCtx : public ReferenceCounted { + UID uid; + KmsUrlMinHeap kmsUrlHeap; + double lastKmsUrlsRefreshTs; + RESTClient restClient; + std::unordered_map validationTokens; + + RESTKmsConnectorCtx() : uid(deterministicRandom()->randomUniqueID()), lastKmsUrlsRefreshTs(0) {} + explicit RESTKmsConnectorCtx(const UID& id) : uid(id), lastKmsUrlsRefreshTs(0) {} +}; + +std::string getEncryptionKeysFullUrl(Reference ctx, const std::string& url) { + if (SERVER_KNOBS->REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT.empty()) { + TraceEvent("GetEncryptionKeysFullUrl_EmptyEndpoint", ctx->uid).log(); + throw encrypt_invalid_kms_config(); + } + + std::string fullUrl(url); + return fullUrl.append("/").append(SERVER_KNOBS->REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT); +} + +void dropCachedKmsUrls(Reference ctx) { + while (!ctx->kmsUrlHeap.empty()) { + std::shared_ptr curUrl = ctx->kmsUrlHeap.top(); + + TraceEvent("DropCachedKmsUrls", ctx->uid) + .detail("Url", curUrl->url) + .detail("NumRequests", curUrl->nRequests) + .detail("NumFailedResponses", curUrl->nFailedResponses) + .detail("NumRespParseFailures", curUrl->nResponseParseFailures); + + ctx->kmsUrlHeap.pop(); + } +} + +bool shouldRefreshKmsUrls(Reference ctx) { + if (!SERVER_KNOBS->REST_KMS_CONNECTOR_REFRESH_KMS_URLS) { + return false; + } + + return (now() - ctx->lastKmsUrlsRefreshTs) > SERVER_KNOBS->REST_KMS_CONNECTOR_REFRESH_KMS_URLS_INTERVAL_SEC; +} + +void extractKmsUrls(Reference ctx, rapidjson::Document& doc, Reference httpResp) { + // Refresh KmsUrls cache + dropCachedKmsUrls(ctx); + ASSERT(ctx->kmsUrlHeap.empty()); + + for (const auto& url : doc[KMS_URLS_TAG].GetArray()) { + if (!url.IsString()) { + TraceEvent("DiscoverKmsUrls_MalformedResp", ctx->uid).detail("ResponseContent", httpResp->content); + throw operation_failed(); + } + + std::string urlStr; + urlStr.resize(url.GetStringLength()); + memcpy(urlStr.data(), url.GetString(), url.GetStringLength()); + + TraceEvent("DiscoverKmsUrls_AddUrl", ctx->uid).detail("Url", urlStr); + + ctx->kmsUrlHeap.emplace(std::make_shared(urlStr)); + } + + // Update Kms URLs refresh timestamp + ctx->lastKmsUrlsRefreshTs = now(); +} + +ACTOR Future parseDiscoverKmsUrlFile(Reference ctx, std::string filename) { + if (filename.empty() || !fileExists(filename)) { + TraceEvent("DiscoverKmsUrls_FileNotFound", ctx->uid).log(); + throw encrypt_invalid_kms_config(); + } + + state Reference dFile = wait(IAsyncFileSystem::filesystem()->open( + filename, IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED, 0644)); + state int64_t fSize = wait(dFile->size()); + state Standalone buff = makeString(fSize); + int bytesRead = wait(dFile->read(mutateString(buff), fSize, 0)); + if (bytesRead != fSize) { + TraceEvent("DiscoveryKmsUrl_FileReadShort", ctx->uid) + .detail("Filename", filename) + .detail("Expected", fSize) + .detail("Actual", bytesRead); + throw io_error(); + } + + // Acceptable file format (new line character separated URLs): + // \n + // \n + + std::stringstream ss(buff.toString()); + std::string url; + while (std::getline(ss, url, DISCOVER_URL_FILE_URL_SEP)) { + std::string trimedUrl = boost::trim_copy(url); + if (trimedUrl.empty()) { + // Empty URL, ignore and continue + continue; + } + TraceEvent(SevDebug, "DiscoverKmsUrls_AddUrl", ctx->uid).detail("Url", url); + ctx->kmsUrlHeap.emplace(std::make_shared(url)); + } + + return Void(); +} + +ACTOR Future discoverKmsUrls(Reference ctx, bool refreshPersistedUrls) { + // KMS discovery needs to be done in two scenarios: + // 1) Initial cluster bootstrap - first boot. + // 2) Requests to all cached KMS URLs is failing for some reason. + // + // Following steps are followed as part of KMS discovery: + // 1) Based on the configured KMS URL discovery mode, the KMS URLs are extracted and persited in a DynamicKnob + // enabled configuration knob. Approach allows relying on the parsing configuration supplied discovery URL mode only + // during afte the initial boot, from then on, the URLs can periodically refreshed along with encryption key fetch + // requests (SERVER_KNOBS->REST_KMS_CONNECTOR_REFRESH_KMS_URLS needs to be enabled). + // 2) Cluster will continue using cached KMS URLs (and refreshing them if needed); however, if for some reason, all + // cached URLs aren't working, then code re-discovers the URL following step#1 and refresh persisted state as well. + + if (!refreshPersistedUrls) { + // TODO: request must be satisfied accessing KMS URLs persited using DynamicKnobs. Will be implemented once + // feature is available + } + + std::string_view mode{ SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MODE }; + + if (mode.compare("file") == 0) { + wait(parseDiscoverKmsUrlFile(ctx, SERVER_KNOBS->REST_KMS_CONNECTOR_DISCOVER_KMS_URL_FILE)); + } else { + throw not_implemented(); + } + + return Void(); +} + +void parseKmsResponse(Reference ctx, + Reference resp, + Arena* arena, + std::vector* outCipherKeyDetails) { + // Acceptable response payload json format: + // + // response_json_payload { + // "cipher_key_details" : [ + // { + // "base_cipher_id" : , + // "encrypt_domain_id" : , + // "base_cipher" : + // }, + // { + // .... + // } + // ], + // "kms_urls" : [ + // "url1", "url2", ... + // ], + // "error" : { + // "details":
+ // } // Optional, populated by the KMS, if present, rest of payload is ignored. + // } + + if (resp->code != HTTP::HTTP_STATUS_CODE_OK) { + // STATUS_OK is gating factor for REST request success + throw http_request_failed(); + } + + rapidjson::Document doc; + doc.Parse(resp->content.c_str()); + + // Check if response has error + if (doc.HasMember(ERROR_TAG)) { + if (doc[ERROR_TAG].HasMember(ERROR_DETAIL_TAG) && doc[ERROR_TAG][ERROR_DETAIL_TAG].IsString()) { + Standalone errRef = makeString(doc[ERROR_TAG][ERROR_DETAIL_TAG].GetStringLength()); + memcpy(mutateString(errRef), + doc[ERROR_TAG][ERROR_DETAIL_TAG].GetString(), + doc[ERROR_TAG][ERROR_DETAIL_TAG].GetStringLength()); + TraceEvent("KMSErrorResponse", ctx->uid).detail("ErrorDetails", errRef.toString()); + } else { + TraceEvent("KMSErrorResponse_EmptyDetails", ctx->uid).log(); + } + + throw encrypt_keys_fetch_failed(); + } + + // Extract CipherKeyDetails + if (!doc.HasMember(CIPHER_KEY_DETAILS_TAG) || !doc[CIPHER_KEY_DETAILS_TAG].IsArray()) { + TraceEvent("ParseKmsResponse_FailureMissingCipherKeyDetails", ctx->uid).log(); + throw operation_failed(); + } + + for (const auto& cipherDetail : doc[CIPHER_KEY_DETAILS_TAG].GetArray()) { + if (!cipherDetail.IsObject()) { + TraceEvent("ParseKmsResponse_FailureEncryptKeyDetailsNotObject", ctx->uid) + .detail("Type", cipherDetail.GetType()); + throw operation_failed(); + } + + const bool isBaseCipherIdPresent = cipherDetail.HasMember(BASE_CIPHER_ID_TAG); + const bool isBaseCipherPresent = cipherDetail.HasMember(BASE_CIPHER_TAG); + const bool isEncryptDomainIdPresent = cipherDetail.HasMember(ENCRYPT_DOMAIN_ID_TAG); + if (!isBaseCipherIdPresent || !isBaseCipherPresent || !isEncryptDomainIdPresent) { + TraceEvent("ParseKmsResponse_MalformedKeyDetail", ctx->uid) + .detail("BaseCipherIdPresent", isBaseCipherIdPresent) + .detail("BaseCipherPresent", isBaseCipherPresent) + .detail("EncryptDomainIdPresent", isEncryptDomainIdPresent); + throw operation_failed(); + } + + const int cipherKeyLen = cipherDetail[BASE_CIPHER_TAG].GetStringLength(); + std::unique_ptr cipherKey = std::make_unique(cipherKeyLen); + memcpy(cipherKey.get(), cipherDetail[BASE_CIPHER_TAG].GetString(), cipherKeyLen); + outCipherKeyDetails->emplace_back(cipherDetail[ENCRYPT_DOMAIN_ID_TAG].GetInt64(), + cipherDetail[BASE_CIPHER_ID_TAG].GetUint64(), + StringRef(cipherKey.get(), cipherKeyLen), + *arena); + } + + if (doc.HasMember(KMS_URLS_TAG)) { + try { + extractKmsUrls(ctx, doc, resp); + } catch (Error& e) { + TraceEvent("RefreshKmsUrls_Failed", ctx->uid).error(e); + // Given cipherKeyDetails extraction was done successfully, ignore KmsUrls parsing error + } + } +} + +void addQueryModeSection(Reference ctx, rapidjson::Document& doc, const char* mode) { + rapidjson::Value key(QUERY_MODE_TAG, doc.GetAllocator()); + rapidjson::Value queryMode; + queryMode.SetString(mode, doc.GetAllocator()); + + // Append 'query_mode' object to the parent document + doc.AddMember(key, queryMode, doc.GetAllocator()); +} + +void addValidationTokensSectionToJsonDoc(Reference ctx, rapidjson::Document& doc) { + // Append "validationTokens" as json array + rapidjson::Value validationTokens(rapidjson::kArrayType); + + for (const auto& token : ctx->validationTokens) { + rapidjson::Value validationToken(rapidjson::kObjectType); + + // Add "name" - token name + rapidjson::Value key(VALIDATION_TOKEN_NAME_TAG, doc.GetAllocator()); + rapidjson::Value tokenName(token.second.name.c_str(), doc.GetAllocator()); + validationToken.AddMember(key, tokenName, doc.GetAllocator()); + + // Add "value" - token value + key.SetString(VALIDATION_TOKEN_VALUE_TAG, doc.GetAllocator()); + rapidjson::Value tokenValue; + tokenValue.SetString(token.second.value.c_str(), token.second.value.size(), doc.GetAllocator()); + validationToken.AddMember(key, tokenValue, doc.GetAllocator()); + + validationTokens.PushBack(validationToken, doc.GetAllocator()); + } + + // Append 'validation_token[]' to the parent document + rapidjson::Value memberKey(VALIDATION_TOKENS_TAG, doc.GetAllocator()); + doc.AddMember(memberKey, validationTokens, doc.GetAllocator()); +} + +void addRefreshKmsUrlsSectionToJsonDoc(Reference ctx, + rapidjson::Document& doc, + const bool refreshKmsUrls) { + rapidjson::Value key(REFRESH_KMS_URLS_TAG, doc.GetAllocator()); + rapidjson::Value refreshUrls; + refreshUrls.SetBool(refreshKmsUrls); + + // Append 'refresh_kms_urls' object to the parent document + doc.AddMember(key, refreshUrls, doc.GetAllocator()); +} + +StringRef getEncryptKeysByKeyIdsRequestBody(Reference ctx, + const KmsConnLookupEKsByKeyIdsReq& req, + const bool refreshKmsUrls, + Arena& arena) { + // Acceptable request payload json format: + // + // request_json_payload { + // "query_mode": "lookupByKeyId" / "lookupByDomainId" + // "cipher_key_details" = [ + // { + // "base_cipher_id" : + // "encrypt_domain_id" : + // }, + // { + // .... + // } + // ], + // "validation_tokens" = [ + // { + // "token_name" : , + // "token_value": + // }, + // { + // .... + // } + // ] + // "refresh_kms_urls" = 1/0 + // } + + rapidjson::Document doc; + doc.SetObject(); + + // Append 'query_mode' object + addQueryModeSection(ctx, doc, QUERY_MODE_LOOKUP_BY_KEY_ID); + + // Append 'cipher_key_details' as json array + rapidjson::Value keyIdDetails(rapidjson::kArrayType); + for (const auto& detail : req.encryptKeyIds) { + rapidjson::Value keyIdDetail(rapidjson::kObjectType); + + // Add 'base_cipher_id' + rapidjson::Value key(BASE_CIPHER_ID_TAG, doc.GetAllocator()); + rapidjson::Value baseKeyId; + baseKeyId.SetUint64(detail.first); + keyIdDetail.AddMember(key, baseKeyId, doc.GetAllocator()); + + // Add 'encrypt_domainId' + key.SetString(ENCRYPT_DOMAIN_ID_TAG, doc.GetAllocator()); + rapidjson::Value domainId; + domainId.SetInt64(detail.second); + keyIdDetail.AddMember(key, domainId, doc.GetAllocator()); + + // push above object to the array + keyIdDetails.PushBack(keyIdDetail, doc.GetAllocator()); + } + rapidjson::Value memberKey(CIPHER_KEY_DETAILS_TAG, doc.GetAllocator()); + doc.AddMember(memberKey, keyIdDetails, doc.GetAllocator()); + + // Append 'validation_tokens' as json array + addValidationTokensSectionToJsonDoc(ctx, doc); + + // Append "refresh_kms_urls' + addRefreshKmsUrlsSectionToJsonDoc(ctx, doc, refreshKmsUrls); + + // Serialize json to string + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + + StringRef ref = makeString(sb.GetSize(), arena); + memcpy(mutateString(ref), sb.GetString(), sb.GetSize()); + return ref; +} + +ACTOR +Future fetchEncryptionKeys_impl(Reference ctx, + StringRef requestBodyRef, + Arena* arena, + std::vector* outCipherKeyDetails) { + state Reference resp; + + // Follow 2-phase scheme: + // Phase-1: Attempt to fetch encryption keys by reaching out to cached KmsUrls in the order of + // past success requests success counts. + // Phase-2: For some reason if none of the cached KmsUrls worked, re-discover the KmsUrls and + // repeat phase-1. + + state int pass = 1; + for (; pass <= 2; pass++) { + state std::stack> tempStack; + + // Iterate over Kms URLs + while (!ctx->kmsUrlHeap.empty()) { + state std::shared_ptr curUrl = ctx->kmsUrlHeap.top(); + ctx->kmsUrlHeap.pop(); + tempStack.push(curUrl); + + try { + std::string kmsEncryptionFullUrl = getEncryptionKeysFullUrl(ctx, curUrl->url); + TraceEvent("FetchEncryptionKeys_Start", ctx->uid).detail("KmsEncryptionFullUrl", kmsEncryptionFullUrl); + Reference _resp = + wait(ctx->restClient.doPost(kmsEncryptionFullUrl, requestBodyRef.toString())); + resp = _resp; + curUrl->nRequests++; + + try { + parseKmsResponse(ctx, resp, arena, outCipherKeyDetails); + + // Push urlCtx back on the ctx->urlHeap + while (!tempStack.empty()) { + ctx->kmsUrlHeap.emplace(tempStack.top()); + tempStack.pop(); + } + + TraceEvent("FetchEncryptionKeys_Success", ctx->uid).detail("KmsUrl", curUrl->url); + return Void(); + } catch (Error& e) { + TraceEvent("FetchEncryptionKeys_RespParseFailure").error(e); + curUrl->nResponseParseFailures++; + // attempt to fetch encryption details from next KmsUrl + } + } catch (Error& e) { + TraceEvent("FetchEncryptionKeys_Failed", ctx->uid).error(e); + curUrl->nFailedResponses++; + // attempt to fetch encryption details from next KmsUrl + } + } + + if (pass == 1) { + // Re-discover KMS urls and re-attempt to fetch the encryption key details using newer KMS URLs + wait(discoverKmsUrls(ctx, true)); + } + } + + // Failed to fetch encryption keys from the remote KMS + throw encrypt_keys_fetch_failed(); +} + +ACTOR Future fetchEncryptionKeysByKeyIds(Reference ctx, + KmsConnLookupEKsByKeyIdsReq req) { + state KmsConnLookupEKsByKeyIdsRep reply; + bool refreshKmsUrls = shouldRefreshKmsUrls(ctx); + std::string requestBody; + + StringRef requestBodyRef = getEncryptKeysByKeyIdsRequestBody(ctx, req, refreshKmsUrls, reply.arena); + + wait(fetchEncryptionKeys_impl(ctx, requestBodyRef, &reply.arena, &reply.cipherKeyDetails)); + + return reply; +} + +StringRef getEncryptKeysByDomainIdsRequestBody(Reference ctx, + const KmsConnLookupEKsByDomainIdsReq& req, + const bool refreshKmsUrls, + Arena& arena) { + // Acceptable request payload json format: + // + // request_json_payload { + // "query_mode": "lookupByKeyId" / "lookupByDomainId" + // "cipher_key_details" = [ + // { + // "encrypt_domainId" : + // }, + // { + // .... + // } + // ], + // "validation_tokens" = [ + // { + // "token_name" : , + // "token_value": + // }, + // { + // .... + // } + // ] + // "refresh_kms_urls" = 1/0 + // } + + rapidjson::Document doc; + doc.SetObject(); + + // Append 'query_mode' object + addQueryModeSection(ctx, doc, QUERY_MODE_LOOKUP_BY_DOMAIN_ID); + + // Append 'cipher_key_details' as json array + rapidjson::Value keyIdDetails(rapidjson::kArrayType); + for (const auto& detail : req.encryptDomainIds) { + rapidjson::Value keyIdDetail(rapidjson::kObjectType); + + rapidjson::Value key(ENCRYPT_DOMAIN_ID_TAG, doc.GetAllocator()); + rapidjson::Value domainId; + domainId.SetInt64(detail); + keyIdDetail.AddMember(key, domainId, doc.GetAllocator()); + + // push above object to the array + keyIdDetails.PushBack(keyIdDetail, doc.GetAllocator()); + } + rapidjson::Value memberKey(CIPHER_KEY_DETAILS_TAG, doc.GetAllocator()); + doc.AddMember(memberKey, keyIdDetails, doc.GetAllocator()); + + // Append 'validation_tokens' as json array + addValidationTokensSectionToJsonDoc(ctx, doc); + + // Append 'refresh_kms_urls' + addRefreshKmsUrlsSectionToJsonDoc(ctx, doc, refreshKmsUrls); + + // Serialize json to string + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + + StringRef ref = makeString(sb.GetSize(), arena); + memcpy(mutateString(ref), sb.GetString(), sb.GetSize()); + return ref; +} + +ACTOR Future fetchEncryptionKeysByDomainIds(Reference ctx, + KmsConnLookupEKsByDomainIdsReq req) { + state KmsConnLookupEKsByDomainIdsRep reply; + bool refreshKmsUrls = shouldRefreshKmsUrls(ctx); + StringRef requestBodyRef = getEncryptKeysByDomainIdsRequestBody(ctx, req, refreshKmsUrls, reply.arena); + + wait(fetchEncryptionKeys_impl(ctx, requestBodyRef, &reply.arena, &reply.cipherKeyDetails)); + + return reply; +} + +ACTOR Future procureValidationTokensFromFiles(Reference ctx, std::string details) { + Standalone detailsRef(details); + if (details.empty()) { + TraceEvent("ValidationToken_EmptyFileDetails", ctx->uid).log(); + throw encrypt_invalid_kms_config(); + } + + TraceEvent("ValidationToken", ctx->uid).detail("DetailsStr", details); + + state std::unordered_map tokenFilePathMap; + while (!details.empty()) { + StringRef name = detailsRef.eat(TOKEN_NAME_FILE_SEP); + if (name.empty()) { + break; + } + StringRef path = detailsRef.eat(TOKEN_TUPLE_SEP); + if (path.empty()) { + TraceEvent("ValidationToken_FileDetailsMalformed", ctx->uid).detail("FileDetails", details); + throw operation_failed(); + } + + std::string tokenName = boost::trim_copy(name.toString()); + std::string tokenFile = boost::trim_copy(path.toString()); + if (!fileExists(tokenFile)) { + TraceEvent("ValidationToken_FileNotFound", ctx->uid) + .detail("TokenName", tokenName) + .detail("Filename", tokenFile); + throw encrypt_invalid_kms_config(); + } + + tokenFilePathMap.emplace(tokenName, tokenFile); + TraceEvent("ValidationToken", ctx->uid).detail("FName", tokenName).detail("Filename", tokenFile); + } + + // Clear existing cached validation tokens + ctx->validationTokens.clear(); + + // Enumerate all token files and extract details + state uint64_t tokensPayloadSize = 0; + for (const auto& item : tokenFilePathMap) { + state std::string tokenName = item.first; + state std::string tokenFile = item.second; + state Reference tFile = wait(IAsyncFileSystem::filesystem()->open( + tokenFile, IAsyncFile::OPEN_NO_AIO | IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED, 0644)); + + state int64_t fSize = wait(tFile->size()); + if (fSize > SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE) { + TraceEvent("ValidationToken_FileTooLarge", ctx->uid) + .detail("FileName", tokenFile) + .detail("Size", fSize) + .detail("MaxAllowedSize", SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE); + throw file_too_large(); + } + + tokensPayloadSize += fSize; + if (tokensPayloadSize > SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKENS_MAX_PAYLOAD_SIZE) { + TraceEvent("ValidationToken_PayloadTooLarge", ctx->uid) + .detail("MaxAllowedSize", SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKENS_MAX_PAYLOAD_SIZE); + throw value_too_large(); + } + + state Standalone buff = makeString(fSize); + int bytesRead = wait(tFile->read(mutateString(buff), fSize, 0)); + if (bytesRead != fSize) { + TraceEvent("DiscoveryKmsUrl_FileReadShort", ctx->uid) + .detail("Filename", tokenFile) + .detail("Expected", fSize) + .detail("Actual", bytesRead); + throw io_error(); + } + + // Populate validation token details + ValidationTokenCtx tokenCtx = + ValidationTokenCtx(tokenName, ValidationTokenSource::VALIDATION_TOKEN_SOURCE_FILE); + tokenCtx.value.resize(fSize); + memcpy(tokenCtx.value.data(), buff.begin(), fSize); + tokenCtx.filePath = tokenFile; + + // NOTE: avoid logging token-value to prevent token leaks in log files.. + TraceEvent("ValidationToken_ReadFile", ctx->uid) + .detail("TokenName", tokenCtx.name) + .detail("TokenSize", tokenCtx.value.size()) + .detail("TokenFilePath", tokenCtx.filePath.get()) + .detail("TotalPayloadSize", tokensPayloadSize); + + ctx->validationTokens.emplace(tokenName, std::move(tokenCtx)); + } + + return Void(); +} + +ACTOR Future procureValidationTokens(Reference ctx) { + std::string_view mode{ SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MODE }; + + if (mode.compare("file") == 0) { + wait(procureValidationTokensFromFiles(ctx, SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS)); + } else { + throw not_implemented(); + } + + return Void(); +} + +ACTOR Future connectorCore_impl(KmsConnectorInterface interf) { + state Reference self = makeReference(interf.id()); + + TraceEvent("RESTKmsConnector_Init", self->uid).log(); + + wait(discoverKmsUrls(self, false /* refreshPersistedUrls */)); + wait(procureValidationTokens(self)); + + loop { + choose { + when(KmsConnLookupEKsByKeyIdsReq req = waitNext(interf.ekLookupByIds.getFuture())) { + state KmsConnLookupEKsByKeyIdsReq byKeyIdReq = req; + state KmsConnLookupEKsByKeyIdsRep byKeyIdResp; + try { + KmsConnLookupEKsByKeyIdsRep _rByKeyId = wait(fetchEncryptionKeysByKeyIds(self, byKeyIdReq)); + byKeyIdResp = _rByKeyId; + byKeyIdReq.reply.send(byKeyIdResp); + } catch (Error& e) { + TraceEvent("LookupEKsByKeyIds_Failed", self->uid).error(e); + if (!canReplyWith(e)) { + throw e; + } + byKeyIdReq.reply.sendError(e); + } + } + when(KmsConnLookupEKsByDomainIdsReq req = waitNext(interf.ekLookupByDomainIds.getFuture())) { + state KmsConnLookupEKsByDomainIdsReq byDomainIdReq = req; + state KmsConnLookupEKsByDomainIdsRep byDomainIdResp; + try { + KmsConnLookupEKsByDomainIdsRep _rByDomainId = + wait(fetchEncryptionKeysByDomainIds(self, byDomainIdReq)); + byDomainIdResp = _rByDomainId; + byDomainIdReq.reply.send(byDomainIdResp); + } catch (Error& e) { + TraceEvent("LookupEKsByDomainIds_Failed", self->uid).error(e); + if (!canReplyWith(e)) { + throw e; + } + byDomainIdReq.reply.sendError(e); + } + } + } + } +} + +Future RESTKmsConnector::connectorCore(KmsConnectorInterface interf) { + return connectorCore_impl(interf); +} + +// Only used to link unit tests +void forceLinkRESTKmsConnectorTest() {} + +namespace { +std::string_view KMS_URL_NAME_TEST = "http://foo/bar"; +uint8_t BASE_CIPHER_KEY_TEST[32]; + +std::shared_ptr prepareTokenFile(const uint8_t* buff, const int len) { + std::shared_ptr tmpFile = std::make_shared("/tmp"); + ASSERT(fileExists(tmpFile->getFileName())); + tmpFile->write(buff, len); + return tmpFile; +} + +std::shared_ptr prepareTokenFile(const int tokenLen) { + Standalone buff = makeString(tokenLen); + generateRandomData(mutateString(buff), tokenLen); + + return prepareTokenFile(buff.begin(), tokenLen); +} + +ACTOR Future testEmptyValidationFileDetails(Reference ctx) { + try { + wait(procureValidationTokensFromFiles(ctx, "")); + ASSERT(false); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_encrypt_invalid_kms_config); + } + return Void(); +} + +ACTOR Future testMalformedFileValidationTokenDetails(Reference ctx) { + try { + wait(procureValidationTokensFromFiles(ctx, "abdc/tmp/foo")); + ASSERT(false); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_operation_failed); + } + + return Void(); +} + +ACTOR Future testValidationTokenFileNotFound(Reference ctx) { + try { + wait(procureValidationTokensFromFiles(ctx, "foo#/imaginary-dir/dream/phantom-file")); + ASSERT(false); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_encrypt_invalid_kms_config); + } + return Void(); +} + +ACTOR Future testTooLargeValidationTokenFile(Reference ctx) { + std::string name("foo"); + const int tokenLen = SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE + 1; + + state std::shared_ptr tmpFile = prepareTokenFile(tokenLen); + + std::string details; + details.append(name).append(TOKEN_NAME_FILE_SEP).append(tmpFile->getFileName()); + + try { + wait(procureValidationTokensFromFiles(ctx, details)); + ASSERT(false); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_file_too_large); + } + + return Void(); +} + +ACTOR Future testValidationFileTokenPayloadTooLarge(Reference ctx) { + const int tokenLen = SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE; + const int nTokens = SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKENS_MAX_PAYLOAD_SIZE / + SERVER_KNOBS->REST_KMS_CONNECTOR_VALIDATION_TOKEN_MAX_SIZE + + 2; + Standalone buff = makeString(tokenLen); + generateRandomData(mutateString(buff), tokenLen); + + std::string details; + state std::vector> tokenfiles; + for (int i = 0; i < nTokens; i++) { + std::shared_ptr tokenfile = prepareTokenFile(buff.begin(), tokenLen); + + details.append(std::to_string(i)).append(TOKEN_NAME_FILE_SEP).append(tokenfile->getFileName()); + if (i < nTokens) + details.append(TOKEN_TUPLE_SEP); + tokenfiles.emplace_back(tokenfile); + } + + try { + wait(procureValidationTokensFromFiles(ctx, details)); + ASSERT(false); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_value_too_large); + } + + return Void(); +} + +ACTOR Future testMultiValidationFileTokenFiles(Reference ctx) { + state int numFiles = deterministicRandom()->randomInt(2, 5); + state int tokenLen = deterministicRandom()->randomInt(26, 75); + state Standalone buff = makeString(tokenLen); + state std::unordered_map> tokenFiles; + state std::unordered_map tokenNameValueMap; + state std::string tokenDetailsStr; + + generateRandomData(mutateString(buff), tokenLen); + + for (int i = 1; i <= numFiles; i++) { + std::string tokenName = std::to_string(i); + std::shared_ptr tokenfile = prepareTokenFile(buff.begin(), tokenLen); + + std::string token((char*)buff.begin(), tokenLen); + tokenFiles.emplace(tokenName, tokenfile); + tokenNameValueMap.emplace(std::to_string(i), token); + tokenDetailsStr.append(tokenName).append(TOKEN_NAME_FILE_SEP).append(tokenfile->getFileName()); + if (i < numFiles) + tokenDetailsStr.append(TOKEN_TUPLE_SEP); + } + + wait(procureValidationTokensFromFiles(ctx, tokenDetailsStr)); + + ASSERT_EQ(ctx->validationTokens.size(), tokenNameValueMap.size()); + + for (const auto& token : ctx->validationTokens) { + const auto& itr = tokenNameValueMap.find(token.first); + const ValidationTokenCtx& tokenCtx = token.second; + + ASSERT(itr != tokenNameValueMap.end()); + ASSERT_EQ(token.first.compare(itr->first), 0); + ASSERT_EQ(tokenCtx.value.compare(itr->second), 0); + ASSERT_EQ(tokenCtx.source, ValidationTokenSource::VALIDATION_TOKEN_SOURCE_FILE); + ASSERT(tokenCtx.filePath.present()); + ASSERT_EQ(tokenCtx.filePath.compare(tokenFiles[tokenCtx.name]->getFileName()), 0); + ASSERT_NE(tokenCtx.getReadTS(), 0); + } + + return Void(); +} + +EncryptCipherDomainId getRandomDomainId() { + const int lottery = deterministicRandom()->randomInt(0, 100); + if (lottery < 10) { + return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID; + } else if (lottery >= 10 && lottery < 25) { + return ENCRYPT_HEADER_DOMAIN_ID; + } else { + return lottery; + } +} + +void getFakeKmsResponse(StringRef jsonReqRef, const bool baseCipherIdPresent, Reference httpResponse) { + rapidjson::Document reqDoc; + reqDoc.Parse(jsonReqRef.toString().c_str()); + + rapidjson::Document resDoc; + resDoc.SetObject(); + + ASSERT(reqDoc.HasMember(CIPHER_KEY_DETAILS_TAG) && reqDoc[CIPHER_KEY_DETAILS_TAG].IsArray()); + + rapidjson::Value cipherKeyDetails(rapidjson::kArrayType); + for (const auto& detail : reqDoc[CIPHER_KEY_DETAILS_TAG].GetArray()) { + rapidjson::Value keyDetail(rapidjson::kObjectType); + + ASSERT(detail.HasMember(ENCRYPT_DOMAIN_ID_TAG)); + + rapidjson::Value key(ENCRYPT_DOMAIN_ID_TAG, resDoc.GetAllocator()); + rapidjson::Value domainId; + domainId.SetInt64(detail[ENCRYPT_DOMAIN_ID_TAG].GetInt64()); + keyDetail.AddMember(key, domainId, resDoc.GetAllocator()); + + key.SetString(BASE_CIPHER_ID_TAG, resDoc.GetAllocator()); + rapidjson::Value baseCipherId; + if (detail.HasMember(BASE_CIPHER_ID_TAG)) { + domainId.SetUint64(detail[BASE_CIPHER_ID_TAG].GetUint64()); + } else { + ASSERT(!baseCipherIdPresent); + domainId.SetUint(1234); + } + keyDetail.AddMember(key, domainId, resDoc.GetAllocator()); + + key.SetString(BASE_CIPHER_TAG, resDoc.GetAllocator()); + rapidjson::Value baseCipher; + baseCipher.SetString((char*)&BASE_CIPHER_KEY_TEST[0], sizeof(BASE_CIPHER_KEY_TEST), resDoc.GetAllocator()); + keyDetail.AddMember(key, baseCipher, resDoc.GetAllocator()); + + cipherKeyDetails.PushBack(keyDetail, resDoc.GetAllocator()); + } + rapidjson::Value memberKey(CIPHER_KEY_DETAILS_TAG, resDoc.GetAllocator()); + resDoc.AddMember(memberKey, cipherKeyDetails, resDoc.GetAllocator()); + + ASSERT(reqDoc.HasMember(REFRESH_KMS_URLS_TAG)); + if (reqDoc[REFRESH_KMS_URLS_TAG].GetBool()) { + rapidjson::Value kmsUrls(rapidjson::kArrayType); + for (int i = 0; i < 3; i++) { + rapidjson::Value url; + url.SetString(KMS_URL_NAME_TEST.data(), resDoc.GetAllocator()); + kmsUrls.PushBack(url, resDoc.GetAllocator()); + } + memberKey.SetString(KMS_URLS_TAG, resDoc.GetAllocator()); + resDoc.AddMember(memberKey, kmsUrls, resDoc.GetAllocator()); + } + + // Serialize json to string + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + resDoc.Accept(writer); + httpResponse->content.resize(sb.GetSize(), '\0'); + memcpy(httpResponse->content.data(), sb.GetString(), sb.GetSize()); +} + +void validateKmsUrls(Reference ctx) { + ASSERT_EQ(ctx->kmsUrlHeap.size(), 3); + std::shared_ptr urlCtx = ctx->kmsUrlHeap.top(); + ASSERT_EQ(urlCtx->url.compare(KMS_URL_NAME_TEST), 0); +} + +void testGetEncryptKeysByKeyIdsRequestBody(Reference ctx, Arena arena) { + KmsConnLookupEKsByKeyIdsReq req; + std::unordered_map keyMap; + const int nKeys = deterministicRandom()->randomInt(7, 8); + for (int i = 1; i < nKeys; i++) { + EncryptCipherDomainId domainId = getRandomDomainId(); + req.encryptKeyIds.push_back(std::make_pair(i, domainId)); + keyMap[i] = domainId; + } + + bool refreshKmsUrls = deterministicRandom()->randomInt(0, 100) < 50; + + StringRef requestBodyRef = getEncryptKeysByKeyIdsRequestBody(ctx, req, refreshKmsUrls, arena); + TraceEvent("FetchKeysByKeyIds", ctx->uid).setMaxFieldLength(10000).detail("JsonReqStr", requestBodyRef.toString()); + Reference httpResp = makeReference(); + httpResp->code = HTTP::HTTP_STATUS_CODE_OK; + getFakeKmsResponse(requestBodyRef, true, httpResp); + TraceEvent("FetchKeysByKeyIds", ctx->uid).setMaxFieldLength(10000).detail("HttpRespStr", httpResp->content); + + std::vector cipherDetails; + parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); + ASSERT_EQ(cipherDetails.size(), keyMap.size()); + for (const auto& detail : cipherDetails) { + ASSERT(keyMap.find(detail.encryptKeyId) != keyMap.end()); + ASSERT_EQ(keyMap[detail.encryptKeyId], detail.encryptDomainId); + ASSERT_EQ(detail.encryptKey.size(), sizeof(BASE_CIPHER_KEY_TEST)); + ASSERT_EQ(memcmp(detail.encryptKey.begin(), &BASE_CIPHER_KEY_TEST[0], sizeof(BASE_CIPHER_KEY_TEST)), 0); + } + if (refreshKmsUrls) { + validateKmsUrls(ctx); + } +} + +void testGetEncryptKeysByDomainIdsRequestBody(Reference ctx, Arena arena) { + KmsConnLookupEKsByDomainIdsReq req; + std::unordered_set domainIdsSet; + const int nKeys = deterministicRandom()->randomInt(7, 25); + for (int i = 1; i < nKeys; i++) { + domainIdsSet.emplace(getRandomDomainId()); + } + req.encryptDomainIds.insert(req.encryptDomainIds.begin(), domainIdsSet.begin(), domainIdsSet.end()); + + bool refreshKmsUrls = deterministicRandom()->randomInt(0, 100) < 50; + + StringRef jsonReqRef = getEncryptKeysByDomainIdsRequestBody(ctx, req, refreshKmsUrls, arena); + TraceEvent("FetchKeysByDomainIds", ctx->uid).detail("JsonReqStr", jsonReqRef.toString()); + Reference httpResp = makeReference(); + httpResp->code = HTTP::HTTP_STATUS_CODE_OK; + getFakeKmsResponse(jsonReqRef, false, httpResp); + TraceEvent("FetchKeysByDomainIds", ctx->uid).detail("HttpRespStr", httpResp->content); + + std::vector cipherDetails; + parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); + ASSERT_EQ(domainIdsSet.size(), cipherDetails.size()); + for (const auto& detail : cipherDetails) { + ASSERT(domainIdsSet.find(detail.encryptDomainId) != domainIdsSet.end()); + ASSERT_EQ(detail.encryptKey.size(), sizeof(BASE_CIPHER_KEY_TEST)); + ASSERT_EQ(memcmp(detail.encryptKey.begin(), &BASE_CIPHER_KEY_TEST[0], sizeof(BASE_CIPHER_KEY_TEST)), 0); + } + if (refreshKmsUrls) { + validateKmsUrls(ctx); + } +} + +void testMissingCipherDetailsTag(Reference ctx) { + Arena arena; + std::vector cipherDetails; + + rapidjson::Document doc; + doc.SetObject(); + + rapidjson::Value key(KMS_URLS_TAG, doc.GetAllocator()); + rapidjson::Value refreshUrl; + refreshUrl.SetBool(true); + doc.AddMember(key, refreshUrl, doc.GetAllocator()); + + Reference httpResp = makeReference(); + httpResp->code = HTTP::HTTP_STATUS_CODE_OK; + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + httpResp->content.resize(sb.GetSize(), '\0'); + memcpy(httpResp->content.data(), sb.GetString(), sb.GetSize()); + + try { + parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_operation_failed); + } +} + +void testMalformedCipherDetails(Reference ctx) { + Arena arena; + std::vector cipherDetails; + + rapidjson::Document doc; + doc.SetObject(); + + rapidjson::Value key(CIPHER_KEY_DETAILS_TAG, doc.GetAllocator()); + rapidjson::Value details; + details.SetBool(true); + doc.AddMember(key, details, doc.GetAllocator()); + + Reference httpResp = makeReference(); + httpResp->code = HTTP::HTTP_STATUS_CODE_OK; + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + httpResp->content.resize(sb.GetSize(), '\0'); + memcpy(httpResp->content.data(), sb.GetString(), sb.GetSize()); + + try { + parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_operation_failed); + } +} + +void testMalfromedCipherDetailObj(Reference ctx) { + Arena arena; + std::vector cipherDetails; + + rapidjson::Document doc; + doc.SetObject(); + + rapidjson::Value cDetails(rapidjson::kArrayType); + rapidjson::Value detail(rapidjson::kObjectType); + rapidjson::Value key(BASE_CIPHER_ID_TAG, doc.GetAllocator()); + rapidjson::Value id; + id.SetUint(12345); + detail.AddMember(key, id, doc.GetAllocator()); + cDetails.PushBack(detail, doc.GetAllocator()); + key.SetString(CIPHER_KEY_DETAILS_TAG, doc.GetAllocator()); + doc.AddMember(key, cDetails, doc.GetAllocator()); + + Reference httpResp = makeReference(); + httpResp->code = HTTP::HTTP_STATUS_CODE_OK; + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + httpResp->content.resize(sb.GetSize(), '\0'); + memcpy(httpResp->content.data(), sb.GetString(), sb.GetSize()); + + try { + parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_operation_failed); + } +} + +void testKMSErrorResponse(Reference ctx) { + Arena arena; + std::vector cipherDetails; + + rapidjson::Document doc; + doc.SetObject(); + + // Construct fake response, it should get ignored anyways + rapidjson::Value cDetails(rapidjson::kArrayType); + rapidjson::Value detail(rapidjson::kObjectType); + rapidjson::Value key(BASE_CIPHER_ID_TAG, doc.GetAllocator()); + rapidjson::Value id; + id.SetUint(12345); + detail.AddMember(key, id, doc.GetAllocator()); + cDetails.PushBack(detail, doc.GetAllocator()); + key.SetString(CIPHER_KEY_DETAILS_TAG, doc.GetAllocator()); + doc.AddMember(key, cDetails, doc.GetAllocator()); + + // Add error tag + rapidjson::Value errorTag(rapidjson::kObjectType); + + // Add 'error_detail' + rapidjson::Value eKey(ERROR_DETAIL_TAG, doc.GetAllocator()); + rapidjson::Value detailInfo; + detailInfo.SetString("Foo is always bad", doc.GetAllocator()); + errorTag.AddMember(eKey, detailInfo, doc.GetAllocator()); + + key.SetString(ERROR_TAG, doc.GetAllocator()); + doc.AddMember(key, errorTag, doc.GetAllocator()); + + Reference httpResp = makeReference(); + httpResp->code = HTTP::HTTP_STATUS_CODE_OK; + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + httpResp->content.resize(sb.GetSize(), '\0'); + memcpy(httpResp->content.data(), sb.GetString(), sb.GetSize()); + + try { + parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_encrypt_keys_fetch_failed); + } +} + +ACTOR Future testParseDiscoverKmsUrlFileNotFound(Reference ctx) { + try { + wait(parseDiscoverKmsUrlFile(ctx, "/imaginary-dir/dream/phantom-file")); + } catch (Error& e) { + ASSERT_EQ(e.code(), error_code_encrypt_invalid_kms_config); + } + return Void(); +} + +ACTOR Future testParseDiscoverKmsUrlFile(Reference ctx) { + state std::shared_ptr tmpFile = std::make_shared("/tmp"); + ASSERT(fileExists(tmpFile->getFileName())); + + state std::unordered_set urls; + urls.emplace("https://127.0.0.1/foo"); + urls.emplace("https://127.0.0.1/foo1"); + urls.emplace("https://127.0.0.1/foo2"); + + std::string content; + for (auto& url : urls) { + content.append(url); + content.push_back(DISCOVER_URL_FILE_URL_SEP); + } + tmpFile->write((const uint8_t*)content.c_str(), content.size()); + wait(parseDiscoverKmsUrlFile(ctx, tmpFile->getFileName())); + + ASSERT_EQ(ctx->kmsUrlHeap.size(), urls.size()); + while (!ctx->kmsUrlHeap.empty()) { + std::shared_ptr urlCtx = ctx->kmsUrlHeap.top(); + ctx->kmsUrlHeap.pop(); + + ASSERT(urls.find(urlCtx->url) != urls.end()); + ASSERT_EQ(urlCtx->nFailedResponses, 0); + ASSERT_EQ(urlCtx->nRequests, 0); + ASSERT_EQ(urlCtx->nResponseParseFailures, 0); + } + + return Void(); +} + +} // namespace + +TEST_CASE("/KmsConnector/REST/ParseKmsDiscoveryUrls") { + state Reference ctx = makeReference(); + state Arena arena; + + // initialize cipher key used for testing + generateRandomData(&BASE_CIPHER_KEY_TEST[0], 32); + + wait(testParseDiscoverKmsUrlFileNotFound(ctx)); + wait(testParseDiscoverKmsUrlFile(ctx)); + + return Void(); +} + +TEST_CASE("/KmsConnector/REST/ParseValidationTokenFile") { + state Reference ctx = makeReference(); + state Arena arena; + + // initialize cipher key used for testing + generateRandomData(&BASE_CIPHER_KEY_TEST[0], 32); + + wait(testEmptyValidationFileDetails(ctx)); + wait(testMalformedFileValidationTokenDetails(ctx)); + wait(testValidationTokenFileNotFound(ctx)); + wait(testTooLargeValidationTokenFile(ctx)); + wait(testValidationFileTokenPayloadTooLarge(ctx)); + wait(testMultiValidationFileTokenFiles(ctx)); + + return Void(); +} + +TEST_CASE("/KmsConnector/REST/ParseKmsResponse") { + state Reference ctx = makeReference(); + state Arena arena; + + // initialize cipher key used for testing + generateRandomData(&BASE_CIPHER_KEY_TEST[0], 32); + + testMissingCipherDetailsTag(ctx); + testMalformedCipherDetails(ctx); + testMalfromedCipherDetailObj(ctx); + testKMSErrorResponse(ctx); + return Void(); +} + +TEST_CASE("/KmsConnector/REST/GetEncryptionKeyOps") { + state Reference ctx = makeReference(); + state Arena arena; + + // initialize cipher key used for testing + generateRandomData(&BASE_CIPHER_KEY_TEST[0], 32); + + // Prepare KmsConnector context details + wait(testParseDiscoverKmsUrlFile(ctx)); + wait(testMultiValidationFileTokenFiles(ctx)); + + const int numIterations = deterministicRandom()->randomInt(512, 786); + for (int i = 0; i < numIterations; i++) { + testGetEncryptKeysByKeyIdsRequestBody(ctx, arena); + testGetEncryptKeysByDomainIdsRequestBody(ctx, arena); + } + return Void(); +} \ No newline at end of file diff --git a/fdbserver/RESTKmsConnector.actor.h b/fdbserver/RESTKmsConnector.actor.h new file mode 100644 index 0000000000..dbd65e01bd --- /dev/null +++ b/fdbserver/RESTKmsConnector.actor.h @@ -0,0 +1,37 @@ +/* + * RESTKmsConnector.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RESTKMSCONNECTOR_ACTOR_G_H) +#define FDBSERVER_RESTKMSCONNECTOR_ACTOR_G_H +#include "fdbserver/RESTKmsConnector.actor.g.h" +#elif !defined(FDBSERVER_RESTKMSCONNECTOR_ACTOR_H) +#define FDBSERVER_RESTKMSCONNECTOR_ACTOR_H + +#include "fdbserver/KmsConnector.h" + +class RESTKmsConnector : public KmsConnector { +public: + RESTKmsConnector() = default; + Future connectorCore(KmsConnectorInterface interf); +}; + +#endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 9c9635d20e..c4cf5f1903 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -111,7 +111,7 @@ enum { OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE, OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE, OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME, - OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK + OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT }; CSimpleOpt::SOption g_rgOptions[] = { @@ -204,6 +204,9 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_FLOW_PROCESS_NAME, "--process-name", SO_REQ_SEP }, { OPT_FLOW_PROCESS_ENDPOINT, "--process-endpoint", SO_REQ_SEP }, { OPT_IP_TRUSTED_MASK, "--trusted-subnet-", SO_REQ_SEP }, + { OPT_KMS_CONN_DISCOVERY_URL_FILE, "--discover-kms-conn-url-file", SO_REQ_SEP}, + { OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, "--kms-conn-validation-token-details", SO_REQ_SEP}, + { OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, "--kms-conn-get-encryption-keys-endpoint", SO_REQ_SEP}, TLS_OPTION_FLAGS, SO_END_OF_OPTIONS }; @@ -1633,6 +1636,18 @@ private: case TLSConfig::OPT_TLS_VERIFY_PEERS: tlsConfig.addVerifyPeers(args.OptionArg()); break; + case OPT_KMS_CONN_DISCOVERY_URL_FILE: { + knobs.emplace_back("rest_kms_connector_kms_discovery_url_file", args.OptionArg()); + break; + } + case OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS: { + knobs.emplace_back("rest_kms_connector_validation_token_details", args.OptionArg()); + break; + } + case OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT: { + knobs.emplace_back("rest_kms_connector_get_encryption_keys_endpoint", args.OptionArg()); + break; + } } } diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp index 4ad6feab78..2255d310d8 100644 --- a/fdbserver/workloads/UnitTests.actor.cpp +++ b/fdbserver/workloads/UnitTests.actor.cpp @@ -39,6 +39,7 @@ void forceLinkTokenSignTests(); void forceLinkVersionVectorTests(); void forceLinkRESTClientTests(); void forceLinkRESTUtilsTests(); +void forceLinkRESTKmsConnectorTest(); struct UnitTestWorkload : TestWorkload { bool enabled; @@ -88,6 +89,7 @@ struct UnitTestWorkload : TestWorkload { forceLinkVersionVectorTests(); forceLinkRESTClientTests(); forceLinkRESTUtilsTests(); + forceLinkRESTKmsConnectorTest(); } std::string description() const override { return "UnitTests"; } diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 5003487591..2fb0c8e62f 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -43,6 +43,9 @@ #include #include #include +#include +#include +#include #include #include @@ -2947,54 +2950,54 @@ int64_t fileSize(std::string const& filename) { #endif } -std::string readFileBytes(std::string const& filename, int maxSize) { - std::string s; - FILE* f = fopen(filename.c_str(), "rb" FOPEN_CLOEXEC_MODE); - if (!f) { - TraceEvent(SevWarn, "FileOpenError") +size_t readFileBytes(std::string const& filename, uint8_t* buff, size_t len) { + std::fstream ifs(filename, std::fstream::in | std::fstream::binary); + if (!ifs.good()) { + TraceEvent("ileBytes_FileOpenError").detail("Filename", filename).GetLastError(); + throw io_error(); + } + + size_t bytesRead = len; + ifs.seekg(0, std::fstream::beg); + ifs.read((char*)buff, len); + if (!ifs) { + bytesRead = ifs.gcount(); + TraceEvent("ReadFileBytes_ShortRead") .detail("Filename", filename) - .detail("Errno", errno) - .detail("ErrorDescription", strerror(errno)); - throw file_not_readable(); + .detail("Requested", len) + .detail("Actual", bytesRead); } - try { - fseek(f, 0, SEEK_END); - size_t size = ftell(f); - if (size > maxSize) - throw file_too_large(); - s.resize(size); - fseek(f, 0, SEEK_SET); - if (!fread(&s[0], size, 1, f)) - throw file_not_readable(); - } catch (...) { - fclose(f); - throw; + + return bytesRead; +} + +std::string readFileBytes(std::string const& filename, int maxSize) { + if (!fileExists(filename)) { + TraceEvent("ReadFileBytes_FileNotFound").detail("Filename", filename); + throw file_not_found(); } - fclose(f); - return s; + + size_t size = fileSize(filename); + if (size > maxSize) { + TraceEvent("ReadFileBytes_FileTooLarge").detail("Filename", filename); + throw file_too_large(); + } + + std::string ret; + ret.resize(size); + readFileBytes(filename, (uint8_t*)ret.data(), size); + + return ret; } void writeFileBytes(std::string const& filename, const uint8_t* data, size_t count) { - FILE* f = fopen(filename.c_str(), "wb" FOPEN_CLOEXEC_MODE); - if (!f) { - TraceEvent(SevError, "WriteFileBytes").detail("Filename", filename).GetLastError(); - throw file_not_writable(); + std::ofstream ofs(filename, std::fstream::out | std::fstream::binary); + if (!ofs.good()) { + TraceEvent("WriteFileBytes_FileOpenError").detail("Filename", filename).GetLastError(); + throw io_error(); } - try { - size_t length = fwrite(data, sizeof(uint8_t), count, f); - if (length != count) { - TraceEvent(SevError, "WriteFileBytes") - .detail("Filename", filename) - .detail("WrittenLength", length) - .GetLastError(); - throw file_not_writable(); - } - } catch (...) { - fclose(f); - throw; - } - fclose(f); + ofs.write((const char*)data, count); } void writeFile(std::string const& filename, std::string const& content) { @@ -3258,6 +3261,61 @@ bool isHwCrcSupported() { #endif } +TmpFile::TmpFile() : filename("") { + createTmpFile(boost::filesystem::temp_directory_path().string(), TmpFile::defaultPrefix); +} + +TmpFile::TmpFile(const std::string& tmpDir) : filename("") { + std::string dir = removeWhitespace(tmpDir); + createTmpFile(dir, TmpFile::defaultPrefix); +} + +TmpFile::TmpFile(const std::string& tmpDir, const std::string& prefix) : filename("") { + std::string dir = removeWhitespace(tmpDir); + createTmpFile(dir, prefix); +} + +TmpFile::~TmpFile() { + if (!filename.empty()) { + destroyFile(); + } +} + +void TmpFile::createTmpFile(const std::string_view dir, const std::string_view prefix) { + std::string modelPattern = "%%%%-%%%%-%%%%-%%%%"; + boost::format fmter("%s/%s-%s"); + std::string modelPath = boost::str(boost::format(fmter % dir % prefix % modelPattern)); + boost::filesystem::path filePath = boost::filesystem::unique_path(modelPath); + + filename = filePath.string(); + + // Create empty tmp file + std::fstream tmpFile(filename, std::fstream::out); + if (!tmpFile.good()) { + TraceEvent("TmpFile_CreateFileError").detail("Filename", filename); + throw io_error(); + } + TraceEvent("TmpFile_CreateSuccess").detail("Filename", filename); +} + +size_t TmpFile::read(uint8_t* buff, size_t len) { + return readFileBytes(filename, buff, len); +} + +void TmpFile::write(const uint8_t* buff, size_t len) { + writeFileBytes(filename, buff, len); +} + +bool TmpFile::destroyFile() { + bool deleted = deleteFile(filename); + if (deleted) { + TraceEvent("TmpFileDestory_Success").detail("Filename", filename); + } else { + TraceEvent("TmpFileDestory_Failed").detail("Filename", filename); + } + return deleted; +} + } // namespace platform extern "C" void criticalError(int exitCode, const char* type, const char* message) { diff --git a/flow/Platform.h b/flow/Platform.h index 272c8e3588..51ed8c2704 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -314,6 +314,10 @@ void atomicReplace(std::string const& path, std::string const& content, bool tex // Read a file into memory std::string readFileBytes(std::string const& filename, int maxSize); +// Read a file into memory supplied by the caller +// If 'len' is greater than file size, then read the filesize bytes. +void readFileBytes(std::string const& filename, uint8_t* buff, int64_t len); + // Write data buffer into file void writeFileBytes(std::string const& filename, const char* data, size_t count); @@ -418,6 +422,27 @@ int eraseDirectoryRecursive(std::string const& directory); bool isHwCrcSupported(); +// Creates a temporary file; file gets destroyed/deleted along with object destruction. +// If 'tmpDir' is empty, code defaults to 'boost::filesystem::temp_directory_path()' +// If 'pattern' is empty, code defaults to 'fdbtmp' +struct TmpFile { +public: + TmpFile(); + TmpFile(const std::string& tempDir); + TmpFile(const std::string& tempDir, std::string const& prefix); + ~TmpFile(); + size_t read(uint8_t* buff, size_t len); + void write(const uint8_t* buff, size_t len); + bool destroyFile(); + std::string getFileName() const { return filename; } + +private: + std::string filename; + constexpr static std::string_view defaultPrefix = "fdbtmp"; + + void createTmpFile(const std::string_view dir, const std::string_view prefix); +}; + } // namespace platform #ifdef __linux__ diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 4dc43e11af..a0c34a24b4 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -311,6 +311,8 @@ ERROR( encrypt_key_ttl_expired, 2703, "Expected encryption key TTL has expired") ERROR( encrypt_header_authtoken_mismatch, 2704, "Encryption header authentication token mismatch") ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key") ERROR( encrypt_invalid_id, 2706, "Invalid encryption cipher details") +ERROR( encrypt_keys_fetch_failed, 2707, "Encryption keys fetch from external KMS failed") +ERROR( encrypt_invalid_kms_config, 2708, "Invalid encryption/kms configuration: discovery-url, validation-token, endpoint etc.") // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5f4e0619b3..648cef86bc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -167,6 +167,8 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/RandomUnitTests.toml) add_fdb_test(TEST_FILES fast/ReadHotDetectionCorrectness.toml IGNORE) # TODO re-enable once read hot detection is enabled. add_fdb_test(TEST_FILES fast/ReportConflictingKeys.toml) + add_fdb_test(TEST_FILES fast/RESTKmsConnectorUnit.toml) + add_fdb_test(TEST_FILES fast/RESTUtilsUnit.toml) add_fdb_test(TEST_FILES fast/SelectorCorrectness.toml) add_fdb_test(TEST_FILES fast/Sideband.toml) add_fdb_test(TEST_FILES fast/SidebandSingle.toml) diff --git a/tests/fast/RESTKmsConnectorUnit.toml b/tests/fast/RESTKmsConnectorUnit.toml new file mode 100644 index 0000000000..72d892c659 --- /dev/null +++ b/tests/fast/RESTKmsConnectorUnit.toml @@ -0,0 +1,8 @@ +[[test]] +testTitle = 'RESTKmsConnectorTest' +useDB = false +startDelay = 0 + + [[test.workload]] + testName = 'UnitTests' + testsMatching = '/KmsConnector/REST/' diff --git a/tests/fast/RESTUtilsUnit.toml b/tests/fast/RESTUtilsUnit.toml new file mode 100644 index 0000000000..b55ed5e04e --- /dev/null +++ b/tests/fast/RESTUtilsUnit.toml @@ -0,0 +1,8 @@ +[[test]] +testTitle = 'RESTUtilsTest' +startDelay = 0 +useDB = false + + [[test.workload]] + testName = 'UnitTests' + testsMatching = '/RESTUtils' \ No newline at end of file From d6e55492267b078d6586a48c0f40195ac2bdc378 Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Sat, 7 May 2022 20:41:00 -0700 Subject: [PATCH 151/299] FDB native MockKMS REST server implementation - Golang Description Major changes include: 1. FDB native MockKMS REST server implementation - Golang based. 2. Implements "getEncryptionKeys" endpoint utilized by FDB RESTKmsConnector module. 3. Ability to inject faults to induce errors at various points during query execution NextSteps: Need to integrate MockKMS to FDB build system. Testing 1. Implements mockkms_test.go module providing extensive test coverage for newly added code. 2. Postman based local testing. --- bindings/go/src/mockkms/fault_injection.go | 179 ++++++++++ .../go/src/mockkms/get_encryption_keys.go | 321 ++++++++++++++++++ bindings/go/src/mockkms/mock_kms.go | 66 ++++ bindings/go/src/mockkms/mockkms_test.go | 302 ++++++++++++++++ bindings/go/src/mockkms/utils.go | 51 +++ 5 files changed, 919 insertions(+) create mode 100644 bindings/go/src/mockkms/fault_injection.go create mode 100644 bindings/go/src/mockkms/get_encryption_keys.go create mode 100644 bindings/go/src/mockkms/mock_kms.go create mode 100644 bindings/go/src/mockkms/mockkms_test.go create mode 100644 bindings/go/src/mockkms/utils.go diff --git a/bindings/go/src/mockkms/fault_injection.go b/bindings/go/src/mockkms/fault_injection.go new file mode 100644 index 0000000000..c6ce553938 --- /dev/null +++ b/bindings/go/src/mockkms/fault_injection.go @@ -0,0 +1,179 @@ +/* + * fault_injection.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Interface supports client to inject fault(s) +// Module enables a client to update { FaultLocation -> FaultStatus } mapping in a +// thread-safe manner, however, client is responsible to synchronize fault status +// updates across 'getEncryptionKeys' REST requests to obtain predictable results. + +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "log" + "net/http" + "os" + "sync" +) + +type Fault struct { + Location int `json:"fault_location"` + Enable bool `json:"enable_fault"` +} + +type FaultInjectionRequest struct { + Faults []Fault `json:"faults"` +} + +type FaultInjectionResponse struct { + Faults []Fault `json:"faults"` +} + +type faultLocMap struct { + locMap map[int]bool + rwLock sync.RWMutex +} + +var ( + faultLocMapInstance *faultLocMap // Singleton mapping of { FaultLocation -> FaultStatus } +) + +// Caller is responsible for thread synchronization. Recommended to be invoked during package::init() +func NewFaultLocMap() *faultLocMap { + if faultLocMapInstance == nil { + faultLocMapInstance = &faultLocMap{} + + faultLocMapInstance.rwLock = sync.RWMutex{} + faultLocMapInstance.locMap = map[int]bool { + READ_HTTP_REQUEST_BODY : false, + UNMARSHAL_REQUEST_BODY_JSON : false, + UNSUPPORTED_QUERY_MODE : false, + PARSE_HTTP_REQUEST : false, + MARSHAL_RESPONSE : false, + } + } + return faultLocMapInstance +} + +func getLocFaultStatus(loc int) (val bool, found bool) { + if faultLocMapInstance == nil { + panic("FaultLocMap not intialized") + os.Exit(1) + } + + faultLocMapInstance.rwLock.RLock() + defer faultLocMapInstance.rwLock.RUnlock() + val, found = faultLocMapInstance.locMap[loc] + if !found { + return + } + return +} + +func updateLocFaultStatuses(faults []Fault) (updated []Fault, err error) { + if faultLocMapInstance == nil { + panic("FaultLocMap not intialized") + os.Exit(1) + } + + updated = []Fault{} + err = nil + + faultLocMapInstance.rwLock.Lock() + defer faultLocMapInstance.rwLock.Unlock() + for i := 0; i < len(faults); i++ { + fault := faults[i] + + oldVal, found := faultLocMapInstance.locMap[fault.Location] + if !found { + err = fmt.Errorf("Unknown fault_location '%d'", fault.Location) + return + } + faultLocMapInstance.locMap[fault.Location] = fault.Enable + log.Printf("Update Location '%d' oldVal '%t' newVal '%t'", fault.Location, oldVal, fault.Enable) + } + + // return the updated faultLocMap + for loc, enable := range faultLocMapInstance.locMap { + var f Fault + f.Location = loc + f.Enable = enable + updated = append(updated, f) + } + return +} + +func jsonifyFaultArr(w http.ResponseWriter, faults []Fault) (jResp string) { + resp := FaultInjectionResponse{ + Faults: faults, + } + + mResp, err := json.Marshal(resp) + if err != nil { + log.Printf("Error marshaling response '%s'", err.Error()) + sendErrorResponse(w, err) + return + } + jResp = string(mResp) + return +} + +func updateFaultLocMap(w http.ResponseWriter, faults []Fault) { + updated , err := updateLocFaultStatuses(faults) + if err != nil { + sendErrorResponse(w, err) + return + } + + fmt.Fprintf(w, jsonifyFaultArr(w, updated)) +} + +func shouldInjectFault(loc int) bool { + status, found := getLocFaultStatus(loc) + if !found { + log.Printf("Unknown fault_location '%d'", loc) + return false + } + return status +} + +func handleUpdateFaultInjection(w http.ResponseWriter, r *http.Request) { + byteArr, err := ioutil.ReadAll(r.Body) + if err != nil { + log.Printf("Http request body read error '%s'", err.Error()) + sendErrorResponse(w, err) + return + } + + req := FaultInjectionRequest{} + err = json.Unmarshal(byteArr, &req) + if err != nil { + log.Printf("Error parsing FaultInjectionRequest '%s'", string(byteArr)) + sendErrorResponse(w, err) + } + updateFaultLocMap(w, req.Faults) +} + +func initFaultLocMap() { + faultLocMapInstance = NewFaultLocMap() + log.Printf("FaultLocMap int done") +} \ No newline at end of file diff --git a/bindings/go/src/mockkms/get_encryption_keys.go b/bindings/go/src/mockkms/get_encryption_keys.go new file mode 100644 index 0000000000..3189955e64 --- /dev/null +++ b/bindings/go/src/mockkms/get_encryption_keys.go @@ -0,0 +1,321 @@ +/* + * get_encryption_keys.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// GetEncryptionKeys handler +// Handler is resposible for the following: +// 1. Parse the incoming HttpRequest and validate JSON request structural sanity +// 2. Ability to handle getEncryptionKeys by 'KeyId' or 'DomainId' as requested +// 3. Ability to inject faults if requested + +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "log" + "math/rand" + "net/http" +) + +type CipherDetailRes struct { + BaseCipherId uint64 `json:"base_cipher_id"` + EncryptDomainId int64 `json:"encrypt_domain_id"` + BaseCipher string `json:"base_cipher"` +} + +type ValidationToken struct { + TokenName string `json:"token_name"` + TokenValue string `json:"token_value"` +} + +type CipherDetailReq struct { + BaseCipherId uint64 `json:"base_cipher_id"` + EncryptDomainId int64 `json:"encrypt_domain_id"` +} + +type GetEncryptKeysResponse struct { + CipherDetails []CipherDetailRes `json:"cipher_key_details"` + KmsUrls []string `json:"kms_urls"` +} + +type GetEncryptKeysRequest struct { + QueryMode string `json:"query_mode"` + CipherDetails []CipherDetailReq `json:"cipher_key_details"` + ValidationTokens []ValidationToken `json:"validation_tokens"` + RefreshKmsUrls bool `json:"refresh_kms_urls"` +} + +type cipherMapInstanceSingleton map[uint64][]byte + +const ( + READ_HTTP_REQUEST_BODY = iota + UNMARSHAL_REQUEST_BODY_JSON + UNSUPPORTED_QUERY_MODE + PARSE_HTTP_REQUEST + MARSHAL_RESPONSE +) + +const ( + maxCipherKeys = uint64(1024*1024) // Max cipher keys + maxCipherSize = 16 // Max cipher buffer size +) + +var ( + cipherMapInstance cipherMapInstanceSingleton // Singleton mapping of { baseCipherId -> baseCipher } +) + +// const mapping of { Location -> errorString } +func errStrMap() func(int) string { + _errStrMap := map[int]string { + READ_HTTP_REQUEST_BODY : "Http request body read error", + UNMARSHAL_REQUEST_BODY_JSON : "Http request body unmarshal error", + UNSUPPORTED_QUERY_MODE : "Unsupported query_mode", + PARSE_HTTP_REQUEST : "Error parsing GetEncryptionKeys request", + MARSHAL_RESPONSE : "Error marshaling response", + } + + return func(key int) string { + return _errStrMap[key] + } +} + +// Caller is responsible for thread synchronization. Recommended to be invoked during package::init() +func NewCipherMap(maxKeys uint64, cipherSize int) cipherMapInstanceSingleton { + if cipherMapInstance == nil { + cipherMapInstance = make(map[uint64][]byte) + + for i := uint64(1); i<= maxKeys; i++ { + cipher := make([]byte, cipherSize) + rand.Read(cipher) + cipherMapInstance[i] = cipher + } + log.Printf("KMS cipher map populate done, maxCiphers '%d'", maxCipherKeys) + } + return cipherMapInstance +} + +func getKmsUrls() (urls []string) { + urlCount := rand.Intn(5) + 1 + for i := 1; i <= urlCount; i++ { + url := fmt.Sprintf("https://KMS/%d:%d:%d:%d", i, i, i, i) + urls = append(urls, url) + } + return +} + +func isEncryptDomainIdValid(id int64) bool { + if id > 0 || id == -1 || id == -2 { + return true + } + return false +} + +func abs(x int64) int64 { + if x < 0 { + return -x + } + return x +} + +func getBaseCipherIdFromDomainId(domainId int64) (baseCipherId uint64) { + baseCipherId = uint64(1) + uint64(abs(domainId)) % maxCipherKeys + return +} + +func getEncryptionKeysByKeyIds(w http.ResponseWriter, byteArr []byte) { + req := GetEncryptKeysRequest{} + err := json.Unmarshal(byteArr, &req) + if err != nil || shouldInjectFault(PARSE_HTTP_REQUEST) { + var e error + if shouldInjectFault(PARSE_HTTP_REQUEST) { + e = fmt.Errorf("[FAULT] %s %s'", errStrMap()(PARSE_HTTP_REQUEST), string(byteArr)) + } else { + e = fmt.Errorf("%s %s' err '%v'", errStrMap()(PARSE_HTTP_REQUEST), string(byteArr), err) + } + log.Println(e.Error()) + sendErrorResponse(w, e) + return + } + + var details []CipherDetailRes + for i := 0; i < len(req.CipherDetails); i++ { + var baseCipherId = uint64(req.CipherDetails[i].BaseCipherId) + + var encryptDomainId = int64(req.CipherDetails[i].EncryptDomainId) + if !isEncryptDomainIdValid(encryptDomainId) { + e := fmt.Errorf("EncryptDomainId not valid '%d'", encryptDomainId) + sendErrorResponse(w, e) + return + } + + cipher, found := cipherMapInstance[baseCipherId] + if !found { + e := fmt.Errorf("BaseCipherId not found '%d'", baseCipherId) + sendErrorResponse(w, e) + return + } + + var detail = CipherDetailRes { + BaseCipherId: baseCipherId, + EncryptDomainId: encryptDomainId, + BaseCipher: string(cipher), + } + details = append(details, detail) + } + + var urls []string + if req.RefreshKmsUrls { + urls = getKmsUrls() + } + + resp := GetEncryptKeysResponse{ + CipherDetails: details, + KmsUrls: urls, + } + + mResp, err := json.Marshal(resp) + if err != nil || shouldInjectFault(MARSHAL_RESPONSE) { + var e error + if shouldInjectFault(MARSHAL_RESPONSE) { + e = fmt.Errorf("[FAULT] %s", errStrMap()(MARSHAL_RESPONSE)) + } else { + e = fmt.Errorf("%s err '%v'", errStrMap()(MARSHAL_RESPONSE), err) + } + log.Println(e.Error()) + sendErrorResponse(w, e) + return + } + + fmt.Fprintf(w, string(mResp)) +} + +func getEncryptionKeysByDomainIds(w http.ResponseWriter, byteArr []byte) { + req := GetEncryptKeysRequest{} + err := json.Unmarshal(byteArr, &req) + if err != nil || shouldInjectFault(PARSE_HTTP_REQUEST) { + var e error + if shouldInjectFault(PARSE_HTTP_REQUEST) { + e = fmt.Errorf("[FAULT] %s '%s'", errStrMap()(PARSE_HTTP_REQUEST), string(byteArr)) + } else { + e = fmt.Errorf("%s '%s' err '%v'", errStrMap()(PARSE_HTTP_REQUEST), string(byteArr), err) + } + log.Println(e.Error()) + sendErrorResponse(w, e) + return + } + + var details []CipherDetailRes + for i := 0; i < len(req.CipherDetails); i++ { + var encryptDomainId = int64(req.CipherDetails[i].EncryptDomainId) + if !isEncryptDomainIdValid(encryptDomainId) { + e := fmt.Errorf("EncryptDomainId not valid '%d'", encryptDomainId) + sendErrorResponse(w, e) + return + } + + var baseCipherId = getBaseCipherIdFromDomainId(encryptDomainId) + cipher, found := cipherMapInstance[baseCipherId] + if !found { + e := fmt.Errorf("BaseCipherId not found '%d'", baseCipherId) + sendErrorResponse(w, e) + return + } + + var detail = CipherDetailRes { + BaseCipherId: baseCipherId, + EncryptDomainId: encryptDomainId, + BaseCipher: string(cipher), + } + details = append(details, detail) + } + + var urls []string + if req.RefreshKmsUrls { + urls = getKmsUrls() + } + + resp := GetEncryptKeysResponse{ + CipherDetails: details, + KmsUrls: urls, + } + + mResp, err := json.Marshal(resp) + if err != nil || shouldInjectFault(MARSHAL_RESPONSE) { + var e error + if shouldInjectFault(MARSHAL_RESPONSE) { + e = fmt.Errorf("[FAULT] %s", errStrMap()(MARSHAL_RESPONSE)) + } else { + e = fmt.Errorf("%s err '%v'", errStrMap()(MARSHAL_RESPONSE), err) + } + log.Println(e.Error()) + sendErrorResponse(w, e) + return + } + + fmt.Fprintf(w, string(mResp)) +} + +func handleGetEncryptionKeys(w http.ResponseWriter, r *http.Request) { + byteArr, err := ioutil.ReadAll(r.Body) + if err != nil || shouldInjectFault(READ_HTTP_REQUEST_BODY) { + var e error + if shouldInjectFault(READ_HTTP_REQUEST_BODY) { + e = fmt.Errorf("[FAULT] %s", errStrMap()(READ_HTTP_REQUEST_BODY)) + } else { + e = fmt.Errorf("%s err '%v'", errStrMap()(READ_HTTP_REQUEST_BODY), err) + } + log.Println(e.Error()) + sendErrorResponse(w, e) + return + } + + var arbitrary_json map[string]interface{} + err = json.Unmarshal(byteArr, &arbitrary_json) + if err != nil || shouldInjectFault(UNMARSHAL_REQUEST_BODY_JSON) { + var e error + if shouldInjectFault(UNMARSHAL_REQUEST_BODY_JSON) { + e = fmt.Errorf("[FAULT] %s", errStrMap()(UNMARSHAL_REQUEST_BODY_JSON)) + } else { + e = fmt.Errorf("%s err '%v'", errStrMap()(UNMARSHAL_REQUEST_BODY_JSON), err) + } + log.Println(e.Error()) + sendErrorResponse(w, e) + return + } + + if shouldInjectFault(UNSUPPORTED_QUERY_MODE) { + err = fmt.Errorf("[FAULT] %s '%s'", errStrMap()(UNSUPPORTED_QUERY_MODE), arbitrary_json["query_mode"]) + sendErrorResponse(w, err) + return + } else if arbitrary_json["query_mode"] == "lookupByKeyId" { + getEncryptionKeysByKeyIds(w, byteArr) + } else if arbitrary_json["query_mode"] == "lookupByDomainId" { + getEncryptionKeysByDomainIds(w, byteArr) + } else { + err = fmt.Errorf("%s '%s'", errStrMap()(UNSUPPORTED_QUERY_MODE), arbitrary_json["query_mode"]) + sendErrorResponse(w, err) + return + } +} + +func initEncryptCipherMap() { + cipherMapInstance = NewCipherMap(maxCipherKeys, maxCipherSize) +} \ No newline at end of file diff --git a/bindings/go/src/mockkms/mock_kms.go b/bindings/go/src/mockkms/mock_kms.go new file mode 100644 index 0000000000..e271aa6cb5 --- /dev/null +++ b/bindings/go/src/mockkms/mock_kms.go @@ -0,0 +1,66 @@ +/* + * mock_kms.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// FoundationDB Mock KMS (Key Management Solution/Service) interface +// Interface runs an HTTP server handling REST calls simulating FDB communications +// with an external KMS. + +package main + +import ( + "log" + "math/rand" + "net/http" + "sync" + "time" +) + +// KMS supported endpoints +const ( + getEncryptionKeysEndpoint = "/getEncryptionKeys" + updateFaultInjectionEndpoint = "/updateFaultInjection" +) + +// Routine is responsible to instantiate data-structures necessary for MockKMS functioning +func init () { + var wg sync.WaitGroup + + wg.Add(2) + go func(){ + initEncryptCipherMap() + wg.Done() + }() + go func(){ + initFaultLocMap() + wg.Done() + }() + + wg.Wait() + + rand.Seed(time.Now().UTC().UnixNano()) +} + +func main() { + http.NewServeMux() + http.HandleFunc(getEncryptionKeysEndpoint, handleGetEncryptionKeys) + http.HandleFunc(updateFaultInjectionEndpoint, handleUpdateFaultInjection) + + log.Fatal(http.ListenAndServe(":5001", nil)) +} \ No newline at end of file diff --git a/bindings/go/src/mockkms/mockkms_test.go b/bindings/go/src/mockkms/mockkms_test.go new file mode 100644 index 0000000000..2613e444ea --- /dev/null +++ b/bindings/go/src/mockkms/mockkms_test.go @@ -0,0 +1,302 @@ +/* + * mockkms_test.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// MockKMS unit tests, the coverage includes: +// 1. Mock HttpRequest creation and HttpResponse writer. +// 2. Construct fake request to validate the following scenarions: +// 2.1. Request with "unsupported query mode" +// 2.2. Get encryption keys by KeyIds; with and without 'RefreshKmsUrls' flag. +// 2.2. Get encryption keys by DomainIds; with and without 'RefreshKmsUrls' flag. +// 2.3. Random fault injection and response validation + +package main + +import ( + "encoding/json" + "io/ioutil" + "math/rand" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +const ( + ByKeyIdReqWithRefreshUrls = `{ + "query_mode": "lookupByKeyId", + "cipher_key_details": [ + { + "base_cipher_id": 77, + "encrypt_domain_id": 76 + }, + { + "base_cipher_id": 2, + "encrypt_domain_id": -1 + } + ], + "validation_tokens": [ + { + "token_name": "1", + "token_value":"12344" + }, + { + "token_name": "2", + "token_value":"12334" + } + ], + "refresh_kms_urls": true + }` + ByKeyIdReqWithoutRefreshUrls = `{ + "query_mode": "lookupByKeyId", + "cipher_key_details": [ + { + "base_cipher_id": 77, + "encrypt_domain_id": 76 + }, + { + "base_cipher_id": 2, + "encrypt_domain_id": -1 + } + ], + "validation_tokens": [ + { + "token_name": "1", + "token_value":"12344" + }, + { + "token_name": "2", + "token_value":"12334" + } + ], + "refresh_kms_urls": false + }` + ByDomainIdReqWithRefreshUrls = `{ + "query_mode": "lookupByDomainId", + "cipher_key_details": [ + { + "encrypt_domain_id": 76 + }, + { + "encrypt_domain_id": -1 + } + ], + "validation_tokens": [ + { + "token_name": "1", + "token_value":"12344" + }, + { + "token_name": "2", + "token_value":"12334" + } + ], + "refresh_kms_urls": true + }` + ByDomainIdReqWithoutRefreshUrls = `{ + "query_mode": "lookupByDomainId", + "cipher_key_details": [ + { + "encrypt_domain_id": 76 + }, + { + "encrypt_domain_id": -1 + } + ], + "validation_tokens": [ + { + "token_name": "1", + "token_value":"12344" + }, + { + "token_name": "2", + "token_value":"12334" + } + ], + "refresh_kms_urls": false + }` + UnsupportedQueryMode= `{ + "query_mode": "foo_mode", + "cipher_key_details": [ + { + "encrypt_domain_id": 76 + }, + { + "encrypt_domain_id": -1 + } + ], + "validation_tokens": [ + { + "token_name": "1", + "token_value":"12344" + }, + { + "token_name": "2", + "token_value":"12334" + } + ], + "refresh_kms_urls": false + }` +) + +func unmarshalValidResponse(data []byte, t *testing.T) (resp GetEncryptKeysResponse) { + resp = GetEncryptKeysResponse{} + err := json.Unmarshal(data, &resp) + if err != nil { + t.Errorf("Error unmarshaling valid response '%s' error '%v'", string(data), err) + t.Fail() + } + return +} + +func unmarshalErrorResponse(data []byte, t *testing.T) (resp ErrorResponse) { + resp = ErrorResponse{} + err := json.Unmarshal(data, &resp) + if err != nil { + t.Errorf("Error unmarshaling error response resp '%s' error '%v'", string(data), err) + t.Fail() + } + return +} + +func checkGetEncyptKeysResponseValidity(resp GetEncryptKeysResponse, t *testing.T) { + if len(resp.CipherDetails) != 2 { + t.Errorf("Unexpected CipherDetails count, expected '%d' actual '%d'", 2, len(resp.CipherDetails)) + t.Fail() + } + + baseCipherIds := [...]uint64 {uint64(77), uint64(2)} + encryptDomainIds := [...]int64 {int64(76), int64(-1)} + + for i := 0; i < len(resp.CipherDetails); i++ { + if resp.CipherDetails[i].BaseCipherId != baseCipherIds[i] { + t.Errorf("Mismatch BaseCipherId, expected '%d' actual '%d'", baseCipherIds[i], resp.CipherDetails[i].BaseCipherId) + t.Fail() + } + if resp.CipherDetails[i].EncryptDomainId != encryptDomainIds[i] { + t.Errorf("Mismatch EncryptDomainId, expected '%d' actual '%d'", encryptDomainIds[i], resp.CipherDetails[i].EncryptDomainId) + t.Fail() + } + if len(resp.CipherDetails[i].BaseCipher) == 0 { + t.Error("Empty BaseCipher") + t.Fail() + } + } +} + +func runQueryExpectingErrorResponse(payload string, url string, errSubStr string, t *testing.T) { + body := strings.NewReader(payload) + req := httptest.NewRequest(http.MethodPost, url, body) + w := httptest.NewRecorder() + handleGetEncryptionKeys(w, req) + res := w.Result() + defer res.Body.Close() + data, err := ioutil.ReadAll(res.Body) + if err != nil { + t.Errorf("Error %v", err) + } + + resp := unmarshalErrorResponse(data, t) + if !strings.Contains(resp.Err.Detail, errSubStr) { + t.Errorf("Unexpected error response '%s'", resp.Err.Detail) + t.Fail() + } +} + +func runQueryExpectingValidResponse(payload string, url string, t *testing.T) { + body := strings.NewReader(payload) + req := httptest.NewRequest(http.MethodPost, url, body) + w := httptest.NewRecorder() + handleGetEncryptionKeys(w, req) + res := w.Result() + defer res.Body.Close() + data, err := ioutil.ReadAll(res.Body) + if err != nil { + t.Errorf("Error %v", err) + } + + resp := unmarshalValidResponse(data, t) + checkGetEncyptKeysResponseValidity(resp, t) +} + +func TestUnsupportedQueryMode(t *testing.T) { + runQueryExpectingErrorResponse(UnsupportedQueryMode, getEncryptionKeysEndpoint, errStrMap()(UNSUPPORTED_QUERY_MODE), t) +} + +func TestGetEncryptionKeysByKeyIdsWithRefreshUrls(t *testing.T) { + runQueryExpectingValidResponse(ByKeyIdReqWithRefreshUrls, getEncryptionKeysEndpoint, t) +} + +func TestGetEncryptionKeysByKeyIdsWithoutRefreshUrls(t *testing.T) { + runQueryExpectingValidResponse(ByKeyIdReqWithoutRefreshUrls, getEncryptionKeysEndpoint, t) +} + +func TestGetEncryptionKeysByDomainIdsWithRefreshUrls(t *testing.T) { + runQueryExpectingValidResponse(ByDomainIdReqWithRefreshUrls, getEncryptionKeysEndpoint, t) +} + +func TestGetEncryptionKeysByDomainIdsWithoutRefreshUrls(t *testing.T) { + runQueryExpectingValidResponse(ByDomainIdReqWithoutRefreshUrls, getEncryptionKeysEndpoint, t) +} + +func TestFaultInjection(t *testing.T) { + numIterations := rand.Intn(701) + 86 + + for i := 0; i < numIterations; i++ { + loc := rand.Intn(MARSHAL_RESPONSE + 1) + f := Fault{} + f.Location = loc + f.Enable = true + + var faults []Fault + faults = append(faults, f) + fW := httptest.NewRecorder() + body := strings.NewReader(jsonifyFaultArr(fW, faults)) + fReq := httptest.NewRequest(http.MethodPost, updateFaultInjectionEndpoint, body) + handleUpdateFaultInjection(fW, fReq) + if !shouldInjectFault(loc) { + t.Errorf("Expected fault enabled for loc '%d'", loc) + t.Fail() + } + + var payload string + lottery := rand.Intn(100) + if lottery < 25 { + payload = ByKeyIdReqWithRefreshUrls + } else if lottery >= 25 && lottery < 50 { + payload = ByKeyIdReqWithoutRefreshUrls + } else if lottery >= 50 && lottery < 75 { + payload = ByDomainIdReqWithRefreshUrls + } else { + payload = ByDomainIdReqWithoutRefreshUrls + } + runQueryExpectingErrorResponse(payload, getEncryptionKeysEndpoint, errStrMap()(loc), t) + + // reset Fault + faults[0].Enable = false + fW = httptest.NewRecorder() + body = strings.NewReader(jsonifyFaultArr(fW, faults)) + fReq = httptest.NewRequest(http.MethodPost, updateFaultInjectionEndpoint, body) + handleUpdateFaultInjection(fW, fReq) + if shouldInjectFault(loc) { + t.Errorf("Expected fault disabled for loc '%d'", loc) + t.Fail() + } + } +} \ No newline at end of file diff --git a/bindings/go/src/mockkms/utils.go b/bindings/go/src/mockkms/utils.go new file mode 100644 index 0000000000..e32a8f1a93 --- /dev/null +++ b/bindings/go/src/mockkms/utils.go @@ -0,0 +1,51 @@ +/* + * utils.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "encoding/json" + "fmt" + "log" + "net/http" +) + +type ErrorDetail struct { + Detail string `json:"details"` +} + +type ErrorResponse struct { + Err ErrorDetail `json:"error"` +} + +func sendErrorResponse(w http.ResponseWriter, err error) { + e := ErrorDetail{} + e.Detail = fmt.Sprintf("Error: %s", err.Error()) + resp := ErrorResponse{ + Err: e, + } + + mResp,err := json.Marshal(resp) + if err != nil { + log.Printf("Error marshalling error response %s", err.Error()) + panic(err) + } + fmt.Fprintf(w, string(mResp)) +} \ No newline at end of file From 637044fd5497f0497342c555864521bb12b7e9d6 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Mon, 9 May 2022 13:39:29 +0200 Subject: [PATCH 152/299] Add testcase for when client chain is invalid Optionally allow the leaf certificates to have already expired --- flow/MkCert.cpp | 19 +++++++++----- flow/MkCert.h | 4 +++ flow/TLSTest.cpp | 65 +++++++++++++++++++++++++++++++++--------------- 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 96d00676a5..6adebd8e0f 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -351,12 +351,10 @@ CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKe } } -CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { +VectorRef makeCertChainSpec(Arena& arena, unsigned length, ESide side) { if (!length) return {}; - // temporary arena for writing up specs - auto tmpArena = Arena(); - auto specs = new (tmpArena) CertSpecRef[length]; + auto specs = new (arena) CertSpecRef[length]; auto const isServerSide = side == ESide::Server; for (auto i = 0u; i < length; i++) { auto kind = CertKind{}; @@ -366,9 +364,18 @@ CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { kind = isServerSide ? CertKind(ServerRootCA{}) : CertKind(ClientRootCA{}); else kind = isServerSide ? CertKind(ServerIntermediateCA{ i }) : CertKind(ClientIntermediateCA{ i }); - specs[i] = CertSpecRef::make(tmpArena, kind); + specs[i] = CertSpecRef::make(arena, kind); } - return makeCertChain(arena, VectorRef(specs, length), {} /*root*/); + return VectorRef(specs, length); +} + +CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { + if (!length) + return {}; + // temporary arena for writing up specs + auto tmpArena = Arena(); + auto specs = makeCertChainSpec(tmpArena, length, side); + return makeCertChain(arena, specs, {} /*root*/); } } // namespace mkcert diff --git a/flow/MkCert.h b/flow/MkCert.h index c5f23cde76..28b286491d 100644 --- a/flow/MkCert.h +++ b/flow/MkCert.h @@ -158,6 +158,10 @@ StringRef concatCertChain(Arena& arena, CertChainRef chain); enum class ESide : int { Server, Client }; +// Generate a chain of valid cert specs that are always the same given the same parameters +// The side parameter makes a difference in the commonName ("CN") field of the produced specs +VectorRef makeCertChainSpec(Arena& arena, unsigned length, ESide side); + // For empty (default) rootAuthority, the last item in specs is used to generate rootAuthority // Otherwise, rootAuthority is deep-copied to first element of returned chain CertChainRef makeCertChain(Arena& arena, VectorRef specs, CertAndKeyRef rootAuthority); diff --git a/flow/TLSTest.cpp b/flow/TLSTest.cpp index 670e01efb5..da8ff8a2c3 100644 --- a/flow/TLSTest.cpp +++ b/flow/TLSTest.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -92,10 +93,11 @@ void init_client_ssl_context() { auto& ctx = client_ssl; ctx.set_options(ssl::context::default_workarounds); ctx.set_verify_mode(ssl::context::verify_peer | ssl::verify_fail_if_no_peer_cert); + /* ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { - logc("context preverify: {}", preverify); - return preverify; - }); + logc("context preverify: {}", preverify); + return preverify; + });*/ init_certs(ctx, client_chain, server_chain.empty() ? StringRef() : server_chain.back().certPem); } @@ -103,10 +105,11 @@ void init_server_ssl_context() { auto& ctx = server_ssl; ctx.set_options(ssl::context::default_workarounds); ctx.set_verify_mode(ssl::context::verify_peer | (client_chain.empty() ? 0 : ssl::verify_fail_if_no_peer_cert)); + /* ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { - logs("context preverify: {}", preverify); - return preverify; - }); + logs("context preverify: {}", preverify); + return preverify; + });*/ init_certs(ctx, server_chain, client_chain.empty() ? StringRef() : client_chain.back().certPem); } @@ -121,8 +124,9 @@ struct fmt::formatter { }; int main(int argc, char** argv) { - auto const server_chain_len = (argc > 1 ? std::strtoul(argv[1], nullptr, 10) : 3ul); - auto const client_chain_len = (argc > 2 ? std::strtoul(argv[2], nullptr, 10) : 3ul); + auto const server_chain_len = (argc > 1 ? std::strtol(argv[1], nullptr, 10) : 3l); + auto const client_chain_len = (argc > 2 ? std::strtol(argv[2], nullptr, 10) : 3l); + auto const expect_handshake_ok = client_chain_len >= 0 && server_chain_len > 0; auto const expect_trusted = client_chain_len != 0; log("cert chain length: server {}, client {}", server_chain_len, client_chain_len); [[maybe_unused]] auto print_chain = [](mkcert::CertChainRef chain) -> void { @@ -138,10 +142,24 @@ int main(int argc, char** argv) { } }; auto arena = Arena(); - if (server_chain_len > 0) - server_chain = mkcert::makeCertChain(arena, server_chain_len, mkcert::ESide::Server); - if (client_chain_len > 0) - client_chain = mkcert::makeCertChain(arena, client_chain_len, mkcert::ESide::Client); + if (server_chain_len) { + auto tmpArena = Arena(); + auto specs = mkcert::makeCertChainSpec(tmpArena, std::labs(server_chain_len), mkcert::ESide::Server); + if (server_chain_len < 0) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago + } + server_chain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/); + } + if (client_chain_len) { + auto tmpArena = Arena(); + auto specs = mkcert::makeCertChainSpec(tmpArena, std::labs(client_chain_len), mkcert::ESide::Client); + if (client_chain_len < 0) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago + } + client_chain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/); + } /* log("=========== SERVER CHAIN"); print_chain(server_chain); @@ -170,17 +188,18 @@ int main(int argc, char** argv) { enum class ESockState { AssumedUntrusted, Trusted }; auto server_sock_state = ESockState::AssumedUntrusted; auto client_sock_state = ESockState::AssumedUntrusted; - server_ssl_sock.set_verify_callback([&server_sock_state](bool preverify, ssl::verify_context&) { + auto handshake_ok = true; + server_ssl_sock.set_verify_callback([&server_sock_state, &handshake_ok](bool preverify, ssl::verify_context&) { logs("client preverify: {}", preverify); switch (server_sock_state) { case ESockState::AssumedUntrusted: if (!preverify) - return false; + return handshake_ok = false; server_sock_state = ESockState::Trusted; break; case ESockState::Trusted: if (!preverify) - return false; + return handshake_ok = false; break; default: break; @@ -206,17 +225,17 @@ int main(int argc, char** argv) { }); auto client_sock = tcp::socket(io); auto client_ssl_sock = socket_type(client_sock, client_ssl); - client_ssl_sock.set_verify_callback([&client_sock_state](bool preverify, ssl::verify_context&) { + client_ssl_sock.set_verify_callback([&client_sock_state, &handshake_ok](bool preverify, ssl::verify_context&) { logc("server preverify: {}", preverify); switch (client_sock_state) { case ESockState::AssumedUntrusted: if (!preverify) - return false; + return handshake_ok = false; client_sock_state = ESockState::Trusted; break; case ESockState::Trusted: if (!preverify) - return false; + return handshake_ok = false; break; default: break; @@ -241,7 +260,13 @@ int main(int argc, char** argv) { } }); io.run(); - ASSERT_EQ(expect_trusted, (server_sock_state == ESockState::Trusted)); - log("Test OK: Connection considered {}", server_sock_state == ESockState::Trusted ? "trusted" : "untrusted"); + ASSERT_EQ(expect_handshake_ok, handshake_ok); + if (expect_handshake_ok) { + ASSERT_EQ(expect_trusted, (server_sock_state == ESockState::Trusted)); + log("Test OK: Handshake passed and connection {} as expected", + server_sock_state == ESockState::Trusted ? "trusted" : "untrusted"); + } else { + log("Test OK: Handshake failed as expected"); + } return 0; } From 033980201462340ef667d5c59a56dbd722664db4 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Mon, 9 May 2022 14:38:57 +0200 Subject: [PATCH 153/299] Remove debug messages --- flow/MkCert.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 6adebd8e0f..40e1141fa3 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -45,7 +45,6 @@ public: }; [[noreturn]] void traceAndThrow(const char* condition, const char* file, int line) { - fprintf(stderr, "Failed condition check %s at %s:%d\n", condition, file, line); auto te = TraceEvent(SevWarnAlways, "ErrorTLSKeyOrCertGen"); te.suppressFor(60).detail("File", file).detail("Line", line).detail("Condition", condition); if (auto err = ::ERR_get_error()) { @@ -54,7 +53,6 @@ public: }; ::ERR_error_string_n(err, buf, sizeof(buf)); te.detail("OpenSSLError", buf); - fprintf(stderr, "OpenSSL error: %s\n", buf); } throw tls_error(); } From 968c2cad4334b0d8bf5d8aadaf2524c23929607c Mon Sep 17 00:00:00 2001 From: Hao Fu <77984096+hfu94@users.noreply.github.com> Date: Mon, 9 May 2022 11:48:45 -0700 Subject: [PATCH 154/299] Reduce workload in CycleMultiClientIntegrationTest (#7100) --- .../apple/foundationdb/CycleMultiClientIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java index 6c277c72b0..5087361c43 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java @@ -43,8 +43,8 @@ public class CycleMultiClientIntegrationTest { public static final MultiClientHelper clientHelper = new MultiClientHelper(); // more write txn than validate txn, as parent thread waits only for validate txn. - private static final int writeTxnCnt = 2000; - private static final int validateTxnCnt = 1000; + private static final int writeTxnCnt = 200; + private static final int validateTxnCnt = 100; private static final int threadPerDB = 5; private static final int cycleLength = 4; From 13b2b455b1f69f6f29c7628e777c7c6cbdd7ab2d Mon Sep 17 00:00:00 2001 From: Renxuan Wang Date: Mon, 2 May 2022 17:34:22 -0700 Subject: [PATCH 155/299] HTTP proxy for backup. Only support non-TLS blobs now. --- fdbbackup/backup.actor.cpp | 5 ++++ fdbclient/S3BlobStore.actor.cpp | 41 +++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 9ce6b03b18..fee0595496 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -620,6 +620,7 @@ CSimpleOpt::SOption g_rgBackupListOptions[] = { #endif { OPT_BASEURL, "-b", SO_REQ_SEP }, { OPT_BASEURL, "--base-url", SO_REQ_SEP }, + { OPT_PROXY, "--proxy", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, { OPT_TRACE_DIR, "--logdir", SO_REQ_SEP }, { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, @@ -3336,6 +3337,10 @@ int main(int argc, char* argv[]) { } Optional proxy; + std::string p; + if (platform::getEnvironmentVar("HTTP_PROXY", p) || platform::getEnvironmentVar("HTTPS_PROXY", p)) { + proxy = p; + } std::string destinationContainer; bool describeDeep = false; bool describeTimestamps = false; diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index ad859b47a4..b80c83d3f1 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -185,11 +185,20 @@ Reference S3BlobStoreEndpoint::fromString(const std::string Optional proxyHost, proxyPort; if (proxy.present()) { - if (!Hostname::isHostname(proxy.get()) && !NetworkAddress::parseOptional(proxy.get()).present()) { - throw format("'%s' is not a valid value for proxy. Format should be either IP:port or host:port.", - proxy.get().c_str()); + StringRef proxyRef(proxy.get()); + if (proxy.get().find("://") != std::string::npos) { + StringRef proxyPrefix = proxyRef.eat("://"); + if (proxyPrefix != LiteralStringRef("http")) { + throw format("Invalid proxy URL prefix '%s'. Either don't use a prefix, or use http://", + proxyPrefix.toString().c_str()); + } } - StringRef p(proxy.get()); + std::string proxyBody = proxyRef.eat().toString(); + if (!Hostname::isHostname(proxyBody) && !NetworkAddress::parseOptional(proxyBody).present()) { + throw format("'%s' is not a valid value for proxy. Format should be either IP:port or host:port.", + proxyBody.c_str()); + } + StringRef p(proxyBody); proxyHost = p.eat(":").toString(); proxyPort = p.eat().toString(); } @@ -645,10 +654,24 @@ ACTOR Future connect_impl(Referencehost, service = b->service; - if (service.empty()) + if (service.empty()) { + if (b->useProxy) { + fprintf(stderr, "ERROR: Port can't be empty when using HTTP proxy.\n"); + throw connection_failed(); + } service = b->knobs.secure_connection ? "https" : "http"; - state Reference conn = - wait(INetworkConnections::net()->connect(host, service, b->knobs.secure_connection ? true : false)); + } + bool isTLS = b->knobs.secure_connection == 1; + if (b->useProxy) { + // TODO(renxuan): Support http proxy + TLS + if (isTLS || b->service == "443") { + fprintf(stderr, "ERROR: TLS is not supported yet when using HTTP proxy.\n"); + throw connection_failed(); + } + host = b->proxyHost.get(); + service = b->proxyPort.get(); + } + state Reference conn = wait(INetworkConnections::net()->connect(host, service, isTLS)); wait(conn->connectHandshake()); TraceEvent("S3BlobStoreEndpointNewConnection") @@ -752,6 +775,10 @@ ACTOR Future> doRequest_impl(ReferencesetAuthHeaders(verb, resource, headers); } + if (bstore->useProxy) { + // Has to be in absolute-form. + resource = "http://" + bstore->host + ":" + bstore->service + resource; + } remoteAddress = rconn.conn->getPeerAddress(); wait(bstore->requestRate->getAllowance(1)); Reference _r = wait(timeoutError(HTTP::doRequest(rconn.conn, From 9190fffbb6e28dc8bce39af9e86344187f7578b5 Mon Sep 17 00:00:00 2001 From: Renxuan Wang Date: Mon, 2 May 2022 21:34:33 -0700 Subject: [PATCH 156/299] Commit suggestion. Co-authored-by: Jingyu Zhou --- fdbclient/S3BlobStore.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index b80c83d3f1..8a3ff649b5 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -188,7 +188,7 @@ Reference S3BlobStoreEndpoint::fromString(const std::string StringRef proxyRef(proxy.get()); if (proxy.get().find("://") != std::string::npos) { StringRef proxyPrefix = proxyRef.eat("://"); - if (proxyPrefix != LiteralStringRef("http")) { + if (proxyPrefix != "http"_sr) { throw format("Invalid proxy URL prefix '%s'. Either don't use a prefix, or use http://", proxyPrefix.toString().c_str()); } From b1ce3fc15a82f4d2ee60aa2d0c2efcbd3f201cdd Mon Sep 17 00:00:00 2001 From: Sam Gwydir Date: Mon, 2 May 2022 16:20:54 -0700 Subject: [PATCH 157/299] WolfSSL fix for TokenSign --- fdbrpc/TokenSign.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbrpc/TokenSign.cpp b/fdbrpc/TokenSign.cpp index 4f872b7115..1d57fa0033 100644 --- a/fdbrpc/TokenSign.cpp +++ b/fdbrpc/TokenSign.cpp @@ -28,6 +28,9 @@ #include "flow/Trace.h" #include "flow/UnitTest.h" #include +#if defined(HAVE_WOLFSSL) +#include +#endif #include #include #include From 7578d5ebc755ca737923345c1c7f1b978e3b3ca9 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 3 May 2022 17:14:38 -0700 Subject: [PATCH 158/299] Create GlobalConfig object for each database instance Currently, GlobalConfig is a singleton that means for each process there is only one GlobalConfig object. This is bug from clients perspective as a client can keep connections to several databases. This patch tracks GlobalConfig for each database using an unordered_map in flowGlobals. We discovered this bug while testing multi-version client, where the client got stuck. This was lucky, as normally it'd just write down config to the wrong database. --- fdbcli/ProfileCommand.actor.cpp | 11 ++++++---- fdbcli/fdbcli.actor.cpp | 4 ++-- fdbcli/fdbcli.actor.h | 5 ++++- fdbclient/GlobalConfig.actor.cpp | 14 ++++++++---- fdbclient/GlobalConfig.actor.h | 22 +++++++++++++++---- fdbclient/NativeAPI.actor.cpp | 34 +++++++++++++++-------------- fdbclient/NativeAPI.actor.h | 1 + fdbclient/SpecialKeySpace.actor.cpp | 4 ++-- fdbclient/SpecialKeySpace.actor.h | 6 ++++- fdbserver/worker.actor.cpp | 12 +++++----- 10 files changed, 73 insertions(+), 40 deletions(-) diff --git a/fdbcli/ProfileCommand.actor.cpp b/fdbcli/ProfileCommand.actor.cpp index 43d3d7a966..3c5710e53f 100644 --- a/fdbcli/ProfileCommand.actor.cpp +++ b/fdbcli/ProfileCommand.actor.cpp @@ -35,7 +35,10 @@ namespace fdb_cli { -ACTOR Future profileCommandActor(Reference tr, std::vector tokens, bool intrans) { +ACTOR Future profileCommandActor(Database db, + Reference tr, + std::vector tokens, + bool intrans) { state bool result = true; if (tokens.size() == 1) { printUsage(tokens[0]); @@ -45,7 +48,7 @@ ACTOR Future profileCommandActor(Reference tr, std::vector\n"); return false; } - wait(GlobalConfig::globalConfig().onInitialized()); + wait(GlobalConfig::globalConfig(db).onInitialized()); if (tokencmp(tokens[2], "get")) { if (tokens.size() != 3) { fprintf(stderr, "ERROR: Addtional arguments to `get` are not supported.\n"); @@ -53,12 +56,12 @@ ACTOR Future profileCommandActor(Reference tr, std::vector( + const double sampleRateDbl = GlobalConfig::globalConfig(db).get( fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); if (!std::isinf(sampleRateDbl)) { sampleRateStr = std::to_string(sampleRateDbl); } - const int64_t sizeLimit = GlobalConfig::globalConfig().get(fdbClientInfoTxnSizeLimit, -1); + const int64_t sizeLimit = GlobalConfig::globalConfig(db).get(fdbClientInfoTxnSizeLimit, -1); if (sizeLimit != -1) { sizeLimitStr = boost::lexical_cast(sizeLimit); } diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index aecd2479be..e936a98420 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -782,7 +782,7 @@ void fdbcliCompCmd(std::string const& text, std::vector& lc) { int count = tokens.size(); // for(int i = 0; i < count; i++) { - // printf("Token (%d): `%s'\n", i, tokens[i].toString().c_str()); + // printf("Token (%d): `%s'\n", i, tokens[i].toString().c_str()); // } std::string ntext = ""; @@ -1552,7 +1552,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (tokencmp(tokens[0], "profile")) { getTransaction(db, managementTenant, tr, options, intrans); - bool _result = wait(makeInterruptable(profileCommandActor(tr, tokens, intrans))); + bool _result = wait(makeInterruptable(profileCommandActor(localDb, tr, tokens, intrans))); if (!_result) is_error = true; continue; diff --git a/fdbcli/fdbcli.actor.h b/fdbcli/fdbcli.actor.h index ea05695e0b..227f47b7d6 100644 --- a/fdbcli/fdbcli.actor.h +++ b/fdbcli/fdbcli.actor.h @@ -217,7 +217,10 @@ ACTOR Future clearHealthyZone(Reference db, bool clearSSFailureZoneString = false); ACTOR Future maintenanceCommandActor(Reference db, std::vector tokens); // profile command -ACTOR Future profileCommandActor(Reference tr, std::vector tokens, bool intrans); +ACTOR Future profileCommandActor(Database db, + Reference tr, + std::vector tokens, + bool intrans); // setclass command ACTOR Future setClassCommandActor(Reference db, std::vector tokens); // snapshot command diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index 4a2e4aec56..de5963d31a 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -39,10 +39,16 @@ const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window"); GlobalConfig::GlobalConfig(Database& cx) : cx(cx), lastUpdate(0) {} -GlobalConfig& GlobalConfig::globalConfig() { - void* res = g_network->global(INetwork::enGlobalConfig); - ASSERT(res); - return *reinterpret_cast(res); +GlobalConfig& GlobalConfig::globalConfig(const Database& cx) { + return GlobalConfig::globalConfig(cx->dbId); +} + +GlobalConfig& GlobalConfig::globalConfig(UID dbid) { + ConfigMap* config_map = reinterpret_cast(g_network->global(INetwork::enGlobalConfig)); + auto res = config_map->find(dbid); + ASSERT(res != config_map->end()); + ASSERT(res->second); + return *reinterpret_cast(res->second); } Key GlobalConfig::prefixedKey(KeyRef key) { diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index 91a78921cd..833a68f708 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -33,6 +33,7 @@ #include #include "fdbclient/CommitProxyInterface.h" +#include "fdbclient/DatabaseContext.h" #include "fdbclient/GlobalConfig.h" #include "fdbclient/ReadYourWrites.h" @@ -65,6 +66,8 @@ struct ConfigValue : ReferenceCounted { }; class GlobalConfig : NonCopyable { + typedef std::unordered_map ConfigMap; + public: // Creates a GlobalConfig singleton, accessed by calling // GlobalConfig::globalConfig(). This function requires a database object @@ -77,23 +80,34 @@ public: // database. template static void create(Database& cx, Reference const> db, const ClientDBInfo* dbInfo) { - if (g_network->global(INetwork::enGlobalConfig) == nullptr) { + auto config_map = + reinterpret_cast(g_network->global(INetwork::enGlobalConfig)); + if (config_map == nullptr) { + auto m = new ConfigMap(); + g_network->setGlobal(INetwork::enGlobalConfig, m); + config_map = m; + } + + auto it = config_map->find(cx.dbId()); + if (it == config_map->end()) { auto config = new GlobalConfig{ cx }; - g_network->setGlobal(INetwork::enGlobalConfig, config); + config_map->emplace(cx.dbId(), config); config->_updater = updater(config, dbInfo); // Bind changes in `db` to the `dbInfoChanged` AsyncTrigger. // TODO: Change AsyncTrigger to a Reference forward(db, std::addressof(config->dbInfoChanged)); } else { - GlobalConfig* config = reinterpret_cast(g_network->global(INetwork::enGlobalConfig)); + GlobalConfig* config = it->second; config->cx = cx; + config->_updater = updater(config, dbInfo); } } // Returns a reference to the global GlobalConfig object. Clients should // call this function whenever they need to read a value out of the global // configuration. - static GlobalConfig& globalConfig(); + static GlobalConfig& globalConfig(const Database& cx); + static GlobalConfig& globalConfig(UID dbid); // Use this function to turn a global configuration key defined above into // the full path needed to set the value in the database. diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 3c06c0f506..42e1c0a95c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -809,12 +809,12 @@ ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { } } cx->clientStatusUpdater.outStatusQ.clear(); - wait(GlobalConfig::globalConfig().onInitialized()); - double sampleRate = GlobalConfig::globalConfig().get(fdbClientInfoTxnSampleRate, - std::numeric_limits::infinity()); + wait(GlobalConfig::globalConfig(cx->dbId).onInitialized()); + double sampleRate = GlobalConfig::globalConfig(cx->dbId).get( + fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; - int64_t sizeLimit = GlobalConfig::globalConfig().get(fdbClientInfoTxnSizeLimit, -1); + int64_t sizeLimit = GlobalConfig::globalConfig(cx->dbId).get(fdbClientInfoTxnSizeLimit, -1); int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit; if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability) wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit)); @@ -1554,8 +1554,8 @@ DatabaseContext::DatabaseContext(Reference(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); - registerSpecialKeysImpl( + std::make_unique(dbId, SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); + registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING))); @@ -1937,13 +1937,13 @@ Future DatabaseContext::onProxiesChanged() const { } bool DatabaseContext::sampleReadTags() const { - double sampleRate = GlobalConfig::globalConfig().get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); + double sampleRate = GlobalConfig::globalConfig(dbId).get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate; } bool DatabaseContext::sampleOnCost(uint64_t cost) const { double sampleCost = - GlobalConfig::globalConfig().get(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST); + GlobalConfig::globalConfig(dbId).get(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST); if (sampleCost <= 0) return false; return deterministicRandom()->random01() <= (double)cost / sampleCost; @@ -2221,8 +2221,8 @@ Database Database::createDatabase(Reference connRecord auto database = Database(db); GlobalConfig::create( database, Reference const>(clientInfo), std::addressof(clientInfo->get())); - GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency); - GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow); + GlobalConfig::globalConfig(database).trigger(samplingFrequency, samplingProfilerUpdateFrequency); + GlobalConfig::globalConfig(database).trigger(samplingWindow, samplingProfilerUpdateWindow); TraceEvent("ConnectToDatabase", database->dbId) .detail("Version", FDB_VT_VERSION) @@ -2245,6 +2245,8 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } +UID Database::dbId() const { return db->dbId; }; + Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { const auto it = watchMap.find(std::make_pair(tenantId, key)); if (it == watchMap.end()) @@ -7231,10 +7233,10 @@ ACTOR Future>> getReadHotRanges(Da // condition. Should we abort and wait for the newly split shards to be hot again? state int nLocs = locations.size(); // if (nLocs > 1) { - // TraceEvent("RHDDebug") - // .detail("NumSSIs", nLocs) - // .detail("KeysBegin", keys.begin.printable().c_str()) - // .detail("KeysEnd", keys.end.printable().c_str()); + // TraceEvent("RHDDebug") + // .detail("NumSSIs", nLocs) + // .detail("KeysBegin", keys.begin.printable().c_str()) + // .detail("KeysEnd", keys.end.printable().c_str()); // } state std::vector> fReplies(nLocs); KeyRef partBegin, partEnd; @@ -7946,7 +7948,7 @@ void Transaction::checkDeferredError() const { Reference Transaction::createTrLogInfoProbabilistically(const Database& cx) { if (!cx->isError()) { - double clientSamplingProbability = GlobalConfig::globalConfig().get( + double clientSamplingProbability = GlobalConfig::globalConfig(cx).get( fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY); if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && @@ -9419,4 +9421,4 @@ int64_t getMaxWriteKeySize(KeyRef const& key, bool hasRawAccess) { int64_t getMaxClearKeySize(KeyRef const& key) { return getMaxKeySize(key); -} \ No newline at end of file +} diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index fe4d578e77..b27ca2638a 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -111,6 +111,7 @@ public: inline DatabaseContext* extractPtr() { return db.extractPtr(); } DatabaseContext* operator->() const { return db.getPtr(); } Reference getReference() const { return db; } + UID dbId() const; const UniqueOrderedOptionList& getTransactionDefaults() const; diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index e24c1829ec..c99471bcc3 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -1454,7 +1454,7 @@ Future> ConsistencyCheckImpl::commit(ReadYourWritesTransac return Optional(); } -GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {} +GlobalConfigImpl::GlobalConfigImpl(UID dbId, KeyRangeRef kr) : dbId(dbId), SpecialKeyRangeRWImpl(kr) {} // Returns key-value pairs for each value stored in the global configuration // framework within the range specified. The special-key-space getrange @@ -1465,7 +1465,7 @@ Future GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, GetRangeLimits limitsHint) const { RangeResult result; - auto& globalConfig = GlobalConfig::globalConfig(); + auto& globalConfig = GlobalConfig::globalConfig(dbId); KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin)); std::map> values = globalConfig.get(modified); diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 44f1646b8e..19c8550013 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -29,6 +29,7 @@ #include "flow/flow.h" #include "flow/Arena.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/ReadYourWrites.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -426,7 +427,7 @@ public: class GlobalConfigImpl : public SpecialKeyRangeRWImpl { public: - explicit GlobalConfigImpl(KeyRangeRef kr); + explicit GlobalConfigImpl(UID dbId, KeyRangeRef kr); Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; @@ -434,6 +435,9 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override; void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override; + +private: + UID dbId; }; class TracingOptionsImpl : public SpecialKeyRangeRWImpl { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index bde9f68511..881b82af76 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -168,8 +168,8 @@ Database openDBOnServer(Reference const> const& db, taskID, lockAware); GlobalConfig::create(cx, db, std::addressof(db->get().client)); - GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency); - GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow); + GlobalConfig::globalConfig(cx).trigger(samplingFrequency, samplingProfilerUpdateFrequency); + GlobalConfig::globalConfig(cx).trigger(samplingWindow, samplingProfilerUpdateWindow); return cx; } @@ -1606,16 +1606,16 @@ ACTOR Future workerServer(Reference connRecord, state Database db = Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, IsInternal::True, locality); metricsLogger = runMetrics(db, KeyRef(metricsPrefix)); + GlobalConfig::globalConfig(db).trigger(samplingFrequency, samplingProfilerUpdateFrequency); } catch (Error& e) { TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile); } } else { auto lockAware = metricsPrefix.size() && metricsPrefix[0] == '\xff' ? LockAware::True : LockAware::False; - metricsLogger = - runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, lockAware), KeyRef(metricsPrefix)); + auto database = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, lockAware); + metricsLogger = runMetrics(database, KeyRef(metricsPrefix)); + GlobalConfig::globalConfig(database).trigger(samplingFrequency, samplingProfilerUpdateFrequency); } - - GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency); } errorForwarders.add(resetAfter(degraded, From 9173e2e19be07dc33421b27e5000705eacdc104e Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 4 May 2022 16:36:04 -0700 Subject: [PATCH 159/299] Move GlobalConfig to DatabaseContext --- fdbcli/ProfileCommand.actor.cpp | 6 ++-- fdbclient/DatabaseContext.h | 3 ++ fdbclient/GlobalConfig.actor.cpp | 14 +-------- fdbclient/GlobalConfig.actor.h | 46 ++++++++--------------------- fdbclient/NativeAPI.actor.cpp | 25 ++++++++-------- fdbclient/SpecialKeySpace.actor.cpp | 7 ++--- fdbclient/SpecialKeySpace.actor.h | 5 ++-- fdbserver/worker.actor.cpp | 10 +++---- flow/genericactors.actor.h | 2 +- 9 files changed, 44 insertions(+), 74 deletions(-) diff --git a/fdbcli/ProfileCommand.actor.cpp b/fdbcli/ProfileCommand.actor.cpp index 3c5710e53f..0f48ce877b 100644 --- a/fdbcli/ProfileCommand.actor.cpp +++ b/fdbcli/ProfileCommand.actor.cpp @@ -48,7 +48,7 @@ ACTOR Future profileCommandActor(Database db, fprintf(stderr, "ERROR: Usage: profile client \n"); return false; } - wait(GlobalConfig::globalConfig(db).onInitialized()); + wait(db->globalConfig->onInitialized()); if (tokencmp(tokens[2], "get")) { if (tokens.size() != 3) { fprintf(stderr, "ERROR: Addtional arguments to `get` are not supported.\n"); @@ -56,12 +56,12 @@ ACTOR Future profileCommandActor(Database db, } std::string sampleRateStr = "default"; std::string sizeLimitStr = "default"; - const double sampleRateDbl = GlobalConfig::globalConfig(db).get( + const double sampleRateDbl = db->globalConfig->get( fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); if (!std::isinf(sampleRateDbl)) { sampleRateStr = std::to_string(sampleRateDbl); } - const int64_t sizeLimit = GlobalConfig::globalConfig(db).get(fdbClientInfoTxnSizeLimit, -1); + const int64_t sizeLimit = db->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); if (sizeLimit != -1) { sizeLimitStr = boost::lexical_cast(sizeLimit); } diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 7965b9833d..5ec819c01d 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -222,6 +222,8 @@ struct KeyRangeLocationInfo { : tenantEntry(tenantEntry), range(range), locations(locations) {} }; +class GlobalConfig; + class DatabaseContext : public ReferenceCounted, public FastAllocated, NonCopyable { public: static DatabaseContext* allocateOnForeignThread() { @@ -627,6 +629,7 @@ public: using TransactionT = ReadYourWritesTransaction; Reference createTransaction(); + std::unique_ptr globalConfig; EventCacheHolder connectToDatabaseEventCacheHolder; private: diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index de5963d31a..93b43de0d4 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -37,19 +37,7 @@ const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency"); const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window"); -GlobalConfig::GlobalConfig(Database& cx) : cx(cx), lastUpdate(0) {} - -GlobalConfig& GlobalConfig::globalConfig(const Database& cx) { - return GlobalConfig::globalConfig(cx->dbId); -} - -GlobalConfig& GlobalConfig::globalConfig(UID dbid) { - ConfigMap* config_map = reinterpret_cast(g_network->global(INetwork::enGlobalConfig)); - auto res = config_map->find(dbid); - ASSERT(res != config_map->end()); - ASSERT(res->second); - return *reinterpret_cast(res->second); -} +GlobalConfig::GlobalConfig(const Database& cx) : cx(cx), lastUpdate(0) {} Key GlobalConfig::prefixedKey(KeyRef key) { return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin); diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index 833a68f708..a1617b9d26 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -20,6 +20,7 @@ #pragma once +#include #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_G_H) #define FDBCLIENT_GLOBALCONFIG_ACTOR_G_H #include "fdbclient/GlobalConfig.actor.g.h" @@ -69,46 +70,24 @@ class GlobalConfig : NonCopyable { typedef std::unordered_map ConfigMap; public: - // Creates a GlobalConfig singleton, accessed by calling - // GlobalConfig::globalConfig(). This function requires a database object - // to allow global configuration to run transactions on the database, and - // an AsyncVar object to watch for changes on. The ClientDBInfo pointer + // Requires a database object to allow global configuration to run + // transactions on the database. + explicit GlobalConfig(const Database& cx); + + // Requires an AsyncVar object to watch for changes on. The ClientDBInfo pointer // should point to a ClientDBInfo object which will contain the updated // global configuration history when the given AsyncVar changes. This // function should be called whenever the database object changes, in order // to allow global configuration to run transactions on the latest // database. template - static void create(Database& cx, Reference const> db, const ClientDBInfo* dbInfo) { - auto config_map = - reinterpret_cast(g_network->global(INetwork::enGlobalConfig)); - if (config_map == nullptr) { - auto m = new ConfigMap(); - g_network->setGlobal(INetwork::enGlobalConfig, m); - config_map = m; - } - - auto it = config_map->find(cx.dbId()); - if (it == config_map->end()) { - auto config = new GlobalConfig{ cx }; - config_map->emplace(cx.dbId(), config); - config->_updater = updater(config, dbInfo); - // Bind changes in `db` to the `dbInfoChanged` AsyncTrigger. - // TODO: Change AsyncTrigger to a Reference - forward(db, std::addressof(config->dbInfoChanged)); - } else { - GlobalConfig* config = it->second; - config->cx = cx; - config->_updater = updater(config, dbInfo); - } + void init(Reference const> db, const ClientDBInfo* dbInfo) { + _updater = updater(this, dbInfo); + // Bind changes in `db` to the `dbInfoChanged` AsyncTrigger. + // TODO: Change AsyncTrigger to a Reference + _forward = forward(db, std::addressof(dbInfoChanged)); } - // Returns a reference to the global GlobalConfig object. Clients should - // call this function whenever they need to read a value out of the global - // configuration. - static GlobalConfig& globalConfig(const Database& cx); - static GlobalConfig& globalConfig(UID dbid); - // Use this function to turn a global configuration key defined above into // the full path needed to set the value in the database. // @@ -164,8 +143,6 @@ public: void trigger(KeyRef key, std::function)> fn); private: - GlobalConfig(Database& cx); - // The functions below only affect the local copy of the global // configuration keyspace! To insert or remove values across all nodes you // must use a transaction (see the note above). @@ -187,6 +164,7 @@ private: Database cx; AsyncTrigger dbInfoChanged; + Future _forward; Future _updater; Promise initialized; AsyncTrigger configChanged; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 42e1c0a95c..215cbbb7ff 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -809,12 +810,12 @@ ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { } } cx->clientStatusUpdater.outStatusQ.clear(); - wait(GlobalConfig::globalConfig(cx->dbId).onInitialized()); - double sampleRate = GlobalConfig::globalConfig(cx->dbId).get( + wait(cx->globalConfig->onInitialized()); + double sampleRate = cx->globalConfig->get( fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; - int64_t sizeLimit = GlobalConfig::globalConfig(cx->dbId).get(fdbClientInfoTxnSizeLimit, -1); + int64_t sizeLimit = cx->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit; if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability) wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit)); @@ -1481,6 +1482,7 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); + globalConfig = std::make_unique(Database(this)); if (apiVersionAtLeast(710)) { registerSpecialKeysImpl( @@ -1554,7 +1556,8 @@ DatabaseContext::DatabaseContext(Reference(dbId, SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); + std::make_unique(globalConfig.get(), + SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE, @@ -1937,13 +1940,12 @@ Future DatabaseContext::onProxiesChanged() const { } bool DatabaseContext::sampleReadTags() const { - double sampleRate = GlobalConfig::globalConfig(dbId).get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); + double sampleRate = globalConfig->get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate; } bool DatabaseContext::sampleOnCost(uint64_t cost) const { - double sampleCost = - GlobalConfig::globalConfig(dbId).get(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST); + double sampleCost = globalConfig->get(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST); if (sampleCost <= 0) return false; return deterministicRandom()->random01() <= (double)cost / sampleCost; @@ -2219,10 +2221,9 @@ Database Database::createDatabase(Reference connRecord } auto database = Database(db); - GlobalConfig::create( - database, Reference const>(clientInfo), std::addressof(clientInfo->get())); - GlobalConfig::globalConfig(database).trigger(samplingFrequency, samplingProfilerUpdateFrequency); - GlobalConfig::globalConfig(database).trigger(samplingWindow, samplingProfilerUpdateWindow); + database->globalConfig->init(Reference const>(clientInfo), std::addressof(clientInfo->get())); + database->globalConfig->trigger(samplingFrequency, samplingProfilerUpdateFrequency); + database->globalConfig->trigger(samplingWindow, samplingProfilerUpdateWindow); TraceEvent("ConnectToDatabase", database->dbId) .detail("Version", FDB_VT_VERSION) @@ -7948,7 +7949,7 @@ void Transaction::checkDeferredError() const { Reference Transaction::createTrLogInfoProbabilistically(const Database& cx) { if (!cx->isError()) { - double clientSamplingProbability = GlobalConfig::globalConfig(cx).get( + double clientSamplingProbability = cx->globalConfig->get( fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY); if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index c99471bcc3..f7090401fd 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -1454,7 +1454,8 @@ Future> ConsistencyCheckImpl::commit(ReadYourWritesTransac return Optional(); } -GlobalConfigImpl::GlobalConfigImpl(UID dbId, KeyRangeRef kr) : dbId(dbId), SpecialKeyRangeRWImpl(kr) {} +GlobalConfigImpl::GlobalConfigImpl(GlobalConfig* config, KeyRangeRef kr) + : globalConfig(config), SpecialKeyRangeRWImpl(kr) {} // Returns key-value pairs for each value stored in the global configuration // framework within the range specified. The special-key-space getrange @@ -1464,11 +1465,9 @@ Future GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const { RangeResult result; - - auto& globalConfig = GlobalConfig::globalConfig(dbId); KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin)); - std::map> values = globalConfig.get(modified); + std::map> values = globalConfig->get(modified); for (const auto& [key, config] : values) { Key prefixedKey = key.withPrefix(getKeyRange().begin); if (config.isValid() && config->value.has_value()) { diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 19c8550013..745a2ef32c 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -425,9 +425,10 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; }; +class GlobalConfig; class GlobalConfigImpl : public SpecialKeyRangeRWImpl { public: - explicit GlobalConfigImpl(UID dbId, KeyRangeRef kr); + explicit GlobalConfigImpl(GlobalConfig* config, KeyRangeRef kr); Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; @@ -437,7 +438,7 @@ public: void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override; private: - UID dbId; + GlobalConfig* globalConfig; }; class TracingOptionsImpl : public SpecialKeyRangeRWImpl { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 881b82af76..46039e7755 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -167,9 +167,9 @@ Database openDBOnServer(Reference const> const& db, enableLocalityLoadBalance, taskID, lockAware); - GlobalConfig::create(cx, db, std::addressof(db->get().client)); - GlobalConfig::globalConfig(cx).trigger(samplingFrequency, samplingProfilerUpdateFrequency); - GlobalConfig::globalConfig(cx).trigger(samplingWindow, samplingProfilerUpdateWindow); + cx->globalConfig->init(db, std::addressof(db->get().client)); + cx->globalConfig->trigger(samplingFrequency, samplingProfilerUpdateFrequency); + cx->globalConfig->trigger(samplingWindow, samplingProfilerUpdateWindow); return cx; } @@ -1606,7 +1606,7 @@ ACTOR Future workerServer(Reference connRecord, state Database db = Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, IsInternal::True, locality); metricsLogger = runMetrics(db, KeyRef(metricsPrefix)); - GlobalConfig::globalConfig(db).trigger(samplingFrequency, samplingProfilerUpdateFrequency); + db->globalConfig->trigger(samplingFrequency, samplingProfilerUpdateFrequency); } catch (Error& e) { TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile); } @@ -1614,7 +1614,7 @@ ACTOR Future workerServer(Reference connRecord, auto lockAware = metricsPrefix.size() && metricsPrefix[0] == '\xff' ? LockAware::True : LockAware::False; auto database = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, lockAware); metricsLogger = runMetrics(database, KeyRef(metricsPrefix)); - GlobalConfig::globalConfig(database).trigger(samplingFrequency, samplingProfilerUpdateFrequency); + database->globalConfig->trigger(samplingFrequency, samplingProfilerUpdateFrequency); } } diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index d0abeedd6d..dbbf98f07a 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -707,7 +707,7 @@ private: // Binds an AsyncTrigger object to an AsyncVar, so when the AsyncVar changes // the AsyncTrigger is triggered. ACTOR template -void forward(Reference const> from, AsyncTrigger* to) { +Future forward(Reference const> from, AsyncTrigger* to) { loop { wait(from->onChange()); to->trigger(); From f14baf2af8411906e2b174058bfe7af51000a879 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 4 May 2022 17:32:04 -0700 Subject: [PATCH 160/299] clang-format changes --- fdbcli/ProfileCommand.actor.cpp | 4 ++-- fdbclient/GlobalConfig.actor.h | 1 - fdbclient/NativeAPI.actor.cpp | 15 +++++++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fdbcli/ProfileCommand.actor.cpp b/fdbcli/ProfileCommand.actor.cpp index 0f48ce877b..d7f66ce080 100644 --- a/fdbcli/ProfileCommand.actor.cpp +++ b/fdbcli/ProfileCommand.actor.cpp @@ -56,8 +56,8 @@ ACTOR Future profileCommandActor(Database db, } std::string sampleRateStr = "default"; std::string sizeLimitStr = "default"; - const double sampleRateDbl = db->globalConfig->get( - fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); + const double sampleRateDbl = + db->globalConfig->get(fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); if (!std::isinf(sampleRateDbl)) { sampleRateStr = std::to_string(sampleRateDbl); } diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index a1617b9d26..1558635f52 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -20,7 +20,6 @@ #pragma once -#include #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_G_H) #define FDBCLIENT_GLOBALCONFIG_ACTOR_G_H #include "fdbclient/GlobalConfig.actor.g.h" diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 215cbbb7ff..85ef1d7ee3 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -811,8 +811,8 @@ ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { } cx->clientStatusUpdater.outStatusQ.clear(); wait(cx->globalConfig->onInitialized()); - double sampleRate = cx->globalConfig->get( - fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); + double sampleRate = + cx->globalConfig->get(fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; int64_t sizeLimit = cx->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); @@ -2221,7 +2221,8 @@ Database Database::createDatabase(Reference connRecord } auto database = Database(db); - database->globalConfig->init(Reference const>(clientInfo), std::addressof(clientInfo->get())); + database->globalConfig->init(Reference const>(clientInfo), + std::addressof(clientInfo->get())); database->globalConfig->trigger(samplingFrequency, samplingProfilerUpdateFrequency); database->globalConfig->trigger(samplingWindow, samplingProfilerUpdateWindow); @@ -2246,7 +2247,9 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } -UID Database::dbId() const { return db->dbId; }; +UID Database::dbId() const { + return db->dbId; +}; Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { const auto it = watchMap.find(std::make_pair(tenantId, key)); @@ -7949,8 +7952,8 @@ void Transaction::checkDeferredError() const { Reference Transaction::createTrLogInfoProbabilistically(const Database& cx) { if (!cx->isError()) { - double clientSamplingProbability = cx->globalConfig->get( - fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY); + double clientSamplingProbability = + cx->globalConfig->get(fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY); if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) { From f88cbf9309d8fb3bf142cf3fe6d7ba3e05a58f50 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 6 May 2022 11:29:17 -0700 Subject: [PATCH 161/299] Address review comments --- fdbclient/GlobalConfig.actor.h | 2 -- fdbclient/NativeAPI.actor.cpp | 7 +------ fdbclient/NativeAPI.actor.h | 1 - fdbclient/SpecialKeySpace.actor.cpp | 5 ++--- fdbclient/SpecialKeySpace.actor.h | 5 +---- 5 files changed, 4 insertions(+), 16 deletions(-) diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index 1558635f52..5e430a3037 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -66,8 +66,6 @@ struct ConfigValue : ReferenceCounted { }; class GlobalConfig : NonCopyable { - typedef std::unordered_map ConfigMap; - public: // Requires a database object to allow global configuration to run // transactions on the database. diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 85ef1d7ee3..03d1c2e1ed 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1556,8 +1556,7 @@ DatabaseContext::DatabaseContext(Reference(globalConfig.get(), - SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); + std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); registerSpecialKeySpaceModule( SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE, @@ -2247,10 +2246,6 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } -UID Database::dbId() const { - return db->dbId; -}; - Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { const auto it = watchMap.find(std::make_pair(tenantId, key)); if (it == watchMap.end()) diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index b27ca2638a..fe4d578e77 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -111,7 +111,6 @@ public: inline DatabaseContext* extractPtr() { return db.extractPtr(); } DatabaseContext* operator->() const { return db.getPtr(); } Reference getReference() const { return db; } - UID dbId() const; const UniqueOrderedOptionList& getTransactionDefaults() const; diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index f7090401fd..9913123acd 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -1454,8 +1454,7 @@ Future> ConsistencyCheckImpl::commit(ReadYourWritesTransac return Optional(); } -GlobalConfigImpl::GlobalConfigImpl(GlobalConfig* config, KeyRangeRef kr) - : globalConfig(config), SpecialKeyRangeRWImpl(kr) {} +GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {} // Returns key-value pairs for each value stored in the global configuration // framework within the range specified. The special-key-space getrange @@ -1467,7 +1466,7 @@ Future GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, RangeResult result; KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin)); - std::map> values = globalConfig->get(modified); + std::map> values = ryw->getDatabase()->globalConfig->get(modified); for (const auto& [key, config] : values) { Key prefixedKey = key.withPrefix(getKeyRange().begin); if (config.isValid() && config->value.has_value()) { diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 745a2ef32c..a53463bcb1 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -428,7 +428,7 @@ public: class GlobalConfig; class GlobalConfigImpl : public SpecialKeyRangeRWImpl { public: - explicit GlobalConfigImpl(GlobalConfig* config, KeyRangeRef kr); + explicit GlobalConfigImpl(KeyRangeRef kr); Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; @@ -436,9 +436,6 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override; void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override; - -private: - GlobalConfig* globalConfig; }; class TracingOptionsImpl : public SpecialKeyRangeRWImpl { From 051bd3102dad315366acffd46278a075007b2ec5 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 6 May 2022 14:15:15 -0700 Subject: [PATCH 162/299] Revert bad merge change --- fdbclient/NativeAPI.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 03d1c2e1ed..74d2971a60 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1557,7 +1557,7 @@ DatabaseContext::DatabaseContext(Reference(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); - registerSpecialKeySpaceModule( + registerSpecialKeysImpl( SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING))); From 153187542522730f270f342c858ae2326ae770b0 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Mon, 9 May 2022 11:19:31 -0700 Subject: [PATCH 163/299] Remove unnecessary decl and includes --- fdbclient/SpecialKeySpace.actor.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index a53463bcb1..44f1646b8e 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -29,7 +29,6 @@ #include "flow/flow.h" #include "flow/Arena.h" #include "fdbclient/FDBTypes.h" -#include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/ReadYourWrites.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -425,7 +424,6 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; }; -class GlobalConfig; class GlobalConfigImpl : public SpecialKeyRangeRWImpl { public: explicit GlobalConfigImpl(KeyRangeRef kr); From 73e655a0677b8df793dbcc95dede043e7fb55a6d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 9 May 2022 16:11:38 -0700 Subject: [PATCH 164/299] adjust (srcLoad - destLoad) / 10.0 --- fdbserver/DataDistributionQueue.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 806cfc134d..9cc1780bf2 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1549,7 +1549,7 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, // randomly choose topK shards int topK = std::min(int(0.1 * shards.size()), 10); state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetTopKMetricsRequest req(shards, topK, (srcLoad - destLoad) / 3.0); + state GetTopKMetricsRequest req(shards, topK, (srcLoad - destLoad) / 10.0); // 1/(5 * 2) req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) > b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); From 57f5ca2522c3a9b94bbfecdd97cad2799948779f Mon Sep 17 00:00:00 2001 From: Bala Namasivayam Date: Mon, 9 May 2022 19:12:12 -0700 Subject: [PATCH 165/299] Fix deserialization of tenant field in 7.1 --- .../transaction_profiling_analyzer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index e304e7ddaa..e704cacb72 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -194,7 +194,8 @@ class BaseInfo(object): if protocol_version >= PROTOCOL_VERSION_6_3: self.dc_id = bb.get_bytes_with_length() if protocol_version >= PROTOCOL_VERSION_7_1: - self.tenant = bb.get_bytes_with_length() + if bb.get_bytes(1): + self.tenant = bb.get_bytes_with_length() class GetVersionInfo(BaseInfo): def __init__(self, bb, protocol_version): From e4d204daf0fe7589997415786fb7cf4fe8ce75c7 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Tue, 10 May 2022 17:00:57 +0200 Subject: [PATCH 166/299] Rephrase function description --- flow/MkCert.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flow/MkCert.h b/flow/MkCert.h index 28b286491d..4220202824 100644 --- a/flow/MkCert.h +++ b/flow/MkCert.h @@ -158,8 +158,9 @@ StringRef concatCertChain(Arena& arena, CertChainRef chain); enum class ESide : int { Server, Client }; -// Generate a chain of valid cert specs that are always the same given the same parameters -// The side parameter makes a difference in the commonName ("CN") field of the produced specs +// Generate a chain of valid cert specs that have consistent subject/issuer names and +// is valid for typical server/client TLS scenario +// The 'side' parameter makes a difference in the commonName ("CN") field of the produced specs VectorRef makeCertChainSpec(Arena& arena, unsigned length, ESide side); // For empty (default) rootAuthority, the last item in specs is used to generate rootAuthority From 8a8b4d0f0c0a3914caf54762dbd8699171f681d3 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 30 Mar 2022 11:59:20 -0700 Subject: [PATCH 167/299] Add test for turning off version vector feature --- tests/CMakeLists.txt | 3 ++ .../VersionVectorDisableRestart-1.toml | 42 +++++++++++++++++++ .../VersionVectorDisableRestart-2.toml | 37 ++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml create mode 100644 tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 648cef86bc..398ceff279 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -276,6 +276,9 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-1.toml restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml) + add_fdb_test( + TEST_FILES restarting/from_7.1.0/VersionVectorDisableRestart-1.toml + restarting/from_7.1.0/VersionVectorDisableRestart-2.toml) add_fdb_test(TEST_FILES slow/ApiCorrectness.toml) diff --git a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml new file mode 100644 index 0000000000..170b596af2 --- /dev/null +++ b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml @@ -0,0 +1,42 @@ +[[knobs]] +enable_version_vector = true +enable_version_vector_tlog_unicast = true + +[[test]] +testTitle='VersionVectorDowngrade' +clearAfterTest=false + + [[test.workload]] + testName='Cycle' + transactionsPerSecond=500.0 + nodeCount=2500 + testDuration=10.0 + expectedRate=0 + + [[test.workload]] + testName='RandomClogging' + testDuration=10.0 + + [[test.workload]] + testName='Rollback' + meanDelay=10.0 + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + [[test.workload]] + testName='SaveAndKill' + restartInfoLocation='simfdb/restartInfo.ini' + testDuration=10.0 diff --git a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml new file mode 100644 index 0000000000..b294aa3978 --- /dev/null +++ b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml @@ -0,0 +1,37 @@ +[[knobs]] +enable_version_vector = false +enable_version_vector_tlog_unicast = false + +[[test]] +testTitle='VersionVectorDowngrade' +runSetup=false + + [[test.workload]] + testName='Cycle' + transactionsPerSecond=2500.0 + nodeCount=2500 + testDuration=10.0 + expectedRate=0 + + [[test.workload]] + testName='RandomClogging' + testDuration=10.0 + + [[test.workload]] + testName='Rollback' + meanDelay=10.0 + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 From fba044b8461a8cd5bdfe1b471278560323e00c5a Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 4 Apr 2022 20:47:43 -0700 Subject: [PATCH 168/299] Use 3 cycles in VV disable tests --- .../VersionVectorDisableRestart-1.toml | 25 ++++++++++++++++--- .../VersionVectorDisableRestart-2.toml | 23 ++++++++++++++--- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml index 170b596af2..c20a7312fb 100644 --- a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml +++ b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml @@ -8,10 +8,27 @@ clearAfterTest=false [[test.workload]] testName='Cycle' - transactionsPerSecond=500.0 - nodeCount=2500 - testDuration=10.0 + transactionsPerSecond=2500.0 + nodeCount=1000 + testDuration=30.0 expectedRate=0 + keyPrefix = 'cycle' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = '!' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = 'ZZZ' [[test.workload]] testName='RandomClogging' @@ -39,4 +56,4 @@ clearAfterTest=false [[test.workload]] testName='SaveAndKill' restartInfoLocation='simfdb/restartInfo.ini' - testDuration=10.0 + testDuration=60.0 diff --git a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml index b294aa3978..5c178993a3 100644 --- a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml +++ b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-2.toml @@ -9,9 +9,26 @@ runSetup=false [[test.workload]] testName='Cycle' transactionsPerSecond=2500.0 - nodeCount=2500 - testDuration=10.0 + nodeCount=1000 + testDuration=30.0 expectedRate=0 + keyPrefix = 'cycle' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = '!' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = 'ZZZ' [[test.workload]] testName='RandomClogging' @@ -34,4 +51,4 @@ runSetup=false machinesToKill=10 machinesToLeave=3 reboot=true - testDuration=10.0 + testDuration=60.0 From 92345b9f82982e960936a9503557c5333da270c6 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 4 Apr 2022 20:51:51 -0700 Subject: [PATCH 169/299] Add VV upgrade tests No VV to VV upgrade tests. --- tests/CMakeLists.txt | 3 + .../VersionVectorEnableRestart-1.toml | 59 +++++++++++++++++++ .../VersionVectorEnableRestart-2.toml | 54 +++++++++++++++++ 3 files changed, 116 insertions(+) create mode 100644 tests/restarting/from_7.1.0/VersionVectorEnableRestart-1.toml create mode 100644 tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 398ceff279..e358bbf1bc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -279,6 +279,9 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.1.0/VersionVectorDisableRestart-1.toml restarting/from_7.1.0/VersionVectorDisableRestart-2.toml) + add_fdb_test( + TEST_FILES restarting/from_7.1.0/VersionVectorEnableRestart-1.toml + restarting/from_7.1.0/VersionVectorEnableRestart-2.toml) add_fdb_test(TEST_FILES slow/ApiCorrectness.toml) diff --git a/tests/restarting/from_7.1.0/VersionVectorEnableRestart-1.toml b/tests/restarting/from_7.1.0/VersionVectorEnableRestart-1.toml new file mode 100644 index 0000000000..05b9d7a710 --- /dev/null +++ b/tests/restarting/from_7.1.0/VersionVectorEnableRestart-1.toml @@ -0,0 +1,59 @@ +[[knobs]] +enable_version_vector = false +enable_version_vector_tlog_unicast = false + +[[test]] +testTitle='VersionVectorUpgrade' +clearAfterTest=false + + [[test.workload]] + testName='Cycle' + transactionsPerSecond=2500.0 + nodeCount=1000 + testDuration=30.0 + expectedRate=0 + keyPrefix = 'cycle' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = '!' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = 'ZZZ' + + [[test.workload]] + testName='RandomClogging' + testDuration=10.0 + + [[test.workload]] + testName='Rollback' + meanDelay=10.0 + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + [[test.workload]] + testName='SaveAndKill' + restartInfoLocation='simfdb/restartInfo.ini' + testDuration=60.0 diff --git a/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml b/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml new file mode 100644 index 0000000000..1aa8537f0b --- /dev/null +++ b/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml @@ -0,0 +1,54 @@ +[[knobs]] +enable_version_vector = true +enable_version_vector_tlog_unicast = true + +[[test]] +testTitle='VersionVectorUpgrade' +runSetup=false + + [[test.workload]] + testName='Cycle' + transactionsPerSecond=2500.0 + nodeCount=1000 + testDuration=30.0 + expectedRate=0 + keyPrefix = 'cycle' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = '!' + + [[test.workload]] + testName = 'Cycle' + nodeCount = 1000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = 'ZZZ' + + [[test.workload]] + testName='RandomClogging' + testDuration=10.0 + + [[test.workload]] + testName='Rollback' + meanDelay=10.0 + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + [[test.workload]] + testName='Attrition' + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=60.0 From dc8cd5bce1854538e16930ab919816dc330cda25 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 4 Apr 2022 21:02:25 -0700 Subject: [PATCH 170/299] Enable proxy_use_resolver_private_mutations for VV restarting tests --- tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml | 1 + tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml index c20a7312fb..0e143bfe22 100644 --- a/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml +++ b/tests/restarting/from_7.1.0/VersionVectorDisableRestart-1.toml @@ -1,6 +1,7 @@ [[knobs]] enable_version_vector = true enable_version_vector_tlog_unicast = true +proxy_use_resolver_private_mutations = true [[test]] testTitle='VersionVectorDowngrade' diff --git a/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml b/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml index 1aa8537f0b..c1b4bc1e3e 100644 --- a/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml +++ b/tests/restarting/from_7.1.0/VersionVectorEnableRestart-2.toml @@ -1,6 +1,7 @@ [[knobs]] enable_version_vector = true enable_version_vector_tlog_unicast = true +proxy_use_resolver_private_mutations = true [[test]] testTitle='VersionVectorUpgrade' From 8f884be4f5a86056056dec8c92a17cadc51d3d39 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Mon, 9 May 2022 12:44:23 -0400 Subject: [PATCH 171/299] To not block peeks during recovery in version vector. --- fdbserver/TLogServer.actor.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 4f6c9f17e7..a0273b6f08 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1784,9 +1784,11 @@ Future tLogPeekMessages(PromiseType replyPromise, state Version poppedVer = poppedVersion(logData, reqTag); + auto tagData = logData->getTagData(reqTag); + bool tagRecovered = tagData && !tagData->unpoppedRecovered; if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin && reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 && - !reqReturnIfBlocked) { + !reqReturnIfBlocked && tagRecovered) { state double startTime = now(); // TODO (version vector) check if this should be included in "status details" json // TODO (version vector) all tags may be too many, instead, standard deviation? From 7972ef48d680a010563b83f3423d13ee1cce5345 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 27 Apr 2022 15:45:24 -0400 Subject: [PATCH 172/299] Refactor profiling special keys to use GlobalConfig The special keys `\xff\xff/management/profiling/client_txn_sample_rate` and `\xff\xff/management/profiling/client_txn_size_limit` are deprecated in FDB 7.2. However, GlobalConfig was introduced in 7.0, and reading and writing these keys through the special key space was broken in 7.0+. This change modifies the profiling special keys to use GlobalConfig behind the scenes, fixing the broken special keys. The following Python script was used to make sure both GlobalConfig and the profiling special key can be used to read/write/clear profiling data: ``` import fdb import time fdb.api_version(710) @fdb.transactional def set_sample_rate(tr): tr.options.set_special_key_space_enable_writes() # Alternative way to write the key #tr[b'\xff\xff/global_config/config/fdb_client_info/client_txn_sample_rate'] = fdb.tuple.pack((5.0,)) tr[b'\xff\xff/management/profiling/client_txn_sample_rate'] = '5.0' @fdb.transactional def clear_sample_rate(tr): tr.options.set_special_key_space_enable_writes() # Alternative way to clear the key #tr.clear(b'\xff\xff/global_config/config/fdb_client_info/client_txn_sample_rate') tr[b'\xff\xff/management/profiling/client_txn_sample_rate'] = 'default' @fdb.transactional def get_sample_rate(tr): print(tr.get(b'\xff\xff/global_config/config/fdb_client_info/client_txn_sample_rate')) # Alternative way to read the key #print(tr.get(b'\xff\xff/management/profiling/client_txn_sample_rate')) fdb.options.set_trace_enable() fdb.options.set_trace_format('json') db = fdb.open() get_sample_rate(db) # None (or 'default') set_sample_rate(db) time.sleep(1) # Allow time for global config changes to propagate get_sample_rate(db) # 5.0 clear_sample_rate(db) time.sleep(1) get_sample_rate(db) # None (or 'default') ``` It can be run with `PYTHONPATH=./bindings/python/ python profiling.py`, and reads the `fdb.cluster` file in the current directory. ``` $ PYTHONPATH=./bindings/python/ python sps.py None 5.000000 None ``` --- fdbclient/GlobalConfig.actor.cpp | 27 ++++++++++ fdbclient/GlobalConfig.actor.h | 9 ++++ fdbclient/SpecialKeySpace.actor.cpp | 77 +++++++++++---------------- fdbserver/ClusterController.actor.cpp | 2 +- 4 files changed, 67 insertions(+), 48 deletions(-) diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index 93b43de0d4..ee25a2d48e 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -39,6 +39,33 @@ const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window"); GlobalConfig::GlobalConfig(const Database& cx) : cx(cx), lastUpdate(0) {} +void GlobalConfig::applyChanges(Transaction& tr, + const VectorRef& insertions, + const VectorRef& clears) { + VersionHistory vh{ 0 }; + for (const auto& kv : insertions) { + vh.mutations.emplace_back_deep(vh.mutations.arena(), MutationRef(MutationRef::SetValue, kv.key, kv.value)); + tr.set(kv.key.withPrefix(globalConfigKeysPrefix), kv.value); + } + for (const auto& range : clears) { + vh.mutations.emplace_back_deep(vh.mutations.arena(), + MutationRef(MutationRef::ClearRange, range.begin, range.end)); + tr.clear( + KeyRangeRef(range.begin.withPrefix(globalConfigKeysPrefix), range.end.withPrefix(globalConfigKeysPrefix))); + } + + // Record the mutations in this commit into the global configuration history. + Key historyKey = addVersionStampAtEnd(globalConfigHistoryPrefix); + ObjectWriter historyWriter(IncludeVersion()); + historyWriter.serialize(vh); + tr.atomicOp(historyKey, historyWriter.toStringRef(), MutationRef::SetVersionstampedKey); + + // Write version key to trigger update in cluster controller. + tr.atomicOp(globalConfigVersionKey, + LiteralStringRef("0123456789\x00\x00\x00\x00"), // versionstamp + MutationRef::SetVersionstampedValue); +} + Key GlobalConfig::prefixedKey(KeyRef key) { return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin); } diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index 5e430a3037..ce6e154dab 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -85,6 +85,15 @@ public: _forward = forward(db, std::addressof(dbInfoChanged)); } + // Given a list of insertions and clears, applies the necessary changes to + // the given transaction to update the global configuration database. Keys + // in the list of mutations should not include the global configuration + // prefix (`\xff\xff/global_config/`). The caller must still commit the + // given transaction in order to persist the changes. + static void applyChanges(Transaction& tr, + const VectorRef& insertions, + const VectorRef& clears); + // Use this function to turn a global configuration key defined above into // the full path needed to set the value in the database. // diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 9913123acd..eb25bb4cec 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -1517,7 +1517,8 @@ ACTOR Future> globalConfigCommitActor(GlobalConfigImpl* gl } } - VersionHistory vh{ 0 }; + Standalone> insertions; + Standalone> clears; // Transform writes from the special-key-space (\xff\xff/global_config/) to // the system key space (\xff/globalConfig/), and writes mutations to @@ -1530,36 +1531,17 @@ ACTOR Future> globalConfigCommitActor(GlobalConfigImpl* gl if (entry.first) { if (entry.second.present() && iter->begin().startsWith(globalConfig->getKeyRange().begin)) { Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin); - vh.mutations.emplace_back_deep(vh.mutations.arena(), - MutationRef(MutationRef::SetValue, bareKey, entry.second.get())); - - Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix); - tr.set(systemKey, entry.second.get()); + insertions.push_back_deep(insertions.arena(), KeyValueRef(bareKey, entry.second.get())); } else if (!entry.second.present() && iter->range().begin.startsWith(globalConfig->getKeyRange().begin) && iter->range().end.startsWith(globalConfig->getKeyRange().begin)) { KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin); KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin); - vh.mutations.emplace_back_deep(vh.mutations.arena(), - MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd)); - - Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix); - Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix); - tr.clear(KeyRangeRef(systemRangeBegin, systemRangeEnd)); + clears.push_back_deep(clears.arena(), KeyRangeRef(bareRangeBegin, bareRangeEnd)); } } ++iter; } - - // Record the mutations in this commit into the global configuration history. - Key historyKey = addVersionStampAtEnd(globalConfigHistoryPrefix); - ObjectWriter historyWriter(IncludeVersion()); - historyWriter.serialize(vh); - tr.atomicOp(historyKey, historyWriter.toStringRef(), MutationRef::SetVersionstampedKey); - - // Write version key to trigger update in cluster controller. - tr.atomicOp(globalConfigVersionKey, - LiteralStringRef("0123456789\x00\x00\x00\x00"), // versionstamp - MutationRef::SetVersionstampedValue); + GlobalConfig::applyChanges(tr, insertions, clears); return Optional(); } @@ -1968,13 +1950,11 @@ ACTOR static Future ClientProfilingGetRangeActor(ReadYourWritesTran ASSERT(entry.second.present()); result.push_back_deep(result.arena(), KeyValueRef(sampleRateKey, entry.second.get())); } else { - Optional f = wait(ryw->getTransaction().get(fdbClientInfoTxnSampleRate)); std::string sampleRateStr = "default"; - if (f.present()) { - const double sampleRateDbl = BinaryReader::fromStringRef(f.get(), Unversioned()); - if (!std::isinf(sampleRateDbl)) { - sampleRateStr = boost::lexical_cast(sampleRateDbl); - } + const double sampleRateDbl = ryw->getDatabase()->globalConfig->get( + fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); + if (!std::isinf(sampleRateDbl)) { + sampleRateStr = std::to_string(sampleRateDbl); } result.push_back_deep(result.arena(), KeyValueRef(sampleRateKey, Value(sampleRateStr))); } @@ -1988,13 +1968,10 @@ ACTOR static Future ClientProfilingGetRangeActor(ReadYourWritesTran ASSERT(entry.second.present()); result.push_back_deep(result.arena(), KeyValueRef(txnSizeLimitKey, entry.second.get())); } else { - Optional f = wait(ryw->getTransaction().get(fdbClientInfoTxnSizeLimit)); std::string sizeLimitStr = "default"; - if (f.present()) { - const int64_t sizeLimit = BinaryReader::fromStringRef(f.get(), Unversioned()); - if (sizeLimit != -1) { - sizeLimitStr = boost::lexical_cast(sizeLimit); - } + const int64_t sizeLimit = ryw->getDatabase()->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); + if (sizeLimit != -1) { + sizeLimitStr = boost::lexical_cast(sizeLimit); } result.push_back_deep(result.arena(), KeyValueRef(txnSizeLimitKey, Value(sizeLimitStr))); } @@ -2011,43 +1988,49 @@ Future ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw Future> ClientProfilingImpl::commit(ReadYourWritesTransaction* ryw) { ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS); + Standalone> insertions; + Standalone> clears; + // client_txn_sample_rate Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(getKeyRange().begin); auto rateEntry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey]; if (rateEntry.first && rateEntry.second.present()) { std::string sampleRateStr = rateEntry.second.get().toString(); - double sampleRate; - if (sampleRateStr == "default") - sampleRate = std::numeric_limits::infinity(); - else { + if (sampleRateStr == "default") { + clears.push_back_deep(clears.arena(), + KeyRangeRef(fdbClientInfoTxnSampleRate, keyAfter(fdbClientInfoTxnSampleRate))); + } else { try { - sampleRate = boost::lexical_cast(sampleRateStr); + double sampleRate = boost::lexical_cast(sampleRateStr); + Tuple rate = Tuple().appendDouble(sampleRate); + insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSampleRate, rate.pack())); } catch (boost::bad_lexical_cast& e) { return Optional(ManagementAPIError::toJsonString( false, "profile", "Invalid transaction sample rate(double): " + sampleRateStr)); } } - ryw->getTransaction().set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleRate, Unversioned())); } // client_txn_size_limit Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(getKeyRange().begin); auto sizeLimitEntry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey]; if (sizeLimitEntry.first && sizeLimitEntry.second.present()) { std::string sizeLimitStr = sizeLimitEntry.second.get().toString(); - int64_t sizeLimit; - if (sizeLimitStr == "default") - sizeLimit = -1; - else { + if (sizeLimitStr == "default") { + clears.push_back_deep(clears.arena(), + KeyRangeRef(fdbClientInfoTxnSizeLimit, keyAfter(fdbClientInfoTxnSizeLimit))); + } else { try { - sizeLimit = boost::lexical_cast(sizeLimitStr); + int64_t sizeLimit = boost::lexical_cast(sizeLimitStr); + Tuple size = Tuple().append(sizeLimit); + insertions.push_back_deep(insertions.arena(), KeyValueRef(fdbClientInfoTxnSizeLimit, size.pack())); } catch (boost::bad_lexical_cast& e) { return Optional(ManagementAPIError::toJsonString( false, "profile", "Invalid transaction size limit(int64_t): " + sizeLimitStr)); } } - ryw->getTransaction().set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned())); } + GlobalConfig::applyChanges(ryw->getTransaction(), insertions, clears); return Optional(); } diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 83b7250dc5..dfe0b380a8 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -3170,4 +3170,4 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer return Void(); } -} // namespace \ No newline at end of file +} // namespace From ee9a047cbd44e017ecacb628c946003a3266403d Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Tue, 10 May 2022 19:54:12 +0200 Subject: [PATCH 173/299] Trace OpenSSL error as const char* Remove extent from char buffer for fetching OpenSSL errors. TraceEvent::detail() interprets passed char[] as string literal and prints trailing \0s. --- flow/MkCert.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 40e1141fa3..9298162d18 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -52,7 +52,7 @@ public: 0, }; ::ERR_error_string_n(err, buf, sizeof(buf)); - te.detail("OpenSSLError", buf); + te.detail("OpenSSLError", static_cast(buf)); } throw tls_error(); } From 1e17f324698743e0f1ee3cb6b8d34b04e81e9b79 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 10 May 2022 13:21:23 -0700 Subject: [PATCH 174/299] Move StorageMetrics unit tests into cpp file --- fdbserver/CMakeLists.txt | 2 +- fdbserver/RestoreLoader.actor.cpp | 2 +- fdbserver/StorageMetrics.actor.cpp | 229 ++++++++ fdbserver/StorageMetrics.actor.h | 817 ----------------------------- fdbserver/StorageMetrics.h | 592 ++++++++++++++++++++- 5 files changed, 818 insertions(+), 824 deletions(-) create mode 100644 fdbserver/StorageMetrics.actor.cpp delete mode 100644 fdbserver/StorageMetrics.actor.h diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 337d5345d6..590ad6d287 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -149,7 +149,7 @@ set(FDBSERVER_SRCS Status.actor.cpp Status.h StorageCache.actor.cpp - StorageMetrics.actor.h + StorageMetrics.actor.cpp StorageMetrics.h storageserver.actor.cpp TagPartitionedLogSystem.actor.cpp diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 1afabdcb95..1954a26b3e 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -27,7 +27,7 @@ #include "fdbserver/RestoreLoader.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" #include "fdbserver/MutationTracking.h" -#include "fdbserver/StorageMetrics.actor.h" +#include "fdbserver/StorageMetrics.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp new file mode 100644 index 0000000000..93cadb0542 --- /dev/null +++ b/fdbserver/StorageMetrics.actor.cpp @@ -0,0 +1,229 @@ +/* + * StorageMetrics.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/simulator.h" +#include "flow/UnitTest.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbserver/Knobs.h" +#include "fdbserver/StorageMetrics.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +TEST_CASE("/fdbserver/StorageMetricSample/simple") { + StorageMetricSample s(1000); + s.sample.insert(LiteralStringRef("Apple"), 1000); + s.sample.insert(LiteralStringRef("Banana"), 2000); + s.sample.insert(LiteralStringRef("Cat"), 1000); + s.sample.insert(LiteralStringRef("Cathode"), 1000); + s.sample.insert(LiteralStringRef("Dog"), 1000); + + ASSERT(s.getEstimate(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D"))) == 5000); + ASSERT(s.getEstimate(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("E"))) == 6000); + ASSERT(s.getEstimate(KeyRangeRef(LiteralStringRef("B"), LiteralStringRef("C"))) == 2000); + + // ASSERT(s.splitEstimate(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 3500) == + // LiteralStringRef("Cat")); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/simple") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 800 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); + + std::vector t = ssm.getSplitPoints( + KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 2000 * sampleUnit, Optional()); + + ASSERT(t.size() == 1 && t[0] == LiteralStringRef("Bah")); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/multipleReturnedPoints") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 800 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); + + std::vector t = ssm.getSplitPoints( + KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 600 * sampleUnit, Optional()); + + ASSERT(t.size() == 3 && t[0] == LiteralStringRef("Absolute") && t[1] == LiteralStringRef("Apple") && + t[2] == LiteralStringRef("Bah")); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/noneSplitable") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 800 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); + + std::vector t = ssm.getSplitPoints( + KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 10000 * sampleUnit, Optional()); + + ASSERT(t.size() == 0); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/chunkTooLarge") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 10 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 10 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 30 * sampleUnit); + + std::vector t = ssm.getSplitPoints( + KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 1000 * sampleUnit, Optional()); + + ASSERT(t.size() == 0); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/simple") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 1000 * sampleUnit); + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); + + std::vector t = + ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 2.0, 200 * sampleUnit, 0); + + ASSERT(t.size() == 1 && (*t.begin()).keys.begin == LiteralStringRef("Bah") && + (*t.begin()).keys.end == LiteralStringRef("Bob")); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/moreThanOneRange") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Final"), 2000 * sampleUnit); + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Dah"), 300 * sampleUnit); + + std::vector t = + ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 2.0, 200 * sampleUnit, 0); + + ASSERT(t.size() == 2 && (*t.begin()).keys.begin == LiteralStringRef("Bah") && + (*t.begin()).keys.end == LiteralStringRef("Bob")); + ASSERT(t.at(1).keys.begin == LiteralStringRef("Cat") && t.at(1).keys.end == LiteralStringRef("Dah")); + + return Void(); +} + +TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/consecutiveRanges") { + + int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + StorageServerMetrics ssm; + + ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Bucket"), 2000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 5000 * sampleUnit); + ssm.bytesReadSample.sample.insert(LiteralStringRef("Final"), 2000 * sampleUnit); + + ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); + ssm.byteSample.sample.insert(LiteralStringRef("Dah"), 300 * sampleUnit); + + std::vector t = + ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 2.0, 200 * sampleUnit, 0); + + ASSERT(t.size() == 2 && (*t.begin()).keys.begin == LiteralStringRef("Bah") && + (*t.begin()).keys.end == LiteralStringRef("But")); + ASSERT(t.at(1).keys.begin == LiteralStringRef("Cat") && t.at(1).keys.end == LiteralStringRef("Dah")); + + return Void(); +} diff --git a/fdbserver/StorageMetrics.actor.h b/fdbserver/StorageMetrics.actor.h deleted file mode 100644 index fa35fffdca..0000000000 --- a/fdbserver/StorageMetrics.actor.h +++ /dev/null @@ -1,817 +0,0 @@ -/* - * StorageMetrics.actor.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Included via StorageMetrics.h -#include "fdbclient/FDBTypes.h" -#include "fdbrpc/simulator.h" -#include "flow/UnitTest.h" -#include "fdbclient/StorageServerInterface.h" -#include "fdbclient/KeyRangeMap.h" -#include "fdbserver/Knobs.h" -#include "flow/actorcompiler.h" // This must be the last #include. - -const StringRef STORAGESERVER_HISTOGRAM_GROUP = LiteralStringRef("StorageServer"); -const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysLatency"); -const StringRef FETCH_KEYS_BYTES_HISTOGRAM = LiteralStringRef("FetchKeysSize"); -const StringRef FETCH_KEYS_BYTES_PER_SECOND_HISTOGRAM = LiteralStringRef("FetchKeysBandwidth"); -const StringRef TLOG_CURSOR_READS_LATENCY_HISTOGRAM = LiteralStringRef("TLogCursorReadsLatency"); -const StringRef SS_VERSION_LOCK_LATENCY_HISTOGRAM = LiteralStringRef("SSVersionLockLatency"); -const StringRef EAGER_READS_LATENCY_HISTOGRAM = LiteralStringRef("EagerReadsLatency"); -const StringRef FETCH_KEYS_PTREE_UPDATES_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysPTreeUpdatesLatency"); -const StringRef TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM = LiteralStringRef("TLogMsgsPTreeUpdatesLatency"); -const StringRef STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM = LiteralStringRef("StorageUpdatesDurableLatency"); -const StringRef STORAGE_COMMIT_LATENCY_HISTOGRAM = LiteralStringRef("StorageCommitLatency"); -const StringRef SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM = LiteralStringRef("SSDurableVersionUpdateLatency"); - -struct StorageMetricSample { - IndexedSet sample; - int64_t metricUnitsPerSample; - - StorageMetricSample(int64_t metricUnitsPerSample) : metricUnitsPerSample(metricUnitsPerSample) {} - - int64_t getEstimate(KeyRangeRef keys) const { return sample.sumRange(keys.begin, keys.end); } - KeyRef splitEstimate(KeyRangeRef range, int64_t offset, bool front = true) const { - auto fwd_split = sample.index(front ? sample.sumTo(sample.lower_bound(range.begin)) + offset - : sample.sumTo(sample.lower_bound(range.end)) - offset); - - if (fwd_split == sample.end() || *fwd_split >= range.end) - return range.end; - - if (!front && *fwd_split <= range.begin) - return range.begin; - - auto bck_split = fwd_split; - - // Butterfly search - start at midpoint then go in both directions. - while ((fwd_split != sample.end() && *fwd_split < range.end) || - (bck_split != sample.begin() && *bck_split > range.begin)) { - if (bck_split != sample.begin() && *bck_split > range.begin) { - auto it = bck_split; - bck_split.decrementNonEnd(); - - KeyRef split = keyBetween(KeyRangeRef( - bck_split != sample.begin() ? std::max(*bck_split, range.begin) : range.begin, *it)); - if (!front || (getEstimate(KeyRangeRef(range.begin, split)) > 0 && - split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT)) - return split; - } - - if (fwd_split != sample.end() && *fwd_split < range.end) { - auto it = fwd_split; - ++it; - - KeyRef split = keyBetween( - KeyRangeRef(*fwd_split, it != sample.end() ? std::min(*it, range.end) : range.end)); - if (front || (getEstimate(KeyRangeRef(split, range.end)) > 0 && - split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT)) - return split; - - fwd_split = it; - } - } - - // If we didn't return above, we didn't find anything. - TraceEvent(SevWarn, "CannotSplitLastSampleKey").detail("Range", range).detail("Offset", offset); - return front ? range.end : range.begin; - } -}; - -TEST_CASE("/fdbserver/StorageMetricSample/simple") { - StorageMetricSample s(1000); - s.sample.insert(LiteralStringRef("Apple"), 1000); - s.sample.insert(LiteralStringRef("Banana"), 2000); - s.sample.insert(LiteralStringRef("Cat"), 1000); - s.sample.insert(LiteralStringRef("Cathode"), 1000); - s.sample.insert(LiteralStringRef("Dog"), 1000); - - ASSERT(s.getEstimate(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D"))) == 5000); - ASSERT(s.getEstimate(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("E"))) == 6000); - ASSERT(s.getEstimate(KeyRangeRef(LiteralStringRef("B"), LiteralStringRef("C"))) == 2000); - - // ASSERT(s.splitEstimate(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 3500) == - // LiteralStringRef("Cat")); - - return Void(); -} - -struct TransientStorageMetricSample : StorageMetricSample { - Deque>> queue; - - TransientStorageMetricSample(int64_t metricUnitsPerSample) : StorageMetricSample(metricUnitsPerSample) {} - - // Returns the sampled metric value (possibly 0, possibly increased by the sampling factor) - int64_t addAndExpire(KeyRef key, int64_t metric, double expiration) { - int64_t x = add(key, metric); - if (x) - queue.emplace_back(expiration, std::make_pair(*sample.find(key), -x)); - return x; - } - - // FIXME: both versions of erase are broken, because they do not remove items in the queue with will subtract a - // metric from the value sometime in the future - int64_t erase(KeyRef key) { - auto it = sample.find(key); - if (it == sample.end()) - return 0; - int64_t x = sample.getMetric(it); - sample.erase(it); - return x; - } - void erase(KeyRangeRef keys) { sample.erase(keys.begin, keys.end); } - - void poll(KeyRangeMap>>& waitMap, StorageMetrics m) { - double now = ::now(); - while (queue.size() && queue.front().first <= now) { - KeyRef key = queue.front().second.first; - int64_t delta = queue.front().second.second; - ASSERT(delta != 0); - - if (sample.addMetric(key, delta) == 0) - sample.erase(key); - - StorageMetrics deltaM = m * delta; - auto v = waitMap[key]; - for (int i = 0; i < v.size(); i++) { - TEST(true); // TransientStorageMetricSample poll update - v[i].send(deltaM); - } - - queue.pop_front(); - } - } - - void poll() { - double now = ::now(); - while (queue.size() && queue.front().first <= now) { - KeyRef key = queue.front().second.first; - int64_t delta = queue.front().second.second; - ASSERT(delta != 0); - - if (sample.addMetric(key, delta) == 0) - sample.erase(key); - - queue.pop_front(); - } - } - -private: - bool roll(KeyRef key, int64_t metric) const { - return deterministicRandom()->random01() < - (double)metric / metricUnitsPerSample; //< SOMEDAY: Better randomInt64? - } - - int64_t add(KeyRef key, int64_t metric) { - if (!metric) - return 0; - int64_t mag = metric < 0 ? -metric : metric; - - if (mag < metricUnitsPerSample) { - if (!roll(key, mag)) - return 0; - metric = metric < 0 ? -metricUnitsPerSample : metricUnitsPerSample; - } - - if (sample.addMetric(key, metric) == 0) - sample.erase(key); - - return metric; - } -}; - -struct StorageServerMetrics { - KeyRangeMap>> waitMetricsMap; - StorageMetricSample byteSample; - TransientStorageMetricSample iopsSample, - bandwidthSample; // FIXME: iops and bandwidth calculations are not effectively tested, since they aren't - // currently used by data distribution - TransientStorageMetricSample bytesReadSample; - - StorageServerMetrics() - : byteSample(0), iopsSample(SERVER_KNOBS->IOPS_UNITS_PER_SAMPLE), - bandwidthSample(SERVER_KNOBS->BANDWIDTH_UNITS_PER_SAMPLE), - bytesReadSample(SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE) {} - - // Get the current estimated metrics for the given keys - StorageMetrics getMetrics(KeyRangeRef const& keys) const { - StorageMetrics result; - result.bytes = byteSample.getEstimate(keys); - result.bytesPerKSecond = - bandwidthSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - result.iosPerKSecond = - iopsSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - result.bytesReadPerKSecond = - bytesReadSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - return result; - } - - // Called when metrics should change (IO for a given key) - // Notifies waiting WaitMetricsRequests through waitMetricsMap, and updates metricsAverageQueue and metricsSampleMap - void notify(KeyRef key, StorageMetrics& metrics) { - ASSERT(metrics.bytes == 0); // ShardNotifyMetrics - if (g_network->isSimulated()) { - TEST(metrics.bytesPerKSecond != 0); // ShardNotifyMetrics bytes - TEST(metrics.iosPerKSecond != 0); // ShardNotifyMetrics ios - TEST(metrics.bytesReadPerKSecond != 0); // ShardNotifyMetrics bytesRead - } - - double expire = now() + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; - - StorageMetrics notifyMetrics; - - if (metrics.bytesPerKSecond) - notifyMetrics.bytesPerKSecond = bandwidthSample.addAndExpire(key, metrics.bytesPerKSecond, expire) * - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (metrics.iosPerKSecond) - notifyMetrics.iosPerKSecond = iopsSample.addAndExpire(key, metrics.iosPerKSecond, expire) * - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (metrics.bytesReadPerKSecond) - notifyMetrics.bytesReadPerKSecond = bytesReadSample.addAndExpire(key, metrics.bytesReadPerKSecond, expire) * - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (!notifyMetrics.allZero()) { - auto& v = waitMetricsMap[key]; - for (int i = 0; i < v.size(); i++) { - if (g_network->isSimulated()) { - TEST(true); // shard notify metrics - } - // ShardNotifyMetrics - v[i].send(notifyMetrics); - } - } - } - - // Due to the fact that read sampling will be called on all reads, use this specialized function to avoid overhead - // around branch misses and unnecessary stack allocation which eventually addes up under heavy load. - void notifyBytesReadPerKSecond(KeyRef key, int64_t in) { - double expire = now() + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; - int64_t bytesReadPerKSecond = - bytesReadSample.addAndExpire(key, in, expire) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (bytesReadPerKSecond > 0) { - StorageMetrics notifyMetrics; - notifyMetrics.bytesReadPerKSecond = bytesReadPerKSecond; - auto& v = waitMetricsMap[key]; - for (int i = 0; i < v.size(); i++) { - TEST(true); // ShardNotifyMetrics - v[i].send(notifyMetrics); - } - } - } - - // Called by StorageServerDisk when the size of a key in byteSample changes, to notify WaitMetricsRequest - // Should not be called for keys past allKeys.end - void notifyBytes(RangeMap>, KeyRangeRef>::iterator shard, - int64_t bytes) { - ASSERT(shard.end() <= allKeys.end); - - StorageMetrics notifyMetrics; - notifyMetrics.bytes = bytes; - for (int i = 0; i < shard.value().size(); i++) { - TEST(true); // notifyBytes - shard.value()[i].send(notifyMetrics); - } - } - - // Called by StorageServerDisk when the size of a key in byteSample changes, to notify WaitMetricsRequest - void notifyBytes(KeyRef key, int64_t bytes) { - if (key >= allKeys.end) // Do not notify on changes to internal storage server state - return; - - notifyBytes(waitMetricsMap.rangeContaining(key), bytes); - } - - // Called when a range of keys becomes unassigned (and therefore not readable), to notify waiting - // WaitMetricsRequests (also other types of wait - // requests in the future?) - void notifyNotReadable(KeyRangeRef keys) { - auto rs = waitMetricsMap.intersectingRanges(keys); - for (auto r = rs.begin(); r != rs.end(); ++r) { - auto& v = r->value(); - TEST(v.size()); // notifyNotReadable() sending errors to intersecting ranges - for (int n = 0; n < v.size(); n++) - v[n].sendError(wrong_shard_server()); - } - } - - // Called periodically (~1 sec intervals) to remove older IOs from the averages - // Removes old entries from metricsAverageQueue, updates metricsSampleMap accordingly, and notifies - // WaitMetricsRequests through waitMetricsMap. - void poll() { - { - StorageMetrics m; - m.bytesPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - bandwidthSample.poll(waitMetricsMap, m); - } - { - StorageMetrics m; - m.iosPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - iopsSample.poll(waitMetricsMap, m); - } - { - StorageMetrics m; - m.bytesReadPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - bytesReadSample.poll(waitMetricsMap, m); - } - // bytesSample doesn't need polling because we never call addExpire() on it - } - - // static void waitMetrics( StorageServerMetrics* const& self, WaitMetricsRequest const& req ); - - // This function can run on untrusted user data. We must validate all divisions carefully. - KeyRef getSplitKey(int64_t remaining, - int64_t estimated, - int64_t limits, - int64_t used, - int64_t infinity, - bool isLastShard, - const StorageMetricSample& sample, - double divisor, - KeyRef const& lastKey, - KeyRef const& key, - bool hasUsed) const { - ASSERT(remaining >= 0); - ASSERT(limits > 0); - ASSERT(divisor > 0); - - if (limits < infinity / 2) { - int64_t expectedSize; - if (isLastShard || remaining > estimated) { - double remaining_divisor = (double(remaining) / limits) + 0.5; - expectedSize = remaining / remaining_divisor; - } else { - // If we are here, then estimated >= remaining >= 0 - double estimated_divisor = (double(estimated) / limits) + 0.5; - expectedSize = remaining / estimated_divisor; - } - - if (remaining > expectedSize) { - // This does the conversion from native units to bytes using the divisor. - double offset = (expectedSize - used) / divisor; - if (offset <= 0) - return hasUsed ? lastKey : key; - return sample.splitEstimate( - KeyRangeRef(lastKey, key), - offset * ((1.0 - SERVER_KNOBS->SPLIT_JITTER_AMOUNT) + - 2 * deterministicRandom()->random01() * SERVER_KNOBS->SPLIT_JITTER_AMOUNT)); - } - } - - return key; - } - - void splitMetrics(SplitMetricsRequest req) const { - try { - SplitMetricsReply reply; - KeyRef lastKey = req.keys.begin; - StorageMetrics used = req.used; - StorageMetrics estimated = req.estimated; - StorageMetrics remaining = getMetrics(req.keys) + used; - - //TraceEvent("SplitMetrics").detail("Begin", req.keys.begin).detail("End", req.keys.end).detail("Remaining", remaining.bytes).detail("Used", used.bytes); - - while (true) { - if (remaining.bytes < 2 * SERVER_KNOBS->MIN_SHARD_BYTES) - break; - KeyRef key = req.keys.end; - bool hasUsed = used.bytes != 0 || used.bytesPerKSecond != 0 || used.iosPerKSecond != 0; - key = getSplitKey(remaining.bytes, - estimated.bytes, - req.limits.bytes, - used.bytes, - req.limits.infinity, - req.isLastShard, - byteSample, - 1, - lastKey, - key, - hasUsed); - if (used.bytes < SERVER_KNOBS->MIN_SHARD_BYTES) - key = std::max(key, - byteSample.splitEstimate(KeyRangeRef(lastKey, req.keys.end), - SERVER_KNOBS->MIN_SHARD_BYTES - used.bytes)); - key = getSplitKey(remaining.iosPerKSecond, - estimated.iosPerKSecond, - req.limits.iosPerKSecond, - used.iosPerKSecond, - req.limits.infinity, - req.isLastShard, - iopsSample, - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, - lastKey, - key, - hasUsed); - key = getSplitKey(remaining.bytesPerKSecond, - estimated.bytesPerKSecond, - req.limits.bytesPerKSecond, - used.bytesPerKSecond, - req.limits.infinity, - req.isLastShard, - bandwidthSample, - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, - lastKey, - key, - hasUsed); - ASSERT(key != lastKey || hasUsed); - if (key == req.keys.end) - break; - reply.splits.push_back_deep(reply.splits.arena(), key); - - StorageMetrics diff = (getMetrics(KeyRangeRef(lastKey, key)) + used); - remaining -= diff; - estimated -= diff; - - used = StorageMetrics(); - lastKey = key; - } - - reply.used = getMetrics(KeyRangeRef(lastKey, req.keys.end)) + used; - req.reply.send(reply); - } catch (Error& e) { - req.reply.sendError(e); - } - } - - void getStorageMetrics(GetStorageMetricsRequest req, - StorageBytes sb, - double bytesInputRate, - int64_t versionLag, - double lastUpdate) const { - GetStorageMetricsReply rep; - - // SOMEDAY: make bytes dynamic with hard disk space - rep.load = getMetrics(allKeys); - - if (sb.free < 1e9) { - TraceEvent(SevWarn, "PhysicalDiskMetrics") - .suppressFor(60.0) - .detail("Free", sb.free) - .detail("Total", sb.total) - .detail("Available", sb.available) - .detail("Load", rep.load.bytes); - } - - rep.available.bytes = sb.available; - rep.available.iosPerKSecond = 10e6; - rep.available.bytesPerKSecond = 100e9; - rep.available.bytesReadPerKSecond = 100e9; - - rep.capacity.bytes = sb.total; - rep.capacity.iosPerKSecond = 10e6; - rep.capacity.bytesPerKSecond = 100e9; - rep.capacity.bytesReadPerKSecond = 100e9; - - rep.bytesInputRate = bytesInputRate; - - rep.versionLag = versionLag; - rep.lastUpdate = lastUpdate; - - req.reply.send(rep); - } - - Future waitMetrics(WaitMetricsRequest req, Future delay); - - // Given a read hot shard, this function will divide the shard into chunks and find those chunks whose - // readBytes/sizeBytes exceeds the `readDensityRatio`. Please make sure to run unit tests - // `StorageMetricsSampleTests.txt` after change made. - std::vector getReadHotRanges(KeyRangeRef shard, - double readDensityRatio, - int64_t baseChunkSize, - int64_t minShardReadBandwidthPerKSeconds) const { - std::vector toReturn; - - double shardSize = (double)byteSample.getEstimate(shard); - int64_t shardReadBandwidth = bytesReadSample.getEstimate(shard); - if (shardReadBandwidth * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS <= - minShardReadBandwidthPerKSeconds) { - return toReturn; - } - if (shardSize <= baseChunkSize) { - // Shard is small, use it as is - if (bytesReadSample.getEstimate(shard) > (readDensityRatio * shardSize)) { - toReturn.emplace_back(shard, - bytesReadSample.getEstimate(shard) / shardSize, - bytesReadSample.getEstimate(shard) / - SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); - } - return toReturn; - } - KeyRef beginKey = shard.begin; - auto endKey = - byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + baseChunkSize); - while (endKey != byteSample.sample.end()) { - if (*endKey > shard.end) { - endKey = byteSample.sample.lower_bound(shard.end); - if (*endKey == beginKey) { - // No need to increment endKey since otherwise it would stuck here forever. - break; - } - } - if (*endKey == beginKey) { - ++endKey; - continue; - } - if (bytesReadSample.getEstimate(KeyRangeRef(beginKey, *endKey)) > - (readDensityRatio * std::max(baseChunkSize, byteSample.getEstimate(KeyRangeRef(beginKey, *endKey))))) { - auto range = KeyRangeRef(beginKey, *endKey); - if (!toReturn.empty() && toReturn.back().keys.end == range.begin) { - // in case two consecutive chunks both are over the ratio, merge them. - range = KeyRangeRef(toReturn.back().keys.begin, *endKey); - toReturn.pop_back(); - } - toReturn.emplace_back( - range, - (double)bytesReadSample.getEstimate(range) / std::max(baseChunkSize, byteSample.getEstimate(range)), - bytesReadSample.getEstimate(range) / SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); - } - beginKey = *endKey; - endKey = byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + - baseChunkSize); - } - return toReturn; - } - - void getReadHotRanges(ReadHotSubRangeRequest req) const { - ReadHotSubRangeReply reply; - auto _ranges = getReadHotRanges(req.keys, - SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO, - SERVER_KNOBS->READ_HOT_SUB_RANGE_CHUNK_SIZE, - SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS); - reply.readHotRanges = VectorRef(_ranges.data(), _ranges.size()); - req.reply.send(reply); - } - - std::vector getSplitPoints(KeyRangeRef range, int64_t chunkSize, Optional prefixToRemove) { - std::vector toReturn; - KeyRef beginKey = range.begin; - IndexedSet::iterator endKey = - byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + chunkSize); - while (endKey != byteSample.sample.end()) { - if (*endKey > range.end) { - break; - } - if (*endKey == beginKey) { - ++endKey; - continue; - } - KeyRef splitPoint = *endKey; - if (prefixToRemove.present()) { - splitPoint = splitPoint.removePrefix(prefixToRemove.get()); - } - toReturn.push_back(splitPoint); - beginKey = *endKey; - endKey = - byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + chunkSize); - } - return toReturn; - } - - void getSplitPoints(SplitRangeRequest req, Optional prefix) { - SplitRangeReply reply; - KeyRangeRef range = req.keys; - if (prefix.present()) { - range = range.withPrefix(prefix.get(), req.arena); - } - std::vector points = getSplitPoints(range, req.chunkSize, prefix); - - reply.splitPoints.append_deep(reply.splitPoints.arena(), points.data(), points.size()); - req.reply.send(reply); - } - -private: - static void collapse(KeyRangeMap& map, KeyRef const& key) { - auto range = map.rangeContaining(key); - if (range == map.ranges().begin() || range == map.ranges().end()) - return; - int value = range->value(); - auto prev = range; - --prev; - if (prev->value() != value) - return; - KeyRange keys = KeyRangeRef(prev->begin(), range->end()); - map.insert(keys, value); - } - - static void add(KeyRangeMap& map, KeyRangeRef const& keys, int delta) { - auto rs = map.modify(keys); - for (auto r = rs.begin(); r != rs.end(); ++r) - r->value() += delta; - collapse(map, keys.begin); - collapse(map, keys.end); - } -}; - -TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/simple") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 800 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); - - std::vector t = ssm.getSplitPoints( - KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 2000 * sampleUnit, Optional()); - - ASSERT(t.size() == 1 && t[0] == LiteralStringRef("Bah")); - - return Void(); -} - -TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/multipleReturnedPoints") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 800 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); - - std::vector t = ssm.getSplitPoints( - KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 600 * sampleUnit, Optional()); - - ASSERT(t.size() == 3 && t[0] == LiteralStringRef("Absolute") && t[1] == LiteralStringRef("Apple") && - t[2] == LiteralStringRef("Bah")); - - return Void(); -} - -TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/noneSplitable") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 800 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); - - std::vector t = ssm.getSplitPoints( - KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 10000 * sampleUnit, Optional()); - - ASSERT(t.size() == 0); - - return Void(); -} - -TEST_CASE("/fdbserver/StorageMetricSample/rangeSplitPoints/chunkTooLarge") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 10 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 10 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 30 * sampleUnit); - - std::vector t = ssm.getSplitPoints( - KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 1000 * sampleUnit, Optional()); - - ASSERT(t.size() == 0); - - return Void(); -} - -TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/simple") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 1000 * sampleUnit); - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); - - std::vector t = - ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("C")), 2.0, 200 * sampleUnit, 0); - - ASSERT(t.size() == 1 && (*t.begin()).keys.begin == LiteralStringRef("Bah") && - (*t.begin()).keys.end == LiteralStringRef("Bob")); - - return Void(); -} - -TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/moreThanOneRange") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Final"), 2000 * sampleUnit); - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Dah"), 300 * sampleUnit); - - std::vector t = - ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 2.0, 200 * sampleUnit, 0); - - ASSERT(t.size() == 2 && (*t.begin()).keys.begin == LiteralStringRef("Bah") && - (*t.begin()).keys.end == LiteralStringRef("Bob")); - ASSERT(t.at(1).keys.begin == LiteralStringRef("Cat") && t.at(1).keys.end == LiteralStringRef("Dah")); - - return Void(); -} - -TEST_CASE("/fdbserver/StorageMetricSample/readHotDetect/consecutiveRanges") { - - int64_t sampleUnit = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; - StorageServerMetrics ssm; - - ssm.bytesReadSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Banana"), 2000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Bucket"), 2000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Cat"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Cathode"), 1000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Dog"), 5000 * sampleUnit); - ssm.bytesReadSample.sample.insert(LiteralStringRef("Final"), 2000 * sampleUnit); - - ssm.byteSample.sample.insert(LiteralStringRef("A"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Absolute"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Apple"), 1000 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bah"), 20 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Banana"), 80 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Bob"), 200 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("But"), 100 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Cat"), 300 * sampleUnit); - ssm.byteSample.sample.insert(LiteralStringRef("Dah"), 300 * sampleUnit); - - std::vector t = - ssm.getReadHotRanges(KeyRangeRef(LiteralStringRef("A"), LiteralStringRef("D")), 2.0, 200 * sampleUnit, 0); - - ASSERT(t.size() == 2 && (*t.begin()).keys.begin == LiteralStringRef("Bah") && - (*t.begin()).keys.end == LiteralStringRef("But")); - ASSERT(t.at(1).keys.begin == LiteralStringRef("Cat") && t.at(1).keys.end == LiteralStringRef("Dah")); - - return Void(); -} - -// Contains information about whether or not a key-value pair should be included in a byte sample -// Also contains size information about the byte sample -struct ByteSampleInfo { - bool inSample; - - // Actual size of the key value pair - int64_t size; - - // The recorded size of the sample (max of bytesPerSample, size) - int64_t sampledSize; -}; - -// Determines whether a key-value pair should be included in a byte sample -// Also returns size information about the sample -ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue); - -#include "flow/unactorcompiler.h" diff --git a/fdbserver/StorageMetrics.h b/fdbserver/StorageMetrics.h index 9791c0b113..17ad106e1d 100644 --- a/fdbserver/StorageMetrics.h +++ b/fdbserver/StorageMetrics.h @@ -18,8 +18,590 @@ * limitations under the License. */ -#if defined(NO_INTELLISENSE) -#include "fdbserver/StorageMetrics.actor.g.h" -#else -#include "fdbserver/StorageMetrics.actor.h" -#endif \ No newline at end of file +#pragma once + +const StringRef STORAGESERVER_HISTOGRAM_GROUP = LiteralStringRef("StorageServer"); +const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysLatency"); +const StringRef FETCH_KEYS_BYTES_HISTOGRAM = LiteralStringRef("FetchKeysSize"); +const StringRef FETCH_KEYS_BYTES_PER_SECOND_HISTOGRAM = LiteralStringRef("FetchKeysBandwidth"); +const StringRef TLOG_CURSOR_READS_LATENCY_HISTOGRAM = LiteralStringRef("TLogCursorReadsLatency"); +const StringRef SS_VERSION_LOCK_LATENCY_HISTOGRAM = LiteralStringRef("SSVersionLockLatency"); +const StringRef EAGER_READS_LATENCY_HISTOGRAM = LiteralStringRef("EagerReadsLatency"); +const StringRef FETCH_KEYS_PTREE_UPDATES_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysPTreeUpdatesLatency"); +const StringRef TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM = LiteralStringRef("TLogMsgsPTreeUpdatesLatency"); +const StringRef STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM = LiteralStringRef("StorageUpdatesDurableLatency"); +const StringRef STORAGE_COMMIT_LATENCY_HISTOGRAM = LiteralStringRef("StorageCommitLatency"); +const StringRef SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM = LiteralStringRef("SSDurableVersionUpdateLatency"); + +struct StorageMetricSample { + IndexedSet sample; + int64_t metricUnitsPerSample; + + StorageMetricSample(int64_t metricUnitsPerSample) : metricUnitsPerSample(metricUnitsPerSample) {} + + int64_t getEstimate(KeyRangeRef keys) const { return sample.sumRange(keys.begin, keys.end); } + KeyRef splitEstimate(KeyRangeRef range, int64_t offset, bool front = true) const { + auto fwd_split = sample.index(front ? sample.sumTo(sample.lower_bound(range.begin)) + offset + : sample.sumTo(sample.lower_bound(range.end)) - offset); + + if (fwd_split == sample.end() || *fwd_split >= range.end) + return range.end; + + if (!front && *fwd_split <= range.begin) + return range.begin; + + auto bck_split = fwd_split; + + // Butterfly search - start at midpoint then go in both directions. + while ((fwd_split != sample.end() && *fwd_split < range.end) || + (bck_split != sample.begin() && *bck_split > range.begin)) { + if (bck_split != sample.begin() && *bck_split > range.begin) { + auto it = bck_split; + bck_split.decrementNonEnd(); + + KeyRef split = keyBetween(KeyRangeRef( + bck_split != sample.begin() ? std::max(*bck_split, range.begin) : range.begin, *it)); + if (!front || (getEstimate(KeyRangeRef(range.begin, split)) > 0 && + split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT)) + return split; + } + + if (fwd_split != sample.end() && *fwd_split < range.end) { + auto it = fwd_split; + ++it; + + KeyRef split = keyBetween( + KeyRangeRef(*fwd_split, it != sample.end() ? std::min(*it, range.end) : range.end)); + if (front || (getEstimate(KeyRangeRef(split, range.end)) > 0 && + split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT)) + return split; + + fwd_split = it; + } + } + + // If we didn't return above, we didn't find anything. + TraceEvent(SevWarn, "CannotSplitLastSampleKey").detail("Range", range).detail("Offset", offset); + return front ? range.end : range.begin; + } +}; + +struct TransientStorageMetricSample : StorageMetricSample { + Deque>> queue; + + TransientStorageMetricSample(int64_t metricUnitsPerSample) : StorageMetricSample(metricUnitsPerSample) {} + + // Returns the sampled metric value (possibly 0, possibly increased by the sampling factor) + int64_t addAndExpire(KeyRef key, int64_t metric, double expiration) { + int64_t x = add(key, metric); + if (x) + queue.emplace_back(expiration, std::make_pair(*sample.find(key), -x)); + return x; + } + + // FIXME: both versions of erase are broken, because they do not remove items in the queue with will subtract a + // metric from the value sometime in the future + int64_t erase(KeyRef key) { + auto it = sample.find(key); + if (it == sample.end()) + return 0; + int64_t x = sample.getMetric(it); + sample.erase(it); + return x; + } + void erase(KeyRangeRef keys) { sample.erase(keys.begin, keys.end); } + + void poll(KeyRangeMap>>& waitMap, StorageMetrics m) { + double now = ::now(); + while (queue.size() && queue.front().first <= now) { + KeyRef key = queue.front().second.first; + int64_t delta = queue.front().second.second; + ASSERT(delta != 0); + + if (sample.addMetric(key, delta) == 0) + sample.erase(key); + + StorageMetrics deltaM = m * delta; + auto v = waitMap[key]; + for (int i = 0; i < v.size(); i++) { + TEST(true); // TransientStorageMetricSample poll update + v[i].send(deltaM); + } + + queue.pop_front(); + } + } + + void poll() { + double now = ::now(); + while (queue.size() && queue.front().first <= now) { + KeyRef key = queue.front().second.first; + int64_t delta = queue.front().second.second; + ASSERT(delta != 0); + + if (sample.addMetric(key, delta) == 0) + sample.erase(key); + + queue.pop_front(); + } + } + +private: + bool roll(KeyRef key, int64_t metric) const { + return deterministicRandom()->random01() < + (double)metric / metricUnitsPerSample; //< SOMEDAY: Better randomInt64? + } + + int64_t add(KeyRef key, int64_t metric) { + if (!metric) + return 0; + int64_t mag = metric < 0 ? -metric : metric; + + if (mag < metricUnitsPerSample) { + if (!roll(key, mag)) + return 0; + metric = metric < 0 ? -metricUnitsPerSample : metricUnitsPerSample; + } + + if (sample.addMetric(key, metric) == 0) + sample.erase(key); + + return metric; + } +}; + +struct StorageServerMetrics { + KeyRangeMap>> waitMetricsMap; + StorageMetricSample byteSample; + TransientStorageMetricSample iopsSample, + bandwidthSample; // FIXME: iops and bandwidth calculations are not effectively tested, since they aren't + // currently used by data distribution + TransientStorageMetricSample bytesReadSample; + + StorageServerMetrics() + : byteSample(0), iopsSample(SERVER_KNOBS->IOPS_UNITS_PER_SAMPLE), + bandwidthSample(SERVER_KNOBS->BANDWIDTH_UNITS_PER_SAMPLE), + bytesReadSample(SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE) {} + + // Get the current estimated metrics for the given keys + StorageMetrics getMetrics(KeyRangeRef const& keys) const { + StorageMetrics result; + result.bytes = byteSample.getEstimate(keys); + result.bytesPerKSecond = + bandwidthSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + result.iosPerKSecond = + iopsSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + result.bytesReadPerKSecond = + bytesReadSample.getEstimate(keys) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + return result; + } + + // Called when metrics should change (IO for a given key) + // Notifies waiting WaitMetricsRequests through waitMetricsMap, and updates metricsAverageQueue and metricsSampleMap + void notify(KeyRef key, StorageMetrics& metrics) { + ASSERT(metrics.bytes == 0); // ShardNotifyMetrics + if (g_network->isSimulated()) { + TEST(metrics.bytesPerKSecond != 0); // ShardNotifyMetrics bytes + TEST(metrics.iosPerKSecond != 0); // ShardNotifyMetrics ios + TEST(metrics.bytesReadPerKSecond != 0); // ShardNotifyMetrics bytesRead + } + + double expire = now() + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; + + StorageMetrics notifyMetrics; + + if (metrics.bytesPerKSecond) + notifyMetrics.bytesPerKSecond = bandwidthSample.addAndExpire(key, metrics.bytesPerKSecond, expire) * + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + if (metrics.iosPerKSecond) + notifyMetrics.iosPerKSecond = iopsSample.addAndExpire(key, metrics.iosPerKSecond, expire) * + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + if (metrics.bytesReadPerKSecond) + notifyMetrics.bytesReadPerKSecond = bytesReadSample.addAndExpire(key, metrics.bytesReadPerKSecond, expire) * + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + if (!notifyMetrics.allZero()) { + auto& v = waitMetricsMap[key]; + for (int i = 0; i < v.size(); i++) { + if (g_network->isSimulated()) { + TEST(true); // shard notify metrics + } + // ShardNotifyMetrics + v[i].send(notifyMetrics); + } + } + } + + // Due to the fact that read sampling will be called on all reads, use this specialized function to avoid overhead + // around branch misses and unnecessary stack allocation which eventually addes up under heavy load. + void notifyBytesReadPerKSecond(KeyRef key, int64_t in) { + double expire = now() + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; + int64_t bytesReadPerKSecond = + bytesReadSample.addAndExpire(key, in, expire) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + if (bytesReadPerKSecond > 0) { + StorageMetrics notifyMetrics; + notifyMetrics.bytesReadPerKSecond = bytesReadPerKSecond; + auto& v = waitMetricsMap[key]; + for (int i = 0; i < v.size(); i++) { + TEST(true); // ShardNotifyMetrics + v[i].send(notifyMetrics); + } + } + } + + // Called by StorageServerDisk when the size of a key in byteSample changes, to notify WaitMetricsRequest + // Should not be called for keys past allKeys.end + void notifyBytes(RangeMap>, KeyRangeRef>::iterator shard, + int64_t bytes) { + ASSERT(shard.end() <= allKeys.end); + + StorageMetrics notifyMetrics; + notifyMetrics.bytes = bytes; + for (int i = 0; i < shard.value().size(); i++) { + TEST(true); // notifyBytes + shard.value()[i].send(notifyMetrics); + } + } + + // Called by StorageServerDisk when the size of a key in byteSample changes, to notify WaitMetricsRequest + void notifyBytes(KeyRef key, int64_t bytes) { + if (key >= allKeys.end) // Do not notify on changes to internal storage server state + return; + + notifyBytes(waitMetricsMap.rangeContaining(key), bytes); + } + + // Called when a range of keys becomes unassigned (and therefore not readable), to notify waiting + // WaitMetricsRequests (also other types of wait + // requests in the future?) + void notifyNotReadable(KeyRangeRef keys) { + auto rs = waitMetricsMap.intersectingRanges(keys); + for (auto r = rs.begin(); r != rs.end(); ++r) { + auto& v = r->value(); + TEST(v.size()); // notifyNotReadable() sending errors to intersecting ranges + for (int n = 0; n < v.size(); n++) + v[n].sendError(wrong_shard_server()); + } + } + + // Called periodically (~1 sec intervals) to remove older IOs from the averages + // Removes old entries from metricsAverageQueue, updates metricsSampleMap accordingly, and notifies + // WaitMetricsRequests through waitMetricsMap. + void poll() { + { + StorageMetrics m; + m.bytesPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + bandwidthSample.poll(waitMetricsMap, m); + } + { + StorageMetrics m; + m.iosPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + iopsSample.poll(waitMetricsMap, m); + } + { + StorageMetrics m; + m.bytesReadPerKSecond = SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + bytesReadSample.poll(waitMetricsMap, m); + } + // bytesSample doesn't need polling because we never call addExpire() on it + } + + // static void waitMetrics( StorageServerMetrics* const& self, WaitMetricsRequest const& req ); + + // This function can run on untrusted user data. We must validate all divisions carefully. + KeyRef getSplitKey(int64_t remaining, + int64_t estimated, + int64_t limits, + int64_t used, + int64_t infinity, + bool isLastShard, + const StorageMetricSample& sample, + double divisor, + KeyRef const& lastKey, + KeyRef const& key, + bool hasUsed) const { + ASSERT(remaining >= 0); + ASSERT(limits > 0); + ASSERT(divisor > 0); + + if (limits < infinity / 2) { + int64_t expectedSize; + if (isLastShard || remaining > estimated) { + double remaining_divisor = (double(remaining) / limits) + 0.5; + expectedSize = remaining / remaining_divisor; + } else { + // If we are here, then estimated >= remaining >= 0 + double estimated_divisor = (double(estimated) / limits) + 0.5; + expectedSize = remaining / estimated_divisor; + } + + if (remaining > expectedSize) { + // This does the conversion from native units to bytes using the divisor. + double offset = (expectedSize - used) / divisor; + if (offset <= 0) + return hasUsed ? lastKey : key; + return sample.splitEstimate( + KeyRangeRef(lastKey, key), + offset * ((1.0 - SERVER_KNOBS->SPLIT_JITTER_AMOUNT) + + 2 * deterministicRandom()->random01() * SERVER_KNOBS->SPLIT_JITTER_AMOUNT)); + } + } + + return key; + } + + void splitMetrics(SplitMetricsRequest req) const { + try { + SplitMetricsReply reply; + KeyRef lastKey = req.keys.begin; + StorageMetrics used = req.used; + StorageMetrics estimated = req.estimated; + StorageMetrics remaining = getMetrics(req.keys) + used; + + //TraceEvent("SplitMetrics").detail("Begin", req.keys.begin).detail("End", req.keys.end).detail("Remaining", remaining.bytes).detail("Used", used.bytes); + + while (true) { + if (remaining.bytes < 2 * SERVER_KNOBS->MIN_SHARD_BYTES) + break; + KeyRef key = req.keys.end; + bool hasUsed = used.bytes != 0 || used.bytesPerKSecond != 0 || used.iosPerKSecond != 0; + key = getSplitKey(remaining.bytes, + estimated.bytes, + req.limits.bytes, + used.bytes, + req.limits.infinity, + req.isLastShard, + byteSample, + 1, + lastKey, + key, + hasUsed); + if (used.bytes < SERVER_KNOBS->MIN_SHARD_BYTES) + key = std::max(key, + byteSample.splitEstimate(KeyRangeRef(lastKey, req.keys.end), + SERVER_KNOBS->MIN_SHARD_BYTES - used.bytes)); + key = getSplitKey(remaining.iosPerKSecond, + estimated.iosPerKSecond, + req.limits.iosPerKSecond, + used.iosPerKSecond, + req.limits.infinity, + req.isLastShard, + iopsSample, + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, + lastKey, + key, + hasUsed); + key = getSplitKey(remaining.bytesPerKSecond, + estimated.bytesPerKSecond, + req.limits.bytesPerKSecond, + used.bytesPerKSecond, + req.limits.infinity, + req.isLastShard, + bandwidthSample, + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS, + lastKey, + key, + hasUsed); + ASSERT(key != lastKey || hasUsed); + if (key == req.keys.end) + break; + reply.splits.push_back_deep(reply.splits.arena(), key); + + StorageMetrics diff = (getMetrics(KeyRangeRef(lastKey, key)) + used); + remaining -= diff; + estimated -= diff; + + used = StorageMetrics(); + lastKey = key; + } + + reply.used = getMetrics(KeyRangeRef(lastKey, req.keys.end)) + used; + req.reply.send(reply); + } catch (Error& e) { + req.reply.sendError(e); + } + } + + void getStorageMetrics(GetStorageMetricsRequest req, + StorageBytes sb, + double bytesInputRate, + int64_t versionLag, + double lastUpdate) const { + GetStorageMetricsReply rep; + + // SOMEDAY: make bytes dynamic with hard disk space + rep.load = getMetrics(allKeys); + + if (sb.free < 1e9) { + TraceEvent(SevWarn, "PhysicalDiskMetrics") + .suppressFor(60.0) + .detail("Free", sb.free) + .detail("Total", sb.total) + .detail("Available", sb.available) + .detail("Load", rep.load.bytes); + } + + rep.available.bytes = sb.available; + rep.available.iosPerKSecond = 10e6; + rep.available.bytesPerKSecond = 100e9; + rep.available.bytesReadPerKSecond = 100e9; + + rep.capacity.bytes = sb.total; + rep.capacity.iosPerKSecond = 10e6; + rep.capacity.bytesPerKSecond = 100e9; + rep.capacity.bytesReadPerKSecond = 100e9; + + rep.bytesInputRate = bytesInputRate; + + rep.versionLag = versionLag; + rep.lastUpdate = lastUpdate; + + req.reply.send(rep); + } + + Future waitMetrics(WaitMetricsRequest req, Future delay); + + // Given a read hot shard, this function will divide the shard into chunks and find those chunks whose + // readBytes/sizeBytes exceeds the `readDensityRatio`. Please make sure to run unit tests + // `StorageMetricsSampleTests.txt` after change made. + std::vector getReadHotRanges(KeyRangeRef shard, + double readDensityRatio, + int64_t baseChunkSize, + int64_t minShardReadBandwidthPerKSeconds) const { + std::vector toReturn; + + double shardSize = (double)byteSample.getEstimate(shard); + int64_t shardReadBandwidth = bytesReadSample.getEstimate(shard); + if (shardReadBandwidth * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS <= + minShardReadBandwidthPerKSeconds) { + return toReturn; + } + if (shardSize <= baseChunkSize) { + // Shard is small, use it as is + if (bytesReadSample.getEstimate(shard) > (readDensityRatio * shardSize)) { + toReturn.emplace_back(shard, + bytesReadSample.getEstimate(shard) / shardSize, + bytesReadSample.getEstimate(shard) / + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); + } + return toReturn; + } + KeyRef beginKey = shard.begin; + auto endKey = + byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + baseChunkSize); + while (endKey != byteSample.sample.end()) { + if (*endKey > shard.end) { + endKey = byteSample.sample.lower_bound(shard.end); + if (*endKey == beginKey) { + // No need to increment endKey since otherwise it would stuck here forever. + break; + } + } + if (*endKey == beginKey) { + ++endKey; + continue; + } + if (bytesReadSample.getEstimate(KeyRangeRef(beginKey, *endKey)) > + (readDensityRatio * std::max(baseChunkSize, byteSample.getEstimate(KeyRangeRef(beginKey, *endKey))))) { + auto range = KeyRangeRef(beginKey, *endKey); + if (!toReturn.empty() && toReturn.back().keys.end == range.begin) { + // in case two consecutive chunks both are over the ratio, merge them. + range = KeyRangeRef(toReturn.back().keys.begin, *endKey); + toReturn.pop_back(); + } + toReturn.emplace_back( + range, + (double)bytesReadSample.getEstimate(range) / std::max(baseChunkSize, byteSample.getEstimate(range)), + bytesReadSample.getEstimate(range) / SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL); + } + beginKey = *endKey; + endKey = byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + + baseChunkSize); + } + return toReturn; + } + + void getReadHotRanges(ReadHotSubRangeRequest req) const { + ReadHotSubRangeReply reply; + auto _ranges = getReadHotRanges(req.keys, + SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO, + SERVER_KNOBS->READ_HOT_SUB_RANGE_CHUNK_SIZE, + SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS); + reply.readHotRanges = VectorRef(_ranges.data(), _ranges.size()); + req.reply.send(reply); + } + + std::vector getSplitPoints(KeyRangeRef range, int64_t chunkSize, Optional prefixToRemove) { + std::vector toReturn; + KeyRef beginKey = range.begin; + IndexedSet::iterator endKey = + byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + chunkSize); + while (endKey != byteSample.sample.end()) { + if (*endKey > range.end) { + break; + } + if (*endKey == beginKey) { + ++endKey; + continue; + } + KeyRef splitPoint = *endKey; + if (prefixToRemove.present()) { + splitPoint = splitPoint.removePrefix(prefixToRemove.get()); + } + toReturn.push_back(splitPoint); + beginKey = *endKey; + endKey = + byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + chunkSize); + } + return toReturn; + } + + void getSplitPoints(SplitRangeRequest req, Optional prefix) { + SplitRangeReply reply; + KeyRangeRef range = req.keys; + if (prefix.present()) { + range = range.withPrefix(prefix.get(), req.arena); + } + std::vector points = getSplitPoints(range, req.chunkSize, prefix); + + reply.splitPoints.append_deep(reply.splitPoints.arena(), points.data(), points.size()); + req.reply.send(reply); + } + +private: + static void collapse(KeyRangeMap& map, KeyRef const& key) { + auto range = map.rangeContaining(key); + if (range == map.ranges().begin() || range == map.ranges().end()) + return; + int value = range->value(); + auto prev = range; + --prev; + if (prev->value() != value) + return; + KeyRange keys = KeyRangeRef(prev->begin(), range->end()); + map.insert(keys, value); + } + + static void add(KeyRangeMap& map, KeyRangeRef const& keys, int delta) { + auto rs = map.modify(keys); + for (auto r = rs.begin(); r != rs.end(); ++r) + r->value() += delta; + collapse(map, keys.begin); + collapse(map, keys.end); + } +}; + +// Contains information about whether or not a key-value pair should be included in a byte sample +// Also contains size information about the byte sample +struct ByteSampleInfo { + bool inSample; + + // Actual size of the key value pair + int64_t size; + + // The recorded size of the sample (max of bytesPerSample, size) + int64_t sampledSize; +}; + +// Determines whether a key-value pair should be included in a byte sample +// Also returns size information about the sample +ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue); From ad16a9344ef4e523675a27f12a3c9e383ddab740 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 10 May 2022 13:52:46 -0700 Subject: [PATCH 175/299] catch special_keys_api_failure() --- fdbserver/workloads/FuzzApiCorrectness.actor.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index bd2abff7c3..bec0240257 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -901,6 +901,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { Key autoCoordinatorSpecialKey = LiteralStringRef("auto_coordinators") .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); + KeyRangeRef actorLineageRange = SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE); // Read this particular special key may throw timed_out Key statusJsonSpecialKey = LiteralStringRef("\xff\xff/status/json"); @@ -920,8 +921,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { error_code_timed_out, ExceptionContract::possibleIf(key1 <= statusJsonSpecialKey && statusJsonSpecialKey < key2)), std::make_pair(error_code_special_keys_api_failure, - ExceptionContract::possibleIf(key1 <= autoCoordinatorSpecialKey && - autoCoordinatorSpecialKey < key2)), + ExceptionContract::possibleIf( + (key1 <= autoCoordinatorSpecialKey && autoCoordinatorSpecialKey < key2) || + actorLineageRange.intersects(KeyRangeRef(key1, key2)))), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), @@ -956,6 +958,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { Key autoCoordinatorSpecialKey = LiteralStringRef("auto_coordinators") .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); + KeyRangeRef actorLineageRange = SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE); Key statusJsonSpecialKey = LiteralStringRef("\xff\xff/status/json"); contract = { @@ -975,8 +978,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { error_code_timed_out, ExceptionContract::possibleIf(key1 <= statusJsonSpecialKey && statusJsonSpecialKey < key2)), std::make_pair(error_code_special_keys_api_failure, - ExceptionContract::possibleIf((key1 <= autoCoordinatorSpecialKey) && - (autoCoordinatorSpecialKey < key2))), + ExceptionContract::possibleIf( + (key1 <= autoCoordinatorSpecialKey && autoCoordinatorSpecialKey < key2) || + actorLineageRange.intersects(KeyRangeRef(key1, key2)))), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), From 9f628df2787e771dd4994f9adc6ddb36d481eaff Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 10 May 2022 14:45:00 -0700 Subject: [PATCH 176/299] outputBuffer may not be null-terminated, so don't convert to std::string This fixes a heap buffer overflow caught by ASAN, and importantly not valgrind, since valgrind currently doesn't run on the first binary in a restarting test. The next commit will change that so we always run valgrind on the binary under test. --- fdbserver/FDBExecHelper.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index fa93d91b00..ff37fc85da 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -268,7 +268,7 @@ static auto fork_child(const std::string& path, std::vector& paramList) { static void setupTraceWithOutput(TraceEvent& event, size_t bytesRead, char* outputBuffer) { // get some errors printed for spawned process std::cout << "Output bytesRead: " << bytesRead << std::endl; - std::cout << "output buffer: " << std::string(outputBuffer) << std::endl; + std::cout << "output buffer: " << std::string_view(outputBuffer, bytesRead) << std::endl; if (bytesRead == 0) return; ASSERT(bytesRead <= SERVER_KNOBS->MAX_FORKED_PROCESS_OUTPUT); From 17140a264587f4fd466c1fc17ae7912fd9782342 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 10 May 2022 14:46:22 -0700 Subject: [PATCH 177/299] Always run valgrind on the binary under test (if valgrind is enabled) --- contrib/TestHarness/Program.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/contrib/TestHarness/Program.cs b/contrib/TestHarness/Program.cs index f4fe7476d4..0b2bbc1127 100644 --- a/contrib/TestHarness/Program.cs +++ b/contrib/TestHarness/Program.cs @@ -376,11 +376,13 @@ namespace SummarizeTest bool useNewPlugin = (oldServerName == fdbserverName) || versionGreaterThanOrEqual(oldServerName.Split('-').Last(), "5.2.0"); bool useToml = File.Exists(testFile + "-1.toml"); string testFile1 = useToml ? testFile + "-1.toml" : testFile + "-1.txt"; - result = RunTest(firstServerName, useNewPlugin ? tlsPluginFile : tlsPluginFile_5_1, summaryFileName, errorFileName, seed, buggify, testFile1, runDir, uid, expectedUnseed, out unseed, out retryableError, logOnRetryableError, useValgrind, false, true, oldServerName, traceToStdout, noSim, faultInjectionEnabled); + bool useValgrindRunOne = useValgrind && firstServerName == fdbserverName; + bool useValgrindRunTwo = useValgrind && secondServerName == fdbserverName; + result = RunTest(firstServerName, useNewPlugin ? tlsPluginFile : tlsPluginFile_5_1, summaryFileName, errorFileName, seed, buggify, testFile1, runDir, uid, expectedUnseed, out unseed, out retryableError, logOnRetryableError, useValgrindRunOne, false, true, oldServerName, traceToStdout, noSim, faultInjectionEnabled); if (result == 0) { string testFile2 = useToml ? testFile + "-2.toml" : testFile + "-2.txt"; - result = RunTest(secondServerName, tlsPluginFile, summaryFileName, errorFileName, seed+1, buggify, testFile2, runDir, uid, expectedUnseed, out unseed, out retryableError, logOnRetryableError, useValgrind, true, false, oldServerName, traceToStdout, noSim, faultInjectionEnabled); + result = RunTest(secondServerName, tlsPluginFile, summaryFileName, errorFileName, seed+1, buggify, testFile2, runDir, uid, expectedUnseed, out unseed, out retryableError, logOnRetryableError, useValgrindRunTwo, true, false, oldServerName, traceToStdout, noSim, faultInjectionEnabled); } } else @@ -458,7 +460,7 @@ namespace SummarizeTest role, IsRunningOnMono() ? "" : "-q", seed, testFile, buggify ? "on" : "off", faultInjectionArg, tlsPluginArg); } if (restarting) args = args + " --restarting"; - if (useValgrind && !willRestart) + if (useValgrind) { valgrindOutputFile = string.Format("valgrind-{0}.xml", seed); process.StartInfo.FileName = "valgrind"; From 0597ae751392c439872fbe3de076ae84b6194e24 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 10 May 2022 15:29:36 -0700 Subject: [PATCH 178/299] When an incompatible connection is closed, clear the state that prevents us from sending messages to it --- fdbrpc/FlowTransport.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index ad737d3be4..f62b73be9b 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -816,6 +816,9 @@ ACTOR Future connectionKeeper(Reference self, .errorUnsuppressed(e) .suppressFor(1.0) .detail("PeerAddr", self->destination); + + // Since the connection has closed, we need to check the protocol version the next time we connect + self->incompatibleProtocolVersionNewer = false; } if (self->destination.isPublic() && From b9b8b2d052698609999ad65913f59dccfd7eae7b Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 10 May 2022 14:18:28 -0700 Subject: [PATCH 179/299] Minor cleanup in StorageMetrics.h --- fdbserver/StorageMetrics.actor.cpp | 6 ----- fdbserver/StorageMetrics.h | 41 +++++++++++++++++------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp index 93cadb0542..0d854434d5 100644 --- a/fdbserver/StorageMetrics.actor.cpp +++ b/fdbserver/StorageMetrics.actor.cpp @@ -18,12 +18,6 @@ * limitations under the License. */ -#include "fdbclient/FDBTypes.h" -#include "fdbrpc/simulator.h" -#include "flow/UnitTest.h" -#include "fdbclient/StorageServerInterface.h" -#include "fdbclient/KeyRangeMap.h" -#include "fdbserver/Knobs.h" #include "fdbserver/StorageMetrics.h" #include "flow/actorcompiler.h" // This must be the last #include. diff --git a/fdbserver/StorageMetrics.h b/fdbserver/StorageMetrics.h index 17ad106e1d..35da5e14f5 100644 --- a/fdbserver/StorageMetrics.h +++ b/fdbserver/StorageMetrics.h @@ -20,24 +20,31 @@ #pragma once -const StringRef STORAGESERVER_HISTOGRAM_GROUP = LiteralStringRef("StorageServer"); -const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysLatency"); -const StringRef FETCH_KEYS_BYTES_HISTOGRAM = LiteralStringRef("FetchKeysSize"); -const StringRef FETCH_KEYS_BYTES_PER_SECOND_HISTOGRAM = LiteralStringRef("FetchKeysBandwidth"); -const StringRef TLOG_CURSOR_READS_LATENCY_HISTOGRAM = LiteralStringRef("TLogCursorReadsLatency"); -const StringRef SS_VERSION_LOCK_LATENCY_HISTOGRAM = LiteralStringRef("SSVersionLockLatency"); -const StringRef EAGER_READS_LATENCY_HISTOGRAM = LiteralStringRef("EagerReadsLatency"); -const StringRef FETCH_KEYS_PTREE_UPDATES_LATENCY_HISTOGRAM = LiteralStringRef("FetchKeysPTreeUpdatesLatency"); -const StringRef TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM = LiteralStringRef("TLogMsgsPTreeUpdatesLatency"); -const StringRef STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM = LiteralStringRef("StorageUpdatesDurableLatency"); -const StringRef STORAGE_COMMIT_LATENCY_HISTOGRAM = LiteralStringRef("StorageCommitLatency"); -const StringRef SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM = LiteralStringRef("SSDurableVersionUpdateLatency"); +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/simulator.h" +#include "flow/UnitTest.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbserver/Knobs.h" + +const StringRef STORAGESERVER_HISTOGRAM_GROUP = "StorageServer"_sr; +const StringRef FETCH_KEYS_LATENCY_HISTOGRAM = "FetchKeysLatency"_sr; +const StringRef FETCH_KEYS_BYTES_HISTOGRAM = "FetchKeysSize"_sr; +const StringRef FETCH_KEYS_BYTES_PER_SECOND_HISTOGRAM = "FetchKeysBandwidth"_sr; +const StringRef TLOG_CURSOR_READS_LATENCY_HISTOGRAM = "TLogCursorReadsLatency"_sr; +const StringRef SS_VERSION_LOCK_LATENCY_HISTOGRAM = "SSVersionLockLatency"_sr; +const StringRef EAGER_READS_LATENCY_HISTOGRAM = "EagerReadsLatency"_sr; +const StringRef FETCH_KEYS_PTREE_UPDATES_LATENCY_HISTOGRAM = "FetchKeysPTreeUpdatesLatency"_sr; +const StringRef TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM = "TLogMsgsPTreeUpdatesLatency"_sr; +const StringRef STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM = "StorageUpdatesDurableLatency"_sr; +const StringRef STORAGE_COMMIT_LATENCY_HISTOGRAM = "StorageCommitLatency"_sr; +const StringRef SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM = "SSDurableVersionUpdateLatency"_sr; struct StorageMetricSample { IndexedSet sample; int64_t metricUnitsPerSample; - StorageMetricSample(int64_t metricUnitsPerSample) : metricUnitsPerSample(metricUnitsPerSample) {} + explicit StorageMetricSample(int64_t metricUnitsPerSample) : metricUnitsPerSample(metricUnitsPerSample) {} int64_t getEstimate(KeyRangeRef keys) const { return sample.sumRange(keys.begin, keys.end); } KeyRef splitEstimate(KeyRangeRef range, int64_t offset, bool front = true) const { @@ -89,7 +96,7 @@ struct StorageMetricSample { struct TransientStorageMetricSample : StorageMetricSample { Deque>> queue; - TransientStorageMetricSample(int64_t metricUnitsPerSample) : StorageMetricSample(metricUnitsPerSample) {} + explicit TransientStorageMetricSample(int64_t metricUnitsPerSample) : StorageMetricSample(metricUnitsPerSample) {} // Returns the sampled metric value (possibly 0, possibly increased by the sampling factor) int64_t addAndExpire(KeyRef key, int64_t metric, double expiration) { @@ -530,10 +537,10 @@ struct StorageServerMetrics { req.reply.send(reply); } - std::vector getSplitPoints(KeyRangeRef range, int64_t chunkSize, Optional prefixToRemove) { + std::vector getSplitPoints(KeyRangeRef range, int64_t chunkSize, Optional prefixToRemove) const { std::vector toReturn; KeyRef beginKey = range.begin; - IndexedSet::iterator endKey = + IndexedSet::const_iterator endKey = byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + chunkSize); while (endKey != byteSample.sample.end()) { if (*endKey > range.end) { @@ -555,7 +562,7 @@ struct StorageServerMetrics { return toReturn; } - void getSplitPoints(SplitRangeRequest req, Optional prefix) { + void getSplitPoints(SplitRangeRequest req, Optional prefix) const { SplitRangeReply reply; KeyRangeRef range = req.keys; if (prefix.present()) { From b7fd093ed0e0de238e5212ef14b4db66e95b247b Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 10 May 2022 19:43:57 -0700 Subject: [PATCH 180/299] Move UnitTests.h include --- fdbserver/StorageMetrics.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/StorageMetrics.actor.cpp b/fdbserver/StorageMetrics.actor.cpp index 0d854434d5..d559380f16 100644 --- a/fdbserver/StorageMetrics.actor.cpp +++ b/fdbserver/StorageMetrics.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "flow/UnitTest.h" #include "fdbserver/StorageMetrics.h" #include "flow/actorcompiler.h" // This must be the last #include. From e5de8ba2605c649ed62ce768b30abf433c4f8ed6 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 11:14:08 +0200 Subject: [PATCH 181/299] Add MkCert command line tool --- flow/CMakeLists.txt | 3 + flow/MkCertCli.cpp | 286 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 289 insertions(+) create mode 100644 flow/MkCertCli.cpp diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 906f05f085..f9314aa2c8 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -211,3 +211,6 @@ if(USE_SANITIZER) else() target_link_libraries(tls_poc PUBLIC fmt::fmt flow boost_target) endif() + +add_executable(mkcert MkCertCli.cpp) +target_link_libraries(mkcert PUBLIC fmt::fmt flow) diff --git a/flow/MkCertCli.cpp b/flow/MkCertCli.cpp new file mode 100644 index 0000000000..9f7ed4a1d2 --- /dev/null +++ b/flow/MkCertCli.cpp @@ -0,0 +1,286 @@ +/* + * MkCertCli.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "flow/Arena.h" +#include "flow/Error.h" +#include "flow/MkCert.h" +#include "flow/SimpleOpt.h" +#include "flow/Platform.h" +#include "flow/network.h" +#include "flow/TLSConfig.actor.h" +#include "flow/Trace.h" + +enum EMkCertOpt : int { + OPT_HELP, + OPT_SERVER_CHAIN_LEN, + OPT_CLIENT_CHAIN_LEN, + OPT_SERVER_CERT_FILE, + OPT_SERVER_KEY_FILE, + OPT_SERVER_CA_FILE, + OPT_CLIENT_CERT_FILE, + OPT_CLIENT_KEY_FILE, + OPT_CLIENT_CA_FILE, + OPT_EXPIRE_SERVER_CERT, + OPT_EXPIRE_CLIENT_CERT, +}; + +CSimpleOpt::SOption gOptions[] = { { OPT_HELP, "--help", SO_NONE }, + { OPT_HELP, "-h", SO_NONE }, + { OPT_SERVER_CHAIN_LEN, "--server-chain-length", SO_REQ_SEP }, + { OPT_SERVER_CHAIN_LEN, "-S", SO_REQ_SEP }, + { OPT_CLIENT_CHAIN_LEN, "--client-chain-length", SO_REQ_SEP }, + { OPT_CLIENT_CHAIN_LEN, "-C", SO_REQ_SEP }, + { OPT_SERVER_CERT_FILE, "--server-cert-file", SO_REQ_SEP }, + { OPT_SERVER_KEY_FILE, "--server-key-file", SO_REQ_SEP }, + { OPT_SERVER_CA_FILE, "--server-ca-file", SO_REQ_SEP }, + { OPT_CLIENT_CERT_FILE, "--client-cert-file", SO_REQ_SEP }, + { OPT_CLIENT_KEY_FILE, "--client-key-file", SO_REQ_SEP }, + { OPT_CLIENT_CA_FILE, "--client-ca-file", SO_REQ_SEP }, + { OPT_EXPIRE_SERVER_CERT, "--expire-server-cert", SO_NONE }, + { OPT_EXPIRE_CLIENT_CERT, "--expire-client-cert", SO_NONE }, + SO_END_OF_OPTIONS }; + +template +void printOptionUsage(std::string_view option, std::string_view(&&optionDescLines)[Len]) { + constexpr std::string_view optionIndent{ " " }; + constexpr std::string_view descIndent{ " " }; + fmt::print(stdout, "{}{}\n", optionIndent, option); + for (auto descLine : optionDescLines) + fmt::print(stdout, "{}{}\n", descIndent, descLine); + fmt::print("\n"); +} + +void printUsage(std::string_view binary) { + fmt::print(stdout, + "mkcert: FDB test certificate chain generator\n" + "Usage: {} [OPTIONS...]\n\n", + binary); + printOptionUsage("--server-chain-length LENGTH, -S LENGTH (default: 3)", + { "Length of server certificate chain including root CA certificate." }); + printOptionUsage("--client-chain-length LENGTH, -C LENGTH (default: 2)", + { "Length of client certificate chain including root CA certificate.", + "Use zero-length to test to setup untrusted clients." }); + printOptionUsage("--server-cert-file PATH (default: 'server_cert.pem')", + { "Output filename for server certificate chain excluding root CA.", + "Intended for SERVERS to use as 'tls_certificate_file'.", + "Certificates are concatenated in leaf-to-CA order." }); + printOptionUsage("--server-key-file PATH (default: 'server_key.pem')", + { "Output filename for server private key matching its leaf certificate.", + "Intended for SERVERS to use as 'tls_key_file'" }); + printOptionUsage("--server-ca-file PATH (default: 'server_ca.pem')", + { "Output filename for server's root CA certificate.", + "Content same as '--server-cert-file' for '--server-chain-length' == 1.", + "Intended for CLIENTS to use as 'tls_ca_file': i.e. cert issuer to trust" }); + printOptionUsage("--client-cert-file PATH (default: 'client_cert.pem')", + { "Output filename for client certificate chain excluding root CA.", + "Intended for CLIENTS to use as 'tls_certificate_file'.", + "Certificates are concatenated in leaf-to-CA order." }); + printOptionUsage("--client-key-file PATH (default: 'client_key.pem')", + { "Output filename for client private key matching its leaf certificate.", + "Intended for CLIENTS to use as 'tls_key_file'" }); + printOptionUsage("--client-ca-file PATH (default: 'client_ca.pem')", + { "Output filename for client's root CA certificate.", + "Content same as '--client-cert-file' for '--client-chain-length' == 1.", + "Intended for SERVERS to use as 'tls_ca_file': i.e. cert issuer to trust" }); + printOptionUsage("--expire-server-cert", { "Deliberately expire server's leaf certificate for testing." }); + printOptionUsage("--expire-client-cert", { "Deliberately expire client's leaf certificate for testing." }); +} + +int main(int argc, char** argv) { + auto serverChainLen = 3; + auto clientChainLen = 2; + auto serverCertFile = std::string("server_cert.pem"); + auto serverKeyFile = std::string("server_key.pem"); + auto serverCaFile = std::string("server_ca.pem"); + auto clientCertFile = std::string("client_cert.pem"); + auto clientKeyFile = std::string("client_key.pem"); + auto clientCaFile = std::string("client_ca.pem"); + auto expireServerCert = false; + auto expireClientCert = false; + auto args = CSimpleOpt(argc, argv, gOptions, SO_O_EXACT | SO_O_HYPHEN_TO_UNDERSCORE); + while (args.Next()) { + if (auto err = args.LastError()) { + switch (err) { + case SO_ARG_INVALID_DATA: + fmt::print(stderr, "ERROR: invalid argument to option '{}'\n", args.OptionText()); + return FDB_EXIT_ERROR; + case SO_ARG_INVALID: + fmt::print(stderr, "ERROR: argument given to no-argument option '{}'\n", args.OptionText()); + return FDB_EXIT_ERROR; + case SO_ARG_MISSING: + fmt::print(stderr, "ERROR: argument missing for option '{}'\n", args.OptionText()); + return FDB_EXIT_ERROR; + case SO_OPT_INVALID: + fmt::print(stderr, "ERROR: unknown option '{}'\n", args.OptionText()); + return FDB_EXIT_ERROR; + default: + fmt::print(stderr, "ERROR: unknown error {} with option '{}'\n", err, args.OptionText()); + return FDB_EXIT_ERROR; + } + } else { + auto const optId = args.OptionId(); + switch (optId) { + case OPT_HELP: + printUsage(argv[0]); + return FDB_EXIT_SUCCESS; + case OPT_SERVER_CHAIN_LEN: + try { + serverChainLen = std::stoi(args.OptionArg()); + } catch (std::exception const& ex) { + fmt::print(stderr, "ERROR: Invalid chain length ({})\n", ex.what()); + return FDB_EXIT_ERROR; + } + break; + case OPT_CLIENT_CHAIN_LEN: + try { + clientChainLen = std::stoi(args.OptionArg()); + } catch (std::exception const& ex) { + fmt::print(stderr, "ERROR: Invalid chain length ({})\n", ex.what()); + return FDB_EXIT_ERROR; + } + break; + case OPT_SERVER_CERT_FILE: + serverCertFile.assign(args.OptionArg()); + break; + case OPT_SERVER_KEY_FILE: + serverKeyFile.assign(args.OptionArg()); + break; + case OPT_SERVER_CA_FILE: + serverCaFile.assign(args.OptionArg()); + break; + case OPT_CLIENT_CERT_FILE: + clientCertFile.assign(args.OptionArg()); + break; + case OPT_CLIENT_KEY_FILE: + clientKeyFile.assign(args.OptionArg()); + break; + case OPT_CLIENT_CA_FILE: + clientCaFile.assign(args.OptionArg()); + break; + case OPT_EXPIRE_SERVER_CERT: + expireServerCert = true; + break; + case OPT_EXPIRE_CLIENT_CERT: + expireClientCert = true; + break; + default: + fmt::print(stderr, "ERROR: Unknown option {}\n", args.OptionText()); + return FDB_EXIT_ERROR; + } + } + } + // Need to involve flow for the TraceEvent. + try { + platformInit(); + Error::init(); + g_network = newNet2(TLSConfig()); + TraceEvent::setNetworkThread(); + openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, ".", "mkcert"); + + serverCertFile = abspath(serverCertFile); + serverKeyFile = abspath(serverKeyFile); + serverCaFile = abspath(serverCaFile); + clientCertFile = abspath(clientCertFile); + clientKeyFile = abspath(clientKeyFile); + clientCaFile = abspath(clientCaFile); + fmt::print("Server certificate chain length: {}\n" + "Client certificate chain length: {}\n" + "Server certificate file: {}\n" + "Server private key file: {}\n" + "Server CA file: {}\n" + "Client certificate file: {}\n" + "Client private key file: {}\n" + "Client CA file: {}\n", + serverChainLen, + clientChainLen, + serverCertFile, + serverKeyFile, + serverCaFile, + clientCertFile, + clientKeyFile, + clientCaFile); + + using FileStream = std::ofstream; + auto ofsServerCert = FileStream(serverCertFile, std::ofstream::out | std::ofstream::trunc); + auto ofsServerKey = FileStream(serverKeyFile, std::ofstream::out | std::ofstream::trunc); + auto ofsServerCa = FileStream(serverCaFile, std::ofstream::out | std::ofstream::trunc); + auto ofsClientCert = FileStream(clientCertFile, std::ofstream::out | std::ofstream::trunc); + auto ofsClientKey = FileStream(clientKeyFile, std::ofstream::out | std::ofstream::trunc); + auto ofsClientCa = FileStream(clientCaFile, std::ofstream::out | std::ofstream::trunc); + if (serverChainLen) { + auto arena = Arena(); + auto specs = mkcert::makeCertChainSpec(arena, std::abs(serverChainLen), mkcert::ESide::Server); + if (expireServerCert) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; + } + auto serverChain = mkcert::makeCertChain(arena, specs, {} /*generate root CA*/); + auto serverCa = serverChain.back().certPem; + ofsServerCa.write(reinterpret_cast(serverCa.begin()), serverCa.size()); + if (serverChain.size() > 1) + serverChain.pop_back(); + auto serverCert = mkcert::concatCertChain(arena, serverChain); + ofsServerCert.write(reinterpret_cast(serverCert.begin()), serverCert.size()); + auto serverKey = serverChain[0].privateKeyPem; + ofsServerKey.write(reinterpret_cast(serverKey.begin()), serverKey.size()); + } + ofsServerCert.close(); + ofsServerKey.close(); + ofsServerCa.close(); + if (clientChainLen) { + auto arena = Arena(); + auto specs = mkcert::makeCertChainSpec(arena, std::abs(serverChainLen), mkcert::ESide::Server); + if (expireClientCert) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; + } + auto serverChain = mkcert::makeCertChain(arena, specs, {} /*generate root CA*/); + auto serverCa = serverChain.back().certPem; + ofsServerCa.write(reinterpret_cast(serverCa.begin()), serverCa.size()); + if (serverChain.size() > 1) + serverChain.pop_back(); + auto serverCert = mkcert::concatCertChain(arena, serverChain); + ofsServerCert.write(reinterpret_cast(serverCert.begin()), serverCert.size()); + auto serverKey = serverChain[0].privateKeyPem; + ofsServerKey.write(reinterpret_cast(serverKey.begin()), serverKey.size()); + } + ofsClientCert.close(); + ofsClientKey.close(); + ofsClientCa.close(); + auto thread = std::thread([]() { g_network->run(); }); + flushTraceFileVoid(); + g_network->stop(); + thread.join(); + return FDB_EXIT_SUCCESS; + } catch (const Error& e) { + fmt::print(stderr, "ERROR: {}\n", e.name()); + TraceEvent(SevError, "MainError").error(e); + return FDB_EXIT_MAIN_ERROR; + } catch (const std::exception& e) { + fmt::print(stderr, "std::exception: {}\n", e.what()); + TraceEvent(SevError, "MainError").error(unknown_error()).detail("RootException", e.what()); + return FDB_EXIT_MAIN_EXCEPTION; + } +} From 8789232df44f6bc1e0e8de13e722d7b33b674315 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 11:46:08 +0200 Subject: [PATCH 182/299] Add ScopeExit to flow and remove scattered impls --- fdbrpc/TokenSign.cpp | 87 +++++++------------------------------------- flow/CMakeLists.txt | 1 + flow/MkCert.cpp | 31 ++++++---------- flow/MkCertCli.cpp | 31 ++++++++++++---- flow/ScopeExit.h | 36 ++++++++++++++++++ 5 files changed, 86 insertions(+), 100 deletions(-) create mode 100644 flow/ScopeExit.h diff --git a/fdbrpc/TokenSign.cpp b/fdbrpc/TokenSign.cpp index fc9396befb..1b2629f5be 100644 --- a/fdbrpc/TokenSign.cpp +++ b/fdbrpc/TokenSign.cpp @@ -24,7 +24,9 @@ #include "flow/Arena.h" #include "flow/Error.h" #include "flow/IRandom.h" +#include "flow/MkCert.h" #include "flow/Platform.h" +#include "flow/ScopeExit.h" #include "flow/Trace.h" #include "flow/UnitTest.h" #include @@ -35,16 +37,6 @@ namespace { -template -class ExitGuard { - std::decay_t fn; - -public: - ExitGuard(Func&& fn) : fn(std::forward(fn)) {} - - ~ExitGuard() { fn(); } -}; - [[noreturn]] void traceAndThrow(const char* type) { auto te = TraceEvent(SevWarnAlways, type); te.suppressFor(60); @@ -53,68 +45,16 @@ public: 0, }; ::ERR_error_string_n(err, buf, sizeof(buf)); - te.detail("OpenSSLError", buf); + te.detail("OpenSSLError", static_cast(buf)); } throw digital_signature_ops_error(); } -struct KeyPairRef { - StringRef privateKey; - StringRef publicKey; -}; - -Standalone generateEcdsaKeyPair() { - auto params = std::add_pointer_t(); - { - auto pctx = ::EVP_PKEY_CTX_new_id(EVP_PKEY_EC, nullptr); - ASSERT(pctx); - auto ctxGuard = ExitGuard([pctx]() { ::EVP_PKEY_CTX_free(pctx); }); - ASSERT_LT(0, ::EVP_PKEY_paramgen_init(pctx)); - ASSERT_LT(0, ::EVP_PKEY_CTX_set_ec_paramgen_curve_nid(pctx, NID_X9_62_prime256v1)); - ASSERT_LT(0, ::EVP_PKEY_paramgen(pctx, ¶ms)); - ASSERT(params); - } - auto paramsGuard = ExitGuard([params]() { ::EVP_PKEY_free(params); }); - // keygen - auto kctx = ::EVP_PKEY_CTX_new(params, nullptr); - ASSERT(kctx); - auto kctxGuard = ExitGuard([kctx]() { ::EVP_PKEY_CTX_free(kctx); }); - auto key = std::add_pointer_t(); - { - ASSERT_LT(0, ::EVP_PKEY_keygen_init(kctx)); - ASSERT_LT(0, ::EVP_PKEY_keygen(kctx, &key)); - } - ASSERT(key); - auto keyGuard = ExitGuard([key]() { ::EVP_PKEY_free(key); }); - - auto ret = Standalone{}; - auto& arena = ret.arena(); - { - auto len = 0; - len = ::i2d_PrivateKey(key, nullptr); - ASSERT_LT(0, len); - auto buf = new (arena) uint8_t[len]; - auto out = std::add_pointer_t(buf); - len = ::i2d_PrivateKey(key, &out); - ret.privateKey = StringRef(buf, len); - } - { - auto len = 0; - len = ::i2d_PUBKEY(key, nullptr); - ASSERT_LT(0, len); - auto buf = new (arena) uint8_t[len]; - auto out = std::add_pointer_t(buf); - len = ::i2d_PUBKEY(key, &out); - ret.publicKey = StringRef(buf, len); - } - return ret; -} - } // namespace Standalone signToken(AuthTokenRef token, StringRef keyName, StringRef privateKeyDer) { auto ret = Standalone{}; - auto arena = ret.arena(); + auto& arena = ret.arena(); auto writer = ObjectWriter([&arena](size_t len) { return new (arena) uint8_t[len]; }, IncludeVersion()); writer.serialize(token); auto tokenStr = writer.toStringRef(); @@ -124,11 +64,11 @@ Standalone signToken(AuthTokenRef token, StringRef keyName, if (!key) { traceAndThrow("SignTokenBadKey"); } - auto keyGuard = ExitGuard([key]() { ::EVP_PKEY_free(key); }); + auto keyGuard = ScopeExit([key]() { ::EVP_PKEY_free(key); }); auto mdctx = ::EVP_MD_CTX_create(); if (!mdctx) traceAndThrow("SignTokenInitFail"); - auto mdctxGuard = ExitGuard([mdctx]() { ::EVP_MD_CTX_free(mdctx); }); + auto mdctxGuard = ScopeExit([mdctx]() { ::EVP_MD_CTX_free(mdctx); }); if (1 != ::EVP_DigestSignInit(mdctx, nullptr, ::EVP_sha256() /*Parameterize?*/, nullptr, key)) traceAndThrow("SignTokenInitFail"); if (1 != ::EVP_DigestSignUpdate(mdctx, tokenStr.begin(), tokenStr.size())) @@ -150,11 +90,11 @@ bool verifyToken(SignedAuthTokenRef signedToken, StringRef publicKeyDer) { auto key = ::d2i_PUBKEY(nullptr, &rawPubKeyDer, publicKeyDer.size()); if (!key) traceAndThrow("VerifyTokenBadKey"); - auto keyGuard = ExitGuard([key]() { ::EVP_PKEY_free(key); }); + auto keyGuard = ScopeExit([key]() { ::EVP_PKEY_free(key); }); auto mdctx = ::EVP_MD_CTX_create(); if (!mdctx) traceAndThrow("VerifyTokenInitFail"); - auto mdctxGuard = ExitGuard([mdctx]() { ::EVP_MD_CTX_free(mdctx); }); + auto mdctxGuard = ScopeExit([mdctx]() { ::EVP_MD_CTX_free(mdctx); }); if (1 != ::EVP_DigestVerifyInit(mdctx, nullptr, ::EVP_sha256(), nullptr, key)) traceAndThrow("VerifyTokenInitFail"); if (1 != ::EVP_DigestVerifyUpdate(mdctx, signedToken.token.begin(), signedToken.token.size())) @@ -179,9 +119,10 @@ void forceLinkTokenSignTests() {} TEST_CASE("/fdbrpc/TokenSign") { const auto numIters = 100; for (auto i = 0; i < numIters; i++) { - auto keyPair = generateEcdsaKeyPair(); + auto kpArena = Arena(); + auto keyPair = mkcert::KeyPairRef::make(kpArena); auto token = Standalone{}; - auto arena = token.arena(); + auto& arena = token.arena(); auto& rng = *deterministicRandom(); token.expiresAt = timer_monotonic() * (0.5 + rng.random01()); if (auto setIp = rng.randomInt(0, 3)) { @@ -206,15 +147,15 @@ TEST_CASE("/fdbrpc/TokenSign") { token.tenants.push_back(arena, genRandomStringRef()); } auto keyName = genRandomStringRef(); - auto signedToken = signToken(token, keyName, keyPair.privateKey); - const auto verifyExpectOk = verifyToken(signedToken, keyPair.publicKey); + auto signedToken = signToken(token, keyName, keyPair.privateKeyDer); + const auto verifyExpectOk = verifyToken(signedToken, keyPair.publicKeyDer); ASSERT(verifyExpectOk); // try tampering with signed token by adding one more tenant token.tenants.push_back(arena, genRandomStringRef()); auto writer = ObjectWriter([&arena](size_t len) { return new (arena) uint8_t[len]; }, IncludeVersion()); writer.serialize(token); signedToken.token = writer.toStringRef(); - const auto verifyExpectFail = verifyToken(signedToken, keyPair.publicKey); + const auto verifyExpectFail = verifyToken(signedToken, keyPair.publicKeyDer); ASSERT(!verifyExpectFail); } printf("%d runs OK\n", numIters); diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index f9314aa2c8..c32bb2bb8c 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -56,6 +56,7 @@ set(FLOW_SRCS Platform.h Profiler.actor.cpp Profiler.h + ScopeExit.h SendBufferIterator.h SignalSafeUnwind.cpp SignalSafeUnwind.h diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 9298162d18..30a5f9f9aa 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -21,6 +21,7 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/MkCert.h" +#include "flow/ScopeExit.h" #include #include @@ -34,16 +35,6 @@ namespace { -template -class ExitGuard { - std::decay_t fn; - -public: - ExitGuard(Func&& fn) : fn(std::forward(fn)) {} - - ~ExitGuard() { fn(); } -}; - [[noreturn]] void traceAndThrow(const char* condition, const char* file, int line) { auto te = TraceEvent(SevWarnAlways, "ErrorTLSKeyOrCertGen"); te.suppressFor(60).detail("File", file).detail("Line", line).detail("Condition", condition); @@ -118,7 +109,7 @@ void printPrivateKey(FILE* out, StringRef privateKeyPem) { auto key = readPrivateKeyPem(privateKeyPem); auto bio = ::BIO_new_fp(out, BIO_NOCLOSE); OSSL_ASSERT(bio); - auto bioGuard = ExitGuard([bio]() { ::BIO_free(bio); }); + auto bioGuard = ScopeExit([bio]() { ::BIO_free(bio); }); OSSL_ASSERT(0 < ::EVP_PKEY_print_private(bio, key.get(), 0, nullptr)); } @@ -127,17 +118,17 @@ std::shared_ptr makeEllipticCurveKeyPairNative() { { auto pctx = ::EVP_PKEY_CTX_new_id(EVP_PKEY_EC, nullptr); OSSL_ASSERT(pctx); - auto ctxGuard = ExitGuard([pctx]() { ::EVP_PKEY_CTX_free(pctx); }); + auto ctxGuard = ScopeExit([pctx]() { ::EVP_PKEY_CTX_free(pctx); }); OSSL_ASSERT(0 < ::EVP_PKEY_paramgen_init(pctx)); OSSL_ASSERT(0 < ::EVP_PKEY_CTX_set_ec_paramgen_curve_nid(pctx, NID_X9_62_prime256v1)); OSSL_ASSERT(0 < ::EVP_PKEY_paramgen(pctx, ¶ms)); OSSL_ASSERT(params); } - auto paramsGuard = ExitGuard([params]() { ::EVP_PKEY_free(params); }); + auto paramsGuard = ScopeExit([params]() { ::EVP_PKEY_free(params); }); // keygen auto kctx = ::EVP_PKEY_CTX_new(params, nullptr); OSSL_ASSERT(kctx); - auto kctxGuard = ExitGuard([kctx]() { ::EVP_PKEY_CTX_free(kctx); }); + auto kctxGuard = ScopeExit([kctx]() { ::EVP_PKEY_CTX_free(kctx); }); auto key = std::add_pointer_t(); OSSL_ASSERT(0 < ::EVP_PKEY_keygen_init(kctx)); OSSL_ASSERT(0 < ::EVP_PKEY_keygen(kctx, &key)); @@ -149,7 +140,7 @@ std::shared_ptr readX509CertPem(StringRef x509CertPem) { ASSERT(!x509CertPem.empty()); auto bio_mem = ::BIO_new_mem_buf(x509CertPem.begin(), x509CertPem.size()); OSSL_ASSERT(bio_mem); - auto bioGuard = ExitGuard([bio_mem]() { ::BIO_free(bio_mem); }); + auto bioGuard = ScopeExit([bio_mem]() { ::BIO_free(bio_mem); }); auto ret = ::PEM_read_bio_X509(bio_mem, nullptr, nullptr, nullptr); OSSL_ASSERT(ret); return std::shared_ptr(ret, &::X509_free); @@ -159,7 +150,7 @@ std::shared_ptr readPrivateKeyPem(StringRef privateKeyPem) { ASSERT(!privateKeyPem.empty()); auto bio_mem = ::BIO_new_mem_buf(privateKeyPem.begin(), privateKeyPem.size()); OSSL_ASSERT(bio_mem); - auto bioGuard = ExitGuard([bio_mem]() { ::BIO_free(bio_mem); }); + auto bioGuard = ScopeExit([bio_mem]() { ::BIO_free(bio_mem); }); auto ret = ::PEM_read_bio_PrivateKey(bio_mem, nullptr, nullptr, nullptr); OSSL_ASSERT(ret); return std::shared_ptr(ret, &::EVP_PKEY_free); @@ -168,7 +159,7 @@ std::shared_ptr readPrivateKeyPem(StringRef privateKeyPem) { StringRef writeX509CertPem(Arena& arena, const std::shared_ptr& nativeCert) { auto mem = ::BIO_new(::BIO_s_secmem()); OSSL_ASSERT(mem); - auto memGuard = ExitGuard([mem]() { ::BIO_free(mem); }); + auto memGuard = ScopeExit([mem]() { ::BIO_free(mem); }); OSSL_ASSERT(::PEM_write_bio_X509(mem, nativeCert.get())); auto bioBuf = std::add_pointer_t{}; auto const len = ::BIO_get_mem_data(mem, &bioBuf); @@ -181,7 +172,7 @@ StringRef writeX509CertPem(Arena& arena, const std::shared_ptr& nativeCert StringRef writePrivateKeyPem(Arena& arena, const std::shared_ptr& nativePrivateKey) { auto mem = ::BIO_new(::BIO_s_secmem()); OSSL_ASSERT(mem); - auto memGuard = ExitGuard([mem]() { ::BIO_free(mem); }); + auto memGuard = ScopeExit([mem]() { ::BIO_free(mem); }); OSSL_ASSERT(::PEM_write_bio_PrivateKey(mem, nativePrivateKey.get(), nullptr, nullptr, 0, 0, nullptr)); auto bioBuf = std::add_pointer_t{}; auto const len = ::BIO_get_mem_data(mem, &bioBuf); @@ -223,7 +214,7 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { auto nativeKeyPair = makeEllipticCurveKeyPairNative(); auto newX = ::X509_new(); OSSL_ASSERT(newX); - auto x509Guard = ExitGuard([&newX]() { + auto x509Guard = ScopeExit([&newX]() { if (newX) ::X509_free(newX); }); @@ -262,7 +253,7 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { auto extValue = entry.bytes.toString(); auto ext = ::X509V3_EXT_conf(nullptr, &ctx, extName.c_str(), extValue.c_str()); OSSL_ASSERT(ext); - auto extGuard = ExitGuard([ext]() { ::X509_EXTENSION_free(ext); }); + auto extGuard = ScopeExit([ext]() { ::X509_EXTENSION_free(ext); }); OSSL_ASSERT(::X509_add_ext(x, ext, -1)); } OSSL_ASSERT(::X509_sign(x, (isSelfSigned ? nativeKeyPair.get() : issuer.privateKey.get()), ::EVP_sha256())); diff --git a/flow/MkCertCli.cpp b/flow/MkCertCli.cpp index 9f7ed4a1d2..3e39a5eae3 100644 --- a/flow/MkCertCli.cpp +++ b/flow/MkCertCli.cpp @@ -26,9 +26,10 @@ #include "flow/Arena.h" #include "flow/Error.h" #include "flow/MkCert.h" -#include "flow/SimpleOpt.h" -#include "flow/Platform.h" #include "flow/network.h" +#include "flow/Platform.h" +#include "flow/ScopeExit.h" +#include "flow/SimpleOpt.h" #include "flow/TLSConfig.actor.h" #include "flow/Trace.h" @@ -196,8 +197,16 @@ int main(int argc, char** argv) { platformInit(); Error::init(); g_network = newNet2(TLSConfig()); - TraceEvent::setNetworkThread(); openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, ".", "mkcert"); + auto thread = std::thread([]() { + TraceEvent::setNetworkThread(); + g_network->run(); + }); + auto cleanUpGuard = ScopeExit([&thread]() { + flushTraceFileVoid(); + g_network->stop(); + thread.join(); + }); serverCertFile = abspath(serverCertFile); serverKeyFile = abspath(serverKeyFile); @@ -223,12 +232,23 @@ int main(int argc, char** argv) { clientCaFile); using FileStream = std::ofstream; + auto checkStream = [](FileStream& fs, std::string_view filename) { + if (!fs) { + throw std::runtime_error(fmt::format("Cannot open '{}' for writing", filename)); + } + }; auto ofsServerCert = FileStream(serverCertFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsServerCert, serverCertFile); auto ofsServerKey = FileStream(serverKeyFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsServerKey, serverKeyFile); auto ofsServerCa = FileStream(serverCaFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsServerCa, serverCaFile); auto ofsClientCert = FileStream(clientCertFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsClientCert, clientCertFile); auto ofsClientKey = FileStream(clientKeyFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsClientKey, clientKeyFile); auto ofsClientCa = FileStream(clientCaFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsClientCa, clientCaFile); if (serverChainLen) { auto arena = Arena(); auto specs = mkcert::makeCertChainSpec(arena, std::abs(serverChainLen), mkcert::ESide::Server); @@ -269,10 +289,7 @@ int main(int argc, char** argv) { ofsClientCert.close(); ofsClientKey.close(); ofsClientCa.close(); - auto thread = std::thread([]() { g_network->run(); }); - flushTraceFileVoid(); - g_network->stop(); - thread.join(); + fmt::print("OK\n"); return FDB_EXIT_SUCCESS; } catch (const Error& e) { fmt::print(stderr, "ERROR: {}\n", e.name()); diff --git a/flow/ScopeExit.h b/flow/ScopeExit.h new file mode 100644 index 0000000000..e6d4ac936d --- /dev/null +++ b/flow/ScopeExit.h @@ -0,0 +1,36 @@ +/* + * ScopeExit.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FLOW_SCOPE_EXIT_H +#define FLOW_SCOPE_EXIT_H +#pragma once + +// Execute lambda as this object goes out of scope +template +class ScopeExit { + std::decay_t fn; + +public: + ScopeExit(Func&& fn) : fn(std::forward(fn)) {} + + ~ScopeExit() { fn(); } +}; + +#endif /*FLOW_SCOPE_EXIT_H*/ From d3b966fcfdff38dac738ea5b9e21496abfd8ff08 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 13:51:24 +0200 Subject: [PATCH 183/299] Refine trace and console message --- flow/MkCertCli.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/flow/MkCertCli.cpp b/flow/MkCertCli.cpp index 3e39a5eae3..558ceeda62 100644 --- a/flow/MkCertCli.cpp +++ b/flow/MkCertCli.cpp @@ -75,7 +75,7 @@ void printOptionUsage(std::string_view option, std::string_view(&&optionDescLine void printUsage(std::string_view binary) { fmt::print(stdout, - "mkcert: FDB test certificate chain generator\n" + "mkcert: FDB test certificate chain generator\n\n" "Usage: {} [OPTIONS...]\n\n", binary); printOptionUsage("--server-chain-length LENGTH, -S LENGTH (default: 3)", @@ -93,7 +93,7 @@ void printUsage(std::string_view binary) { printOptionUsage("--server-ca-file PATH (default: 'server_ca.pem')", { "Output filename for server's root CA certificate.", "Content same as '--server-cert-file' for '--server-chain-length' == 1.", - "Intended for CLIENTS to use as 'tls_ca_file': i.e. cert issuer to trust" }); + "Intended for CLIENTS to use as 'tls_ca_file': i.e. cert issuer to trust." }); printOptionUsage("--client-cert-file PATH (default: 'client_cert.pem')", { "Output filename for client certificate chain excluding root CA.", "Intended for CLIENTS to use as 'tls_certificate_file'.", @@ -104,7 +104,7 @@ void printUsage(std::string_view binary) { printOptionUsage("--client-ca-file PATH (default: 'client_ca.pem')", { "Output filename for client's root CA certificate.", "Content same as '--client-cert-file' for '--client-chain-length' == 1.", - "Intended for SERVERS to use as 'tls_ca_file': i.e. cert issuer to trust" }); + "Intended for SERVERS to use as 'tls_ca_file': i.e. cert issuer to trust." }); printOptionUsage("--expire-server-cert", { "Deliberately expire server's leaf certificate for testing." }); printOptionUsage("--expire-client-cert", { "Deliberately expire client's leaf certificate for testing." }); } @@ -234,7 +234,7 @@ int main(int argc, char** argv) { using FileStream = std::ofstream; auto checkStream = [](FileStream& fs, std::string_view filename) { if (!fs) { - throw std::runtime_error(fmt::format("Cannot open '{}' for writing", filename)); + throw std::runtime_error(fmt::format("cannot open '{}' for writing", filename)); } }; auto ofsServerCert = FileStream(serverCertFile, std::ofstream::out | std::ofstream::trunc); @@ -292,12 +292,10 @@ int main(int argc, char** argv) { fmt::print("OK\n"); return FDB_EXIT_SUCCESS; } catch (const Error& e) { - fmt::print(stderr, "ERROR: {}\n", e.name()); - TraceEvent(SevError, "MainError").error(e); + fmt::print(stderr, "error: {}\n", e.name()); return FDB_EXIT_MAIN_ERROR; } catch (const std::exception& e) { - fmt::print(stderr, "std::exception: {}\n", e.what()); - TraceEvent(SevError, "MainError").error(unknown_error()).detail("RootException", e.what()); + fmt::print(stderr, "exception: {}\n", e.what()); return FDB_EXIT_MAIN_EXCEPTION; } } From 8c180e3e46f41a1af83e3d452f35446d17d37b3b Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 15:32:46 +0200 Subject: [PATCH 184/299] Add option to print cert chain or arguments Also do code cleanup --- flow/MkCert.cpp | 7 +- flow/MkCertCli.cpp | 224 +++++++++++++++++++++++++-------------------- 2 files changed, 129 insertions(+), 102 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 30a5f9f9aa..a0a5f78b60 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -36,8 +37,10 @@ namespace { [[noreturn]] void traceAndThrow(const char* condition, const char* file, int line) { - auto te = TraceEvent(SevWarnAlways, "ErrorTLSKeyOrCertGen"); - te.suppressFor(60).detail("File", file).detail("Line", line).detail("Condition", condition); + auto te = TraceEvent(SevWarnAlways, "MkCertOrKeyError"); + auto pfile= ::strrchr(file, '/'); + pfile = pfile ? pfile + 1 : file; + te.suppressFor(5).detail("File", pfile).detail("Line", line).detail("Condition", condition); if (auto err = ::ERR_get_error()) { char buf[256]{ 0, diff --git a/flow/MkCertCli.cpp b/flow/MkCertCli.cpp index 558ceeda62..33514a034d 100644 --- a/flow/MkCertCli.cpp +++ b/flow/MkCertCli.cpp @@ -45,6 +45,9 @@ enum EMkCertOpt : int { OPT_CLIENT_CA_FILE, OPT_EXPIRE_SERVER_CERT, OPT_EXPIRE_CLIENT_CERT, + OPT_PRINT_SERVER_CERT, + OPT_PRINT_CLIENT_CERT, + OPT_PRINT_ARGUMENTS, }; CSimpleOpt::SOption gOptions[] = { { OPT_HELP, "--help", SO_NONE }, @@ -61,6 +64,9 @@ CSimpleOpt::SOption gOptions[] = { { OPT_HELP, "--help", SO_NONE }, { OPT_CLIENT_CA_FILE, "--client-ca-file", SO_REQ_SEP }, { OPT_EXPIRE_SERVER_CERT, "--expire-server-cert", SO_NONE }, { OPT_EXPIRE_CLIENT_CERT, "--expire-client-cert", SO_NONE }, + { OPT_PRINT_SERVER_CERT, "--print-server-cert", SO_NONE }, + { OPT_PRINT_CLIENT_CERT, "--print-client-cert", SO_NONE }, + { OPT_PRINT_ARGUMENTS, "--print-args", SO_NONE }, SO_END_OF_OPTIONS }; template @@ -105,21 +111,88 @@ void printUsage(std::string_view binary) { { "Output filename for client's root CA certificate.", "Content same as '--client-cert-file' for '--client-chain-length' == 1.", "Intended for SERVERS to use as 'tls_ca_file': i.e. cert issuer to trust." }); - printOptionUsage("--expire-server-cert", { "Deliberately expire server's leaf certificate for testing." }); - printOptionUsage("--expire-client-cert", { "Deliberately expire client's leaf certificate for testing." }); + printOptionUsage("--expire-server-cert (default: no)", + { "Deliberately expire server's leaf certificate for testing." }); + printOptionUsage("--expire-client-cert (default: no)", + { "Deliberately expire client's leaf certificate for testing." }); + printOptionUsage("--print-server-cert (default: no)", + { "Print generated server certificate chain including root in human readable form.", + "Printed certificates are in leaf-to-CA order.", + "If --print-client-cert is also used, server chain precedes client's." }); + printOptionUsage("--print-client-cert (default: no)", + { "Print generated client certificate chain including root in human readable form.", + "Printed certificates are in leaf-to-CA order.", + "If --print-server-cert is also used, server chain precedes client's." }); + printOptionUsage("--print-args (default: no)", { "Print chain generation arguments." }); +} + +struct ChainSpec { + unsigned length; + std::string certFile; + std::string keyFile; + std::string caFile; + mkcert::ESide side; + bool expireLeaf; + void transformPathToAbs() { + certFile = abspath(certFile); + keyFile = abspath(keyFile); + caFile = abspath(caFile); + } + void print() { + fmt::print(stdout, "{}-side:\n", side == mkcert::ESide::Server ? "Server" : "Client"); + fmt::print(stdout, " Chain length: {}\n", length); + fmt::print(stdout, " Certificate file: {}\n", certFile); + fmt::print(stdout, " Key file: {}\n", keyFile); + fmt::print(stdout, " CA file: {}\n", caFile); + fmt::print(stdout, " Expire cert: {}\n", expireLeaf); + } + mkcert::CertChainRef makeChain(Arena& arena); +}; + +mkcert::CertChainRef ChainSpec::makeChain(Arena& arena) { + auto checkStream = [](std::ofstream& fs, std::string_view filename) { + if (!fs) { + throw std::runtime_error(fmt::format("cannot open '{}' for writing", filename)); + } + }; + auto ofsCert = std::ofstream(certFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsCert, certFile); + auto ofsKey = std::ofstream(keyFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsKey, keyFile); + auto ofsCa = std::ofstream(caFile, std::ofstream::out | std::ofstream::trunc); + checkStream(ofsCa, caFile); + if (!length) + return {}; + auto specs = mkcert::makeCertChainSpec(arena, length, side); + if (expireLeaf) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; + } + auto chain = mkcert::makeCertChain(arena, specs, {} /*generate root CA*/); + auto ca = chain.back().certPem; + ofsCa.write(reinterpret_cast(ca.begin()), ca.size()); + auto chainMinusRoot = chain; + if (chainMinusRoot.size() > 1) + chainMinusRoot.pop_back(); + auto cert = mkcert::concatCertChain(arena, chainMinusRoot); + ofsCert.write(reinterpret_cast(cert.begin()), cert.size()); + auto key = chain[0].privateKeyPem; + ofsKey.write(reinterpret_cast(key.begin()), key.size()); + ofsCert.close(); + ofsKey.close(); + ofsCa.close(); + return chain; } int main(int argc, char** argv) { - auto serverChainLen = 3; - auto clientChainLen = 2; - auto serverCertFile = std::string("server_cert.pem"); - auto serverKeyFile = std::string("server_key.pem"); - auto serverCaFile = std::string("server_ca.pem"); - auto clientCertFile = std::string("client_cert.pem"); - auto clientKeyFile = std::string("client_key.pem"); - auto clientCaFile = std::string("client_ca.pem"); - auto expireServerCert = false; - auto expireClientCert = false; + // default chain specs + auto serverArgs = ChainSpec{ 3u /*length*/, "server_cert.pem", "server_key.pem", + "server_ca.pem", mkcert::ESide::Server, false /* expireLeaf */ }; + auto clientArgs = ChainSpec{ 2u /*length*/, "client_cert.pem", "client_key.pem", + "client_ca.pem", mkcert::ESide::Client, false /* expireLeaf */ }; + auto printServerCert = false; + auto printClientCert = false; + auto printArguments = false; auto args = CSimpleOpt(argc, argv, gOptions, SO_O_EXACT | SO_O_HYPHEN_TO_UNDERSCORE); while (args.Next()) { if (auto err = args.LastError()) { @@ -148,7 +221,7 @@ int main(int argc, char** argv) { return FDB_EXIT_SUCCESS; case OPT_SERVER_CHAIN_LEN: try { - serverChainLen = std::stoi(args.OptionArg()); + serverArgs.length = std::stoul(args.OptionArg()); } catch (std::exception const& ex) { fmt::print(stderr, "ERROR: Invalid chain length ({})\n", ex.what()); return FDB_EXIT_ERROR; @@ -156,35 +229,44 @@ int main(int argc, char** argv) { break; case OPT_CLIENT_CHAIN_LEN: try { - clientChainLen = std::stoi(args.OptionArg()); + clientArgs.length = std::stoul(args.OptionArg()); } catch (std::exception const& ex) { fmt::print(stderr, "ERROR: Invalid chain length ({})\n", ex.what()); return FDB_EXIT_ERROR; } break; case OPT_SERVER_CERT_FILE: - serverCertFile.assign(args.OptionArg()); + serverArgs.certFile.assign(args.OptionArg()); break; case OPT_SERVER_KEY_FILE: - serverKeyFile.assign(args.OptionArg()); + serverArgs.keyFile.assign(args.OptionArg()); break; case OPT_SERVER_CA_FILE: - serverCaFile.assign(args.OptionArg()); + serverArgs.caFile.assign(args.OptionArg()); break; case OPT_CLIENT_CERT_FILE: - clientCertFile.assign(args.OptionArg()); + clientArgs.certFile.assign(args.OptionArg()); break; case OPT_CLIENT_KEY_FILE: - clientKeyFile.assign(args.OptionArg()); + clientArgs.keyFile.assign(args.OptionArg()); break; case OPT_CLIENT_CA_FILE: - clientCaFile.assign(args.OptionArg()); + clientArgs.caFile.assign(args.OptionArg()); break; case OPT_EXPIRE_SERVER_CERT: - expireServerCert = true; + serverArgs.expireLeaf = true; break; case OPT_EXPIRE_CLIENT_CERT: - expireClientCert = true; + clientArgs.expireLeaf = true; + break; + case OPT_PRINT_SERVER_CERT: + printServerCert = true; + break; + case OPT_PRINT_CLIENT_CERT: + printClientCert = true; + break; + case OPT_PRINT_ARGUMENTS: + printArguments = true; break; default: fmt::print(stderr, "ERROR: Unknown option {}\n", args.OptionText()); @@ -208,88 +290,30 @@ int main(int argc, char** argv) { thread.join(); }); - serverCertFile = abspath(serverCertFile); - serverKeyFile = abspath(serverKeyFile); - serverCaFile = abspath(serverCaFile); - clientCertFile = abspath(clientCertFile); - clientKeyFile = abspath(clientKeyFile); - clientCaFile = abspath(clientCaFile); - fmt::print("Server certificate chain length: {}\n" - "Client certificate chain length: {}\n" - "Server certificate file: {}\n" - "Server private key file: {}\n" - "Server CA file: {}\n" - "Client certificate file: {}\n" - "Client private key file: {}\n" - "Client CA file: {}\n", - serverChainLen, - clientChainLen, - serverCertFile, - serverKeyFile, - serverCaFile, - clientCertFile, - clientKeyFile, - clientCaFile); + serverArgs.transformPathToAbs(); + clientArgs.transformPathToAbs(); + if (printArguments) { + serverArgs.print(); + clientArgs.print(); + } + auto arena = Arena(); + auto serverChain = serverArgs.makeChain(arena); + auto clientChain = clientArgs.makeChain(arena); - using FileStream = std::ofstream; - auto checkStream = [](FileStream& fs, std::string_view filename) { - if (!fs) { - throw std::runtime_error(fmt::format("cannot open '{}' for writing", filename)); + if (printServerCert || printClientCert) { + if (printServerCert) { + for (auto i = 0; i < serverChain.size(); i++) { + mkcert::printCert(stdout, serverChain[i].certPem); + } } - }; - auto ofsServerCert = FileStream(serverCertFile, std::ofstream::out | std::ofstream::trunc); - checkStream(ofsServerCert, serverCertFile); - auto ofsServerKey = FileStream(serverKeyFile, std::ofstream::out | std::ofstream::trunc); - checkStream(ofsServerKey, serverKeyFile); - auto ofsServerCa = FileStream(serverCaFile, std::ofstream::out | std::ofstream::trunc); - checkStream(ofsServerCa, serverCaFile); - auto ofsClientCert = FileStream(clientCertFile, std::ofstream::out | std::ofstream::trunc); - checkStream(ofsClientCert, clientCertFile); - auto ofsClientKey = FileStream(clientKeyFile, std::ofstream::out | std::ofstream::trunc); - checkStream(ofsClientKey, clientKeyFile); - auto ofsClientCa = FileStream(clientCaFile, std::ofstream::out | std::ofstream::trunc); - checkStream(ofsClientCa, clientCaFile); - if (serverChainLen) { - auto arena = Arena(); - auto specs = mkcert::makeCertChainSpec(arena, std::abs(serverChainLen), mkcert::ESide::Server); - if (expireServerCert) { - specs[0].offsetNotBefore = -60l * 60 * 24 * 365; - specs[0].offsetNotAfter = -10l; + if (printClientCert) { + for (auto i = 0; i < clientChain.size(); i++) { + mkcert::printCert(stdout, clientChain[i].certPem); + } } - auto serverChain = mkcert::makeCertChain(arena, specs, {} /*generate root CA*/); - auto serverCa = serverChain.back().certPem; - ofsServerCa.write(reinterpret_cast(serverCa.begin()), serverCa.size()); - if (serverChain.size() > 1) - serverChain.pop_back(); - auto serverCert = mkcert::concatCertChain(arena, serverChain); - ofsServerCert.write(reinterpret_cast(serverCert.begin()), serverCert.size()); - auto serverKey = serverChain[0].privateKeyPem; - ofsServerKey.write(reinterpret_cast(serverKey.begin()), serverKey.size()); + } else { + fmt::print("OK\n"); } - ofsServerCert.close(); - ofsServerKey.close(); - ofsServerCa.close(); - if (clientChainLen) { - auto arena = Arena(); - auto specs = mkcert::makeCertChainSpec(arena, std::abs(serverChainLen), mkcert::ESide::Server); - if (expireClientCert) { - specs[0].offsetNotBefore = -60l * 60 * 24 * 365; - specs[0].offsetNotAfter = -10l; - } - auto serverChain = mkcert::makeCertChain(arena, specs, {} /*generate root CA*/); - auto serverCa = serverChain.back().certPem; - ofsServerCa.write(reinterpret_cast(serverCa.begin()), serverCa.size()); - if (serverChain.size() > 1) - serverChain.pop_back(); - auto serverCert = mkcert::concatCertChain(arena, serverChain); - ofsServerCert.write(reinterpret_cast(serverCert.begin()), serverCert.size()); - auto serverKey = serverChain[0].privateKeyPem; - ofsServerKey.write(reinterpret_cast(serverKey.begin()), serverKey.size()); - } - ofsClientCert.close(); - ofsClientKey.close(); - ofsClientCa.close(); - fmt::print("OK\n"); return FDB_EXIT_SUCCESS; } catch (const Error& e) { fmt::print(stderr, "error: {}\n", e.name()); From 8818aedfbb1b04e59c3d6df6e40018175c357a7f Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 16:01:22 +0200 Subject: [PATCH 185/299] Remove TLS Test --- flow/CMakeLists.txt | 8 -- flow/TLSTest.cpp | 272 -------------------------------------------- 2 files changed, 280 deletions(-) delete mode 100644 flow/TLSTest.cpp diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index a6afcac782..705a98f282 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -195,13 +195,5 @@ if(APPLE) target_link_libraries(flow_sampling PRIVATE ${IO_KIT} ${CORE_FOUNDATION}) endif() -add_executable(tls_poc TLSTest.cpp) - -if(USE_SANITIZER) - target_link_libraries(tls_poc PUBLIC fmt::fmt flow boost_asan) -else() - target_link_libraries(tls_poc PUBLIC fmt::fmt flow boost_target) -endif() - add_executable(mkcert MkCertCli.cpp) target_link_libraries(mkcert PUBLIC fmt::fmt flow) diff --git a/flow/TLSTest.cpp b/flow/TLSTest.cpp deleted file mode 100644 index da8ff8a2c3..0000000000 --- a/flow/TLSTest.cpp +++ /dev/null @@ -1,272 +0,0 @@ -/* - * TLSTest.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "flow/Arena.h" -#include "flow/MkCert.h" - -std::FILE* outp = stderr; - -template -void log(Args&&... args) { - auto buf = fmt::memory_buffer{}; - fmt::format_to(std::back_inserter(buf), std::forward(args)...); - fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); -} - -template -void logc(Args&&... args) { - auto buf = fmt::memory_buffer{}; - fmt::format_to(std::back_inserter(buf), "[CLIENT] "); - fmt::format_to(std::back_inserter(buf), std::forward(args)...); - fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); -} - -template -void logs(Args&&... args) { - auto buf = fmt::memory_buffer{}; - fmt::format_to(std::back_inserter(buf), "[SERVER] "); - fmt::format_to(std::back_inserter(buf), std::forward(args)...); - fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); -} - -using namespace boost::asio; -using ip::tcp; - -using ec_type = boost::system::error_code; - -using socket_type = ssl::stream; -using work_guard_type = executor_work_guard; - -auto client_ssl = ssl::context(ssl::context::tls); -auto server_ssl = ssl::context(ssl::context::tls); - -mkcert::CertChainRef server_chain; -mkcert::CertChainRef client_chain; - -void trust_root_cacert(ssl::context& ctx, StringRef certPem) { - ctx.add_certificate_authority(const_buffer(certPem.begin(), certPem.size())); -} - -void use_chain(ssl::context& ctx, mkcert::CertChainRef chain) { - auto arena = Arena(); - auto chain_str = concatCertChain(arena, chain); - ctx.use_certificate_chain(const_buffer(chain_str.begin(), chain_str.size())); - auto keyPem = chain.front().privateKeyPem; - ctx.use_private_key(const_buffer(keyPem.begin(), keyPem.size()), ssl::context::pem); -} - -void init_certs(ssl::context& ctx, mkcert::CertChainRef my_chain, StringRef peerRootPem) { - if (!peerRootPem.empty()) - trust_root_cacert(ctx, peerRootPem); - if (my_chain.size() > 1) - my_chain.pop_back(); - if (my_chain.size() > 0) - use_chain(ctx, my_chain); -} - -void init_client_ssl_context() { - auto& ctx = client_ssl; - ctx.set_options(ssl::context::default_workarounds); - ctx.set_verify_mode(ssl::context::verify_peer | ssl::verify_fail_if_no_peer_cert); - /* - ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { - logc("context preverify: {}", preverify); - return preverify; - });*/ - init_certs(ctx, client_chain, server_chain.empty() ? StringRef() : server_chain.back().certPem); -} - -void init_server_ssl_context() { - auto& ctx = server_ssl; - ctx.set_options(ssl::context::default_workarounds); - ctx.set_verify_mode(ssl::context::verify_peer | (client_chain.empty() ? 0 : ssl::verify_fail_if_no_peer_cert)); - /* - ctx.set_verify_callback([](bool preverify, ssl::verify_context&) { - logs("context preverify: {}", preverify); - return preverify; - });*/ - init_certs(ctx, server_chain, client_chain.empty() ? StringRef() : client_chain.back().certPem); -} - -template <> -struct fmt::formatter { - constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { return ctx.begin(); } - - template - auto format(const tcp::endpoint& ep, FormatContext& ctx) -> decltype(ctx.out()) { - return fmt::format_to(ctx.out(), "{}:{}", ep.address().to_string(), ep.port()); - } -}; - -int main(int argc, char** argv) { - auto const server_chain_len = (argc > 1 ? std::strtol(argv[1], nullptr, 10) : 3l); - auto const client_chain_len = (argc > 2 ? std::strtol(argv[2], nullptr, 10) : 3l); - auto const expect_handshake_ok = client_chain_len >= 0 && server_chain_len > 0; - auto const expect_trusted = client_chain_len != 0; - log("cert chain length: server {}, client {}", server_chain_len, client_chain_len); - [[maybe_unused]] auto print_chain = [](mkcert::CertChainRef chain) -> void { - if (chain.empty()) { - log("EMPTY"); - return; - } - for (auto certAndKey : chain) { - certAndKey.printCert(outp); - log("==========="); - certAndKey.printPrivateKey(outp); - log("==========="); - } - }; - auto arena = Arena(); - if (server_chain_len) { - auto tmpArena = Arena(); - auto specs = mkcert::makeCertChainSpec(tmpArena, std::labs(server_chain_len), mkcert::ESide::Server); - if (server_chain_len < 0) { - specs[0].offsetNotBefore = -60l * 60 * 24 * 365; - specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago - } - server_chain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/); - } - if (client_chain_len) { - auto tmpArena = Arena(); - auto specs = mkcert::makeCertChainSpec(tmpArena, std::labs(client_chain_len), mkcert::ESide::Client); - if (client_chain_len < 0) { - specs[0].offsetNotBefore = -60l * 60 * 24 * 365; - specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago - } - client_chain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/); - } - /* - log("=========== SERVER CHAIN"); - print_chain(server_chain); - auto concat = concatCertChain(arena, server_chain); - if (!concat.empty()) - log(concat.toString()); - log("=========== CLIENT CHAIN"); - print_chain(client_chain); - concat = concatCertChain(arena, client_chain); - if (!concat.empty()) - log(concat.toString()); - */ - init_client_ssl_context(); - log("client SSL contexts initialized"); - init_server_ssl_context(); - log("server SSL contexts initialized"); - auto io = io_context(); - auto wg_server = work_guard_type(io.get_executor()); - auto wg_client = work_guard_type(io.get_executor()); - auto const ip = ip::address::from_string("127.0.0.1"); - auto acceptor = tcp::acceptor(io, tcp::endpoint(ip, 0)); - auto const server_addr = acceptor.local_endpoint(); - logs("server listening at {}", server_addr); - auto server_sock = tcp::socket(io); - auto server_ssl_sock = socket_type(server_sock, server_ssl); - enum class ESockState { AssumedUntrusted, Trusted }; - auto server_sock_state = ESockState::AssumedUntrusted; - auto client_sock_state = ESockState::AssumedUntrusted; - auto handshake_ok = true; - server_ssl_sock.set_verify_callback([&server_sock_state, &handshake_ok](bool preverify, ssl::verify_context&) { - logs("client preverify: {}", preverify); - switch (server_sock_state) { - case ESockState::AssumedUntrusted: - if (!preverify) - return handshake_ok = false; - server_sock_state = ESockState::Trusted; - break; - case ESockState::Trusted: - if (!preverify) - return handshake_ok = false; - break; - default: - break; - } - // if untrusted connection passes preverify, they are considered trusted - return true; - }); - acceptor.async_accept(server_sock, [&server_ssl_sock, &wg_server](const ec_type& ec) { - if (ec) { - logs("accept error: {}", ec.message()); - wg_server.reset(); - } else { - logs("accepted connection from {}", server_ssl_sock.next_layer().remote_endpoint()); - server_ssl_sock.async_handshake(ssl::stream_base::handshake_type::server, [&wg_server](const ec_type& ec) { - if (ec) { - logs("server handshake returned {}", ec.message()); - } else { - logs("handshake OK"); - } - wg_server.reset(); - }); - } - }); - auto client_sock = tcp::socket(io); - auto client_ssl_sock = socket_type(client_sock, client_ssl); - client_ssl_sock.set_verify_callback([&client_sock_state, &handshake_ok](bool preverify, ssl::verify_context&) { - logc("server preverify: {}", preverify); - switch (client_sock_state) { - case ESockState::AssumedUntrusted: - if (!preverify) - return handshake_ok = false; - client_sock_state = ESockState::Trusted; - break; - case ESockState::Trusted: - if (!preverify) - return handshake_ok = false; - break; - default: - break; - } - // if untrusted connection passes preverify, they are considered trusted - return true; - }); - client_sock.async_connect(server_addr, [&wg_client, &client_sock, &client_ssl_sock](const ec_type& ec) { - if (ec) { - logc("connect error: {}", ec.message()); - wg_client.reset(); - } else { - logc("connected to {}", client_sock.remote_endpoint()); - client_ssl_sock.async_handshake(ssl::stream_base::handshake_type::client, [&wg_client](const ec_type& ec) { - if (ec) { - logc("client handshake returned {}", ec.message()); - } else { - logc("handshake OK"); - } - wg_client.reset(); - }); - } - }); - io.run(); - ASSERT_EQ(expect_handshake_ok, handshake_ok); - if (expect_handshake_ok) { - ASSERT_EQ(expect_trusted, (server_sock_state == ESockState::Trusted)); - log("Test OK: Handshake passed and connection {} as expected", - server_sock_state == ESockState::Trusted ? "trusted" : "untrusted"); - } else { - log("Test OK: Handshake failed as expected"); - } - return 0; -} From 9155fbd1b8aed3249517234484489b36a203359a Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 16:12:45 +0200 Subject: [PATCH 186/299] Fix formatting and remove redundant trace field --- flow/MkCert.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index a0a5f78b60..35c5931dd3 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -36,11 +36,9 @@ namespace { -[[noreturn]] void traceAndThrow(const char* condition, const char* file, int line) { +[[noreturn]] void traceAndThrow(const char* condition, int line) { auto te = TraceEvent(SevWarnAlways, "MkCertOrKeyError"); - auto pfile= ::strrchr(file, '/'); - pfile = pfile ? pfile + 1 : file; - te.suppressFor(5).detail("File", pfile).detail("Line", line).detail("Condition", condition); + te.suppressFor(10).detail("Line", line).detail("Condition", condition); if (auto err = ::ERR_get_error()) { char buf[256]{ 0, From 2e8654e0488e94dec7db406a84aa2f75ec8bda97 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 16:30:20 +0200 Subject: [PATCH 187/299] Fix syntax error --- flow/MkCert.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 35c5931dd3..f2903066a0 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -54,7 +54,7 @@ namespace { #define OSSL_ASSERT(condition) \ do { \ if (!(condition)) \ - traceAndThrow(#condition, __FILE__, __LINE__); \ + traceAndThrow(#condition, __LINE__); \ } while (false) namespace mkcert { From fe55605a25eb57150d095477fc70ed9fa992cdf8 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 16:52:00 +0200 Subject: [PATCH 188/299] Fix GCC-specific error --- flow/MkCertCli.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flow/MkCertCli.cpp b/flow/MkCertCli.cpp index 33514a034d..a4c1a4eb24 100644 --- a/flow/MkCertCli.cpp +++ b/flow/MkCertCli.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include "flow/Arena.h" @@ -70,7 +71,7 @@ CSimpleOpt::SOption gOptions[] = { { OPT_HELP, "--help", SO_NONE }, SO_END_OF_OPTIONS }; template -void printOptionUsage(std::string_view option, std::string_view(&&optionDescLines)[Len]) { +void printOptionUsage(std::string_view option, const char*(&&optionDescLines)[Len]) { constexpr std::string_view optionIndent{ " " }; constexpr std::string_view descIndent{ " " }; fmt::print(stdout, "{}{}\n", optionIndent, option); From 88d1692de7d95a6a2bc928fbb0d58ae450421773 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Wed, 11 May 2022 18:36:58 +0200 Subject: [PATCH 189/299] Fix BoringSSL-specific issues with Mac Build --- flow/MkCert.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index f2903066a0..4249f398de 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -27,9 +27,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -158,7 +160,7 @@ std::shared_ptr readPrivateKeyPem(StringRef privateKeyPem) { } StringRef writeX509CertPem(Arena& arena, const std::shared_ptr& nativeCert) { - auto mem = ::BIO_new(::BIO_s_secmem()); + auto mem = ::BIO_new(::BIO_s_mem()); OSSL_ASSERT(mem); auto memGuard = ScopeExit([mem]() { ::BIO_free(mem); }); OSSL_ASSERT(::PEM_write_bio_X509(mem, nativeCert.get())); @@ -171,7 +173,7 @@ StringRef writeX509CertPem(Arena& arena, const std::shared_ptr& nativeCert } StringRef writePrivateKeyPem(Arena& arena, const std::shared_ptr& nativePrivateKey) { - auto mem = ::BIO_new(::BIO_s_secmem()); + auto mem = ::BIO_new(::BIO_s_mem()); OSSL_ASSERT(mem); auto memGuard = ScopeExit([mem]() { ::BIO_free(mem); }); OSSL_ASSERT(::PEM_write_bio_PrivateKey(mem, nativePrivateKey.get(), nullptr, nullptr, 0, 0, nullptr)); @@ -252,7 +254,12 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { // extension field names and values are expected to null-terminate auto extName = entry.field.toString(); auto extValue = entry.bytes.toString(); - auto ext = ::X509V3_EXT_conf(nullptr, &ctx, extName.c_str(), extValue.c_str()); + auto extNid = ::OBJ_txt2nid(extName.c_str()); + if (extNid == NID_undef) { + TraceEvent(SevWarnAlways, "MkCertInvalidExtName").suppressFor(10).detail("Name", extName); + throw tls_error(); + } + auto ext = ::X509V3_EXT_conf_nid(nullptr, &ctx, extNid, extValue.c_str()); OSSL_ASSERT(ext); auto extGuard = ScopeExit([ext]() { ::X509_EXTENSION_free(ext); }); OSSL_ASSERT(::X509_add_ext(x, ext, -1)); From 2f67637701a91b28dd6543fc7793ab9a329b41c5 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 11 May 2022 19:12:38 +0200 Subject: [PATCH 190/299] Fixing the problem with client getting stuck on server downgrades: - Restoring the original check for strict protocol compatibility before sending packets - Resetting the compatibility flag when connection is closed, so that the protocol compatibility is checked again for a new connection --- fdbrpc/FlowTransport.actor.cpp | 16 +++++++--------- fdbrpc/FlowTransport.h | 1 - 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index ad737d3be4..ef126ef5fd 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -816,6 +816,9 @@ ACTOR Future connectionKeeper(Reference self, .errorUnsuppressed(e) .suppressFor(1.0) .detail("PeerAddr", self->destination); + + // Since the connection has closed, we need to check the protocol version the next time we connect + self->compatible = true; } if (self->destination.isPublic() && @@ -885,9 +888,9 @@ ACTOR Future connectionKeeper(Reference self, Peer::Peer(TransportData* transport, NetworkAddress const& destination) : transport(transport), destination(destination), compatible(true), outgoingConnectionIdle(true), lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), peerReferences(-1), - incompatibleProtocolVersionNewer(false), bytesReceived(0), bytesSent(0), lastDataPacketSentTime(now()), - outstandingReplies(0), pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), - lastLoggedTime(0.0), lastLoggedBytesReceived(0), lastLoggedBytesSent(0), timeoutCount(0), + bytesReceived(0), bytesSent(0), lastDataPacketSentTime(now()), outstandingReplies(0), + pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedTime(0.0), + lastLoggedBytesReceived(0), lastLoggedBytesSent(0), timeoutCount(0), protocolVersion(Reference>>(new AsyncVar>())), connectOutgoingCount(0), connectIncomingCount(0), connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) { @@ -1257,7 +1260,6 @@ ACTOR static Future connectionReader(TransportData* transport, state bool expectConnectPacket = true; state bool compatible = false; state bool incompatiblePeerCounted = false; - state bool incompatibleProtocolVersionNewer = false; state NetworkAddress peerAddress; state ProtocolVersion peerProtocolVersion; state Reference authorizedTenants = makeReference(); @@ -1323,7 +1325,6 @@ ACTOR static Future connectionReader(TransportData* transport, uint64_t connectionId = pkt.connectionId; if (!pkt.protocolVersion.hasObjectSerializerFlag() || !pkt.protocolVersion.isCompatible(g_network->protocolVersion())) { - incompatibleProtocolVersionNewer = pkt.protocolVersion > g_network->protocolVersion(); NetworkAddress addr = pkt.canonicalRemotePort ? NetworkAddress(pkt.canonicalRemoteIp(), pkt.canonicalRemotePort) : conn->getPeerAddress(); @@ -1383,7 +1384,6 @@ ACTOR static Future connectionReader(TransportData* transport, .suppressFor(1.0) .detail("PeerAddr", NetworkAddress(pkt.canonicalRemoteIp(), pkt.canonicalRemotePort)); peer->compatible = compatible; - peer->incompatibleProtocolVersionNewer = incompatibleProtocolVersionNewer; if (!compatible) { peer->transport->numIncompatibleConnections++; incompatiblePeerCounted = true; @@ -1401,7 +1401,6 @@ ACTOR static Future connectionReader(TransportData* transport, } peer = transport->getOrOpenPeer(peerAddress, false); peer->compatible = compatible; - peer->incompatibleProtocolVersionNewer = incompatibleProtocolVersionNewer; if (!compatible) { peer->transport->numIncompatibleConnections++; incompatiblePeerCounted = true; @@ -1741,8 +1740,7 @@ static ReliablePacket* sendPacket(TransportData* self, // If there isn't an open connection, a public address, or the peer isn't compatible, we can't send if (!peer || (peer->outgoingConnectionIdle && !destination.getPrimaryAddress().isPublic()) || - (peer->incompatibleProtocolVersionNewer && - destination.token != Endpoint::wellKnownToken(WLTOKEN_PING_PACKET))) { + (!peer->compatible && destination.token != Endpoint::wellKnownToken(WLTOKEN_PING_PACKET))) { TEST(true); // Can't send to private address without a compatible open connection return nullptr; } diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index ceaf3e6f35..b19f3adb88 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -159,7 +159,6 @@ struct Peer : public ReferenceCounted { double lastConnectTime; double reconnectionDelay; int peerReferences; - bool incompatibleProtocolVersionNewer; int64_t bytesReceived; int64_t bytesSent; double lastDataPacketSentTime; From a7cd61c5cf7bb2d0aad46715cb343bbe1bb654d4 Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Wed, 11 May 2022 13:23:27 -0700 Subject: [PATCH 191/299] Enable debugId tracing for encryption requests (#7111) * Enable debugId tracing for encryption requests Description diff-1: Minor fixes, address review comment Proposed changes include: 1. Update EncryptKeyProxy API to embded Optional for debugging request execution. 2. Encryption participant FDB processes can set 'debugId' enabling tracing requests within FDB cluster processes and beyond. 3. The 'debugId' if available is embedded as part of 'request_json_payload' by RESTKmsConnector, enabling tracing request between FDB <--> KMS. 4. Fix EncryptKeyProxyTest which got broken due to recent changes. Testing Updated following test: 1. EncryptKeyProxy simulation test. 2. RESTKmsConnector simulation test. Description Testing --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 1 + fdbserver/CMakeLists.txt | 4 +- fdbserver/EncryptKeyProxy.actor.cpp | 92 ++++++++++++++++--- fdbserver/EncryptKeyProxyInterface.h | 12 +-- fdbserver/KmsConnectorInterface.h | 14 ++- fdbserver/RESTKmsConnector.actor.cpp | 38 ++++++-- ...msConnector.actor.h => RESTKmsConnector.h} | 8 +- fdbserver/SimKmsConnector.actor.cpp | 36 +++++++- ...KmsConnector.actor.h => SimKmsConnector.h} | 10 +- .../workloads/EncryptKeyProxyTest.actor.cpp | 36 ++++++-- fdbserver/workloads/EncryptionOps.actor.cpp | 2 +- flow/BlobCipher.cpp | 2 +- flow/CMakeLists.txt | 2 + flow/EncryptUtils.cpp | 38 ++++++++ flow/EncryptUtils.h | 15 ++- flow/IRandom.h | 1 + flow/flow.cpp | 13 +++ 18 files changed, 265 insertions(+), 60 deletions(-) rename fdbserver/{RESTKmsConnector.actor.h => RESTKmsConnector.h} (77%) rename fdbserver/{SimKmsConnector.actor.h => SimKmsConnector.h} (74%) create mode 100644 flow/EncryptUtils.cpp diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 42dddd5a3b..f1618eebad 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -857,6 +857,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ENABLE_ENCRYPTION, false ); init( ENCRYPTION_MODE, "AES-256-CTR"); init( SIM_KMS_MAX_KEYS, 4096); + init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000); // KMS connector type init( KMS_CONNECTOR_TYPE, "RESTKmsConnector"); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 0176011fec..19273218fa 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -824,6 +824,7 @@ public: bool ENABLE_ENCRYPTION; std::string ENCRYPTION_MODE; int SIM_KMS_MAX_KEYS; + int ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH; // Key Management Service (KMS) Connector std::string KMS_CONNECTOR_TYPE; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 590ad6d287..f47e003a67 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -105,7 +105,7 @@ set(FDBSERVER_SRCS RecoveryState.h RemoteIKeyValueStore.actor.h RemoteIKeyValueStore.actor.cpp - RESTKmsConnector.actor.h + RESTKmsConnector.h RESTKmsConnector.actor.cpp ResolutionBalancer.actor.cpp ResolutionBalancer.actor.h @@ -138,7 +138,7 @@ set(FDBSERVER_SRCS ServerDBInfo.actor.h ServerDBInfo.h SigStack.cpp - SimKmsConnector.actor.h + SimKmsConnector.h SimKmsConnector.actor.cpp SimpleConfigConsumer.actor.cpp SimpleConfigConsumer.h diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp index 03d0874f1a..c1979f8964 100644 --- a/fdbserver/EncryptKeyProxy.actor.cpp +++ b/fdbserver/EncryptKeyProxy.actor.cpp @@ -24,9 +24,9 @@ #include "fdbserver/KmsConnector.h" #include "fdbserver/KmsConnectorInterface.h" #include "fdbserver/Knobs.h" -#include "fdbserver/RESTKmsConnector.actor.h" +#include "fdbserver/RESTKmsConnector.h" #include "fdbserver/ServerDBInfo.actor.h" -#include "fdbserver/SimKmsConnector.actor.h" +#include "fdbserver/SimKmsConnector.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/ServerDBInfo.h" #include "flow/Arena.h" @@ -42,6 +42,7 @@ #include "flow/network.h" #include +#include #include #include @@ -162,10 +163,17 @@ ACTOR Future getCipherKeysByBaseCipherKeyIds(Reference> lookupCipherIds; - state std::vector cachedCipherDetails; + state std::vector cachedCipherDetails; state EKPGetBaseCipherKeysByIdsRequest keysByIds = req; state EKPGetBaseCipherKeysByIdsReply keyIdsReply; + state Optional dbgTrace = + keysByIds.debugId.present() ? TraceEvent("GetByKeyIds", ekpProxyData->myId) : Optional(); + + if (dbgTrace.present()) { + dbgTrace.get().setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH); + dbgTrace.get().detail("DbgId", keysByIds.debugId.get()); + } // Dedup the requested pair // TODO: endpoint serialization of std::unordered_set isn't working at the moment @@ -176,12 +184,28 @@ ACTOR Future getCipherKeysByBaseCipherKeyIds(ReferencebaseCipherKeyIdCache.find(item.first); if (itr != ekpProxyData->baseCipherKeyIdCache.end()) { ASSERT(itr->second.isValid()); cachedCipherDetails.emplace_back( itr->second.domainId, itr->second.baseCipherId, itr->second.baseCipherKey, keyIdsReply.arena); + + if (dbgTrace.present()) { + // {encryptId, baseCipherId} forms a unique tuple across encryption domains + dbgTrace.get().detail(getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_CACHED_PREFIX, + itr->second.domainId, + itr->second.baseCipherId), + ""); + } } else { lookupCipherIds.emplace_back(std::make_pair(item.first, item.second)); } @@ -192,7 +216,7 @@ ACTOR Future getCipherKeysByBaseCipherKeyIds(Reference getCipherKeysByBaseCipherKeyIds(ReferenceinsertIntoBaseCipherIdCache(item.encryptDomainId, item.encryptKeyId, item.encryptKey); + + if (dbgTrace.present()) { + // {encryptId, baseCipherId} forms a unique tuple across encryption domains + dbgTrace.get().detail( + getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_INSERT_PREFIX, item.encryptDomainId, item.encryptKeyId), + ""); + } } } catch (Error& e) { if (!canReplyWith(e)) { - TraceEvent("GetCipherKeysByIds", ekpProxyData->myId).error(e); + TraceEvent("GetCipherKeysByKeyIds", ekpProxyData->myId).error(e); throw; } - TraceEvent("GetCipherKeysByIds", ekpProxyData->myId).detail("ErrorCode", e.code()); + TraceEvent("GetCipherKeysByKeyIds", ekpProxyData->myId).detail("ErrorCode", e.code()); ekpProxyData->sendErrorResponse(keysByIds.reply, e); return Void(); } @@ -237,6 +268,13 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa state EKPGetLatestBaseCipherKeysRequest latestKeysReq = req; state EKPGetLatestBaseCipherKeysReply latestCipherReply; state Arena& arena = latestCipherReply.arena; + state Optional dbgTrace = + latestKeysReq.debugId.present() ? TraceEvent("GetByDomIds", ekpProxyData->myId) : Optional(); + + if (dbgTrace.present()) { + dbgTrace.get().setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH); + dbgTrace.get().detail("DbgId", latestKeysReq.debugId.get()); + } // Dedup the requested domainIds. // TODO: endpoint serialization of std::unordered_set isn't working at the moment @@ -245,6 +283,14 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa dedupedDomainIds.emplace(id); } + if (dbgTrace.present()) { + dbgTrace.get().detail("NKeys", dedupedDomainIds.size()); + for (EncryptCipherDomainId id : dedupedDomainIds) { + // log encryptDomainIds queried + dbgTrace.get().detail(getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_QUERY_PREFIX, id), ""); + } + } + // First, check if the requested information is already cached by the server. // Ensure the cached information is within FLOW_KNOBS->ENCRYPT_CIPHER_KEY_CACHE_TTL time window. @@ -253,6 +299,12 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa const auto itr = ekpProxyData->baseCipherDomainIdCache.find(id); if (itr != ekpProxyData->baseCipherDomainIdCache.end() && itr->second.isValid()) { cachedCipherDetails.emplace_back(id, itr->second.baseCipherId, itr->second.baseCipherKey, arena); + + if (dbgTrace.present()) { + // {encryptDomainId, baseCipherId} forms a unique tuple across encryption domains + dbgTrace.get().detail( + getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_CACHED_PREFIX, id, itr->second.baseCipherId), ""); + } } else { lookupCipherDomains.emplace_back(id); } @@ -263,7 +315,7 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa if (!lookupCipherDomains.empty()) { try { - KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq(lookupCipherDomains); + KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq(lookupCipherDomains, latestKeysReq.debugId); KmsConnLookupEKsByDomainIdsRep keysByDomainIdRep = wait(kmsConnectorInf.ekLookupByDomainIds.getReply(keysByDomainIdReq)); @@ -273,6 +325,13 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa // Record the fetched cipher details to the local cache for the future references ekpProxyData->insertIntoBaseDomainIdCache(item.encryptDomainId, item.encryptKeyId, item.encryptKey); + + if (dbgTrace.present()) { + // {encryptDomainId, baseCipherId} forms a unique tuple across encryption domains + dbgTrace.get().detail( + getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_INSERT_PREFIX, item.encryptDomainId, item.encryptKeyId), + ""); + } } } catch (Error& e) { if (!canReplyWith(e)) { @@ -298,13 +357,16 @@ ACTOR Future getLatestCipherKeys(Reference ekpProxyDa ACTOR Future refreshEncryptionKeysCore(Reference ekpProxyData, KmsConnectorInterface kmsConnectorInf) { + state UID debugId = deterministicRandom()->randomUniqueID(); - ASSERT(g_network->isSimulated()); - - TraceEvent("RefreshEKs_Start", ekpProxyData->myId).detail("KmsConnInf", kmsConnectorInf.id()); + state TraceEvent t("RefreshEKs_Start", ekpProxyData->myId); + t.setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH); + t.detail("KmsConnInf", kmsConnectorInf.id()); + t.detail("DebugId", debugId); try { KmsConnLookupEKsByDomainIdsReq req; + req.debugId = debugId; req.encryptDomainIds.reserve(ekpProxyData->baseCipherDomainIdCache.size()); for (auto& item : ekpProxyData->baseCipherDomainIdCache) { @@ -313,16 +375,20 @@ ACTOR Future refreshEncryptionKeysCore(Reference ekpP KmsConnLookupEKsByDomainIdsRep rep = wait(kmsConnectorInf.ekLookupByDomainIds.getReply(req)); for (auto& item : rep.cipherKeyDetails) { ekpProxyData->insertIntoBaseDomainIdCache(item.encryptDomainId, item.encryptKeyId, item.encryptKey); + // {encryptDomainId, baseCipherId} forms a unique tuple across encryption domains + t.detail(getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_INSERT_PREFIX, item.encryptDomainId, item.encryptKeyId), + ""); } ekpProxyData->baseCipherKeysRefreshed += rep.cipherKeyDetails.size(); - TraceEvent("RefreshEKs_Done", ekpProxyData->myId).detail("KeyCount", rep.cipherKeyDetails.size()); + + t.detail("nKeys", rep.cipherKeyDetails.size()); } catch (Error& e) { if (!canReplyWith(e)) { - TraceEvent("RefreshEncryptionKeys_Error").error(e); + TraceEvent("RefreshEKs_Error").error(e); throw e; } - TraceEvent("RefreshEncryptionKeys").detail("ErrorCode", e.code()); + TraceEvent("RefreshEKs").detail("ErrorCode", e.code()); ++ekpProxyData->numEncryptionKeyRefreshErrors; } diff --git a/fdbserver/EncryptKeyProxyInterface.h b/fdbserver/EncryptKeyProxyInterface.h index 61b237bb49..52bb9e1245 100644 --- a/fdbserver/EncryptKeyProxyInterface.h +++ b/fdbserver/EncryptKeyProxyInterface.h @@ -125,6 +125,7 @@ struct EKPGetBaseCipherKeysByIdsRequest { constexpr static FileIdentifier file_identifier = 4930263; UID requesterID; std::vector> baseCipherIds; + Optional debugId; ReplyPromise reply; EKPGetBaseCipherKeysByIdsRequest() : requesterID(deterministicRandom()->randomUniqueID()) {} @@ -133,7 +134,7 @@ struct EKPGetBaseCipherKeysByIdsRequest { template void serialize(Ar& ar) { - serializer(ar, requesterID, baseCipherIds, reply); + serializer(ar, requesterID, baseCipherIds, debugId, reply); } }; @@ -156,17 +157,16 @@ struct EKPGetLatestBaseCipherKeysReply { struct EKPGetLatestBaseCipherKeysRequest { constexpr static FileIdentifier file_identifier = 1910123; - UID requesterID; std::vector encryptDomainIds; + Optional debugId; ReplyPromise reply; - EKPGetLatestBaseCipherKeysRequest() : requesterID(deterministicRandom()->randomUniqueID()) {} - explicit EKPGetLatestBaseCipherKeysRequest(UID uid, const std::vector& ids) - : requesterID(uid), encryptDomainIds(ids) {} + EKPGetLatestBaseCipherKeysRequest() {} + explicit EKPGetLatestBaseCipherKeysRequest(const std::vector& ids) : encryptDomainIds(ids) {} template void serialize(Ar& ar) { - serializer(ar, requesterID, encryptDomainIds, reply); + serializer(ar, encryptDomainIds, debugId, reply); } }; diff --git a/fdbserver/KmsConnectorInterface.h b/fdbserver/KmsConnectorInterface.h index 4c4c91aef5..6f0a408e05 100644 --- a/fdbserver/KmsConnectorInterface.h +++ b/fdbserver/KmsConnectorInterface.h @@ -101,16 +101,18 @@ struct KmsConnLookupEKsByKeyIdsRep { struct KmsConnLookupEKsByKeyIdsReq { constexpr static FileIdentifier file_identifier = 6913396; std::vector> encryptKeyIds; + Optional debugId; ReplyPromise reply; KmsConnLookupEKsByKeyIdsReq() {} explicit KmsConnLookupEKsByKeyIdsReq( - const std::vector>& keyIds) - : encryptKeyIds(keyIds) {} + const std::vector>& keyIds, + Optional dbgId) + : encryptKeyIds(keyIds), debugId(dbgId) {} template void serialize(Ar& ar) { - serializer(ar, encryptKeyIds, reply); + serializer(ar, encryptKeyIds, debugId, reply); } }; @@ -130,14 +132,16 @@ struct KmsConnLookupEKsByDomainIdsRep { struct KmsConnLookupEKsByDomainIdsReq { constexpr static FileIdentifier file_identifier = 9918682; std::vector encryptDomainIds; + Optional debugId; ReplyPromise reply; KmsConnLookupEKsByDomainIdsReq() {} - explicit KmsConnLookupEKsByDomainIdsReq(const std::vector& ids) : encryptDomainIds(ids) {} + explicit KmsConnLookupEKsByDomainIdsReq(const std::vector& ids, Optional dbgId) + : encryptDomainIds(ids), debugId(dbgId) {} template void serialize(Ar& ar) { - serializer(ar, encryptDomainIds, reply); + serializer(ar, encryptDomainIds, debugId, reply); } }; diff --git a/fdbserver/RESTKmsConnector.actor.cpp b/fdbserver/RESTKmsConnector.actor.cpp index be4023ca51..3e720b9e33 100644 --- a/fdbserver/RESTKmsConnector.actor.cpp +++ b/fdbserver/RESTKmsConnector.actor.cpp @@ -18,7 +18,7 @@ * limitations under the License. */ -#include "fdbserver/RESTKmsConnector.actor.h" +#include "fdbserver/RESTKmsConnector.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/rapidjson/document.h" @@ -61,6 +61,7 @@ const char* REFRESH_KMS_URLS_TAG = "refresh_kms_urls"; const char* VALIDATION_TOKENS_TAG = "validation_tokens"; const char* VALIDATION_TOKEN_NAME_TAG = "token_name"; const char* VALIDATION_TOKEN_VALUE_TAG = "token_value"; +const char* DEBUG_UID_TAG = "debug_uid"; const char* TOKEN_NAME_FILE_SEP = "#"; const char* TOKEN_TUPLE_SEP = ","; @@ -280,9 +281,9 @@ void parseKmsResponse(Reference ctx, // "kms_urls" : [ // "url1", "url2", ... // ], - // "error" : { + // "error" : { // Optional, populated by the KMS, if present, rest of payload is ignored. // "details":
- // } // Optional, populated by the KMS, if present, rest of payload is ignored. + // } // } if (resp->code != HTTP::HTTP_STATUS_CODE_OK) { @@ -397,6 +398,20 @@ void addRefreshKmsUrlsSectionToJsonDoc(Reference ctx, doc.AddMember(key, refreshUrls, doc.GetAllocator()); } +void addDebugUidSectionToJsonDoc(Reference ctx, rapidjson::Document& doc, Optional dbgId) { + if (!dbgId.present()) { + // Debug id not present; do nothing + return; + } + rapidjson::Value key(DEBUG_UID_TAG, doc.GetAllocator()); + rapidjson::Value debugIdVal; + const std::string dbgIdStr = dbgId.get().toString(); + debugIdVal.SetString(dbgIdStr.c_str(), dbgIdStr.size(), doc.GetAllocator()); + + // Append 'debug_uid' object to the parent document + doc.AddMember(key, debugIdVal, doc.GetAllocator()); +} + StringRef getEncryptKeysByKeyIdsRequestBody(Reference ctx, const KmsConnLookupEKsByKeyIdsReq& req, const bool refreshKmsUrls, @@ -424,6 +439,7 @@ StringRef getEncryptKeysByKeyIdsRequestBody(Reference ctx, // } // ] // "refresh_kms_urls" = 1/0 + // "debug_uid" = // Optional debug info to trace requests across FDB <--> KMS // } rapidjson::Document doc; @@ -458,9 +474,12 @@ StringRef getEncryptKeysByKeyIdsRequestBody(Reference ctx, // Append 'validation_tokens' as json array addValidationTokensSectionToJsonDoc(ctx, doc); - // Append "refresh_kms_urls' + // Append 'refresh_kms_urls' addRefreshKmsUrlsSectionToJsonDoc(ctx, doc, refreshKmsUrls); + // Append 'debug_uid' section if needed + addDebugUidSectionToJsonDoc(ctx, doc, req.debugId); + // Serialize json to string rapidjson::StringBuffer sb; rapidjson::Writer writer(sb); @@ -574,6 +593,7 @@ StringRef getEncryptKeysByDomainIdsRequestBody(Reference ct // } // ] // "refresh_kms_urls" = 1/0 + // "debug_uid" = // Optional debug info to trace requests across FDB <--> KMS // } rapidjson::Document doc; @@ -604,6 +624,9 @@ StringRef getEncryptKeysByDomainIdsRequestBody(Reference ct // Append 'refresh_kms_urls' addRefreshKmsUrlsSectionToJsonDoc(ctx, doc, refreshKmsUrls); + // Append 'debug_uid' section if needed + addDebugUidSectionToJsonDoc(ctx, doc, req.debugId); + // Serialize json to string rapidjson::StringBuffer sb; rapidjson::Writer writer(sb); @@ -1007,13 +1030,16 @@ void testGetEncryptKeysByKeyIdsRequestBody(Reference ctx, A } bool refreshKmsUrls = deterministicRandom()->randomInt(0, 100) < 50; + if (deterministicRandom()->randomInt(0, 100) < 40) { + req.debugId = deterministicRandom()->randomUniqueID(); + } StringRef requestBodyRef = getEncryptKeysByKeyIdsRequestBody(ctx, req, refreshKmsUrls, arena); - TraceEvent("FetchKeysByKeyIds", ctx->uid).setMaxFieldLength(10000).detail("JsonReqStr", requestBodyRef.toString()); + TraceEvent("FetchKeysByKeyIds", ctx->uid).setMaxFieldLength(100000).detail("JsonReqStr", requestBodyRef.toString()); Reference httpResp = makeReference(); httpResp->code = HTTP::HTTP_STATUS_CODE_OK; getFakeKmsResponse(requestBodyRef, true, httpResp); - TraceEvent("FetchKeysByKeyIds", ctx->uid).setMaxFieldLength(10000).detail("HttpRespStr", httpResp->content); + TraceEvent("FetchKeysByKeyIds", ctx->uid).setMaxFieldLength(100000).detail("HttpRespStr", httpResp->content); std::vector cipherDetails; parseKmsResponse(ctx, httpResp, &arena, &cipherDetails); diff --git a/fdbserver/RESTKmsConnector.actor.h b/fdbserver/RESTKmsConnector.h similarity index 77% rename from fdbserver/RESTKmsConnector.actor.h rename to fdbserver/RESTKmsConnector.h index dbd65e01bd..41127440b5 100644 --- a/fdbserver/RESTKmsConnector.actor.h +++ b/fdbserver/RESTKmsConnector.h @@ -18,14 +18,10 @@ * limitations under the License. */ +#ifndef REST_KMS_CONNECTOR_H +#define REST_KMS_CONNECTOR_H #pragma once -#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RESTKMSCONNECTOR_ACTOR_G_H) -#define FDBSERVER_RESTKMSCONNECTOR_ACTOR_G_H -#include "fdbserver/RESTKmsConnector.actor.g.h" -#elif !defined(FDBSERVER_RESTKMSCONNECTOR_ACTOR_H) -#define FDBSERVER_RESTKMSCONNECTOR_ACTOR_H - #include "fdbserver/KmsConnector.h" class RESTKmsConnector : public KmsConnector { diff --git a/fdbserver/SimKmsConnector.actor.cpp b/fdbserver/SimKmsConnector.actor.cpp index 958003f7cd..8282c11906 100644 --- a/fdbserver/SimKmsConnector.actor.cpp +++ b/fdbserver/SimKmsConnector.actor.cpp @@ -18,7 +18,7 @@ * limitations under the License. */ -#include "fdbserver/SimKmsConnector.actor.h" +#include "fdbserver/SimKmsConnector.h" #include "fdbrpc/sim_validation.h" #include "fdbserver/Knobs.h" @@ -29,6 +29,7 @@ #include "flow/FastRef.h" #include "flow/IRandom.h" #include "flow/ITrace.h" +#include "flow/Trace.h" #include "flow/network.h" #include "flow/UnitTest.h" @@ -79,6 +80,14 @@ ACTOR Future simKmsConnectorCore_impl(KmsConnectorInterface interf) { when(KmsConnLookupEKsByKeyIdsReq req = waitNext(interf.ekLookupByIds.getFuture())) { state KmsConnLookupEKsByKeyIdsReq keysByIdsReq = req; state KmsConnLookupEKsByKeyIdsRep keysByIdsRep; + state Optional dbgKIdTrace = keysByIdsReq.debugId.present() + ? TraceEvent("SimKmsGetByKeyIds", interf.id()) + : Optional(); + + if (dbgKIdTrace.present()) { + dbgKIdTrace.get().setMaxEventLength(100000); + dbgKIdTrace.get().detail("DbgId", keysByIdsReq.debugId.get()); + } // Lookup corresponding EncryptKeyCtx for input keyId for (const auto& item : req.encryptKeyIds) { @@ -89,6 +98,12 @@ ACTOR Future simKmsConnectorCore_impl(KmsConnectorInterface interf) { itr->first, StringRef(keysByIdsRep.arena, itr->second.get()->key), keysByIdsRep.arena); + + if (dbgKIdTrace.present()) { + // {encryptDomainId, baseCipherId} forms a unique tuple across encryption domains + dbgKIdTrace.get().detail( + getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_RESULT_PREFIX, item.second, itr->first), ""); + } } else { success = false; break; @@ -102,16 +117,29 @@ ACTOR Future simKmsConnectorCore_impl(KmsConnectorInterface interf) { when(KmsConnLookupEKsByDomainIdsReq req = waitNext(interf.ekLookupByDomainIds.getFuture())) { state KmsConnLookupEKsByDomainIdsReq keysByDomainIdReq = req; state KmsConnLookupEKsByDomainIdsRep keysByDomainIdRep; + state Optional dbgDIdTrace = keysByDomainIdReq.debugId.present() + ? TraceEvent("SimKmsGetsByDomIds", interf.id()) + : Optional(); - // Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This would - // mean multiple domains gets mapped to the same encryption key which is fine, the EncryptKeyStore - // guarantees that keyId -> plaintext encryptKey mapping is idempotent. + if (dbgDIdTrace.present()) { + dbgDIdTrace.get().detail("DbgId", keysByDomainIdReq.debugId.get()); + } + + // Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This + // would mean multiple domains gets mapped to the same encryption key which is fine, the + // EncryptKeyStore guarantees that keyId -> plaintext encryptKey mapping is idempotent. for (EncryptCipherDomainId domainId : req.encryptDomainIds) { EncryptCipherBaseKeyId keyId = 1 + abs(domainId) % SERVER_KNOBS->SIM_KMS_MAX_KEYS; const auto& itr = ctx->simEncryptKeyStore.find(keyId); if (itr != ctx->simEncryptKeyStore.end()) { keysByDomainIdRep.cipherKeyDetails.emplace_back( domainId, keyId, StringRef(itr->second.get()->key), keysByDomainIdRep.arena); + + if (dbgDIdTrace.present()) { + // {encryptId, baseCipherId} forms a unique tuple across encryption domains + dbgDIdTrace.get().detail( + getEncryptDbgTraceKey(ENCRYPT_DBG_TRACE_RESULT_PREFIX, domainId, keyId), ""); + } } else { success = false; break; diff --git a/fdbserver/SimKmsConnector.actor.h b/fdbserver/SimKmsConnector.h similarity index 74% rename from fdbserver/SimKmsConnector.actor.h rename to fdbserver/SimKmsConnector.h index 6a03dcda26..c26ba808b4 100644 --- a/fdbserver/SimKmsConnector.actor.h +++ b/fdbserver/SimKmsConnector.h @@ -18,19 +18,13 @@ * limitations under the License. */ +#ifndef SIM_KMS_CONNECTOR_H +#define SIM_KMS_CONNECTOR_H #pragma once -#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_SIMKMSCONNECTOR_ACTOR_G_H) -#define FDBSERVER_SIMKMSCONNECTOR_ACTOR_G_H -#include "fdbserver/SimKmsConnector.actor.g.h" -#elif !defined(FDBSERVER_SIMKMSCONNECTOR_ACTOR_H) -#define FDBSERVER_SIMKMSCONNECTOR_ACTOR_H - #include "fdbserver/KmsConnector.h" #include "flow/BlobCipher.h" -#include "flow/actorcompiler.h" // This must be the last #include. - class SimKmsConnector : public KmsConnector { public: SimKmsConnector() = default; diff --git a/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp b/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp index 27c00f406e..e07f42e0e2 100644 --- a/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp +++ b/fdbserver/workloads/EncryptKeyProxyTest.actor.cpp @@ -72,7 +72,11 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { state int nAttempts = 0; loop { - EKPGetLatestBaseCipherKeysRequest req(deterministicRandom()->randomUniqueID(), self->domainIds); + EKPGetLatestBaseCipherKeysRequest req; + req.encryptDomainIds = self->domainIds; + if (deterministicRandom()->randomInt(0, 100) < 50) { + req.debugId = deterministicRandom()->randomUniqueID(); + } ErrorOr rep = wait(self->ekpInf.getLatestBaseCipherKeys.tryGetReply(req)); if (rep.present()) { @@ -82,7 +86,7 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { for (const uint64_t id : self->domainIds) { bool found = false; for (const auto& item : rep.get().baseCipherDetails) { - if (item.baseCipherId == id) { + if (item.encryptDomainId == id) { found = true; break; } @@ -131,7 +135,11 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { // assertions. However, in simulation runs, RPCs can be force failed to inject retries, hence, code leverage // tryGetReply to ensure at-most once delivery of message, further, assertions are relaxed to account of // cache warm-up due to retries. - EKPGetLatestBaseCipherKeysRequest req(deterministicRandom()->randomUniqueID(), self->domainIds); + EKPGetLatestBaseCipherKeysRequest req; + req.encryptDomainIds = self->domainIds; + if (deterministicRandom()->randomInt(0, 100) < 50) { + req.debugId = deterministicRandom()->randomUniqueID(); + } ErrorOr rep = wait(self->ekpInf.getLatestBaseCipherKeys.tryGetReply(req)); if (rep.present()) { ASSERT(!rep.get().error.present()); @@ -140,7 +148,7 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { for (const uint64_t id : self->domainIds) { bool found = false; for (const auto& item : rep.get().baseCipherDetails) { - if (item.baseCipherId == id) { + if (item.encryptDomainId == id) { found = true; break; } @@ -176,7 +184,11 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { self->domainIds.emplace_back(self->minDomainId + i); } - EKPGetLatestBaseCipherKeysRequest req(deterministicRandom()->randomUniqueID(), self->domainIds); + EKPGetLatestBaseCipherKeysRequest req; + req.encryptDomainIds = self->domainIds; + if (deterministicRandom()->randomInt(0, 100) < 50) { + req.debugId = deterministicRandom()->randomUniqueID(); + } EKPGetLatestBaseCipherKeysReply rep = wait(self->ekpInf.getLatestBaseCipherKeys.getReply(req)); ASSERT(!rep.error.present()); @@ -184,7 +196,7 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { for (const uint64_t id : self->domainIds) { bool found = false; for (const auto& item : rep.baseCipherDetails) { - if (item.baseCipherId == id) { + if (item.encryptDomainId == id) { found = true; break; } @@ -200,14 +212,24 @@ struct EncryptKeyProxyTestWorkload : TestWorkload { } state int numIterations = deterministicRandom()->randomInt(512, 786); - for (; numIterations > 0; numIterations--) { + for (; numIterations > 0;) { int idx = deterministicRandom()->randomInt(1, self->cipherIds.size()); int nIds = deterministicRandom()->randomInt(1, self->cipherIds.size()); EKPGetBaseCipherKeysByIdsRequest req; + if (deterministicRandom()->randomInt(0, 100) < 50) { + req.debugId = deterministicRandom()->randomUniqueID(); + } for (int i = idx; i < nIds && i < self->cipherIds.size(); i++) { req.baseCipherIds.emplace_back(std::make_pair(self->cipherIds[i], 1)); } + if (req.baseCipherIds.empty()) { + // No keys to query; continue + continue; + } else { + numIterations--; + } + expectedHits = req.baseCipherIds.size(); EKPGetBaseCipherKeysByIdsReply rep = wait(self->ekpInf.getBaseCipherKeysByIds.getReply(req)); diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index 4062823c98..6c47611f76 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -226,7 +226,7 @@ struct EncryptionOpsWorkload : TestWorkload { Reference cipherKey = cipherKeyCache->getCipherKey(domainId, baseCipherId, salt); if (simCacheMiss) { - TraceEvent("SimKeyCacheMiss").detail("EncyrptDomainId", domainId).detail("BaseCipherId", baseCipherId); + TraceEvent("SimKeyCacheMiss").detail("EncryptDomainId", domainId).detail("BaseCipherId", baseCipherId); // simulate KeyCache miss that may happen during decryption; insert a CipherKey with known 'salt' cipherKeyCache->insertCipherKey(domainId, baseCipherId, diff --git a/flow/BlobCipher.cpp b/flow/BlobCipher.cpp index c93f292ae0..1b7e8d19e7 100644 --- a/flow/BlobCipher.cpp +++ b/flow/BlobCipher.cpp @@ -1175,7 +1175,7 @@ TEST_CASE("flow/BlobCipher") { TraceEvent("MultiAuthMode_Done").log(); } - // Validate dropping encyrptDomainId cached keys + // Validate dropping encryptDomainId cached keys const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId); cipherKeyCache->resetEncryptDomainId(candidate); std::vector> cachedKeys = cipherKeyCache->getAllCiphers(candidate); diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 7493dfee94..e30648b8ae 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -19,6 +19,8 @@ set(FLOW_SRCS Error.cpp Error.h EventTypes.actor.h + EncryptUtils.h + EncryptUtils.cpp FastAlloc.cpp FastAlloc.h FastRef.h diff --git a/flow/EncryptUtils.cpp b/flow/EncryptUtils.cpp new file mode 100644 index 0000000000..82895937e4 --- /dev/null +++ b/flow/EncryptUtils.cpp @@ -0,0 +1,38 @@ +/* + * EncryptUtils.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/EncryptUtils.h" +#include "flow/Trace.h" + +#include + +std::string getEncryptDbgTraceKey(std::string_view prefix, + EncryptCipherDomainId domainId, + Optional baseCipherId) { + // Construct the TraceEvent field key ensuring its uniqueness and compliance to TraceEvent field validator and log + // parsing tools + if (baseCipherId.present()) { + boost::format fmter("%s.%lld.%llu"); + return boost::str(boost::format(fmter % prefix % domainId % baseCipherId.get())); + } else { + boost::format fmter("%s.%lld"); + return boost::str(boost::format(fmter % prefix % domainId)); + } +} \ No newline at end of file diff --git a/flow/EncryptUtils.h b/flow/EncryptUtils.h index 2728f2410f..5191304a17 100644 --- a/flow/EncryptUtils.h +++ b/flow/EncryptUtils.h @@ -22,8 +22,12 @@ #define ENCRYPT_UTILS_H #pragma once +#include "flow/Arena.h" + #include #include +#include +#include #define ENCRYPT_INVALID_DOMAIN_ID 0 #define ENCRYPT_INVALID_CIPHER_KEY_ID 0 @@ -50,7 +54,7 @@ static_assert(EncryptCipherMode::ENCRYPT_CIPHER_MODE_LAST <= std::numeric_limits // EncryptionHeader authentication modes // 1. NONE - No 'authentication token' generation needed for EncryptionHeader i.e. no protection against header OR // cipherText 'tampering' and/or bit rot/flip corruptions. -// 2. Single/Multi - Encyrption header would generate one or more 'authentication tokens' to protect the header against +// 2. Single/Multi - Encryption header would generate one or more 'authentication tokens' to protect the header against // 'tempering' and/or bit rot/flip corruptions. Refer to BlobCipher.h for detailed usage recommendations. // 3. LAST - Invalid mode, used for static asserts. @@ -64,4 +68,13 @@ typedef enum { static_assert(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_LAST <= std::numeric_limits::max(), "EncryptHeaderAuthToken value overflow"); +constexpr std::string_view ENCRYPT_DBG_TRACE_CACHED_PREFIX = "Chd"; +constexpr std::string_view ENCRYPT_DBG_TRACE_QUERY_PREFIX = "Qry"; +constexpr std::string_view ENCRYPT_DBG_TRACE_INSERT_PREFIX = "Ins"; +constexpr std::string_view ENCRYPT_DBG_TRACE_RESULT_PREFIX = "Res"; + +// Utility interface to construct TraceEvent key for debugging +std::string getEncryptDbgTraceKey(std::string_view prefix, + EncryptCipherDomainId domainId, + Optional baseCipherId = Optional()); #endif diff --git a/flow/IRandom.h b/flow/IRandom.h index 87f7f42424..bd46d108fe 100644 --- a/flow/IRandom.h +++ b/flow/IRandom.h @@ -90,6 +90,7 @@ public: uint64_t second() const { return part[1]; } static UID fromString(std::string const&); + static UID fromStringThrowsOnFailure(std::string const&); template void serialize_unversioned( diff --git a/flow/flow.cpp b/flow/flow.cpp index 285de7e904..fc09c92ab4 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -130,6 +130,19 @@ UID UID::fromString(std::string const& s) { return UID(a, b); } +UID UID::fromStringThrowsOnFailure(std::string const& s) { + if (s.size() != 32) { + // invalid string size + throw operation_failed(); + } + uint64_t a = 0, b = 0; + int r = sscanf(s.c_str(), "%16" SCNx64 "%16" SCNx64, &a, &b); + if (r != 2) { + throw operation_failed(); + } + return UID(a, b); +} + std::string UID::shortString() const { return format("%016llx", part[0]); } From 65cab4f12af30af438ddb428b875261a5b9a0077 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 11 May 2022 15:51:31 -0700 Subject: [PATCH 192/299] Don't pass nullptr to memcpy (#7136) --- fdbclient/Tenant.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdbclient/Tenant.h b/fdbclient/Tenant.h index f7e69e6d44..adfa06470a 100644 --- a/fdbclient/Tenant.h +++ b/fdbclient/Tenant.h @@ -55,7 +55,9 @@ private: ASSERT(id >= 0); prefix = makeString(8 + subspace.size()); uint8_t* data = mutateString(prefix); - memcpy(data, subspace.begin(), subspace.size()); + if (subspace.size() > 0) { + memcpy(data, subspace.begin(), subspace.size()); + } int64_t swapped = bigEndian64(id); memcpy(data + subspace.size(), &swapped, 8); } From a3102c0db4bd792e79635e52a9250fc0df923352 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Thu, 12 May 2022 15:28:53 +0200 Subject: [PATCH 193/299] Fix updating database shared state on upgrades --- fdbclient/MultiVersionTransaction.actor.cpp | 49 +++++++++++++++++---- fdbclient/MultiVersionTransaction.h | 12 +++-- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index e281887e11..f90303d5da 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -1609,7 +1609,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion // When the protocol version changes, clear the corresponding entry in the shared state map // so it can be re-initialized. Only do so if there was a valid previous protocol version. if (dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) { - MultiVersionApi::api->clearClusterSharedStateMapEntry(clusterFilePath); + MultiVersionApi::api->clearClusterSharedStateMapEntry(clusterFilePath, dbProtocolVersion.get()); } dbProtocolVersion = protocolVersion; @@ -1722,8 +1722,10 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference ne } } if (db.isValid() && dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) { - auto updateResult = MultiVersionApi::api->updateClusterSharedStateMap(clusterFilePath, db); + auto updateResult = + MultiVersionApi::api->updateClusterSharedStateMap(clusterFilePath, dbProtocolVersion.get(), db); auto handler = mapThreadFuture(updateResult, [this](ErrorOr result) { + TraceEvent("ClusterSharedStateUpdated").detail("ClusterFilePath", clusterFilePath); dbVar->set(db); return ErrorOr(Void()); }); @@ -2389,12 +2391,30 @@ void MultiVersionApi::updateSupportedVersions() { } } -ThreadFuture MultiVersionApi::updateClusterSharedStateMap(std::string clusterFilePath, Reference db) { +ThreadFuture MultiVersionApi::updateClusterSharedStateMap(std::string clusterFilePath, + ProtocolVersion dbProtocolVersion, + Reference db) { MutexHolder holder(lock); if (clusterSharedStateMap.find(clusterFilePath) == clusterSharedStateMap.end()) { - clusterSharedStateMap[clusterFilePath] = db->createSharedState(); + TraceEvent("CreatingClusterSharedState") + .detail("ClusterFilePath", clusterFilePath) + .detail("ProtocolVersion", dbProtocolVersion); + clusterSharedStateMap[clusterFilePath] = { db->createSharedState(), dbProtocolVersion }; } else { - ThreadFuture entry = clusterSharedStateMap[clusterFilePath]; + auto& sharedStateInfo = clusterSharedStateMap[clusterFilePath]; + if (sharedStateInfo.protocolVersion != dbProtocolVersion) { + // This situation should never happen, because we are connecting to the same cluster, + // so the protocol version must be the same + TraceEvent(SevError, "ClusterStateProtocolVersionMismatch") + .detail("ClusterFilePath", clusterFilePath) + .detail("ProtocolVersionExpected", dbProtocolVersion) + .detail("ProtocolVersionFound", sharedStateInfo.protocolVersion); + return Void(); + } + TraceEvent("SettingClusterSharedState") + .detail("ClusterFilePath", clusterFilePath) + .detail("ProtocolVersion", dbProtocolVersion); + ThreadFuture entry = sharedStateInfo.sharedStateFuture; return mapThreadFuture(entry, [db](ErrorOr result) { if (result.isError()) { return ErrorOr(result.getError()); @@ -2407,16 +2427,29 @@ ThreadFuture MultiVersionApi::updateClusterSharedStateMap(std::string clus return Void(); } -void MultiVersionApi::clearClusterSharedStateMapEntry(std::string clusterFilePath) { +void MultiVersionApi::clearClusterSharedStateMapEntry(std::string clusterFilePath, ProtocolVersion dbProtocolVersion) { MutexHolder holder(lock); auto mapEntry = clusterSharedStateMap.find(clusterFilePath); + // It can be that other database instances on the same cluster path are already upgraded and thus + // have cleared or even created a new shared object entry if (mapEntry == clusterSharedStateMap.end()) { - TraceEvent(SevError, "ClusterSharedStateMapEntryNotFound").detail("ClusterFilePath", clusterFilePath); + TraceEvent("ClusterSharedStateMapEntryNotFound").detail("ClusterFilePath", clusterFilePath); return; } - auto ssPtr = mapEntry->second.get(); + auto sharedStateInfo = mapEntry->second; + if (sharedStateInfo.protocolVersion != dbProtocolVersion) { + TraceEvent("ClusterSharedStateClearSkipped") + .detail("ClusterFilePath", clusterFilePath) + .detail("ProtocolVersionExpected", dbProtocolVersion) + .detail("ProtocolVersionFound", sharedStateInfo.protocolVersion); + return; + } + auto ssPtr = sharedStateInfo.sharedStateFuture.get(); ssPtr->delRef(ssPtr); clusterSharedStateMap.erase(mapEntry); + TraceEvent("ClusterSharedStateCleared") + .detail("ClusterFilePath", clusterFilePath) + .detail("ProtocolVersion", dbProtocolVersion); } std::vector parseOptionValues(std::string valueStr) { diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 1fb5c604ff..77bc429dc4 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -861,8 +861,10 @@ public: bool callbackOnMainThread; bool localClientDisabled; - ThreadFuture updateClusterSharedStateMap(std::string clusterFilePath, Reference db); - void clearClusterSharedStateMapEntry(std::string clusterFilePath); + ThreadFuture updateClusterSharedStateMap(std::string clusterFilePath, + ProtocolVersion dbProtocolVersion, + Reference db); + void clearClusterSharedStateMapEntry(std::string clusterFilePath, ProtocolVersion dbProtocolVersion); static bool apiVersionAtLeast(int minVersion); @@ -888,7 +890,11 @@ private: std::map>> externalClients; // Map of clusterFilePath -> DatabaseSharedState pointer Future // Upon cluster version upgrade, clear the map entry for that cluster - std::map> clusterSharedStateMap; + struct SharedStateInfo { + ThreadFuture sharedStateFuture; + ProtocolVersion protocolVersion; + }; + std::map clusterSharedStateMap; bool networkStartSetup; volatile bool networkSetup; From 1992898323ea1d69bb90fd4cdf69fc383be81835 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Thu, 12 May 2022 15:34:03 +0200 Subject: [PATCH 194/299] Add a regression test to upgrade from 7.1 to 7.2 and downgrade back --- bindings/c/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index ecbf7dbff7..cdb4dc096b 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -334,6 +334,14 @@ endif() --process-number 3 ) + add_test(NAME fdb_c_upgrade_multi_threaded_710api + COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py + --build-dir ${CMAKE_BINARY_DIR} + --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml + --upgrade-path "7.1.3" "7.2.0" "7.1.3" + --process-number 3 + ) + add_test(NAME fdb_c_cluster_wiggle COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} From 39e085938c65ccb96f450870d8e1d4cfc639f329 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 12 May 2022 09:47:51 -0700 Subject: [PATCH 195/299] Use libcoro for valgrind --- cmake/FDBComponents.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index f0081df9c9..c21f504cf7 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -217,7 +217,7 @@ set(DEFAULT_COROUTINE_IMPL boost) if(WIN32) # boost coroutine not available in windows build environment for now. set(DEFAULT_COROUTINE_IMPL libcoro) -elseif(NOT APPLE AND NOT USE_SANITIZER AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^x86") +elseif(NOT APPLE AND NOT USE_ASAN AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^x86") # revert to libcoro for x86 linux while we investigate a performance regression set(DEFAULT_COROUTINE_IMPL libcoro) endif() From 30e124c09bf477eb34a833bfa6f8019104dbd7d4 Mon Sep 17 00:00:00 2001 From: Renxuan Wang Date: Wed, 11 May 2022 18:41:11 -0700 Subject: [PATCH 196/299] Remove HostnameStatus and resolve trigger. They are no longer needed since we have coordinators DNS cache; and they are introducing complex crashes. --- fdbclient/MonitorLeader.actor.cpp | 8 +- fdbrpc/genericactors.actor.h | 4 - flow/Hostname.actor.cpp | 122 ++++++++++-------------------- flow/Hostname.h | 17 +---- 4 files changed, 43 insertions(+), 108 deletions(-) diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 7f76fcfdb2..0182bc1534 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -270,6 +270,9 @@ TEST_CASE("/fdbclient/MonitorLeader/ConnectionString/hostname") { ACTOR Future> tryResolveHostnamesImpl(ClusterConnectionString* self) { state std::set allCoordinatorsSet; + for (const auto& coord : self->coords) { + allCoordinatorsSet.insert(coord); + } std::vector> fs; for (auto& hostname : self->hostnames) { fs.push_back(map(hostname.resolve(), [&](Optional const& addr) -> Void { @@ -280,9 +283,6 @@ ACTOR Future> tryResolveHostnamesImpl(ClusterConnect })); } wait(waitForAll(fs)); - for (const auto& coord : self->coords) { - allCoordinatorsSet.insert(coord); - } std::vector allCoordinators(allCoordinatorsSet.begin(), allCoordinatorsSet.end()); std::sort(allCoordinators.begin(), allCoordinators.end()); return allCoordinators; @@ -300,7 +300,7 @@ TEST_CASE("/fdbclient/MonitorLeader/PartialResolve") { INetworkConnections::net()->addMockTCPEndpoint(hn, port, { address }); - state ClusterConnectionString cs(connectionString); + ClusterConnectionString cs(connectionString); state std::vector allCoordinators = wait(cs.tryResolveHostnames()); ASSERT(allCoordinators.size() == 1 && std::find(allCoordinators.begin(), allCoordinators.end(), address) != allCoordinators.end()); diff --git a/fdbrpc/genericactors.actor.h b/fdbrpc/genericactors.actor.h index 05f7c150ee..a8eb35758b 100644 --- a/fdbrpc/genericactors.actor.h +++ b/fdbrpc/genericactors.actor.h @@ -101,7 +101,6 @@ Future> tryGetReplyFromHostname(Req request, Hostname h resetReply(request); if (reply.getError().code() == error_code_request_maybe_delivered) { // Connection failure. - hostname.resetToUnresolved(); INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service); } } @@ -126,7 +125,6 @@ Future> tryGetReplyFromHostname(Req request, resetReply(request); if (reply.getError().code() == error_code_request_maybe_delivered) { // Connection failure. - hostname.resetToUnresolved(); INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service); } } @@ -149,7 +147,6 @@ Future retryGetReplyFromHostname(Req request, Hostname hostname // Connection failure. wait(delay(reconnetInterval)); reconnetInterval = std::min(2 * reconnetInterval, FLOW_KNOBS->HOSTNAME_RECONNECT_MAX_INTERVAL); - hostname.resetToUnresolved(); INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service); } else { throw reply.getError(); @@ -179,7 +176,6 @@ Future retryGetReplyFromHostname(Req request, // Connection failure. wait(delay(reconnetInterval)); reconnetInterval = std::min(2 * reconnetInterval, FLOW_KNOBS->HOSTNAME_RECONNECT_MAX_INTERVAL); - hostname.resetToUnresolved(); INetworkConnections::net()->removeCachedDNS(hostname.host, hostname.service); } else { throw reply.getError(); diff --git a/flow/Hostname.actor.cpp b/flow/Hostname.actor.cpp index ab89280a44..110061e5c9 100644 --- a/flow/Hostname.actor.cpp +++ b/flow/Hostname.actor.cpp @@ -39,47 +39,19 @@ Hostname Hostname::parse(const std::string& s) { return Hostname(f.substr(0, colonPos), f.substr(colonPos + 1), isTLS); } -void Hostname::resetToUnresolved() { - if (status == Hostname::RESOLVED) { - status = UNRESOLVED; - resolvedAddress = Optional(); - } -} - ACTOR Future> resolveImpl(Hostname* self) { - loop { - if (self->status == Hostname::UNRESOLVED) { - self->status = Hostname::RESOLVING; - try { - std::vector addresses = - wait(INetworkConnections::net()->resolveTCPEndpointWithDNSCache(self->host, self->service)); - NetworkAddress address = addresses[deterministicRandom()->randomInt(0, addresses.size())]; - address.flags = 0; // Reset the parsed address to public - address.fromHostname = NetworkAddressFromHostname::True; - if (self->isTLS) { - address.flags |= NetworkAddress::FLAG_TLS; - } - self->resolvedAddress = address; - self->status = Hostname::RESOLVED; - self->resolveFinish.trigger(); - return self->resolvedAddress.get(); - } catch (...) { - self->status = Hostname::UNRESOLVED; - self->resolveFinish.trigger(); - self->resolvedAddress = Optional(); - return Optional(); - } - } else if (self->status == Hostname::RESOLVING) { - wait(self->resolveFinish.onTrigger()); - if (self->status == Hostname::RESOLVED) { - return self->resolvedAddress.get(); - } - // Otherwise, this means other threads failed on resolve, so here we go back to the loop and try to resolve - // again. - } else { - // status is RESOLVED, nothing to do. - return self->resolvedAddress.get(); + try { + std::vector addresses = + wait(INetworkConnections::net()->resolveTCPEndpointWithDNSCache(self->host, self->service)); + NetworkAddress address = addresses[deterministicRandom()->randomInt(0, addresses.size())]; + address.flags = 0; // Reset the parsed address to public + address.fromHostname = NetworkAddressFromHostname::True; + if (self->isTLS) { + address.flags |= NetworkAddress::FLAG_TLS; } + return address; + } catch (...) { + return Optional(); } } @@ -109,24 +81,19 @@ Future Hostname::resolveWithRetry() { } Optional Hostname::resolveBlocking() { - if (status != RESOLVED) { - try { - std::vector addresses = - INetworkConnections::net()->resolveTCPEndpointBlockingWithDNSCache(host, service); - NetworkAddress address = addresses[deterministicRandom()->randomInt(0, addresses.size())]; - address.flags = 0; // Reset the parsed address to public - address.fromHostname = NetworkAddressFromHostname::True; - if (isTLS) { - address.flags |= NetworkAddress::FLAG_TLS; - } - resolvedAddress = address; - status = RESOLVED; - } catch (...) { - status = UNRESOLVED; - resolvedAddress = Optional(); + try { + std::vector addresses = + INetworkConnections::net()->resolveTCPEndpointBlockingWithDNSCache(host, service); + NetworkAddress address = addresses[deterministicRandom()->randomInt(0, addresses.size())]; + address.flags = 0; // Reset the parsed address to public + address.fromHostname = NetworkAddressFromHostname::True; + if (isTLS) { + address.flags |= NetworkAddress::FLAG_TLS; } + return address; + } catch (...) { + return Optional(); } - return resolvedAddress; } TEST_CASE("/flow/Hostname/hostname") { @@ -179,49 +146,36 @@ TEST_CASE("/flow/Hostname/hostname") { ASSERT(!Hostname::isHostname(hn12s)); ASSERT(!Hostname::isHostname(hn13s)); - ASSERT(hn1.status == Hostname::UNRESOLVED && !hn1.resolvedAddress.present()); - ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present()); - ASSERT(hn3.status == Hostname::UNRESOLVED && !hn3.resolvedAddress.present()); - ASSERT(hn4.status == Hostname::UNRESOLVED && !hn4.resolvedAddress.present()); + state Optional optionalAddress = wait(hn2.resolve()); + ASSERT(!optionalAddress.present()); - state Optional emptyAddress = wait(hn2.resolve()); - ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present() && !emptyAddress.present()); + optionalAddress = hn2.resolveBlocking(); + ASSERT(!optionalAddress.present()); + state NetworkAddress address; try { - NetworkAddress _ = wait(timeoutError(hn2.resolveWithRetry(), 1)); + wait(timeoutError(store(address, hn2.resolveWithRetry()), 1)); } catch (Error& e) { ASSERT(e.code() == error_code_timed_out); } - ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present()); - - emptyAddress = hn2.resolveBlocking(); - ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present() && !emptyAddress.present()); + ASSERT(address == NetworkAddress()); state NetworkAddress addressSource = NetworkAddress::parse("127.0.0.0:1234"); INetworkConnections::net()->addMockTCPEndpoint("host-name", "1234", { addressSource }); // Test resolve. - state Optional optionalAddress = wait(hn2.resolve()); - ASSERT(hn2.status == Hostname::RESOLVED); - ASSERT(hn2.resolvedAddress.get() == addressSource && optionalAddress.get() == addressSource); + wait(store(optionalAddress, hn2.resolve())); + ASSERT(optionalAddress.present() && optionalAddress.get() == addressSource); + optionalAddress = Optional(); + + // Test resolveBlocking. + optionalAddress = hn2.resolveBlocking(); + ASSERT(optionalAddress.present() && optionalAddress.get() == addressSource); optionalAddress = Optional(); // Test resolveWithRetry. - hn2.resetToUnresolved(); - ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present()); - - state NetworkAddress address = wait(hn2.resolveWithRetry()); - ASSERT(hn2.status == Hostname::RESOLVED); - ASSERT(hn2.resolvedAddress.get() == addressSource && address == addressSource); - - // Test resolveBlocking. - hn2.resetToUnresolved(); - ASSERT(hn2.status == Hostname::UNRESOLVED && !hn2.resolvedAddress.present()); - - optionalAddress = hn2.resolveBlocking(); - ASSERT(hn2.status == Hostname::RESOLVED); - ASSERT(hn2.resolvedAddress.get() == addressSource && optionalAddress.get() == addressSource); - optionalAddress = Optional(); + wait(store(address, hn2.resolveWithRetry())); + ASSERT(address == addressSource); return Void(); } diff --git a/flow/Hostname.h b/flow/Hostname.h index 2492a17370..c7871e5028 100644 --- a/flow/Hostname.h +++ b/flow/Hostname.h @@ -33,16 +33,6 @@ struct Hostname { Hostname(const std::string& host, const std::string& service, bool isTLS) : host(host), service(service), isTLS(isTLS) {} Hostname() : host(""), service(""), isTLS(false) {} - Hostname(const Hostname& rhs) { operator=(rhs); } - Hostname& operator=(const Hostname& rhs) { - // Copy everything except AsyncTrigger resolveFinish. - host = rhs.host; - service = rhs.service; - isTLS = rhs.isTLS; - resolvedAddress = rhs.resolvedAddress; - status = rhs.status; - return *this; - } bool operator==(const Hostname& r) const { return host == r.host && service == r.service && isTLS == r.isTLS; } bool operator!=(const Hostname& r) const { return !(*this == r); } @@ -72,20 +62,15 @@ struct Hostname { std::string toString() const { return host + ":" + service + (isTLS ? ":tls" : ""); } - Optional resolvedAddress; - enum HostnameStatus { UNRESOLVED, RESOLVING, RESOLVED }; // The resolve functions below use DNS cache. Future> resolve(); Future resolveWithRetry(); Optional resolveBlocking(); // This one should only be used when resolving asynchronously is // impossible. For all other cases, resolve() should be preferred. - void resetToUnresolved(); - HostnameStatus status = UNRESOLVED; - AsyncTrigger resolveFinish; template void serialize(Ar& ar) { - serializer(ar, host, service, isTLS, resolvedAddress, status); + serializer(ar, host, service, isTLS); } }; From 8117aa9670a839d76ad92ef98d095f1915c60181 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Thu, 12 May 2022 21:28:30 +0200 Subject: [PATCH 197/299] Add Selective mTLS unit test using MkCert --- flow/CMakeLists.txt | 10 ++ flow/TLSTest.cpp | 254 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 264 insertions(+) create mode 100644 flow/TLSTest.cpp diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 705a98f282..4266651aa1 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -197,3 +197,13 @@ endif() add_executable(mkcert MkCertCli.cpp) target_link_libraries(mkcert PUBLIC fmt::fmt flow) + +add_executable(mtls_unittest TLSTest.cpp) +target_link_libraries(mtls_unittest PUBLIC fmt::fmt flow) +if(USE_SANITIZER) + target_link_libraries(mtls_unittest PUBLIC boost_asan) +else() + target_link_libraries(mtls_unittest PUBLIC boost_target) +endif() +add_test(NAME mutual_tls_unittest + COMMAND $) diff --git a/flow/TLSTest.cpp b/flow/TLSTest.cpp new file mode 100644 index 0000000000..edf78128e5 --- /dev/null +++ b/flow/TLSTest.cpp @@ -0,0 +1,254 @@ +/* + * TLSTest.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "flow/Arena.h" +#include "flow/MkCert.h" + +std::FILE* outp = stdout; + +template +void log(Args&&... args) { + auto buf = fmt::memory_buffer{}; + fmt::format_to(std::back_inserter(buf), std::forward(args)...); + fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); +} + +template +void logc(Args&&... args) { + auto buf = fmt::memory_buffer{}; + fmt::format_to(std::back_inserter(buf), "[CLIENT] "); + fmt::format_to(std::back_inserter(buf), std::forward(args)...); + fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); +} + +template +void logs(Args&&... args) { + auto buf = fmt::memory_buffer{}; + fmt::format_to(std::back_inserter(buf), "[SERVER] "); + fmt::format_to(std::back_inserter(buf), std::forward(args)...); + fmt::print(outp, "{}\n", std::string_view(buf.data(), buf.size())); +} + +using namespace boost::asio; +using ip::tcp; + +using ec_type = boost::system::error_code; + +using socket_type = ssl::stream; +using work_guard_type = executor_work_guard; + +const_buffer toBuffer(StringRef s) { + ASSERT(!s.empty()); + return const_buffer(s.begin(), s.size()); +} + +void trustRootCaCert(ssl::context& ctx, StringRef certPem) { + if (!certPem.empty()) + ctx.add_certificate_authority(const_buffer(certPem.begin(), certPem.size())); +} + +void useChain(ssl::context& ctx, mkcert::CertChainRef chain) { + auto arena = Arena(); + auto chainStr = concatCertChain(arena, chain); + if (!chainStr.empty()) + ctx.use_certificate_chain(toBuffer(chainStr)); + auto keyPem = chain.front().privateKeyPem; + if (!keyPem.empty()) + ctx.use_private_key(toBuffer(keyPem), ssl::context::pem); +} + +void initCerts(ssl::context& ctx, mkcert::CertChainRef myChain, StringRef peerRootPem) { + trustRootCaCert(ctx, peerRootPem); + if (myChain.size() > 1) + myChain.pop_back(); + if (!myChain.empty()) + useChain(ctx, myChain); +} + +void initSslContext(ssl::context& ctx, + mkcert::CertChainRef myChain, + mkcert::CertChainRef peerChain, + mkcert::ESide side) { + ctx.set_options(ssl::context::default_workarounds); + ctx.set_verify_mode(ssl::context::verify_peer | + (side == mkcert::ESide::Server ? 0 : ssl::verify_fail_if_no_peer_cert)); + initCerts(ctx, myChain, peerChain.empty() ? StringRef() : peerChain.back().certPem); +} + +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { return ctx.begin(); } + + template + auto format(const tcp::endpoint& ep, FormatContext& ctx) -> decltype(ctx.out()) { + return fmt::format_to(ctx.out(), "{}:{}", ep.address().to_string(), ep.port()); + } +}; + +void runTlsTest(int serverChainLen, int clientChainLen) { + log("==== BEGIN TESTCASE ===="); + auto clientSsl = ssl::context(ssl::context::tls); + auto serverSsl = ssl::context(ssl::context::tls); + auto const expectHandshakeOk = clientChainLen >= 0 && serverChainLen > 0; + auto const expectTrusted = clientChainLen != 0; + log("cert chain length: server {}, client {}", serverChainLen, clientChainLen); + auto arena = Arena(); + auto serverChain = mkcert::CertChainRef{}; + auto clientChain = mkcert::CertChainRef{}; + if (serverChainLen) { + auto tmpArena = Arena(); + auto specs = mkcert::makeCertChainSpec(tmpArena, std::labs(serverChainLen), mkcert::ESide::Server); + if (serverChainLen < 0) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago + } + serverChain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/); + } + if (clientChainLen) { + auto tmpArena = Arena(); + auto specs = mkcert::makeCertChainSpec(tmpArena, std::labs(clientChainLen), mkcert::ESide::Client); + if (clientChainLen < 0) { + specs[0].offsetNotBefore = -60l * 60 * 24 * 365; + specs[0].offsetNotAfter = -10l; // cert that expired 10 seconds ago + } + clientChain = mkcert::makeCertChain(arena, specs, {} /* create root CA cert from spec*/); + } + initSslContext(clientSsl, clientChain, serverChain, mkcert::ESide::Client); + log("client SSL contexts initialized"); + initSslContext(serverSsl, serverChain, clientChain, mkcert::ESide::Server); + log("server SSL contexts initialized"); + auto io = io_context(); + auto serverWorkGuard = work_guard_type(io.get_executor()); + auto clientWorkGuard = work_guard_type(io.get_executor()); + auto const ip = ip::address::from_string("127.0.0.1"); + auto acceptor = tcp::acceptor(io, tcp::endpoint(ip, 0)); + auto const serverAddr = acceptor.local_endpoint(); + logs("server listening at {}", serverAddr); + auto serverSock = tcp::socket(io); + auto serverSslSock = socket_type(serverSock, serverSsl); + enum class ESockState { AssumedUntrusted, Trusted }; + auto serverSockState = ESockState::AssumedUntrusted; + auto clientSockState = ESockState::AssumedUntrusted; + auto handshakeOk = true; + serverSslSock.set_verify_callback([&serverSockState, &handshakeOk](bool preverify, ssl::verify_context&) { + logs("client preverify: {}", preverify); + switch (serverSockState) { + case ESockState::AssumedUntrusted: + if (!preverify) + return handshakeOk = false; + serverSockState = ESockState::Trusted; + break; + case ESockState::Trusted: + if (!preverify) + return handshakeOk = false; + break; + default: + break; + } + // if untrusted connection passes preverify, they are considered trusted + return true; + }); + acceptor.async_accept(serverSock, [&serverSslSock, &serverWorkGuard, &handshakeOk](const ec_type& ec) { + if (ec) { + logs("accept error: {}", ec.message()); + handshakeOk = false; + serverWorkGuard.reset(); + } else { + logs("accepted connection from {}", serverSslSock.next_layer().remote_endpoint()); + serverSslSock.async_handshake(ssl::stream_base::handshake_type::server, + [&serverWorkGuard, &handshakeOk](const ec_type& ec) { + if (ec) { + logs("server handshake returned {}", ec.message()); + handshakeOk = false; + } else { + logs("handshake OK"); + } + serverWorkGuard.reset(); + }); + } + }); + auto clientSock = tcp::socket(io); + auto clientSslSock = socket_type(clientSock, clientSsl); + clientSslSock.set_verify_callback([&clientSockState](bool preverify, ssl::verify_context&) { + logc("server preverify: {}", preverify); + switch (clientSockState) { + case ESockState::AssumedUntrusted: + if (!preverify) + return false; + clientSockState = ESockState::Trusted; + break; + case ESockState::Trusted: + if (!preverify) + return false; + break; + default: + break; + } + // if untrusted connection passes preverify, they are considered trusted + return true; + }); + clientSock.async_connect(serverAddr, + [&clientWorkGuard, &clientSock, &clientSslSock, &handshakeOk](const ec_type& ec) { + if (ec) { + logc("connect error: {}", ec.message()); + handshakeOk = false; + clientWorkGuard.reset(); + } else { + logc("connected to {}", clientSock.remote_endpoint()); + clientSslSock.async_handshake(ssl::stream_base::handshake_type::client, + [&clientWorkGuard, &handshakeOk](const ec_type& ec) { + if (ec) { + logc("handshake returned: {}", ec.message()); + handshakeOk = false; + } else { + logc("handshake OK"); + } + clientWorkGuard.reset(); + }); + } + }); + io.run(); + ASSERT_EQ(expectHandshakeOk, handshakeOk); + if (expectHandshakeOk) { + ASSERT_EQ(expectTrusted, (serverSockState == ESockState::Trusted)); + log("Test OK: Handshake passed and connection {} as expected", + serverSockState == ESockState::Trusted ? "trusted" : "untrusted"); + } else { + log("Test OK: Handshake failed as expected"); + } +} + +int main() { + std::pair inputs[] = { { 3, 2 }, { 4, 0 }, { -3, 1 }, { 3, -2 }, { -3, 0 }, + { 0, 0 }, { 0, 1 }, { 1, 3 }, { -1, -3 }, { 1, 0 } }; + for (auto input : inputs) { + auto [serverChainLen, clientChainLen] = input; + runTlsTest(serverChainLen, clientChainLen); + } + return 0; +} From c51ad847d6571b7453ef1fb5e89c3e8f9976334e Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 12 May 2022 13:12:23 -0700 Subject: [PATCH 198/299] revert "kill" command to previous 7.0 behavior because the current implementation is killing processes one at a time --- fdbcli/fdbcli.actor.cpp | 58 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index e936a98420..d40a85d77d 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -1514,9 +1514,61 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (tokencmp(tokens[0], "kill")) { getTransaction(db, managementTenant, tr, options, intrans); - bool _result = wait(makeInterruptable(killCommandActor(db, tr, tokens, &address_interface))); - if (!_result) - is_error = true; + if (tokens.size() == 1) { + RangeResult kvs = wait( + makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), + LiteralStringRef("\xff\xff/worker_interfaces0")), + CLIENT_KNOBS->TOO_MANY))); + ASSERT(!kvs.more); + auto connectLock = makeReference(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM); + std::vector> addInterfs; + for (auto it : kvs) { + addInterfs.push_back(addInterface(&address_interface, connectLock, it)); + } + wait(waitForAll(addInterfs)); + } + if (tokens.size() == 1 || tokencmp(tokens[1], "list")) { + if (address_interface.size() == 0) { + printf("\nNo addresses can be killed.\n"); + } else if (address_interface.size() == 1) { + printf("\nThe following address can be killed:\n"); + } else { + printf("\nThe following %zu addresses can be killed:\n", address_interface.size()); + } + for (auto it : address_interface) { + printf("%s\n", printable(it.first).c_str()); + } + printf("\n"); + } else if (tokencmp(tokens[1], "all")) { + for (auto it : address_interface) { + BinaryReader::fromStringRef(it.second.first, IncludeVersion()) + .reboot.send(RebootRequest()); + } + if (address_interface.size() == 0) { + fprintf(stderr, + "ERROR: no processes to kill. You must run the `kill’ command before " + "running `kill all’.\n"); + } else { + printf("Attempted to kill %zu processes\n", address_interface.size()); + } + } else { + for (int i = 1; i < tokens.size(); i++) { + if (!address_interface.count(tokens[i])) { + fprintf(stderr, "ERROR: process `%s' not recognized.\n", printable(tokens[i]).c_str()); + is_error = true; + break; + } + } + + if (!is_error) { + for (int i = 1; i < tokens.size(); i++) { + BinaryReader::fromStringRef(address_interface[tokens[i]].first, + IncludeVersion()) + .reboot.send(RebootRequest()); + } + printf("Attempted to kill %zu processes\n", tokens.size() - 1); + } + } continue; } From 809bc52bbc38b2e354a5ae10620bfeca8ef5b938 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Thu, 12 May 2022 23:12:51 +0200 Subject: [PATCH 199/299] Add boringssl workaround for Mac builds --- flow/MkCert.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index 4249f398de..b46de8fa4a 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -259,7 +259,11 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { TraceEvent(SevWarnAlways, "MkCertInvalidExtName").suppressFor(10).detail("Name", extName); throw tls_error(); } +#ifdef OPENSSL_IS_BORINGSSL + auto ext = ::X509V3_EXT_conf_nid(nullptr, &ctx, extNid, const_cast(extValue.c_str())); +#else auto ext = ::X509V3_EXT_conf_nid(nullptr, &ctx, extNid, extValue.c_str()); +#endif OSSL_ASSERT(ext); auto extGuard = ScopeExit([ext]() { ::X509_EXTENSION_free(ext); }); OSSL_ASSERT(::X509_add_ext(x, ext, -1)); From cd4a7038fc21357403648a942d9ec81d1a5e17f4 Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Thu, 12 May 2022 23:14:52 +0200 Subject: [PATCH 200/299] Move CertKind::getCommonName() to source file --- flow/MkCert.cpp | 12 ++++++++++++ flow/MkCert.h | 12 +----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index b46de8fa4a..fa11aab603 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -379,4 +379,16 @@ CertChainRef makeCertChain(Arena& arena, unsigned length, ESide side) { return makeCertChain(arena, specs, {} /*root*/); } +StringRef CertKind::getCommonName(StringRef prefix, Arena& arena) const { + auto const side = std::string(isClientSide() ? " Client" : " Server"); + if (isIntermediateCA()) { + auto const level = isClientSide() ? get().level : get().level; + return prefix.withSuffix(fmt::format("{} Intermediate {}", side, level), arena); + } else if (isRootCA()) { + return prefix.withSuffix(fmt::format("{} Root", side), arena); + } else { + return prefix.withSuffix(side, arena); + } +} + } // namespace mkcert diff --git a/flow/MkCert.h b/flow/MkCert.h index 4220202824..2fdaa6b8a5 100644 --- a/flow/MkCert.h +++ b/flow/MkCert.h @@ -91,17 +91,7 @@ struct CertKind { bool isCA() const noexcept { return !isLeaf(); } - StringRef getCommonName(StringRef prefix, Arena& arena) const { - auto const side = std::string(isClientSide() ? " Client" : " Server"); - if (isIntermediateCA()) { - auto const level = isClientSide() ? get().level : get().level; - return prefix.withSuffix(fmt::format("{} Intermediate {}", side, level), arena); - } else if (isRootCA()) { - return prefix.withSuffix(fmt::format("{} Root", side), arena); - } else { - return prefix.withSuffix(side, arena); - } - } + StringRef getCommonName(StringRef prefix, Arena& arena) const; std::variant value; }; From 452315ee78d50f253d796101b8afe14df9041b6a Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 12 May 2022 15:07:02 -0700 Subject: [PATCH 201/299] Build mockkms and add mockkms test (#7153) --- cmake/CompileBoost.cmake | 2 +- contrib/CMakeLists.txt | 1 + contrib/mockkms/CMakeLists.txt | 18 ++++++++++++++++++ .../src => contrib}/mockkms/fault_injection.go | 0 .../mockkms/get_encryption_keys.go | 0 .../go/src => contrib}/mockkms/mock_kms.go | 0 .../go/src => contrib}/mockkms/mockkms_test.go | 0 {bindings/go/src => contrib}/mockkms/utils.go | 0 8 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 contrib/mockkms/CMakeLists.txt rename {bindings/go/src => contrib}/mockkms/fault_injection.go (100%) rename {bindings/go/src => contrib}/mockkms/get_encryption_keys.go (100%) rename {bindings/go/src => contrib}/mockkms/mock_kms.go (100%) rename {bindings/go/src => contrib}/mockkms/mockkms_test.go (100%) rename {bindings/go/src => contrib}/mockkms/utils.go (100%) diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake index d73d0ab024..42bc5770dd 100644 --- a/cmake/CompileBoost.cmake +++ b/cmake/CompileBoost.cmake @@ -123,7 +123,7 @@ set(FORCE_BOOST_BUILD OFF CACHE BOOL "Forces cmake to build boost and ignores an if(Boost_FOUND AND Boost_filesystem_FOUND AND Boost_context_FOUND AND NOT FORCE_BOOST_BUILD) add_library(boost_target INTERFACE) - target_link_libraries(boost_target INTERFACE Boost::boost Boost::context_FOUND Boost::filesystem) + target_link_libraries(boost_target INTERFACE Boost::boost Boost::context Boost::filesystem) elseif(WIN32) message(FATAL_ERROR "Could not find Boost") else() diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index f2da5835fc..e34743d00a 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -5,3 +5,4 @@ if(NOT WIN32) add_subdirectory(TraceLogHelper) add_subdirectory(TestHarness) endif() +add_subdirectory(mockkms) diff --git a/contrib/mockkms/CMakeLists.txt b/contrib/mockkms/CMakeLists.txt new file mode 100644 index 0000000000..d5a79652d0 --- /dev/null +++ b/contrib/mockkms/CMakeLists.txt @@ -0,0 +1,18 @@ +if(WITH_GO_BINDING) + set(MOCK_KMS_SRC fault_injection.go get_encryption_keys.go mock_kms.go utils.go) + set(MOCK_KMS_TEST_SRC ${MOCK_KMS_SRC} mockkms_test.go) + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/bin/mockkms + COMMAND go build -o ${CMAKE_BINARY_DIR}/bin/mockkms ${MOCK_KMS_SRC} + DEPENDS ${MOCK_KMS_SRC} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(mockkms ALL DEPENDS ${CMAKE_BINARY_DIR}/bin/mockkms) + fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/bin/mockkms DESTINATION bin COMPONENT server) + + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/bin/mockkms_test + COMMAND go test -c -o ${CMAKE_BINARY_DIR}/bin/mockkms_test ${MOCK_KMS_TEST_SRC} + DEPENDS ${MOCK_KMS_TEST_SRC} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(mockkms_test ALL DEPENDS ${CMAKE_BINARY_DIR}/bin/mockkms_test) + add_test(NAME mockkms COMMAND ${CMAKE_BINARY_DIR}/bin/mockkms_test) + +endif() diff --git a/bindings/go/src/mockkms/fault_injection.go b/contrib/mockkms/fault_injection.go similarity index 100% rename from bindings/go/src/mockkms/fault_injection.go rename to contrib/mockkms/fault_injection.go diff --git a/bindings/go/src/mockkms/get_encryption_keys.go b/contrib/mockkms/get_encryption_keys.go similarity index 100% rename from bindings/go/src/mockkms/get_encryption_keys.go rename to contrib/mockkms/get_encryption_keys.go diff --git a/bindings/go/src/mockkms/mock_kms.go b/contrib/mockkms/mock_kms.go similarity index 100% rename from bindings/go/src/mockkms/mock_kms.go rename to contrib/mockkms/mock_kms.go diff --git a/bindings/go/src/mockkms/mockkms_test.go b/contrib/mockkms/mockkms_test.go similarity index 100% rename from bindings/go/src/mockkms/mockkms_test.go rename to contrib/mockkms/mockkms_test.go diff --git a/bindings/go/src/mockkms/utils.go b/contrib/mockkms/utils.go similarity index 100% rename from bindings/go/src/mockkms/utils.go rename to contrib/mockkms/utils.go From e9e11bf53b3bf5f3bf73c8bebb341c4d750cf9e2 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 12 May 2022 16:30:21 -0700 Subject: [PATCH 202/299] change all criteria to knobs --- fdbclient/ServerKnobs.cpp | 3 +++ fdbclient/ServerKnobs.h | 4 ++++ fdbserver/DataDistributionQueue.actor.cpp | 27 ++++++++++++++--------- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index c0cc5c79ca..05b3e5dc95 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -159,6 +159,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( PRIORITY_SPLIT_SHARD, 950 ); if( randomize && BUGGIFY ) PRIORITY_SPLIT_SHARD = 350; // Data distribution + init( READ_REBALANCE_CPU_THRESHOLD, 15.0 ); + init( READ_REBALANCE_SRC_PARALLELISM, 5 ); + init( READ_REBALANCE_SHARD_TOPK, 10 ); init( RETRY_RELOCATESHARD_DELAY, 0.1 ); init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0; bool buggifySmallShards = randomize && BUGGIFY; diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index ec7f9ab620..a976b5041d 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -161,6 +161,10 @@ public: int PRIORITY_SPLIT_SHARD; // Data distribution + double READ_REBALANCE_CPU_THRESHOLD; // read rebalance only happens if the source servers' CPU > threshold + int READ_REBALANCE_SRC_PARALLELISM; // the max count a server become a source server within a certain interval + int READ_REBALANCE_SHARD_TOPK; // top k shards were return for random selection in read rebalance + double RETRY_RELOCATESHARD_DELAY; double DATA_DISTRIBUTION_FAILURE_REACTION_TIME; int MIN_SHARD_BYTES, SHARD_BYTES_RATIO, SHARD_BYTES_PER_SQRT_BYTES, MAX_SHARD_BYTES, KEY_SERVER_SHARD_BYTES; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 9cc1780bf2..5caef85278 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1083,8 +1083,8 @@ struct DDQueueData { bool timeThrottle(const std::vector& ids) const { return std::any_of(ids.begin(), ids.end(), [this](const UID& id) { if (this->lastAsSource.count(id)) { - // TODO: set 5.0 as a knob - return (now() - this->lastAsSource.at(id)) * 5.0 < SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; + return (now() - this->lastAsSource.at(id)) * SERVER_KNOBS->READ_REBALANCE_SRC_PARALLELISM < + SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL; } return false; }); @@ -1545,18 +1545,19 @@ ACTOR Future rebalanceReadLoad(DDQueueData* self, traceEvent->detail("SkipReason", "TeamTooSimilar"); return false; } - // TODO: set 10 as a knob // randomly choose topK shards - int topK = std::min(int(0.1 * shards.size()), 10); + int topK = std::min(int(0.1 * shards.size()), SERVER_KNOBS->READ_REBALANCE_SHARD_TOPK); state Future healthMetrics = self->cx->getHealthMetrics(true); - state GetTopKMetricsRequest req(shards, topK, (srcLoad - destLoad) / 10.0); // 1/(5 * 2) + state GetTopKMetricsRequest req( + shards, topK, (srcLoad - destLoad) / 2.0 / SERVER_KNOBS->READ_REBALANCE_SRC_PARALLELISM); req.comparator = [](const StorageMetrics& a, const StorageMetrics& b) { return a.bytesReadPerKSecond / std::max(a.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES) > b.bytesReadPerKSecond / std::max(b.bytes * 1.0, 1.0 * SERVER_KNOBS->MIN_SHARD_BYTES); }; state std::vector metricsList = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req))); wait(ready(healthMetrics)); - if (getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs()) < 25.0) { // 25% + if (getWorstCpu(healthMetrics.get(), sourceTeam->getServerIDs()) < + SERVER_KNOBS->READ_REBALANCE_CPU_THRESHOLD) { // 15.0 +- (0.3 * 15) < 20.0 traceEvent->detail("SkipReason", "LowReadLoad"); return false; } @@ -1710,7 +1711,8 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, state GetTeamRequest srcReq; state GetTeamRequest destReq; state TraceEvent traceEvent(eventName, self->distributorId); - traceEvent.suppressFor(5.0) + // FIXME: uncomment + traceEvent // .suppressFor(5.0) .detail("PollingInterval", rebalancePollingInterval) .detail("Rebalance", readRebalance ? "Read" : "Disk"); @@ -1719,7 +1721,6 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, } try { - // FIXME: change back to BG_REBALANCE_SWITCH_CHECK_INTERVAL after test delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { tr.setOption(FDBTransactionOptions::LOCK_AWARE); @@ -1838,7 +1839,10 @@ ACTOR Future BgDDMountainChopper(DDQueueData* self, int teamCollectionInde state std::pair>, bool> randomTeam; state bool moved = false; state TraceEvent traceEvent("BgDDMountainChopper_Old", self->distributorId); - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); + // FIXME: uncomment + traceEvent // .suppressFor(5.0) + .detail("PollingInterval", rebalancePollingInterval) + .detail("Rebalance", "Disk"); if (*self->lastLimited > 0) { traceEvent.detail("SecondsSinceLastLimited", now() - *self->lastLimited); @@ -1961,7 +1965,10 @@ ACTOR Future BgDDValleyFiller(DDQueueData* self, int teamCollectionIndex) state std::pair>, bool> randomTeam; state bool moved = false; state TraceEvent traceEvent("BgDDValleyFiller_Old", self->distributorId); - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); + // FIXME: uncomment + traceEvent //.suppressFor(5.0) + .detail("PollingInterval", rebalancePollingInterval) + .detail("Rebalance", "Disk"); if (*self->lastLimited > 0) { traceEvent.detail("SecondsSinceLastLimited", now() - *self->lastLimited); From 583120514f834f1ff45407234ef4e8520e0cf117 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 12 May 2022 17:19:25 -0700 Subject: [PATCH 203/299] fixed compilation errors --- fdbcli/fdbcli.actor.cpp | 39 +++++++++++++++++++++++++++++++++++---- fdbcli/fdbcli.actor.h | 5 +---- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index d40a85d77d..523128b271 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -1012,6 +1012,36 @@ Future stopNetworkAfter(Future what) { } } +ACTOR Future addInterface(std::map>* address_interface, + Reference connectLock, + KeyValue kv) { + wait(connectLock->take()); + state FlowLock::Releaser releaser(*connectLock); + state ClientWorkerInterface workerInterf = + BinaryReader::fromStringRef(kv.value, IncludeVersion()); + state ClientLeaderRegInterface leaderInterf(workerInterf.address()); + choose { + when(Optional rep = + wait(brokenPromiseToNever(leaderInterf.getLeader.getReply(GetLeaderRequest())))) { + StringRef ip_port = + (kv.key.endsWith(LiteralStringRef(":tls")) ? kv.key.removeSuffix(LiteralStringRef(":tls")) : kv.key) + .removePrefix(LiteralStringRef("\xff\xff/worker_interfaces/")); + (*address_interface)[ip_port] = std::make_pair(kv.value, leaderInterf); + + if (workerInterf.reboot.getEndpoint().addresses.secondaryAddress.present()) { + Key full_ip_port2 = + StringRef(workerInterf.reboot.getEndpoint().addresses.secondaryAddress.get().toString()); + StringRef ip_port2 = full_ip_port2.endsWith(LiteralStringRef(":tls")) + ? full_ip_port2.removeSuffix(LiteralStringRef(":tls")) + : full_ip_port2; + (*address_interface)[ip_port2] = std::make_pair(kv.value, leaderInterf); + } + } + when(wait(delay(CLIENT_KNOBS->CLI_CONNECT_TIMEOUT))) {} + } + return Void(); +} + ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { state LineNoise& linenoise = *plinenoise; state bool intrans = false; @@ -1515,10 +1545,11 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (tokencmp(tokens[0], "kill")) { getTransaction(db, managementTenant, tr, options, intrans); if (tokens.size() == 1) { - RangeResult kvs = wait( - makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), - LiteralStringRef("\xff\xff/worker_interfaces0")), - CLIENT_KNOBS->TOO_MANY))); + state ThreadFuture wInterfF = + tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), + LiteralStringRef("\xff\xff/worker_interfaces0")), + CLIENT_KNOBS->TOO_MANY); + RangeResult kvs = wait(makeInterruptable(safeThreadFutureToFuture(wInterfF))); ASSERT(!kvs.more); auto connectLock = makeReference(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM); std::vector> addInterfs; diff --git a/fdbcli/fdbcli.actor.h b/fdbcli/fdbcli.actor.h index 227f47b7d6..ec443e4a19 100644 --- a/fdbcli/fdbcli.actor.h +++ b/fdbcli/fdbcli.actor.h @@ -121,10 +121,7 @@ extern const KeyRangeRef processClassTypeSpecialKeyRange; // Other special keys inline const KeyRef errorMsgSpecialKey = LiteralStringRef("\xff\xff/error_message"); // help functions (Copied from fdbcli.actor.cpp) -// decode worker interfaces -ACTOR Future addInterface(std::map>* address_interface, - Reference connectLock, - KeyValue kv); + // get all workers' info ACTOR Future getWorkers(Reference db, std::vector* workers); From a92ef37d44aa383e8e400a855ab13fa0bd36411c Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 12 May 2022 11:05:28 -0700 Subject: [PATCH 204/299] Log a backtrace before throwing serialization_failed --- fdbrpc/FlowTransport.actor.cpp | 1 + flow/serialize.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index f62b73be9b..3e8c0f0337 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -479,6 +479,7 @@ struct ConnectPacket { serializer(ar, connectPacketLength); if (connectPacketLength > sizeof(ConnectPacket) - sizeof(connectPacketLength)) { ASSERT(!g_network->isSimulated()); + TraceEvent("SerializationFailed").backtrace(); throw serialization_failed(); } diff --git a/flow/serialize.cpp b/flow/serialize.cpp index d51416a308..e4d78379a5 100644 --- a/flow/serialize.cpp +++ b/flow/serialize.cpp @@ -25,6 +25,7 @@ _AssumeVersion::_AssumeVersion(ProtocolVersion version) : v(version) { if (!version.isValid()) { ASSERT(!g_network->isSimulated()); + TraceEvent("SerializationFailed").backtrace(); throw serialization_failed(); } } @@ -34,6 +35,7 @@ const void* BinaryReader::readBytes(int bytes) { const char* e = b + bytes; if (e > end) { ASSERT(!g_network->isSimulated()); + TraceEvent("SerializationFailed").backtrace(); throw serialization_failed(); } begin = e; From e4ac7ab1d9b2f30d0910ad828404031971721f7b Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 12 May 2022 22:14:45 -0700 Subject: [PATCH 205/299] Update release notes 7.1.0 through 7.1.5 --- .../release-notes/release-notes-710.rst | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst index 84a3405b70..0c2ef672af 100644 --- a/documentation/sphinx/source/release-notes/release-notes-710.rst +++ b/documentation/sphinx/source/release-notes/release-notes-710.rst @@ -4,6 +4,52 @@ Release Notes ############# +7.1.5 +===== +* Fixed a fdbcli kill bug that was not killing in parallel. `(PR #7150) `_ +* Fixed a bug that prevents a peer from sending messages on a previously incompatible connection. `(PR #7124) `_ +* Added rocksdb throttling counters to trace event. `(PR #7096) `_ +* Added a backtrace before throwing serialization_failed. `(PR #7155) `_ + +7.1.4 +===== +* Fixed a bug that prevents client from connecting to a cluster. `(PR #7060) `_ +* Fixed a performance bug that overloads Resolver CPU. `(PR #7068) `_ +* Optimized storage server performance for "get range and flat map" feature. `(PR #7078) `_ +* Optimized both Proxy performance and Resolver (when version vector is enabled) performance. `(PR #7076) `_ +* Fixed a key size limit bug when using tenants. `(PR #6986) `_ +* Fixed operation_failed thrown incorrectly from transactions. `(PR #6993) `_ +* Fixed a version vector bug when GRV cache is used. `(PR #7057) `_ +* Fixed orphaned storage server due to force recovery. `(PR #7028) `_ +* Fixed a bug that a storage server reads stale cluster ID. `(PR #7026) `_ +* Fixed a storage server exclusion status bug that affects wiggling. `(PR #6984) `_ +* Fixed a bug that relocate shard tasks move data to a removed team. `(PR #7023) `_ +* Fixed recruitment thrashing when there are temporarily multiple cluster controllers. `(PR #7001) `_ +* Fixed change feed deletion due to multiple sources race. `(PR #6987) `_ +* Fixed TLog crash if more TLogs are absent than the replication factor. `(PR #6991) `_ +* Added hostname DNS resolution logic for cluster connection string. `(PR #6998) `_ +* Fixed a limit bug in indexPrefetch. `(PR #7005) `_ + +7.1.3 +===== +* Added logging measuring commit compute duration. `(PR #6906) `_ +* RocksDb used aggregated property metrics for pending compaction bytes. `(PR #6867) `_ +* Fixed a perpetual wiggle bug that would not react to a pause. `(PR #6933) `_ +* Fixed a crash of data distributor. `(PR #6938) `_ +* Added new c libs to client package. `(PR #6921) `_ +* Fixed a bug that prevents a cluster from fully recovered state after taking a snapshot. `(PR #6892) `_ + +7.1.2 +===== +* Fixed failing upgrades due to non-persisted initial cluster version. `(PR #6864) `_ +* Fixed a client load balancing bug because ClientDBInfo may be unintentionally not set. `(PR #6878) `_ +* Fixed stuck LogRouter due to races of multiple PeekStream requests. `(PR #6870) `_ +* Fixed a client-side infinite loop due to provisional GRV Proxy ID not set in GetReadVersionReply. `(PR #6849) `_ + +7.1.1 +===== +* Added new c libs to client package. `(PR #6828) `_ + 7.1.0 ===== From 6bb83adbdc414f98b2bbb730787642df3da29eb3 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 13 May 2022 11:08:23 +0200 Subject: [PATCH 206/299] Upgrade Tests: Restore progress checks after every step --- tests/TestRunner/upgrade_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/TestRunner/upgrade_test.py b/tests/TestRunner/upgrade_test.py index d849183716..87cde2f725 100755 --- a/tests/TestRunner/upgrade_test.py +++ b/tests/TestRunner/upgrade_test.py @@ -468,8 +468,8 @@ class UpgradeTest: else: assert entry in self.used_versions, "Unexpected entry in the upgrade path: {}".format(entry) self.upgrade_to(entry) - self.health_check() - self.progress_check() + self.health_check() + self.progress_check() os.write(self.ctrl_pipe, b"STOP\n") finally: os.close(self.ctrl_pipe) From 7c9a213127996fa19847a3592d876be669caa416 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 13 May 2022 11:47:57 +0200 Subject: [PATCH 207/299] Upgrade Tests: removing failing downgrade test to 7.1 --- bindings/c/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index cdb4dc096b..a22d372384 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -338,7 +338,7 @@ endif() COMMAND ${CMAKE_SOURCE_DIR}/tests/TestRunner/upgrade_test.py --build-dir ${CMAKE_BINARY_DIR} --test-file ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests/upgrade/MixedApiWorkloadMultiThr.toml - --upgrade-path "7.1.3" "7.2.0" "7.1.3" + --upgrade-path "7.1.3" "7.2.0" --process-number 3 ) From 853e6a346bf20054b1d0212c9967a589c2147265 Mon Sep 17 00:00:00 2001 From: Hao Fu <77984096+hfu94@users.noreply.github.com> Date: Fri, 13 May 2022 10:10:43 -0700 Subject: [PATCH 208/299] Optimization: support removing index conditionally (#7116) --- bindings/c/fdb_c.cpp | 2 + bindings/c/foundationdb/fdb_c.h | 1 + bindings/c/test/unit/fdb_api.cpp | 2 + bindings/c/test/unit/fdb_api.hpp | 1 + bindings/c/test/unit/unit_tests.cpp | 27 +++++++++-- bindings/java/fdbJNI.cpp | 2 + .../MappedRangeQueryIntegrationTest.java | 12 ++--- .../apple/foundationdb/FDBTransaction.java | 28 ++++++----- .../apple/foundationdb/ReadTransaction.java | 2 +- fdbclient/FDBTypes.h | 3 ++ fdbclient/IClientApi.h | 1 + fdbclient/ISingleThreadTransaction.h | 1 + fdbclient/MultiVersionTransaction.actor.cpp | 5 +- fdbclient/MultiVersionTransaction.h | 3 ++ fdbclient/NativeAPI.actor.cpp | 30 ++++++++++-- fdbclient/NativeAPI.actor.h | 2 + fdbclient/PaxosConfigTransaction.h | 1 + fdbclient/ReadYourWrites.actor.cpp | 20 +++++--- fdbclient/ReadYourWrites.h | 1 + fdbclient/SimpleConfigTransaction.h | 1 + fdbclient/StorageServerInterface.h | 4 +- fdbclient/ThreadSafeTransaction.cpp | 5 +- fdbclient/ThreadSafeTransaction.h | 1 + fdbserver/storageserver.actor.cpp | 18 +++++--- fdbserver/workloads/ApiWorkload.h | 8 +++- fdbserver/workloads/GetMappedRange.actor.cpp | 46 ++++++++++++++----- 26 files changed, 168 insertions(+), 59 deletions(-) diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index 5cbfdd6de9..e5f2a33ae3 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -655,6 +655,7 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* int target_bytes, FDBStreamingMode mode, int iteration, + int matchIndex, fdb_bool_t snapshot, fdb_bool_t reverse) { FDBFuture* r = validate_and_update_parameters(limit, target_bytes, mode, iteration, reverse); @@ -667,6 +668,7 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* KeySelectorRef(KeyRef(end_key_name, end_key_name_length), end_or_equal, end_offset), StringRef(mapper_name, mapper_name_length), GetRangeLimits(limit, target_bytes), + matchIndex, snapshot, reverse) .extractPtr()); diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 2773be8e98..584c56c854 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -384,6 +384,7 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_mapped_range(FDBTran int target_bytes, FDBStreamingMode mode, int iteration, + int matchIndex, fdb_bool_t snapshot, fdb_bool_t reverse); diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index 4684dd66dc..b71b8b8003 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -271,6 +271,7 @@ MappedKeyValueArrayFuture Transaction::get_mapped_range(const uint8_t* begin_key int target_bytes, FDBStreamingMode mode, int iteration, + int matchIndex, fdb_bool_t snapshot, fdb_bool_t reverse) { return MappedKeyValueArrayFuture(fdb_transaction_get_mapped_range(tr_, @@ -288,6 +289,7 @@ MappedKeyValueArrayFuture Transaction::get_mapped_range(const uint8_t* begin_key target_bytes, mode, iteration, + matchIndex, snapshot, reverse)); } diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index 633965e1b5..91365fc0a2 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -304,6 +304,7 @@ public: int target_bytes, FDBStreamingMode mode, int iteration, + int matchIndex, fdb_bool_t snapshot, fdb_bool_t reverse); diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 5ec1c6cec2..d902bfac8f 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -261,6 +261,7 @@ GetMappedRangeResult get_mapped_range(fdb::Transaction& tr, int target_bytes, FDBStreamingMode mode, int iteration, + int matchIndex, fdb_bool_t snapshot, fdb_bool_t reverse) { fdb::MappedKeyValueArrayFuture f1 = tr.get_mapped_range(begin_key_name, @@ -277,6 +278,7 @@ GetMappedRangeResult get_mapped_range(fdb::Transaction& tr, target_bytes, mode, iteration, + matchIndex, snapshot, reverse); @@ -951,7 +953,11 @@ std::map fillInRecords(int n) { return data; } -GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transaction& tr, std::string mapper) { +GetMappedRangeResult getMappedIndexEntries(int beginId, + int endId, + fdb::Transaction& tr, + std::string mapper, + int matchIndex = MATCH_INDEX_ALL) { std::string indexEntryKeyBegin = indexEntryKey(beginId); std::string indexEntryKeyEnd = indexEntryKey(endId); @@ -965,13 +971,17 @@ GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transact /* target_bytes */ 0, /* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL, /* iteration */ 0, + /* matchIndex */ matchIndex, /* snapshot */ false, /* reverse */ 0); } -GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transaction& tr) { +GetMappedRangeResult getMappedIndexEntries(int beginId, + int endId, + fdb::Transaction& tr, + int matchIndex = MATCH_INDEX_ALL) { std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).append("{...}"_sr).pack().toString(); - return getMappedIndexEntries(beginId, endId, tr, mapper); + return getMappedIndexEntries(beginId, endId, tr, mapper, matchIndex); } TEST_CASE("fdb_transaction_get_mapped_range") { @@ -983,7 +993,8 @@ TEST_CASE("fdb_transaction_get_mapped_range") { while (1) { int beginId = 1; int endId = 19; - auto result = getMappedIndexEntries(beginId, endId, tr); + const int matchIndex = deterministicRandom()->random01() > 0.5 ? MATCH_INDEX_NONE : MATCH_INDEX_ALL; + auto result = getMappedIndexEntries(beginId, endId, tr, matchIndex); if (result.err) { fdb::EmptyFuture f1 = tr.on_error(result.err); @@ -998,7 +1009,11 @@ TEST_CASE("fdb_transaction_get_mapped_range") { int id = beginId; for (int i = 0; i < expectSize; i++, id++) { const auto& [key, value, begin, end, range_results] = result.mkvs[i]; - CHECK(indexEntryKey(id).compare(key) == 0); + if (matchIndex == MATCH_INDEX_ALL || i == 0 || i == expectSize - 1) { + CHECK(indexEntryKey(id).compare(key) == 0); + } else { + CHECK(EMPTY.compare(key) == 0); + } CHECK(EMPTY.compare(value) == 0); CHECK(range_results.size() == SPLIT_SIZE); for (int split = 0; split < SPLIT_SIZE; split++) { @@ -1024,6 +1039,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") { /* target_bytes */ 0, /* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL, /* iteration */ 0, + /* matchIndex */ MATCH_INDEX_ALL, /* snapshot */ true, // Set snapshot to true /* reverse */ 0); ASSERT(result.err == error_code_unsupported_operation); @@ -1043,6 +1059,7 @@ TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_ryw_enable") { /* target_bytes */ 0, /* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL, /* iteration */ 0, + /* matchIndex */ MATCH_INDEX_ALL, /* snapshot */ false, /* reverse */ 0); ASSERT(result.err == error_code_unsupported_operation); diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index 5692f020dc..0bead76952 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -960,6 +960,7 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1 jint targetBytes, jint streamingMode, jint iteration, + jint matchIndex, jboolean snapshot, jboolean reverse) { if (!tPtr || !keyBeginBytes || !keyEndBytes || !mapperBytes) { @@ -1007,6 +1008,7 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1 targetBytes, (FDBStreamingMode)streamingMode, iteration, + matchIndex, snapshot, reverse); jenv->ReleaseByteArrayElements(keyBeginBytes, (jbyte*)barrBegin, JNI_ABORT); diff --git a/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java index 3230815c0f..b323566f13 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java @@ -192,12 +192,12 @@ class MappedRangeQueryIntegrationTest { RangeQueryWithIndex mappedRangeQuery = (int begin, int end, Database db) -> db.run(tr -> { try { - List kvs = - tr.getMappedRange(KeySelector.firstGreaterOrEqual(indexEntryKey(begin)), - KeySelector.firstGreaterOrEqual(indexEntryKey(end)), MAPPER, - ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL) - .asList() - .get(); + List kvs = tr.getMappedRange(KeySelector.firstGreaterOrEqual(indexEntryKey(begin)), + KeySelector.firstGreaterOrEqual(indexEntryKey(end)), MAPPER, + ReadTransaction.ROW_LIMIT_UNLIMITED, + FDBTransaction.MATCH_INDEX_ALL, false, StreamingMode.WANT_ALL) + .asList() + .get(); Assertions.assertEquals(end - begin, kvs.size()); if (validate) { diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java index 65a1e9f254..7451959f22 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java @@ -32,6 +32,10 @@ import com.apple.foundationdb.async.AsyncUtil; import com.apple.foundationdb.tuple.ByteArrayUtil; class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionConsumer { + + static public final int MATCH_INDEX_ALL = 0; + static public final int MATCH_INDEX_NONE = 1; + private final Database database; private final Executor executor; private final TransactionOptions options; @@ -93,7 +97,8 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC @Override public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, - int limit, boolean reverse, StreamingMode mode) { + int limit, int matchIndex, boolean reverse, + StreamingMode mode) { throw new UnsupportedOperationException("getMappedRange is only supported in serializable"); } @@ -346,8 +351,8 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } @Override - public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, - int limit, boolean reverse, StreamingMode mode) { + public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, + int matchIndex, boolean reverse, StreamingMode mode) { if (mapper == null) { throw new IllegalArgumentException("Mapper must be non-null"); } @@ -467,9 +472,9 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC begin.toString(), end.toString(), rowLimit, targetBytes, streamingMode, iteration, Boolean.toString(isSnapshot), Boolean.toString(reverse)));*/ return new FutureMappedResults( - Transaction_getMappedRange(getPtr(), begin.getKey(), begin.orEqual(), begin.getOffset(), - end.getKey(), end.orEqual(), end.getOffset(), mapper, rowLimit, - targetBytes, streamingMode, iteration, isSnapshot, reverse), + Transaction_getMappedRange(getPtr(), begin.getKey(), begin.orEqual(), begin.getOffset(), end.getKey(), + end.orEqual(), end.getOffset(), mapper, rowLimit, targetBytes, streamingMode, + iteration, MATCH_INDEX_ALL, isSnapshot, reverse), FDB.instance().isDirectBufferQueriesEnabled(), executor, eventKeeper); } finally { pointerReadLock.unlock(); @@ -809,12 +814,11 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC byte[] keyEnd, boolean orEqualEnd, int offsetEnd, int rowLimit, int targetBytes, int streamingMode, int iteration, boolean isSnapshot, boolean reverse); - private native long Transaction_getMappedRange(long cPtr, byte[] keyBegin, boolean orEqualBegin, - int offsetBegin, byte[] keyEnd, boolean orEqualEnd, - int offsetEnd, - byte[] mapper, // Nonnull - int rowLimit, int targetBytes, int streamingMode, int iteration, - boolean isSnapshot, boolean reverse); + private native long Transaction_getMappedRange(long cPtr, byte[] keyBegin, boolean orEqualBegin, int offsetBegin, + byte[] keyEnd, boolean orEqualEnd, int offsetEnd, + byte[] mapper, // Nonnull + int rowLimit, int targetBytes, int streamingMode, int iteration, + int matchIndex, boolean isSnapshot, boolean reverse); private native void Transaction_addConflictRange(long cPtr, byte[] keyBegin, byte[] keyEnd, int conflictRangeType); private native void Transaction_set(long cPtr, byte[] key, byte[] value); diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index 66ad7a9e80..11ed7e900c 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -460,7 +460,7 @@ public interface ReadTransaction extends ReadTransactionContext { * @return a handle to access the results of the asynchronous call */ AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, - boolean reverse, StreamingMode mode); + int matchIndex, boolean reverse, StreamingMode mode); /** * Gets an estimate for the number of bytes stored in the given range. diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 5d95ce36fb..86696e8987 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -148,6 +148,9 @@ static const Tag invalidTag{ tagLocalitySpecial, 0 }; static const Tag txsTag{ tagLocalitySpecial, 1 }; static const Tag cacheTag{ tagLocalitySpecial, 2 }; +const int MATCH_INDEX_ALL = 0; +const int MATCH_INDEX_NONE = 1; + enum { txsTagOld = -1, invalidTagOld = -100 }; struct TagsAndMessage { diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index e1861432a1..56049b6718 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -68,6 +68,7 @@ public: const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex = MATCH_INDEX_ALL, bool snapshot = false, bool reverse = false) = 0; virtual ThreadFuture>> getAddressesForKey(const KeyRef& key) = 0; diff --git a/fdbclient/ISingleThreadTransaction.h b/fdbclient/ISingleThreadTransaction.h index 19beb4e5df..b44f58b464 100644 --- a/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/ISingleThreadTransaction.h @@ -74,6 +74,7 @@ public: KeySelector end, Key mapper, GetRangeLimits limits, + int matchIndex = MATCH_INDEX_ALL, Snapshot = Snapshot::False, Reverse = Reverse::False) = 0; virtual Future>> getAddressesForKey(Key const& key) = 0; diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index f90303d5da..e7030694b2 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -158,6 +158,7 @@ ThreadFuture DLTransaction::getMappedRange(const KeySelectorR const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex, bool snapshot, bool reverse) { FdbCApi::FDBFuture* f = api->transactionGetMappedRange(tr, @@ -175,6 +176,7 @@ ThreadFuture DLTransaction::getMappedRange(const KeySelectorR limits.bytes, FDB_STREAMING_MODE_EXACT, 0, + matchIndex, snapshot, reverse); return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { @@ -971,10 +973,11 @@ ThreadFuture MultiVersionTransaction::getMappedRange(const Ke const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex, bool snapshot, bool reverse) { auto tr = getTransaction(); - auto f = tr.transaction ? tr.transaction->getMappedRange(begin, end, mapper, limits, snapshot, reverse) + auto f = tr.transaction ? tr.transaction->getMappedRange(begin, end, mapper, limits, matchIndex, snapshot, reverse) : makeTimeout(); return abortableFuture(f, tr.onChange); } diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 77bc429dc4..5468d0ab0a 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -218,6 +218,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int targetBytes, FDBStreamingMode mode, int iteration, + int matchIndex, fdb_bool_t snapshot, fdb_bool_t reverse); FDBFuture* (*transactionGetVersionstamp)(FDBTransaction* tr); @@ -349,6 +350,7 @@ public: const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex, bool snapshot, bool reverse) override; ThreadFuture>> getAddressesForKey(const KeyRef& key) override; @@ -537,6 +539,7 @@ public: const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex, bool snapshot, bool reverse) override; ThreadFuture>> getAddressesForKey(const KeyRef& key) override; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 74d2971a60..502235b85c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3791,12 +3791,24 @@ PublicRequestStream StorageServerInterface::*getRange } } +template +void setMatchIndex(GetKeyValuesFamilyRequest& req, int matchIndex) { + if constexpr (std::is_same::value) { + // do nothing; + } else if (std::is_same::value) { + req.matchIndex = matchIndex; + } else { + UNREACHABLE(); + } +} + ACTOR template Future getExactRange(Reference trState, Version version, KeyRange keys, Key mapper, GetRangeLimits limits, + int matchIndex, Reverse reverse, UseTenant useTenant) { state RangeResultFamily output; @@ -3830,6 +3842,7 @@ Future getExactRange(Reference trState, req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); + setMatchIndex(req, matchIndex); req.spanContext = span.context; trState->cx->getLatestCommitVersions( locations[shard].locations, req.version, trState, req.ssLatestCommitVersions); @@ -4004,6 +4017,7 @@ Future getRangeFallback(Reference trState, KeySelector end, Key mapper, GetRangeLimits limits, + int matchIndex, Reverse reverse, UseTenant useTenant) { if (version == latestVersion) { @@ -4029,7 +4043,7 @@ Future getRangeFallback(Reference trState, // or allKeys.begin exists in the database/tenant and will be part of the conflict range anyways RangeResultFamily _r = wait(getExactRange( - trState, version, KeyRangeRef(b, e), mapper, limits, reverse, useTenant)); + trState, version, KeyRangeRef(b, e), mapper, limits, matchIndex, reverse, useTenant)); RangeResultFamily r = _r; if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) @@ -4153,6 +4167,7 @@ Future getRange(Reference trState, Key mapper, GetRangeLimits limits, Promise> conflictRange, + int matchIndex, Snapshot snapshot, Reverse reverse, UseTenant useTenant = UseTenant::True) { @@ -4205,7 +4220,7 @@ Future getRange(Reference trState, state GetKeyValuesFamilyRequest req; req.mapper = mapper; req.arena.dependsOn(mapper.arena()); - + setMatchIndex(req, matchIndex); req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); req.version = readVersion; @@ -4385,6 +4400,7 @@ Future getRange(Reference trState, originalEnd, mapper, originalLimits, + matchIndex, reverse, useTenant)); getRangeFinished( @@ -4425,6 +4441,7 @@ Future getRange(Reference trState, originalEnd, mapper, originalLimits, + matchIndex, reverse, useTenant)); getRangeFinished( @@ -5010,6 +5027,7 @@ Future getRange(Reference const& trState, ""_sr, limits, Promise>(), + MATCH_INDEX_ALL, Snapshot::True, reverse, useTenant); @@ -5364,6 +5382,7 @@ Future Transaction::getRangeInternal(const KeySelector& begin const KeySelector& end, const Key& mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse) { ++trState->cx->transactionLogicalReads; @@ -5406,7 +5425,7 @@ Future Transaction::getRangeInternal(const KeySelector& begin } return ::getRange( - trState, getReadVersion(), b, e, mapper, limits, conflictRange, snapshot, reverse); + trState, getReadVersion(), b, e, mapper, limits, conflictRange, matchIndex, snapshot, reverse); } Future Transaction::getRange(const KeySelector& begin, @@ -5415,17 +5434,18 @@ Future Transaction::getRange(const KeySelector& begin, Snapshot snapshot, Reverse reverse) { return getRangeInternal( - begin, end, ""_sr, limits, snapshot, reverse); + begin, end, ""_sr, limits, MATCH_INDEX_ALL, snapshot, reverse); } Future Transaction::getMappedRange(const KeySelector& begin, const KeySelector& end, const Key& mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse) { return getRangeInternal( - begin, end, mapper, limits, snapshot, reverse); + begin, end, mapper, limits, matchIndex, snapshot, reverse); } Future Transaction::getRange(const KeySelector& begin, diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index fe4d578e77..541bef367f 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -329,6 +329,7 @@ public: const KeySelector& end, const Key& mapper, GetRangeLimits limits, + int matchIndex = MATCH_INDEX_ALL, Snapshot = Snapshot::False, Reverse = Reverse::False); @@ -338,6 +339,7 @@ private: const KeySelector& end, const Key& mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse); diff --git a/fdbclient/PaxosConfigTransaction.h b/fdbclient/PaxosConfigTransaction.h index 67487fff32..05c113e6d9 100644 --- a/fdbclient/PaxosConfigTransaction.h +++ b/fdbclient/PaxosConfigTransaction.h @@ -54,6 +54,7 @@ public: KeySelector end, Key mapper, GetRangeLimits limits, + int matchIndex = MATCH_INDEX_ALL, Snapshot = Snapshot::False, Reverse = Reverse::False) override { throw client_invalid_operation(); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 6c3fe880c1..cd7d59c041 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -77,11 +77,12 @@ public: template struct GetMappedRangeReq { - GetMappedRangeReq(KeySelector begin, KeySelector end, Key mapper, GetRangeLimits limits) - : begin(begin), end(end), mapper(mapper), limits(limits) {} + GetMappedRangeReq(KeySelector begin, KeySelector end, Key mapper, int matchIndex, GetRangeLimits limits) + : begin(begin), end(end), mapper(mapper), limits(limits), matchIndex(matchIndex) {} KeySelector begin, end; Key mapper; GetRangeLimits limits; + int matchIndex; using Result = MappedRangeResult; }; @@ -1140,9 +1141,13 @@ public: else read.end = KeySelector(firstGreaterOrEqual(key), key.arena()); } - - MappedRangeResult v = wait(ryw->tr.getMappedRange( - read.begin, read.end, read.mapper, read.limits, snapshot, backwards ? Reverse::True : Reverse::False)); + MappedRangeResult v = wait(ryw->tr.getMappedRange(read.begin, + read.end, + read.mapper, + read.limits, + read.matchIndex, + snapshot, + backwards ? Reverse::True : Reverse::False)); return v; } @@ -1677,6 +1682,7 @@ Future ReadYourWritesTransaction::getMappedRange(KeySelector KeySelector end, Key mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse) { if (getDatabase()->apiVersionAtLeast(630)) { @@ -1724,9 +1730,9 @@ Future ReadYourWritesTransaction::getMappedRange(KeySelector Future result = reverse ? RYWImpl::readWithConflictRangeForGetMappedRange( - this, RYWImpl::GetMappedRangeReq(begin, end, mapper, limits), snapshot) + this, RYWImpl::GetMappedRangeReq(begin, end, mapper, matchIndex, limits), snapshot) : RYWImpl::readWithConflictRangeForGetMappedRange( - this, RYWImpl::GetMappedRangeReq(begin, end, mapper, limits), snapshot); + this, RYWImpl::GetMappedRangeReq(begin, end, mapper, matchIndex, limits), snapshot); return result; } diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h index e67b5334f7..6ddf892774 100644 --- a/fdbclient/ReadYourWrites.h +++ b/fdbclient/ReadYourWrites.h @@ -112,6 +112,7 @@ public: KeySelector end, Key mapper, GetRangeLimits limits, + int matchIndex, Snapshot = Snapshot::False, Reverse = Reverse::False) override; diff --git a/fdbclient/SimpleConfigTransaction.h b/fdbclient/SimpleConfigTransaction.h index 83d8411518..903764aa94 100644 --- a/fdbclient/SimpleConfigTransaction.h +++ b/fdbclient/SimpleConfigTransaction.h @@ -63,6 +63,7 @@ public: KeySelector end, Key mapper, GetRangeLimits limits, + int matchIndex, Snapshot = Snapshot::False, Reverse = Reverse::False) override { throw client_invalid_operation(); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index cda6a32b66..58fe73cbbd 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -426,6 +426,7 @@ struct GetMappedKeyValuesRequest : TimedRequest { KeyRef mapper; Version version; // or latestVersion int limit, limitBytes; + int matchIndex; bool isFetchKeys; Optional tags; Optional debugID; @@ -451,7 +452,8 @@ struct GetMappedKeyValuesRequest : TimedRequest { spanContext, tenantInfo, arena, - ssLatestCommitVersions); + ssLatestCommitVersions, + matchIndex); } }; diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index c796f02536..4c639d5423 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -306,6 +306,7 @@ ThreadFuture ThreadSafeTransaction::getMappedRange(const KeyS const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex, bool snapshot, bool reverse) { KeySelector b = begin; @@ -313,9 +314,9 @@ ThreadFuture ThreadSafeTransaction::getMappedRange(const KeyS Key h = mapper; ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr, b, e, h, limits, snapshot, reverse]() -> Future { + return onMainThread([tr, b, e, h, limits, matchIndex, snapshot, reverse]() -> Future { tr->checkDeferredError(); - return tr->getMappedRange(b, e, h, limits, Snapshot{ snapshot }, Reverse{ reverse }); + return tr->getMappedRange(b, e, h, limits, matchIndex, Snapshot{ snapshot }, Reverse{ reverse }); }); } diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index a187bb2f45..938ddd97c1 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -136,6 +136,7 @@ public: const KeySelectorRef& end, const StringRef& mapper, GetRangeLimits limits, + int matchIndex, bool snapshot, bool reverse) override; ThreadFuture>> getAddressesForKey(const KeyRef& key) override; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index a4109e1c1a..cc8bf1b758 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3723,7 +3723,8 @@ ACTOR Future mapKeyValues(StorageServer* data, StringRef mapper, // To provide span context, tags, debug ID to underlying lookups. GetMappedKeyValuesRequest* pOriginalReq, - Optional tenantPrefix) { + Optional tenantPrefix, + int matchIndex) { state GetMappedKeyValuesReply result; result.version = input.version; result.more = input.more; @@ -3741,15 +3742,20 @@ ACTOR Future mapKeyValues(StorageServer* data, TraceEvent("MapperNotTuple").error(e).detail("Mapper", mapper.printable()); throw mapper_not_tuple(); } - state KeyValueRef* it = input.data.begin(); state std::vector> vt; state bool isRangeQuery = false; preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); - for (; it != input.data.end(); it++) { + state int sz = input.data.size(); + state int i = 0; + for (; i < sz; i++) { + KeyValueRef* it = &input.data[i]; state MappedKeyValueRef kvm; - kvm.key = it->key; - kvm.value = it->value; + // need to keep the boundary, so that caller can use it as a continuation. + if ((i == 0 || i == sz - 1) || matchIndex == MATCH_INDEX_ALL) { + kvm.key = it->key; + kvm.value = it->value; + } state Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, mappedKeyFormatTuple); // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. @@ -4026,7 +4032,7 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe try { // Map the scanned range to another list of keys and look up. GetMappedKeyValuesReply _r = - wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, tenantPrefix)); + wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, tenantPrefix, req.matchIndex)); r = _r; } catch (Error& e) { TraceEvent("MapError").error(e); diff --git a/fdbserver/workloads/ApiWorkload.h b/fdbserver/workloads/ApiWorkload.h index 8f46f7b148..f61f1b5c5d 100644 --- a/fdbserver/workloads/ApiWorkload.h +++ b/fdbserver/workloads/ApiWorkload.h @@ -56,6 +56,7 @@ struct TransactionWrapper : public ReferenceCounted { KeySelector& end, Key& mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse) = 0; @@ -128,9 +129,10 @@ struct FlowTransactionWrapper : public TransactionWrapper { KeySelector& end, Key& mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse) override { - return transaction.getMappedRange(begin, end, mapper, limits, snapshot, reverse); + return transaction.getMappedRange(begin, end, mapper, limits, matchIndex, snapshot, reverse); } // Gets the key from the database specified by a given key selector @@ -203,9 +205,11 @@ struct ThreadTransactionWrapper : public TransactionWrapper { KeySelector& end, Key& mapper, GetRangeLimits limits, + int matchIndex, Snapshot snapshot, Reverse reverse) override { - return unsafeThreadFutureToFuture(transaction->getMappedRange(begin, end, mapper, limits, snapshot, reverse)); + return unsafeThreadFutureToFuture( + transaction->getMappedRange(begin, end, mapper, limits, matchIndex, snapshot, reverse)); } // Gets the key from the database specified by a given key selector diff --git a/fdbserver/workloads/GetMappedRange.actor.cpp b/fdbserver/workloads/GetMappedRange.actor.cpp index cb454b6ef6..cb977d6815 100644 --- a/fdbserver/workloads/GetMappedRange.actor.cpp +++ b/fdbserver/workloads/GetMappedRange.actor.cpp @@ -145,10 +145,18 @@ struct GetMappedRangeWorkload : ApiWorkload { } // Return true if need to retry. - static bool validateRecord(int expectedId, const MappedKeyValueRef* it, GetMappedRangeWorkload* self) { + static bool validateRecord(int expectedId, + const MappedKeyValueRef* it, + GetMappedRangeWorkload* self, + int matchIndex, + bool isBoundary) { // std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key) << " // indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId)) << std::endl; - ASSERT(it->key == indexEntryKey(expectedId)); + if (matchIndex == MATCH_INDEX_ALL || isBoundary) { + ASSERT(it->key == indexEntryKey(expectedId)); + } else { + ASSERT(it->key == EMPTY); + } ASSERT(it->value == EMPTY); if (self->SPLIT_RECORDS) { @@ -189,7 +197,8 @@ struct GetMappedRangeWorkload : ApiWorkload { Key mapper, int limit, int expectedBeginId, - GetMappedRangeWorkload* self) { + GetMappedRangeWorkload* self, + int matchIndex) { std::cout << "start scanMappedRangeWithLimits beginSelector:" << beginSelector.toString() << " endSelector:" << endSelector.toString() << " expectedBeginId:" << expectedBeginId @@ -197,8 +206,13 @@ struct GetMappedRangeWorkload : ApiWorkload { loop { state Reference tr = self->createTransaction(); try { - MappedRangeResult result = wait(tr->getMappedRange( - beginSelector, endSelector, mapper, GetRangeLimits(limit), self->snapshot, Reverse::False)); + MappedRangeResult result = wait(tr->getMappedRange(beginSelector, + endSelector, + mapper, + GetRangeLimits(limit), + matchIndex, + self->snapshot, + Reverse::False)); // showResult(result); if (self->BAD_MAPPER) { TraceEvent("GetMappedRangeWorkloadShouldNotReachable").detail("ResultSize", result.size()); @@ -208,8 +222,10 @@ struct GetMappedRangeWorkload : ApiWorkload { ASSERT(result.size() <= limit); int expectedId = expectedBeginId; bool needRetry = false; - for (const MappedKeyValueRef* it = result.begin(); it != result.end(); it++) { - if (validateRecord(expectedId, it, self)) { + int cnt = 0; + const MappedKeyValueRef* it = result.begin(); + for (; cnt < result.size(); cnt++, it++) { + if (validateRecord(expectedId, it, self, matchIndex, cnt == 0 || cnt == result.size() - 1)) { needRetry = true; break; } @@ -236,7 +252,12 @@ struct GetMappedRangeWorkload : ApiWorkload { } } - ACTOR Future scanMappedRange(Database cx, int beginId, int endId, Key mapper, GetMappedRangeWorkload* self) { + ACTOR Future scanMappedRange(Database cx, + int beginId, + int endId, + Key mapper, + GetMappedRangeWorkload* self, + int matchIndex) { Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone(); state KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple)); Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone(); @@ -244,14 +265,15 @@ struct GetMappedRangeWorkload : ApiWorkload { state int limit = 100; state int expectedBeginId = beginId; while (true) { - MappedRangeResult result = wait( - self->scanMappedRangeWithLimits(cx, beginSelector, endSelector, mapper, limit, expectedBeginId, self)); + MappedRangeResult result = wait(self->scanMappedRangeWithLimits( + cx, beginSelector, endSelector, mapper, limit, expectedBeginId, self, matchIndex)); expectedBeginId += result.size(); if (result.more) { if (result.empty()) { // This is usually not expected. std::cout << "not result but have more, try again" << std::endl; } else { + // auto& reqAndResult = std::get(result.back().reqAndResult); beginSelector = KeySelector(firstGreaterThan(result.back().key)); } } else { @@ -296,6 +318,7 @@ struct GetMappedRangeWorkload : ApiWorkload { endSelector, mapper, GetRangeLimits(GetRangeLimits::ROW_LIMIT_UNLIMITED), + MATCH_INDEX_ALL, self->snapshot, Reverse::False); } @@ -394,7 +417,8 @@ struct GetMappedRangeWorkload : ApiWorkload { Key mapper = getMapper(self); // The scanned range cannot be too large to hit get_mapped_key_values_has_more. We have a unit validating the // error is thrown when the range is large. - wait(self->scanMappedRange(cx, 10, 490, mapper, self)); + int matchIndex = deterministicRandom()->random01() > 0.5 ? MATCH_INDEX_NONE : MATCH_INDEX_ALL; + wait(self->scanMappedRange(cx, 10, 490, mapper, self, matchIndex)); return Void(); } From 78f819fb2ab530e2300800eb1235c7794bbbcc6d Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 13 May 2022 11:36:03 -0700 Subject: [PATCH 209/299] Update flow/serialize.h Co-authored-by: Trevor Clinkenbeard --- flow/serialize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/serialize.h b/flow/serialize.h index d24d3bcaf3..4b331a73fc 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -635,7 +635,7 @@ public: } size_t size() const { return len; } - size_t remainedBytes() const { return end - begin; }; + size_t remainingBytes() const { return end - begin; }; protected: _Reader(const char* begin, const char* end) : begin(begin), end(end), len(end - begin) {} From 8014ac6baf7e7bc7cfda5947feddb662141844f0 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 13 May 2022 12:23:53 -0700 Subject: [PATCH 210/299] CMakeList.txt --- fdbserver/CMakeLists.txt | 1 + tests/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index f47e003a67..03a51cb662 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -281,6 +281,7 @@ set(FDBSERVER_SRCS workloads/Sideband.actor.cpp workloads/SidebandSingle.actor.cpp workloads/SimpleAtomicAdd.actor.cpp + workloads/SkewedReadWrite.actor.cpp workloads/SlowTaskWorkload.actor.cpp workloads/SnapTest.actor.cpp workloads/SpecialKeySpaceCorrectness.actor.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1489ecb97e..b409f6ae35 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -212,6 +212,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES rare/LargeApiCorrectnessStatus.toml) add_fdb_test(TEST_FILES rare/RYWDisable.toml) add_fdb_test(TEST_FILES rare/RandomReadWriteTest.toml) + add_fdb_test(TEST_FILES rare/ReadSkewReadWrite.toml) add_fdb_test(TEST_FILES rare/SpecificUnitTests.toml) add_fdb_test(TEST_FILES rare/SwizzledLargeApiCorrectness.toml) add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml) From b0c26e93b2ce7763b816a0d9f987a6db8a9c18fd Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 13 May 2022 12:55:19 -0700 Subject: [PATCH 211/299] remove size() method --- fdbclient/SystemData.cpp | 4 ++-- flow/serialize.h | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 73baffd127..2768a3e4c1 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -335,8 +335,8 @@ std::pair serverKeysDecodeServerBegin(const KeyRef& key) { BinaryReader rd(key.removePrefix(serverKeysPrefix), Unversioned()); rd >> server_id; rd.readBytes(1); // skip "/" - const auto remainedBytes = rd.remainedBytes(); - KeyRef ref = KeyRef(rd.arenaRead(remainedBytes), remainedBytes); + const auto remainingBytes = rd.remainingBytes(); + KeyRef ref = KeyRef(rd.arenaRead(remainingBytes), remainingBytes); // std::cout << ref.size() << " " << ref.toString() << std::endl; return std::make_pair(server_id, Key(ref)); } diff --git a/flow/serialize.h b/flow/serialize.h index 4b331a73fc..5c218b9bc6 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -634,19 +634,16 @@ public: check = nullptr; } - size_t size() const { return len; } size_t remainingBytes() const { return end - begin; }; protected: - _Reader(const char* begin, const char* end) : begin(begin), end(end), len(end - begin) {} - _Reader(const char* begin, const char* end, const Arena& arena) - : begin(begin), end(end), m_pool(arena), len(end - begin) {} + _Reader(const char* begin, const char* end) : begin(begin), end(end) {} + _Reader(const char* begin, const char* end, const Arena& arena) : begin(begin), end(end), m_pool(arena) {} const char *begin, *end; const char* check = nullptr; Arena m_pool; ProtocolVersion m_protocolVersion; - size_t len; }; class ArenaReader : public _Reader { From 30922268d716764e68ba4813d28f601323642236 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Fri, 13 May 2022 13:03:00 -0700 Subject: [PATCH 212/299] Minor readability improvement (I believe) to DD code; mostly, replacing .first, .second of pairs with better names through bindings) --- fdbserver/DDTeamCollection.actor.cpp | 107 +++++++++++----------- fdbserver/DataDistributionQueue.actor.cpp | 8 +- 2 files changed, 58 insertions(+), 57 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 8de3cebcb9..a6b47db11e 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -468,20 +468,20 @@ public: self->healthyZone.set(initTeams->initHealthyZoneValue); // SOMEDAY: If some servers have teams and not others (or some servers have more data than others) and there is // an address/locality collision, should we preferentially mark the least used server as undesirable? - for (auto& server : initTeams->allServers) { - if (self->shouldHandleServer(server.first)) { - if (!self->isValidLocality(self->configuration.storagePolicy, server.first.locality)) { + for (auto& [server, procClass] : initTeams->allServers) { + if (self->shouldHandleServer(server)) { + if (!self->isValidLocality(self->configuration.storagePolicy, server.locality)) { TraceEvent(SevWarnAlways, "MissingLocality") - .detail("Server", server.first.uniqueID) - .detail("Locality", server.first.locality.toString()); - auto addr = server.first.stableAddress(); + .detail("Server", server.uniqueID) + .detail("Locality", server.locality.toString()); + auto addr = server.stableAddress(); self->invalidLocalityAddr.insert(AddressExclusion(addr.ip, addr.port)); if (self->checkInvalidLocalities.isReady()) { self->checkInvalidLocalities = checkAndRemoveInvalidLocalityAddr(self); self->addActor.send(self->checkInvalidLocalities); } } - self->addServer(server.first, server.second, self->serverTrackerErrorOut, 0, *ddEnabledState); + self->addServer(server, procClass, self->serverTrackerErrorOut, 0, *ddEnabledState); } } @@ -514,13 +514,14 @@ public: } } - for (auto i = self->server_info.begin(); i != self->server_info.end(); ++i) { - if (!self->server_status.get(i->first).isUnhealthy()) { + for (auto& [serverID, server] : self->server_info) { + if (!self->server_status.get(serverID).isUnhealthy()) { ++serverCount; - LocalityData const& serverLocation = i->second->getLastKnownInterface().locality; + LocalityData const& serverLocation = server->getLastKnownInterface().locality; machines.insert(serverLocation.zoneId()); } } + uniqueMachines = machines.size(); TraceEvent("BuildTeams", self->distributorId) .detail("ServerCount", self->server_info.size()) @@ -594,8 +595,8 @@ public: int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = self->getHealthyMachineTeamCount(); - std::pair minMaxTeamsOnServer = self->calculateMinMaxServerTeamsOnServer(); - std::pair minMaxMachineTeamsOnMachine = + auto [minTeamsOnServer, maxTeamsOnServer] = self->calculateMinMaxServerTeamsOnServer(); + auto [minMachineTeamsOnMachine, maxMachineTeamsOnMachine] = self->calculateMinMaxMachineTeamsOnMachine(); TraceEvent("TeamCollectionInfo", self->distributorId) @@ -611,10 +612,10 @@ public: .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachines", totalHealthyMachineCount) - .detail("MinTeamsOnServer", minMaxTeamsOnServer.first) - .detail("MaxTeamsOnServer", minMaxTeamsOnServer.second) - .detail("MinMachineTeamsOnMachine", minMaxMachineTeamsOnMachine.first) - .detail("MaxMachineTeamsOnMachine", minMaxMachineTeamsOnMachine.second) + .detail("MinTeamsOnServer", minTeamsOnServer) + .detail("MaxTeamsOnServer", maxTeamsOnServer) + .detail("MinMachineTeamsOnMachine", maxMachineTeamsOnMachine) + .detail("MaxMachineTeamsOnMachine", minMachineTeamsOnMachine) .detail("DoBuildTeams", self->doBuildTeams) .trackLatest(self->teamCollectionInfoEventHolder->trackingKey); } @@ -3257,24 +3258,24 @@ void DDTeamCollection::traceServerInfo() const { int i = 0; TraceEvent("ServerInfo", distributorId).detail("Size", server_info.size()); - for (auto& server : server_info) { + for (auto& [serverID, server] : server_info) { TraceEvent("ServerInfo", distributorId) .detail("ServerInfoIndex", i++) - .detail("ServerID", server.first.toString()) - .detail("ServerTeamOwned", server.second->getTeams().size()) - .detail("MachineID", server.second->machine->machineID.contents().toString()) - .detail("StoreType", server.second->getStoreType().toString()) - .detail("InDesiredDC", server.second->isInDesiredDC()); + .detail("ServerID", serverID.toString()) + .detail("ServerTeamOwned", server->getTeams().size()) + .detail("MachineID", server->machine->machineID.contents().toString()) + .detail("StoreType", server->getStoreType().toString()) + .detail("InDesiredDC", server->isInDesiredDC()); } - for (auto& server : server_info) { - const UID& uid = server.first; + for (auto& [serverID, server] : server_info) { TraceEvent("ServerStatus", distributorId) - .detail("ServerID", uid) - .detail("Healthy", !server_status.get(uid).isUnhealthy()) - .detail("MachineIsValid", get(server_info, uid)->machine.isValid()) + .detail("ServerID", serverID) + .detail("Healthy", !server_status.get(serverID).isUnhealthy()) + .detail("MachineIsValid", get(server_info, serverID)->machine.isValid()) .detail("MachineTeamSize", - get(server_info, uid)->machine.isValid() ? get(server_info, uid)->machine->machineTeams.size() - : -1); + get(server_info, serverID)->machine.isValid() + ? get(server_info, serverID)->machine->machineTeams.size() + : -1); } } @@ -3838,16 +3839,16 @@ void DDTeamCollection::addTeam(const std::vector>& newTe // For a good team, we add it to teams and create machine team for it when necessary teams.push_back(teamInfo); - for (int i = 0; i < newTeamServers.size(); ++i) { - newTeamServers[i]->addTeam(teamInfo); + for (auto& server : newTeamServers) { + server->addTeam(teamInfo); } // Find or create machine team for the server team // Add the reference of machineTeam (with machineIDs) into process team std::vector> machineIDs; - for (auto server = newTeamServers.begin(); server != newTeamServers.end(); ++server) { - ASSERT_WE_THINK((*server)->machine.isValid()); - machineIDs.push_back((*server)->machine->machineID); + for (auto& server : newTeamServers) { + ASSERT_WE_THINK(server->machine.isValid()); + machineIDs.push_back(server->machine->machineID); } sort(machineIDs.begin(), machineIDs.end()); Reference machineTeamInfo = findMachineTeam(machineIDs); @@ -3907,9 +3908,9 @@ Reference DDTeamCollection::addMachineTeam(std::vectorfirst).isUnhealthy()) { - checkAndCreateMachine(i->second); + for (auto& [serverID, server] : server_info) { + if (!server_status.get(serverID).isUnhealthy()) { + checkAndCreateMachine(server); totalServerIndex++; } } @@ -4021,26 +4022,26 @@ void DDTeamCollection::traceAllInfo(bool shouldPrint) const { void DDTeamCollection::rebuildMachineLocalityMap() { machineLocalityMap.clear(); int numHealthyMachine = 0; - for (auto machine = machine_info.begin(); machine != machine_info.end(); ++machine) { - if (machine->second->serversOnMachine.empty()) { + for (auto& [_, machine] : machine_info) { + if (machine->serversOnMachine.empty()) { TraceEvent(SevWarn, "RebuildMachineLocalityMapError") - .detail("Machine", machine->second->machineID.toString()) + .detail("Machine", machine->machineID.toString()) .detail("NumServersOnMachine", 0); continue; } - if (!isMachineHealthy(machine->second)) { + if (!isMachineHealthy(machine)) { continue; } - Reference representativeServer = machine->second->serversOnMachine[0]; + Reference representativeServer = machine->serversOnMachine[0]; auto& locality = representativeServer->getLastKnownInterface().locality; if (!isValidLocality(configuration.storagePolicy, locality)) { TraceEvent(SevWarn, "RebuildMachineLocalityMapError") - .detail("Machine", machine->second->machineID.toString()) + .detail("Machine", machine->machineID.toString()) .detail("InvalidLocality", locality.toString()); continue; } const LocalityEntry& localityEntry = machineLocalityMap.add(locality, &representativeServer->getId()); - machine->second->localityEntry = localityEntry; + machine->localityEntry = localityEntry; ++numHealthyMachine; } } @@ -4420,12 +4421,12 @@ bool DDTeamCollection::notEnoughMachineTeamsForAMachine() const { SERVER_KNOBS->TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS ? (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2 : SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER; - for (auto& m : machine_info) { + for (auto& [_, machine] : machine_info) { // If SERVER_KNOBS->TR_FLAG_REMOVE_MT_WITH_MOST_TEAMS is false, // The desired machine team number is not the same with the desired server team number // in notEnoughTeamsForAServer() below, because the machineTeamRemover() does not // remove a machine team with the most number of machine teams. - if (m.second->machineTeams.size() < targetMachineTeamNumPerMachine && isMachineHealthy(m.second)) { + if (machine->machineTeams.size() < targetMachineTeamNumPerMachine && isMachineHealthy(machine)) { return true; } } @@ -4443,8 +4444,8 @@ bool DDTeamCollection::notEnoughTeamsForAServer() const { // (#servers * DESIRED_TEAMS_PER_SERVER * storageTeamSize) / #servers. int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2; ASSERT_GT(targetTeamNumPerServer, 0); - for (auto& s : server_info) { - if (s.second->getTeams().size() < targetTeamNumPerServer && !server_status.get(s.first).isUnhealthy()) { + for (auto& [serverID, server] : server_info) { + if (server->getTeams().size() < targetTeamNumPerServer && !server_status.get(serverID).isUnhealthy()) { return true; } } @@ -4651,11 +4652,11 @@ void DDTeamCollection::traceTeamCollectionInfo() const { void DDTeamCollection::noHealthyTeams() const { std::set desiredServerSet; std::string desc; - for (auto i = server_info.begin(); i != server_info.end(); ++i) { - ASSERT(i->first == i->second->getId()); - if (!server_status.get(i->first).isFailed) { - desiredServerSet.insert(i->first); - desc += i->first.shortString() + " (" + i->second->getLastKnownInterface().toString() + "), "; + for (auto& [serverID, server] : server_info) { + ASSERT(serverID == server->getId()); + if (!server_status.get(serverID).isFailed) { + desiredServerSet.insert(serverID); + desc += serverID.shortString() + " (" + server->getLastKnownInterface().toString() + "), "; } } diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 58d4ac201d..310b90de9e 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -355,8 +355,8 @@ bool canLaunchDest(const std::vector, return true; } int workFactor = getDestWorkFactor(); - for (auto& team : candidateTeams) { - for (UID id : team.first->getServerIDs()) { + for (auto& [team, _] : candidateTeams) { + for (UID id : team->getServerIDs()) { if (!busymapDest[id].canLaunch(priority, workFactor)) { return false; } @@ -378,8 +378,8 @@ void launchDest(RelocateData& relocation, std::map& destBusymap) { ASSERT(relocation.completeDests.empty()); int destWorkFactor = getDestWorkFactor(); - for (auto& team : candidateTeams) { - for (UID id : team.first->getServerIDs()) { + for (auto& [team, _] : candidateTeams) { + for (UID id : team->getServerIDs()) { relocation.completeDests.push_back(id); destBusymap[id].addWork(relocation.priority, destWorkFactor); } From 4f3a7b7e7f41f80f04743c6da89e1df4565b4144 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Fri, 13 May 2022 15:26:39 -0700 Subject: [PATCH 213/299] refactor ReadWriteWorkload --- fdbserver/CMakeLists.txt | 1 + fdbserver/workloads/ReadWrite.actor.cpp | 572 +++++++----------- fdbserver/workloads/ReadWriteWorkload.actor.h | 171 ++++++ fdbserver/workloads/SkewedReadWrite.actor.cpp | 416 +------------ 4 files changed, 415 insertions(+), 745 deletions(-) create mode 100644 fdbserver/workloads/ReadWriteWorkload.actor.h diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 03a51cb662..18a9e10b6d 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -267,6 +267,7 @@ set(FDBSERVER_SRCS workloads/ReadAfterWrite.actor.cpp workloads/ReadHotDetection.actor.cpp workloads/ReadWrite.actor.cpp + workloads/ReadWriteWorkload.actor.h workloads/RemoveServersSafely.actor.cpp workloads/ReportConflictingKeys.actor.cpp workloads/RestoreBackup.actor.cpp diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index f149de94ea..475c3a023c 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -28,209 +28,13 @@ #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" +#include "fdbserver/workloads/ReadWriteWorkload.actor.h" #include "fdbclient/ReadYourWrites.h" #include "flow/TDMetric.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. -const int sampleSize = 10000; -static Future nextRV; -static Version lastRV = invalidVersion; - -ACTOR static Future getNextRV(Database db) { - state Transaction tr(db); - loop { - try { - Version v = wait(tr.getReadVersion()); - return v; - } catch (Error& e) { - wait(tr.onError(e)); - } - } -} -static Future getInconsistentReadVersion(Database const& db) { - if (!nextRV.isValid() || nextRV.isReady()) { // if no getNextRV() running - if (nextRV.isValid()) - lastRV = nextRV.get(); - nextRV = getNextRV(db); - } - if (lastRV == invalidVersion) - return nextRV; - else - return lastRV; -} - -DESCR struct TransactionSuccessMetric { - int64_t totalLatency; // ns - int64_t startLatency; // ns - int64_t commitLatency; // ns - int64_t retries; // count -}; - -DESCR struct TransactionFailureMetric { - int64_t startLatency; // ns - int64_t errorCode; // flow error code -}; - -DESCR struct ReadMetric { - int64_t readLatency; // ns -}; - -struct ReadWriteWorkload : KVWorkload { - // general test setting - Standalone descriptionString; - bool doSetup, cancelWorkersAtDuration; - double testDuration, transactionsPerSecond, warmingDelay, maxInsertRate, debugInterval, debugTime; - double metricsStart, metricsDuration; - std::vector insertionCountsToMeasure; // measure the speed of sequential insertion when bulkSetup - - // test log setting - bool enableReadLatencyLogging; - double periodicLoggingInterval; - - // use ReadWrite as a ramp up workload - bool rampUpLoad; // indicate this is a ramp up workload - int rampSweepCount; // how many times of ramp up - bool rampTransactionType; // choose transaction type based on client start time - bool rampUpConcurrency; // control client concurrency - - // transaction setting - bool useRYW; - bool batchPriority; - bool rangeReads; // read operations are all single key range read - bool dependentReads; // read operations are issued sequentially - bool inconsistentReads; // read with previous read version - bool adjacentReads; // keys are adjacent within a transaction - bool adjacentWrites; - double alpha; // probability for run TransactionA type - // two type of transaction - int readsPerTransactionA, writesPerTransactionA; - int readsPerTransactionB, writesPerTransactionB; - int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; - std::string valueString; - // hot traffic pattern - double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting - - // states of metric - Int64MetricHandle totalReadsMetric; - Int64MetricHandle totalRetriesMetric; - EventMetricHandle transactionSuccessMetric; - EventMetricHandle transactionFailureMetric; - EventMetricHandle readMetric; - PerfIntCounter aTransactions, bTransactions, retries; - ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; - double readLatencyTotal; - int readLatencyCount; - std::vector periodicMetrics; - std::vector> ratesAtKeyCounts; // sequential insertion speed - - // other internal states - std::vector> clients; - double loadTime, clientBegin; - - ReadWriteWorkload(WorkloadContext const& wcx) - : KVWorkload(wcx), dependentReads(false), adjacentReads(false), adjacentWrites(false), - totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), - totalRetriesMetric(LiteralStringRef("RWWorkload.TotalRetries")), aTransactions("A Transactions"), - bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), - commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), - readLatencyCount(0), loadTime(0.0), clientBegin(0) { - transactionSuccessMetric.init(LiteralStringRef("RWWorkload.SuccessfulTransaction")); - transactionFailureMetric.init(LiteralStringRef("RWWorkload.FailedTransaction")); - readMetric.init(LiteralStringRef("RWWorkload.Read")); - - testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0); - transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 5000.0) / clientCount; - double allowedLatency = getOption(options, LiteralStringRef("allowedLatency"), 0.250); - actorCount = ceil(transactionsPerSecond * allowedLatency); - actorCount = getOption(options, LiteralStringRef("actorCountPerTester"), actorCount); - - readsPerTransactionA = getOption(options, LiteralStringRef("readsPerTransactionA"), 10); - writesPerTransactionA = getOption(options, LiteralStringRef("writesPerTransactionA"), 0); - readsPerTransactionB = getOption(options, LiteralStringRef("readsPerTransactionB"), 1); - writesPerTransactionB = getOption(options, LiteralStringRef("writesPerTransactionB"), 9); - alpha = getOption(options, LiteralStringRef("alpha"), 0.1); - - extraReadConflictRangesPerTransaction = - getOption(options, LiteralStringRef("extraReadConflictRangesPerTransaction"), 0); - extraWriteConflictRangesPerTransaction = - getOption(options, LiteralStringRef("extraWriteConflictRangesPerTransaction"), 0); - - valueString = std::string(maxValueBytes, '.'); - if (nodePrefix > 0) { - keyBytes += 16; - } - - metricsStart = getOption(options, LiteralStringRef("metricsStart"), 0.0); - metricsDuration = getOption(options, LiteralStringRef("metricsDuration"), testDuration); - if (getOption(options, LiteralStringRef("discardEdgeMeasurements"), true)) { - // discardEdgeMeasurements keeps the metrics from the middle 3/4 of the test - metricsStart += testDuration * 0.125; - metricsDuration *= 0.75; - } - - dependentReads = getOption(options, LiteralStringRef("dependentReads"), false); - warmingDelay = getOption(options, LiteralStringRef("warmingDelay"), 0.0); - maxInsertRate = getOption(options, LiteralStringRef("maxInsertRate"), 1e12); - debugInterval = getOption(options, LiteralStringRef("debugInterval"), 0.0); - debugTime = getOption(options, LiteralStringRef("debugTime"), 0.0); - enableReadLatencyLogging = getOption(options, LiteralStringRef("enableReadLatencyLogging"), false); - periodicLoggingInterval = getOption(options, LiteralStringRef("periodicLoggingInterval"), 5.0); - cancelWorkersAtDuration = getOption(options, LiteralStringRef("cancelWorkersAtDuration"), true); - inconsistentReads = getOption(options, LiteralStringRef("inconsistentReads"), false); - adjacentReads = getOption(options, LiteralStringRef("adjacentReads"), false); - adjacentWrites = getOption(options, LiteralStringRef("adjacentWrites"), false); - rampUpLoad = getOption(options, LiteralStringRef("rampUpLoad"), false); - useRYW = getOption(options, LiteralStringRef("useRYW"), false); - rampSweepCount = getOption(options, LiteralStringRef("rampSweepCount"), 1); - rangeReads = getOption(options, LiteralStringRef("rangeReads"), false); - rampTransactionType = getOption(options, LiteralStringRef("rampTransactionType"), false); - rampUpConcurrency = getOption(options, LiteralStringRef("rampUpConcurrency"), false); - doSetup = getOption(options, LiteralStringRef("setup"), true); - batchPriority = getOption(options, LiteralStringRef("batchPriority"), false); - descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("ReadWrite")); - - if (rampUpConcurrency) - ASSERT(rampSweepCount == 2); // Implementation is hard coded to ramp up and down - - // Validate that keyForIndex() is monotonic - for (int i = 0; i < 30; i++) { - int64_t a = deterministicRandom()->randomInt64(0, nodeCount); - int64_t b = deterministicRandom()->randomInt64(0, nodeCount); - if (a > b) { - std::swap(a, b); - } - ASSERT(a <= b); - ASSERT((keyForIndex(a, false) <= keyForIndex(b, false))); - } - - std::vector insertionCountsToMeasureString = - getOption(options, LiteralStringRef("insertionCountsToMeasure"), std::vector()); - for (int i = 0; i < insertionCountsToMeasureString.size(); i++) { - try { - uint64_t count = boost::lexical_cast(insertionCountsToMeasureString[i]); - insertionCountsToMeasure.push_back(count); - } catch (...) { - } - } - - { - // with P(hotTrafficFraction) an access is directed to one of a fraction - // of hot keys, else it is directed to a disjoint set of cold keys - hotKeyFraction = getOption(options, LiteralStringRef("hotKeyFraction"), 0.0); - double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); - ASSERT(hotKeyFraction >= 0 && hotTrafficFraction <= 1); - ASSERT(hotKeyFraction <= hotTrafficFraction); // hot keys should be actually hot! - // p(Cold key) = (1-FHP) * (1-hkf) - // p(Cold key) = (1-htf) - // solving for FHP gives: - forceHotProbability = (hotTrafficFraction - hotKeyFraction) / (1 - hotKeyFraction); - } - } - - std::string description() const override { return descriptionString.toString(); } - Future setup(Database const& cx) override { return _setup(cx, this); } - Future start(Database const& cx) override { return _start(cx, this); } - +struct ReadWriteCommonImpl { + // trace methods ACTOR static Future traceDumpWorkers(Reference const> db) { try { loop { @@ -257,91 +61,7 @@ struct ReadWriteWorkload : KVWorkload { throw; } } - - Future check(Database const& cx) override { - clients.clear(); - - if (!cancelWorkersAtDuration && now() < metricsStart + metricsDuration) - metricsDuration = now() - metricsStart; - - g_traceBatch.dump(); - if (clientId == 0) - return traceDumpWorkers(dbInfo); - else - return true; - } - - void getMetrics(std::vector& m) override { - double duration = metricsDuration; - int reads = - (aTransactions.getValue() * readsPerTransactionA) + (bTransactions.getValue() * readsPerTransactionB); - int writes = - (aTransactions.getValue() * writesPerTransactionA) + (bTransactions.getValue() * writesPerTransactionB); - m.emplace_back("Measured Duration", duration, Averaged::True); - m.emplace_back( - "Transactions/sec", (aTransactions.getValue() + bTransactions.getValue()) / duration, Averaged::False); - m.emplace_back("Operations/sec", ((reads + writes) / duration), Averaged::False); - m.push_back(aTransactions.getMetric()); - m.push_back(bTransactions.getMetric()); - m.push_back(retries.getMetric()); - m.emplace_back("Mean load time (seconds)", loadTime, Averaged::True); - m.emplace_back("Read rows", reads, Averaged::False); - m.emplace_back("Write rows", writes, Averaged::False); - - if (!rampUpLoad) { - m.emplace_back("Mean Latency (ms)", 1000 * latencies.mean(), Averaged::True); - m.emplace_back("Median Latency (ms, averaged)", 1000 * latencies.median(), Averaged::True); - m.emplace_back("90% Latency (ms, averaged)", 1000 * latencies.percentile(0.90), Averaged::True); - m.emplace_back("98% Latency (ms, averaged)", 1000 * latencies.percentile(0.98), Averaged::True); - m.emplace_back("Max Latency (ms, averaged)", 1000 * latencies.max(), Averaged::True); - - m.emplace_back("Mean Row Read Latency (ms)", 1000 * readLatencies.mean(), Averaged::True); - m.emplace_back("Median Row Read Latency (ms, averaged)", 1000 * readLatencies.median(), Averaged::True); - m.emplace_back("Max Row Read Latency (ms, averaged)", 1000 * readLatencies.max(), Averaged::True); - - m.emplace_back("Mean Total Read Latency (ms)", 1000 * fullReadLatencies.mean(), Averaged::True); - m.emplace_back( - "Median Total Read Latency (ms, averaged)", 1000 * fullReadLatencies.median(), Averaged::True); - m.emplace_back("Max Total Latency (ms, averaged)", 1000 * fullReadLatencies.max(), Averaged::True); - - m.emplace_back("Mean GRV Latency (ms)", 1000 * GRVLatencies.mean(), Averaged::True); - m.emplace_back("Median GRV Latency (ms, averaged)", 1000 * GRVLatencies.median(), Averaged::True); - m.emplace_back("Max GRV Latency (ms, averaged)", 1000 * GRVLatencies.max(), Averaged::True); - - m.emplace_back("Mean Commit Latency (ms)", 1000 * commitLatencies.mean(), Averaged::True); - m.emplace_back("Median Commit Latency (ms, averaged)", 1000 * commitLatencies.median(), Averaged::True); - m.emplace_back("Max Commit Latency (ms, averaged)", 1000 * commitLatencies.max(), Averaged::True); - } - - m.emplace_back("Read rows/sec", reads / duration, Averaged::False); - m.emplace_back("Write rows/sec", writes / duration, Averaged::False); - m.emplace_back( - "Bytes read/sec", (reads * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, Averaged::False); - m.emplace_back("Bytes written/sec", - (writes * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, - Averaged::False); - m.insert(m.end(), periodicMetrics.begin(), periodicMetrics.end()); - - std::vector>::iterator ratesItr = ratesAtKeyCounts.begin(); - for (; ratesItr != ratesAtKeyCounts.end(); ratesItr++) - m.emplace_back(format("%lld keys imported bytes/sec", ratesItr->first), ratesItr->second, Averaged::False); - } - - Value randomValue() { - return StringRef((uint8_t*)valueString.c_str(), - deterministicRandom()->randomInt(minValueBytes, maxValueBytes + 1)); - } - - Standalone operator()(uint64_t n) { return KeyValueRef(keyForIndex(n, false), randomValue()); } - - template - void setupTransaction(Trans* tr) { - if (batchPriority) { - tr->setOption(FDBTransactionOptions::PRIORITY_BATCH); - } - } - - ACTOR static Future tracePeriodically(ReadWriteWorkload* self) { + ACTOR static Future tracePeriodically(ReadWriteCommon* self) { state double start = now(); state double elapsed = 0.0; state int64_t last_ops = 0; @@ -477,7 +197,6 @@ struct ReadWriteWorkload : KVWorkload { self->readLatencyCount = 0; } } - ACTOR static Future logLatency(Future> f, ContinuousSample* latencies, double* totalLatency, @@ -498,7 +217,6 @@ struct ReadWriteWorkload : KVWorkload { } return Void(); } - ACTOR static Future logLatency(Future f, ContinuousSample* latencies, double* totalLatency, @@ -520,52 +238,7 @@ struct ReadWriteWorkload : KVWorkload { return Void(); } - ACTOR template - Future readOp(Trans* tr, std::vector keys, ReadWriteWorkload* self, bool shouldRecord) { - if (!keys.size()) - return Void(); - if (!self->dependentReads) { - std::vector> readers; - if (self->rangeReads) { - for (int op = 0; op < keys.size(); op++) { - ++self->totalReadsMetric; - readers.push_back(logLatency( - tr->getRange(KeyRangeRef(self->keyForIndex(keys[op]), Key(strinc(self->keyForIndex(keys[op])))), - GetRangeLimits(-1, 80000)), - &self->readLatencies, - &self->readLatencyTotal, - &self->readLatencyCount, - self->readMetric, - shouldRecord)); - } - } else { - for (int op = 0; op < keys.size(); op++) { - ++self->totalReadsMetric; - readers.push_back(logLatency(tr->get(self->keyForIndex(keys[op])), - &self->readLatencies, - &self->readLatencyTotal, - &self->readLatencyCount, - self->readMetric, - shouldRecord)); - } - } - wait(waitForAll(readers)); - } else { - state int op; - for (op = 0; op < keys.size(); op++) { - ++self->totalReadsMetric; - wait(logLatency(tr->get(self->keyForIndex(keys[op])), - &self->readLatencies, - &self->readLatencyTotal, - &self->readLatencyCount, - self->readMetric, - shouldRecord)); - } - } - return Void(); - } - - ACTOR Future _setup(Database cx, ReadWriteWorkload* self) { + ACTOR static Future setup(Database cx, ReadWriteCommon* self) { if (!self->doSetup) return Void(); @@ -587,8 +260,232 @@ struct ReadWriteWorkload : KVWorkload { return Void(); } +}; - ACTOR Future _start(Database cx, ReadWriteWorkload* self) { +Future ReadWriteCommon::tracePeriodically() { + return ReadWriteCommonImpl::tracePeriodically(this); +} + +Future ReadWriteCommon::logLatency(Future> f, bool shouldRecord) { + return ReadWriteCommonImpl::logLatency( + f, &readLatencies, &readLatencyTotal, &readLatencyCount, readMetric, shouldRecord); +} + +Future ReadWriteCommon::logLatency(Future f, bool shouldRecord) { + return ReadWriteCommonImpl::logLatency( + f, &readLatencies, &readLatencyTotal, &readLatencyCount, readMetric, shouldRecord); +} + +Future ReadWriteCommon::setup(Database const& cx) { + return ReadWriteCommonImpl::setup(cx, this); +} + +Future ReadWriteCommon::check(Database const& cx) { + clients.clear(); + + if (!cancelWorkersAtDuration && now() < metricsStart + metricsDuration) + metricsDuration = now() - metricsStart; + + g_traceBatch.dump(); + if (clientId == 0) + return ReadWriteCommonImpl::traceDumpWorkers(dbInfo); + else + return true; +} + +void ReadWriteCommon::getMetrics(std::vector& m) { + double duration = metricsDuration; + int reads = (aTransactions.getValue() * readsPerTransactionA) + (bTransactions.getValue() * readsPerTransactionB); + int writes = + (aTransactions.getValue() * writesPerTransactionA) + (bTransactions.getValue() * writesPerTransactionB); + m.emplace_back("Measured Duration", duration, Averaged::True); + m.emplace_back( + "Transactions/sec", (aTransactions.getValue() + bTransactions.getValue()) / duration, Averaged::False); + m.emplace_back("Operations/sec", ((reads + writes) / duration), Averaged::False); + m.push_back(aTransactions.getMetric()); + m.push_back(bTransactions.getMetric()); + m.push_back(retries.getMetric()); + m.emplace_back("Mean load time (seconds)", loadTime, Averaged::True); + m.emplace_back("Read rows", reads, Averaged::False); + m.emplace_back("Write rows", writes, Averaged::False); + m.emplace_back("Read rows/sec", reads / duration, Averaged::False); + m.emplace_back("Write rows/sec", writes / duration, Averaged::False); + m.emplace_back( + "Bytes read/sec", (reads * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, Averaged::False); + m.emplace_back( + "Bytes written/sec", (writes * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, Averaged::False); + m.insert(m.end(), periodicMetrics.begin(), periodicMetrics.end()); + + std::vector>::iterator ratesItr = ratesAtKeyCounts.begin(); + for (; ratesItr != ratesAtKeyCounts.end(); ratesItr++) + m.emplace_back(format("%lld keys imported bytes/sec", ratesItr->first), ratesItr->second, Averaged::False); +} + +Value ReadWriteCommon::randomValue() { + return StringRef((uint8_t*)valueString.c_str(), deterministicRandom()->randomInt(minValueBytes, maxValueBytes + 1)); +} + +Standalone ReadWriteCommon::operator()(uint64_t n) { + return KeyValueRef(keyForIndex(n, false), randomValue()); +} + +bool ReadWriteCommon::shouldRecord(double checkTime) { + double timeSinceStart = checkTime - clientBegin; + return timeSinceStart >= metricsStart && timeSinceStart < (metricsStart + metricsDuration); +} + +static Future nextRV; +static Version lastRV = invalidVersion; + +ACTOR static Future getNextRV(Database db) { + state Transaction tr(db); + loop { + try { + Version v = wait(tr.getReadVersion()); + return v; + } catch (Error& e) { + wait(tr.onError(e)); + } + } +} + +static Future getInconsistentReadVersion(Database const& db) { + if (!nextRV.isValid() || nextRV.isReady()) { // if no getNextRV() running + if (nextRV.isValid()) + lastRV = nextRV.get(); + nextRV = getNextRV(db); + } + if (lastRV == invalidVersion) + return nextRV; + else + return lastRV; +} + +struct ReadWriteWorkload : ReadWriteCommon { + // use ReadWrite as a ramp up workload + bool rampUpLoad; // indicate this is a ramp up workload + int rampSweepCount; // how many times of ramp up + bool rampTransactionType; // choose transaction type based on client start time + bool rampUpConcurrency; // control client concurrency + + // transaction setting + bool batchPriority; + bool rangeReads; // read operations are all single key range read + bool dependentReads; // read operations are issued sequentially + bool inconsistentReads; // read with previous read version + bool adjacentReads; // keys are adjacent within a transaction + bool adjacentWrites; + int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; + + // hot traffic pattern + double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting + + ReadWriteWorkload(WorkloadContext const& wcx) + : ReadWriteCommon(wcx), dependentReads(false), adjacentReads(false), adjacentWrites(false) { + extraReadConflictRangesPerTransaction = + getOption(options, LiteralStringRef("extraReadConflictRangesPerTransaction"), 0); + extraWriteConflictRangesPerTransaction = + getOption(options, LiteralStringRef("extraWriteConflictRangesPerTransaction"), 0); + dependentReads = getOption(options, LiteralStringRef("dependentReads"), false); + inconsistentReads = getOption(options, LiteralStringRef("inconsistentReads"), false); + adjacentReads = getOption(options, LiteralStringRef("adjacentReads"), false); + adjacentWrites = getOption(options, LiteralStringRef("adjacentWrites"), false); + rampUpLoad = getOption(options, LiteralStringRef("rampUpLoad"), false); + rampSweepCount = getOption(options, LiteralStringRef("rampSweepCount"), 1); + rangeReads = getOption(options, LiteralStringRef("rangeReads"), false); + rampTransactionType = getOption(options, LiteralStringRef("rampTransactionType"), false); + rampUpConcurrency = getOption(options, LiteralStringRef("rampUpConcurrency"), false); + batchPriority = getOption(options, LiteralStringRef("batchPriority"), false); + descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("ReadWrite")); + + if (rampUpConcurrency) + ASSERT(rampSweepCount == 2); // Implementation is hard coded to ramp up and down + + { + // with P(hotTrafficFraction) an access is directed to one of a fraction + // of hot keys, else it is directed to a disjoint set of cold keys + hotKeyFraction = getOption(options, LiteralStringRef("hotKeyFraction"), 0.0); + double hotTrafficFraction = getOption(options, LiteralStringRef("hotTrafficFraction"), 0.0); + ASSERT(hotKeyFraction >= 0 && hotTrafficFraction <= 1); + ASSERT(hotKeyFraction <= hotTrafficFraction); // hot keys should be actually hot! + // p(Cold key) = (1-FHP) * (1-hkf) + // p(Cold key) = (1-htf) + // solving for FHP gives: + forceHotProbability = (hotTrafficFraction - hotKeyFraction) / (1 - hotKeyFraction); + } + } + + std::string description() const override { return descriptionString.toString(); } + + template + void setupTransaction(Trans* tr) { + if (batchPriority) { + tr->setOption(FDBTransactionOptions::PRIORITY_BATCH); + } + } + + void getMetrics(std::vector& m) override { + ReadWriteCommon::getMetrics(m); + if (!rampUpLoad) { + m.emplace_back("Mean Latency (ms)", 1000 * latencies.mean(), Averaged::True); + m.emplace_back("Median Latency (ms, averaged)", 1000 * latencies.median(), Averaged::True); + m.emplace_back("90% Latency (ms, averaged)", 1000 * latencies.percentile(0.90), Averaged::True); + m.emplace_back("98% Latency (ms, averaged)", 1000 * latencies.percentile(0.98), Averaged::True); + m.emplace_back("Max Latency (ms, averaged)", 1000 * latencies.max(), Averaged::True); + + m.emplace_back("Mean Row Read Latency (ms)", 1000 * readLatencies.mean(), Averaged::True); + m.emplace_back("Median Row Read Latency (ms, averaged)", 1000 * readLatencies.median(), Averaged::True); + m.emplace_back("Max Row Read Latency (ms, averaged)", 1000 * readLatencies.max(), Averaged::True); + + m.emplace_back("Mean Total Read Latency (ms)", 1000 * fullReadLatencies.mean(), Averaged::True); + m.emplace_back( + "Median Total Read Latency (ms, averaged)", 1000 * fullReadLatencies.median(), Averaged::True); + m.emplace_back("Max Total Latency (ms, averaged)", 1000 * fullReadLatencies.max(), Averaged::True); + + m.emplace_back("Mean GRV Latency (ms)", 1000 * GRVLatencies.mean(), Averaged::True); + m.emplace_back("Median GRV Latency (ms, averaged)", 1000 * GRVLatencies.median(), Averaged::True); + m.emplace_back("Max GRV Latency (ms, averaged)", 1000 * GRVLatencies.max(), Averaged::True); + + m.emplace_back("Mean Commit Latency (ms)", 1000 * commitLatencies.mean(), Averaged::True); + m.emplace_back("Median Commit Latency (ms, averaged)", 1000 * commitLatencies.median(), Averaged::True); + m.emplace_back("Max Commit Latency (ms, averaged)", 1000 * commitLatencies.max(), Averaged::True); + } + } + + Future start(Database const& cx) override { return _start(cx, this); } + + ACTOR template + static Future readOp(Trans* tr, std::vector keys, ReadWriteWorkload* self, bool shouldRecord) { + if (!keys.size()) + return Void(); + if (!self->dependentReads) { + std::vector> readers; + if (self->rangeReads) { + for (int op = 0; op < keys.size(); op++) { + ++self->totalReadsMetric; + readers.push_back(self->logLatency( + tr->getRange(KeyRangeRef(self->keyForIndex(keys[op]), Key(strinc(self->keyForIndex(keys[op])))), + GetRangeLimits(-1, 80000)), + shouldRecord)); + } + } else { + for (int op = 0; op < keys.size(); op++) { + ++self->totalReadsMetric; + readers.push_back(self->logLatency(tr->get(self->keyForIndex(keys[op])), shouldRecord)); + } + } + wait(waitForAll(readers)); + } else { + state int op; + for (op = 0; op < keys.size(); op++) { + ++self->totalReadsMetric; + wait(self->logLatency(tr->get(self->keyForIndex(keys[op])), shouldRecord)); + } + } + return Void(); + } + + ACTOR static Future _start(Database cx, ReadWriteWorkload* self) { // Read one record from the database to warm the cache of keyServers state std::vector keys; keys.push_back(deterministicRandom()->randomInt64(0, self->nodeCount)); @@ -610,7 +507,7 @@ struct ReadWriteWorkload : KVWorkload { std::vector> clients; if (self->enableReadLatencyLogging) - clients.push_back(tracePeriodically(self)); + clients.push_back(self->tracePeriodically()); self->clientBegin = now(); for (int c = 0; c < self->actorCount; c++) { @@ -632,13 +529,6 @@ struct ReadWriteWorkload : KVWorkload { return Void(); } - bool shouldRecord() { return shouldRecord(now()); } - - bool shouldRecord(double checkTime) { - double timeSinceStart = checkTime - clientBegin; - return timeSinceStart >= metricsStart && timeSinceStart < (metricsStart + metricsDuration); - } - int64_t getRandomKey(uint64_t nodeCount) { if (forceHotProbability && deterministicRandom()->random01() < forceHotProbability) return deterministicRandom()->randomInt64(0, nodeCount * hotKeyFraction) / diff --git a/fdbserver/workloads/ReadWriteWorkload.actor.h b/fdbserver/workloads/ReadWriteWorkload.actor.h new file mode 100644 index 0000000000..fe33be0213 --- /dev/null +++ b/fdbserver/workloads/ReadWriteWorkload.actor.h @@ -0,0 +1,171 @@ +/* + * ReadWriteWorkload.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_READWRITEWORKLOAD_ACTOR_G_H) +#define FDBSERVER_READWRITEWORKLOAD_ACTOR_G_H +#include "fdbserver/workloads/ReadWriteWorkload.actor.g.h" +#elif !defined(FDBSERVER_READWRITEWORKLOAD_ACTOR_H) +#define FDBSERVER_READWRITEWORKLOAD_ACTOR_H + +#include "fdbserver/workloads/workloads.actor.h" +#include "flow/TDMetric.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. +DESCR struct TransactionSuccessMetric { + int64_t totalLatency; // ns + int64_t startLatency; // ns + int64_t commitLatency; // ns + int64_t retries; // count +}; + +DESCR struct TransactionFailureMetric { + int64_t startLatency; // ns + int64_t errorCode; // flow error code +}; + +DESCR struct ReadMetric { + int64_t readLatency; // ns +}; + +// Common ReadWrite test settings +struct ReadWriteCommon : KVWorkload { + static constexpr int sampleSize = 10000; + friend struct ReadWriteCommonImpl; + + // general test setting + Standalone descriptionString; + bool doSetup, cancelWorkersAtDuration; + double testDuration, transactionsPerSecond, warmingDelay, maxInsertRate, debugInterval, debugTime; + double metricsStart, metricsDuration; + std::vector insertionCountsToMeasure; // measure the speed of sequential insertion when bulkSetup + + // test log setting + bool enableReadLatencyLogging; + double periodicLoggingInterval; + + // two type of transaction + int readsPerTransactionA, writesPerTransactionA; + int readsPerTransactionB, writesPerTransactionB; + std::string valueString; + double alpha; // probability for run TransactionA type + // transaction setting + bool useRYW; + + // states of metric + Int64MetricHandle totalReadsMetric; + Int64MetricHandle totalRetriesMetric; + EventMetricHandle transactionSuccessMetric; + EventMetricHandle transactionFailureMetric; + EventMetricHandle readMetric; + PerfIntCounter aTransactions, bTransactions, retries; + ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; + double readLatencyTotal; + int readLatencyCount; + std::vector periodicMetrics; + std::vector> ratesAtKeyCounts; // sequential insertion speed + + // other internal states + std::vector> clients; + double loadTime, clientBegin; + + explicit ReadWriteCommon(WorkloadContext const& wcx) + : KVWorkload(wcx), totalReadsMetric(LiteralStringRef("ReadWrite.TotalReads")), + totalRetriesMetric(LiteralStringRef("ReadWrite.TotalRetries")), aTransactions("A Transactions"), + bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), + commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), + readLatencyCount(0), loadTime(0.0), clientBegin(0) { + + transactionSuccessMetric.init(LiteralStringRef("ReadWrite.SuccessfulTransaction")); + transactionFailureMetric.init(LiteralStringRef("ReadWrite.FailedTransaction")); + readMetric.init(LiteralStringRef("ReadWrite.Read")); + + testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0); + transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 5000.0) / clientCount; + double allowedLatency = getOption(options, LiteralStringRef("allowedLatency"), 0.250); + actorCount = ceil(transactionsPerSecond * allowedLatency); + actorCount = getOption(options, LiteralStringRef("actorCountPerTester"), actorCount); + + readsPerTransactionA = getOption(options, LiteralStringRef("readsPerTransactionA"), 10); + writesPerTransactionA = getOption(options, LiteralStringRef("writesPerTransactionA"), 0); + readsPerTransactionB = getOption(options, LiteralStringRef("readsPerTransactionB"), 1); + writesPerTransactionB = getOption(options, LiteralStringRef("writesPerTransactionB"), 9); + alpha = getOption(options, LiteralStringRef("alpha"), 0.1); + + valueString = std::string(maxValueBytes, '.'); + if (nodePrefix > 0) { + keyBytes += 16; + } + + metricsStart = getOption(options, LiteralStringRef("metricsStart"), 0.0); + metricsDuration = getOption(options, LiteralStringRef("metricsDuration"), testDuration); + if (getOption(options, LiteralStringRef("discardEdgeMeasurements"), true)) { + // discardEdgeMeasurements keeps the metrics from the middle 3/4 of the test + metricsStart += testDuration * 0.125; + metricsDuration *= 0.75; + } + + warmingDelay = getOption(options, LiteralStringRef("warmingDelay"), 0.0); + maxInsertRate = getOption(options, LiteralStringRef("maxInsertRate"), 1e12); + debugInterval = getOption(options, LiteralStringRef("debugInterval"), 0.0); + debugTime = getOption(options, LiteralStringRef("debugTime"), 0.0); + enableReadLatencyLogging = getOption(options, LiteralStringRef("enableReadLatencyLogging"), false); + periodicLoggingInterval = getOption(options, LiteralStringRef("periodicLoggingInterval"), 5.0); + cancelWorkersAtDuration = getOption(options, LiteralStringRef("cancelWorkersAtDuration"), true); + + useRYW = getOption(options, LiteralStringRef("useRYW"), false); + doSetup = getOption(options, LiteralStringRef("setup"), true); + + // Validate that keyForIndex() is monotonic + for (int i = 0; i < 30; i++) { + int64_t a = deterministicRandom()->randomInt64(0, nodeCount); + int64_t b = deterministicRandom()->randomInt64(0, nodeCount); + if (a > b) { + std::swap(a, b); + } + ASSERT(a <= b); + ASSERT((keyForIndex(a, false) <= keyForIndex(b, false))); + } + + std::vector insertionCountsToMeasureString = + getOption(options, LiteralStringRef("insertionCountsToMeasure"), std::vector()); + for (int i = 0; i < insertionCountsToMeasureString.size(); i++) { + try { + uint64_t count = boost::lexical_cast(insertionCountsToMeasureString[i]); + insertionCountsToMeasure.push_back(count); + } catch (...) { + } + } + } + + Future tracePeriodically(); + Future logLatency(Future> f, bool shouldRecord); + Future logLatency(Future f, bool shouldRecord); + + Future setup(Database const& cx) override; + Future check(Database const& cx) override; + void getMetrics(std::vector& m) override; + + Standalone operator()(uint64_t n); + bool shouldRecord(double checkTime = now()); + Value randomValue(); +}; + +#include "flow/unactorcompiler.h" +#endif // FDBSERVER_READWRITEWORKLOAD_ACTOR_H diff --git a/fdbserver/workloads/SkewedReadWrite.actor.cpp b/fdbserver/workloads/SkewedReadWrite.actor.cpp index 6ea9d9b8aa..78576f957f 100644 --- a/fdbserver/workloads/SkewedReadWrite.actor.cpp +++ b/fdbserver/workloads/SkewedReadWrite.actor.cpp @@ -28,48 +28,13 @@ #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" +#include "fdbserver/workloads/ReadWriteWorkload.actor.h" #include "fdbclient/ReadYourWrites.h" #include "flow/TDMetric.actor.h" #include "fdbclient/RunTransaction.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. -const int sampleSize = 10000; -DESCR struct TransactionSuccessMetric { - int64_t totalLatency; // ns - int64_t startLatency; // ns - int64_t commitLatency; // ns - int64_t retries; // count -}; - -DESCR struct TransactionFailureMetric { - int64_t startLatency; // ns - int64_t errorCode; // flow error code -}; - -DESCR struct ReadMetric { - int64_t readLatency; // ns -}; - -struct SkewedReadWriteWorkload : KVWorkload { - // general test setting - Standalone descriptionString; - bool doSetup, cancelWorkersAtDuration; - double testDuration, transactionsPerSecond, warmingDelay, maxInsertRate, debugInterval, debugTime; - double metricsStart, metricsDuration; - std::vector insertionCountsToMeasure; // measure the speed of sequential insertion when bulkSetup - - // test log setting - bool enableReadLatencyLogging; - double periodicLoggingInterval; - - // transaction setting - bool useRYW; - double alpha; // probability for run TransactionA type - // two type of transaction - int readsPerTransactionA, writesPerTransactionA; - int readsPerTransactionB, writesPerTransactionB; - std::string valueString; - +struct SkewedReadWriteWorkload : ReadWriteCommon { // server based hot traffic setting int skewRound = 0; // skewDuration = ceil(testDuration / skewRound) double hotServerFraction = 0, hotServerShardFraction = 1.0; // set > 0 to issue hot key based on shard map @@ -83,184 +48,20 @@ struct SkewedReadWriteWorkload : KVWorkload { std::map serverInterfaces; int hotServerCount = 0, currentHotRound = -1; - // states of metric - Int64MetricHandle totalReadsMetric; - Int64MetricHandle totalRetriesMetric; - EventMetricHandle transactionSuccessMetric; - EventMetricHandle transactionFailureMetric; - EventMetricHandle readMetric; - PerfIntCounter aTransactions, bTransactions, retries; - ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, fullReadLatencies; - double readLatencyTotal; - int readLatencyCount; - std::vector periodicMetrics; - std::vector> ratesAtKeyCounts; // sequential insertion speed - - // other internal states - std::vector> clients; - double loadTime, clientBegin; - - SkewedReadWriteWorkload(WorkloadContext const& wcx) - : KVWorkload(wcx), totalReadsMetric(LiteralStringRef("RWWorkload.TotalReads")), - totalRetriesMetric(LiteralStringRef("RWWorkload.TotalRetries")), aTransactions("A Transactions"), - bTransactions("B Transactions"), retries("Retries"), latencies(sampleSize), readLatencies(sampleSize), - commitLatencies(sampleSize), GRVLatencies(sampleSize), fullReadLatencies(sampleSize), readLatencyTotal(0), - readLatencyCount(0), loadTime(0.0), clientBegin(0) { - - transactionSuccessMetric.init(LiteralStringRef("RWWorkload.SuccessfulTransaction")); - transactionFailureMetric.init(LiteralStringRef("RWWorkload.FailedTransaction")); - readMetric.init(LiteralStringRef("RWWorkload.Read")); - - testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0); - transactionsPerSecond = getOption(options, LiteralStringRef("transactionsPerSecond"), 5000.0) / clientCount; - double allowedLatency = getOption(options, LiteralStringRef("allowedLatency"), 0.250); - actorCount = ceil(transactionsPerSecond * allowedLatency); - actorCount = getOption(options, LiteralStringRef("actorCountPerTester"), actorCount); - - readsPerTransactionA = getOption(options, LiteralStringRef("readsPerTransactionA"), 10); - writesPerTransactionA = getOption(options, LiteralStringRef("writesPerTransactionA"), 0); - readsPerTransactionB = getOption(options, LiteralStringRef("readsPerTransactionB"), 1); - writesPerTransactionB = getOption(options, LiteralStringRef("writesPerTransactionB"), 9); - alpha = getOption(options, LiteralStringRef("alpha"), 0.1); - - valueString = std::string(maxValueBytes, '.'); - if (nodePrefix > 0) { - keyBytes += 16; - } - - metricsStart = getOption(options, LiteralStringRef("metricsStart"), 0.0); - metricsDuration = getOption(options, LiteralStringRef("metricsDuration"), testDuration); - if (getOption(options, LiteralStringRef("discardEdgeMeasurements"), true)) { - // discardEdgeMeasurements keeps the metrics from the middle 3/4 of the test - metricsStart += testDuration * 0.125; - metricsDuration *= 0.75; - } - - warmingDelay = getOption(options, LiteralStringRef("warmingDelay"), 0.0); - maxInsertRate = getOption(options, LiteralStringRef("maxInsertRate"), 1e12); - debugInterval = getOption(options, LiteralStringRef("debugInterval"), 0.0); - debugTime = getOption(options, LiteralStringRef("debugTime"), 0.0); - enableReadLatencyLogging = getOption(options, LiteralStringRef("enableReadLatencyLogging"), false); - periodicLoggingInterval = getOption(options, LiteralStringRef("periodicLoggingInterval"), 5.0); - cancelWorkersAtDuration = getOption(options, LiteralStringRef("cancelWorkersAtDuration"), true); - useRYW = getOption(options, LiteralStringRef("useRYW"), false); - doSetup = getOption(options, LiteralStringRef("setup"), true); + SkewedReadWriteWorkload(WorkloadContext const& wcx) : ReadWriteCommon(wcx) { descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("SkewedReadWrite")); - - // Validate that keyForIndex() is monotonic - for (int i = 0; i < 30; i++) { - int64_t a = deterministicRandom()->randomInt64(0, nodeCount); - int64_t b = deterministicRandom()->randomInt64(0, nodeCount); - if (a > b) { - std::swap(a, b); - } - ASSERT(a <= b); - ASSERT((keyForIndex(a, false) <= keyForIndex(b, false))); - } - - std::vector insertionCountsToMeasureString = - getOption(options, LiteralStringRef("insertionCountsToMeasure"), std::vector()); - for (int i = 0; i < insertionCountsToMeasureString.size(); i++) { - try { - uint64_t count = boost::lexical_cast(insertionCountsToMeasureString[i]); - insertionCountsToMeasure.push_back(count); - } catch (...) { - } - } - - { - hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.2); - hotServerShardFraction = getOption(options, "hotServerShardFraction"_sr, 1.0); - hotReadWriteServerOverlap = getOption(options, "hotReadWriteServerOverlap"_sr, 0.0); - skewRound = getOption(options, "skewRound"_sr, 1); - hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.8); - hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); - ASSERT((hotServerReadFrac >= hotServerFraction || hotServerWriteFrac >= hotServerFraction) && - skewRound > 0); - } + hotServerFraction = getOption(options, "hotServerFraction"_sr, 0.2); + hotServerShardFraction = getOption(options, "hotServerShardFraction"_sr, 1.0); + hotReadWriteServerOverlap = getOption(options, "hotReadWriteServerOverlap"_sr, 0.0); + skewRound = getOption(options, "skewRound"_sr, 1); + hotServerReadFrac = getOption(options, "hotServerReadFrac"_sr, 0.8); + hotServerWriteFrac = getOption(options, "hotServerWriteFrac"_sr, 0.0); + ASSERT((hotServerReadFrac >= hotServerFraction || hotServerWriteFrac >= hotServerFraction) && skewRound > 0); } std::string description() const override { return descriptionString.toString(); } - Future setup(Database const& cx) override { return _setup(cx, this); } Future start(Database const& cx) override { return _start(cx, this); } - ACTOR static Future traceDumpWorkers(Reference const> db) { - try { - loop { - choose { - when(wait(db->onChange())) {} - - when(ErrorOr> workerList = - wait(db->get().clusterInterface.getWorkers.tryGetReply(GetWorkersRequest()))) { - if (workerList.present()) { - std::vector>> dumpRequests; - dumpRequests.reserve(workerList.get().size()); - for (int i = 0; i < workerList.get().size(); i++) - dumpRequests.push_back(workerList.get()[i].interf.traceBatchDumpRequest.tryGetReply( - TraceBatchDumpRequest())); - wait(waitForAll(dumpRequests)); - return true; - } - wait(delay(1.0)); - } - } - } - } catch (Error& e) { - TraceEvent(SevError, "FailedToDumpWorkers").error(e); - throw; - } - } - - Future check(Database const& cx) override { - clients.clear(); - - if (!cancelWorkersAtDuration && now() < metricsStart + metricsDuration) - metricsDuration = now() - metricsStart; - - g_traceBatch.dump(); - if (clientId == 0) - return traceDumpWorkers(dbInfo); - else - return true; - } - - void getMetrics(std::vector& m) override { - double duration = metricsDuration; - int reads = - (aTransactions.getValue() * readsPerTransactionA) + (bTransactions.getValue() * readsPerTransactionB); - int writes = - (aTransactions.getValue() * writesPerTransactionA) + (bTransactions.getValue() * writesPerTransactionB); - m.emplace_back("Measured Duration", duration, Averaged::True); - m.emplace_back( - "Transactions/sec", (aTransactions.getValue() + bTransactions.getValue()) / duration, Averaged::False); - m.emplace_back("Operations/sec", ((reads + writes) / duration), Averaged::False); - m.push_back(aTransactions.getMetric()); - m.push_back(bTransactions.getMetric()); - m.push_back(retries.getMetric()); - m.emplace_back("Mean load time (seconds)", loadTime, Averaged::True); - m.emplace_back("Read rows", reads, Averaged::False); - m.emplace_back("Write rows", writes, Averaged::False); - m.emplace_back("Read rows/sec", reads / duration, Averaged::False); - m.emplace_back("Write rows/sec", writes / duration, Averaged::False); - m.emplace_back( - "Bytes read/sec", (reads * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, Averaged::False); - m.emplace_back("Bytes written/sec", - (writes * (keyBytes + (minValueBytes + maxValueBytes) * 0.5)) / duration, - Averaged::False); - m.insert(m.end(), periodicMetrics.begin(), periodicMetrics.end()); - - std::vector>::iterator ratesItr = ratesAtKeyCounts.begin(); - for (; ratesItr != ratesAtKeyCounts.end(); ratesItr++) - m.emplace_back(format("%lld keys imported bytes/sec", ratesItr->first), ratesItr->second, Averaged::False); - } - - Value randomValue() { - return StringRef((uint8_t*)valueString.c_str(), - deterministicRandom()->randomInt(minValueBytes, maxValueBytes + 1)); - } - - Standalone operator()(uint64_t n) { return KeyValueRef(keyForIndex(n, false), randomValue()); } - void debugPrintServerShards() const { std::cout << std::hex; for (auto it : this->serverShards) { @@ -375,164 +176,6 @@ struct SkewedReadWriteWorkload : KVWorkload { return Void(); } - ACTOR static Future tracePeriodically(SkewedReadWriteWorkload* self) { - state double start = now(); - state double elapsed = 0.0; - state int64_t last_ops = 0; - - loop { - elapsed += self->periodicLoggingInterval; - wait(delayUntil(start + elapsed)); - - TraceEvent((self->description() + "_RowReadLatency").c_str()) - .detail("Mean", self->readLatencies.mean()) - .detail("Median", self->readLatencies.median()) - .detail("Percentile5", self->readLatencies.percentile(.05)) - .detail("Percentile95", self->readLatencies.percentile(.95)) - .detail("Percentile99", self->readLatencies.percentile(.99)) - .detail("Percentile99_9", self->readLatencies.percentile(.999)) - .detail("Max", self->readLatencies.max()) - .detail("Count", self->readLatencyCount) - .detail("Elapsed", elapsed); - - TraceEvent((self->description() + "_GRVLatency").c_str()) - .detail("Mean", self->GRVLatencies.mean()) - .detail("Median", self->GRVLatencies.median()) - .detail("Percentile5", self->GRVLatencies.percentile(.05)) - .detail("Percentile95", self->GRVLatencies.percentile(.95)) - .detail("Percentile99", self->GRVLatencies.percentile(.99)) - .detail("Percentile99_9", self->GRVLatencies.percentile(.999)) - .detail("Max", self->GRVLatencies.max()); - - TraceEvent((self->description() + "_CommitLatency").c_str()) - .detail("Mean", self->commitLatencies.mean()) - .detail("Median", self->commitLatencies.median()) - .detail("Percentile5", self->commitLatencies.percentile(.05)) - .detail("Percentile95", self->commitLatencies.percentile(.95)) - .detail("Percentile99", self->commitLatencies.percentile(.99)) - .detail("Percentile99_9", self->commitLatencies.percentile(.999)) - .detail("Max", self->commitLatencies.max()); - - TraceEvent((self->description() + "_TotalLatency").c_str()) - .detail("Mean", self->latencies.mean()) - .detail("Median", self->latencies.median()) - .detail("Percentile5", self->latencies.percentile(.05)) - .detail("Percentile95", self->latencies.percentile(.95)) - .detail("Percentile99", self->latencies.percentile(.99)) - .detail("Percentile99_9", self->latencies.percentile(.999)) - .detail("Max", self->latencies.max()); - - int64_t ops = - (self->aTransactions.getValue() * (self->readsPerTransactionA + self->writesPerTransactionA)) + - (self->bTransactions.getValue() * (self->readsPerTransactionB + self->writesPerTransactionB)); - bool recordBegin = self->shouldRecord(std::max(now() - self->periodicLoggingInterval, self->clientBegin)); - bool recordEnd = self->shouldRecord(now()); - if (recordBegin && recordEnd) { - std::string ts = format("T=%04.0fs:", elapsed); - self->periodicMetrics.emplace_back( - ts + "Operations/sec", (ops - last_ops) / self->periodicLoggingInterval, Averaged::False); - - // if(self->rampUpLoad) { - self->periodicMetrics.emplace_back( - ts + "Mean Latency (ms)", 1000 * self->latencies.mean(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Median Latency (ms, averaged)", 1000 * self->latencies.median(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "5% Latency (ms, averaged)", 1000 * self->latencies.percentile(.05), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "95% Latency (ms, averaged)", 1000 * self->latencies.percentile(.95), Averaged::True); - - self->periodicMetrics.emplace_back( - ts + "Mean Row Read Latency (ms)", 1000 * self->readLatencies.mean(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Median Row Read Latency (ms, averaged)", 1000 * self->readLatencies.median(), Averaged::True); - self->periodicMetrics.emplace_back(ts + "5% Row Read Latency (ms, averaged)", - 1000 * self->readLatencies.percentile(.05), - Averaged::True); - self->periodicMetrics.emplace_back(ts + "95% Row Read Latency (ms, averaged)", - 1000 * self->readLatencies.percentile(.95), - Averaged::True); - - self->periodicMetrics.emplace_back( - ts + "Mean Total Read Latency (ms)", 1000 * self->fullReadLatencies.mean(), Averaged::True); - self->periodicMetrics.emplace_back(ts + "Median Total Read Latency (ms, averaged)", - 1000 * self->fullReadLatencies.median(), - Averaged::True); - self->periodicMetrics.emplace_back(ts + "5% Total Read Latency (ms, averaged)", - 1000 * self->fullReadLatencies.percentile(.05), - Averaged::True); - self->periodicMetrics.emplace_back(ts + "95% Total Read Latency (ms, averaged)", - 1000 * self->fullReadLatencies.percentile(.95), - Averaged::True); - - self->periodicMetrics.emplace_back( - ts + "Mean GRV Latency (ms)", 1000 * self->GRVLatencies.mean(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Median GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.median(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "5% GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.percentile(.05), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "95% GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.percentile(.95), Averaged::True); - - self->periodicMetrics.emplace_back( - ts + "Mean Commit Latency (ms)", 1000 * self->commitLatencies.mean(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Median Commit Latency (ms, averaged)", 1000 * self->commitLatencies.median(), Averaged::True); - self->periodicMetrics.emplace_back(ts + "5% Commit Latency (ms, averaged)", - 1000 * self->commitLatencies.percentile(.05), - Averaged::True); - self->periodicMetrics.emplace_back(ts + "95% Commit Latency (ms, averaged)", - 1000 * self->commitLatencies.percentile(.95), - Averaged::True); - //} - - self->periodicMetrics.emplace_back( - ts + "Max Latency (ms, averaged)", 1000 * self->latencies.max(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Max Row Read Latency (ms, averaged)", 1000 * self->readLatencies.max(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Max Total Read Latency (ms, averaged)", 1000 * self->fullReadLatencies.max(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Max GRV Latency (ms, averaged)", 1000 * self->GRVLatencies.max(), Averaged::True); - self->periodicMetrics.emplace_back( - ts + "Max Commit Latency (ms, averaged)", 1000 * self->commitLatencies.max(), Averaged::True); - } - last_ops = ops; - - // if(self->rampUpLoad) { - self->latencies.clear(); - self->readLatencies.clear(); - self->fullReadLatencies.clear(); - self->GRVLatencies.clear(); - self->commitLatencies.clear(); - //} - - self->readLatencyTotal = 0.0; - self->readLatencyCount = 0; - } - } - - ACTOR static Future logLatency(Future> f, - ContinuousSample* latencies, - double* totalLatency, - int* latencyCount, - EventMetricHandle readMetric, - bool shouldRecord) { - state double readBegin = now(); - Optional value = wait(f); - - double latency = now() - readBegin; - readMetric->readLatency = latency * 1e9; - readMetric->log(); - - if (shouldRecord) { - *totalLatency += latency; - ++*latencyCount; - latencies->addSample(latency); - } - return Void(); - } - ACTOR template Future readOp(Trans* tr, std::vector keys, SkewedReadWriteWorkload* self, bool shouldRecord) { if (!keys.size()) @@ -541,41 +184,13 @@ struct SkewedReadWriteWorkload : KVWorkload { std::vector> readers; for (int op = 0; op < keys.size(); op++) { ++self->totalReadsMetric; - readers.push_back(logLatency(tr->get(self->keyForIndex(keys[op])), - &self->readLatencies, - &self->readLatencyTotal, - &self->readLatencyCount, - self->readMetric, - shouldRecord)); + readers.push_back(self->logLatency(tr->get(self->keyForIndex(keys[op])), shouldRecord)); } wait(waitForAll(readers)); return Void(); } - ACTOR static Future _setup(Database cx, SkewedReadWriteWorkload* self) { - if (!self->doSetup) - return Void(); - - state Promise loadTime; - state Promise>> ratesAtKeyCounts; - - wait(bulkSetup(cx, - self, - self->nodeCount, - loadTime, - self->insertionCountsToMeasure.empty(), - self->warmingDelay, - self->maxInsertRate, - self->insertionCountsToMeasure, - ratesAtKeyCounts)); - - self->loadTime = loadTime.getFuture().get(); - self->ratesAtKeyCounts = ratesAtKeyCounts.getFuture().get(); - - return Void(); - } - void startReadWriteClients(Database cx, std::vector>& clients) { clientBegin = now(); for (int c = 0; c < actorCount; c++) { @@ -592,7 +207,7 @@ struct SkewedReadWriteWorkload : KVWorkload { ACTOR static Future _start(Database cx, SkewedReadWriteWorkload* self) { state std::vector> clients; if (self->enableReadLatencyLogging) - clients.push_back(tracePeriodically(self)); + clients.push_back(self->tracePeriodically()); wait(updateServerShards(cx, self)); for (self->currentHotRound = 0; self->currentHotRound < self->skewRound; ++self->currentHotRound) { @@ -606,13 +221,6 @@ struct SkewedReadWriteWorkload : KVWorkload { return Void(); } - bool shouldRecord() { return shouldRecord(now()); } - - bool shouldRecord(double checkTime) { - double timeSinceStart = checkTime - clientBegin; - return timeSinceStart >= metricsStart && timeSinceStart < (metricsStart + metricsDuration); - } - // calculate hot server count void setHotServers() { hotServerCount = ceil(hotServerFraction * serverShards.size()); From 5308a2727f956a2eb11c0e6690422466a89ba18a Mon Sep 17 00:00:00 2001 From: Junhyun Shim Date: Mon, 16 May 2022 11:58:42 +0200 Subject: [PATCH 214/299] Fix failing Mac build from boringssl X509V3_EXT_conf_nid is part of 'libdecrepit' in BoringSSL. Use X509V3_EXT_nconf_nid instead. --- flow/MkCert.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/MkCert.cpp b/flow/MkCert.cpp index fa11aab603..df06227722 100644 --- a/flow/MkCert.cpp +++ b/flow/MkCert.cpp @@ -260,9 +260,9 @@ CertAndKeyNative makeCertNative(CertSpecRef spec, CertAndKeyNative issuer) { throw tls_error(); } #ifdef OPENSSL_IS_BORINGSSL - auto ext = ::X509V3_EXT_conf_nid(nullptr, &ctx, extNid, const_cast(extValue.c_str())); + auto ext = ::X509V3_EXT_nconf_nid(nullptr, &ctx, extNid, const_cast(extValue.c_str())); #else - auto ext = ::X509V3_EXT_conf_nid(nullptr, &ctx, extNid, extValue.c_str()); + auto ext = ::X509V3_EXT_nconf_nid(nullptr, &ctx, extNid, extValue.c_str()); #endif OSSL_ASSERT(ext); auto extGuard = ScopeExit([ext]() { ::X509_EXTENSION_free(ext); }); From 89c59012282e4c2650cf13b5ec4275ad45ca3a6c Mon Sep 17 00:00:00 2001 From: Clement Pang Date: Tue, 10 May 2022 16:12:49 -0700 Subject: [PATCH 215/299] make MappedKeyValue constructor public --- .../java/src/main/com/apple/foundationdb/MappedKeyValue.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java b/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java index 3e66a91b84..3aa55013e3 100644 --- a/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java +++ b/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java @@ -32,7 +32,7 @@ public class MappedKeyValue extends KeyValue { private final byte[] rangeEnd; private final List rangeResult; - MappedKeyValue(byte[] key, byte[] value, byte[] rangeBegin, byte[] rangeEnd, List rangeResult) { + public MappedKeyValue(byte[] key, byte[] value, byte[] rangeBegin, byte[] rangeEnd, List rangeResult) { super(key, value); this.rangeBegin = rangeBegin; this.rangeEnd = rangeEnd; From 5e01c171be7ea104d52df5898ab8ccd8345cc60a Mon Sep 17 00:00:00 2001 From: imperatorx Date: Mon, 16 May 2022 20:02:36 +0200 Subject: [PATCH 216/299] Update KeyArrayResult.java Add missing getter --- .../java/src/main/com/apple/foundationdb/KeyArrayResult.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bindings/java/src/main/com/apple/foundationdb/KeyArrayResult.java b/bindings/java/src/main/com/apple/foundationdb/KeyArrayResult.java index 174bc89b19..26ceae306e 100644 --- a/bindings/java/src/main/com/apple/foundationdb/KeyArrayResult.java +++ b/bindings/java/src/main/com/apple/foundationdb/KeyArrayResult.java @@ -41,4 +41,8 @@ public class KeyArrayResult { keys.add(key); } } + + public List getKeys() { + return keys; + } } From 7d785fe6ba285aeb16895edb5c91744b3f2c426e Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 5 May 2022 15:56:16 -0700 Subject: [PATCH 217/299] Add number of tenants to status --- fdbserver/ProxyCommitData.actor.h | 18 ++++++++++-------- fdbserver/Status.actor.cpp | 22 ++++++++++++++++++++-- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/fdbserver/ProxyCommitData.actor.h b/fdbserver/ProxyCommitData.actor.h index c5e523dd3e..01d129e419 100644 --- a/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/ProxyCommitData.actor.h @@ -104,7 +104,8 @@ struct ProxyStats { explicit ProxyStats(UID id, NotifiedVersion* pVersion, NotifiedVersion* pCommittedVersion, - int64_t* commitBatchesMemBytesCountPtr) + int64_t* commitBatchesMemBytesCountPtr, + std::map* tenantMapPtr) : cc("ProxyStats", id.toString()), txnCommitIn("TxnCommitIn", cc), txnCommitVersionAssigned("TxnCommitVersionAssigned", cc), txnCommitResolving("TxnCommitResolving", cc), txnCommitResolved("TxnCommitResolved", cc), txnCommitOut("TxnCommitOut", cc), @@ -160,6 +161,7 @@ struct ProxyStats { specialCounter(cc, "CommitBatchesMemBytesCount", [commitBatchesMemBytesCountPtr]() { return *commitBatchesMemBytesCountPtr; }); + specialCounter(cc, "NumTenants", [tenantMapPtr]() { return tenantMapPtr->size(); }); specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); }); specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); }); logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics"); @@ -169,6 +171,7 @@ struct ProxyStats { struct ProxyCommitData { UID dbgid; int64_t commitBatchesMemBytesCount; + std::map tenantMap; ProxyStats stats; MasterInterface master; std::vector resolvers; @@ -226,8 +229,6 @@ struct ProxyCommitData { UIDTransactionTagMap ssTrTagCommitCost; double lastMasterReset; double lastResolverReset; - - std::map tenantMap; int localTLogCount = -1; // The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly @@ -289,11 +290,12 @@ struct ProxyCommitData { Reference const> db, bool firstProxy) : dbgid(dbgid), commitBatchesMemBytesCount(0), - stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount), master(master), logAdapter(nullptr), - txnStateStore(nullptr), committedVersion(recoveryTransactionVersion), minKnownCommittedVersion(0), version(0), - lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0), firstProxy(firstProxy), - lastCoalesceTime(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), - localCommitBatchesStarted(0), getConsistentReadVersion(getConsistentReadVersion), commit(commit), + stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount, &tenantMap), master(master), + logAdapter(nullptr), txnStateStore(nullptr), committedVersion(recoveryTransactionVersion), + minKnownCommittedVersion(0), version(0), lastVersionTime(0), commitVersionRequestNumber(1), + mostRecentProcessedRequestNumber(0), firstProxy(firstProxy), lastCoalesceTime(0), locked(false), + commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), localCommitBatchesStarted(0), + getConsistentReadVersion(getConsistentReadVersion), commit(commit), cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True)), db(db), singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), lastTxsPop(0), popRemoteTxs(false), lastStartCommit(0), lastCommitLatency(SERVER_KNOBS->REQUIRED_MIN_RECOVERY_DURATION), lastCommitTime(0), diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 4340ff53bf..c7617ec492 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -2101,6 +2101,7 @@ ACTOR static Future workloadStatusFetcher( WorkerDetails rkWorker, JsonBuilderObject* qos, JsonBuilderObject* data_overlay, + JsonBuilderObject* tenants, std::set* incomplete_reasons, Future>> storageServerFuture) { state JsonBuilderObject statusObj; @@ -2184,6 +2185,12 @@ ACTOR static Future workloadStatusFetcher( transactions["committed"] = txnCommitOutSuccess.getStatus(); statusObj["transactions"] = transactions; + + uint64_t numTenants = 0; + if (commitProxyStats.size() > 0) { + numTenants = commitProxyStats[0].getUint64("NumTenants"); + } + (*tenants)["num_tenants"] = numTenants; } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; @@ -2953,6 +2960,7 @@ ACTOR Future clusterGetStatus( state std::vector blobWorkers; state JsonBuilderObject qos; state JsonBuilderObject dataOverlay; + state JsonBuilderObject tenants; state JsonBuilderObject storageWiggler; state std::unordered_set wiggleServers; @@ -3040,8 +3048,15 @@ ACTOR Future clusterGetStatus( state Future> primaryDCFO = getActivePrimaryDC(cx, &fullyReplicatedRegions, &messages); state std::vector> futures2; futures2.push_back(dataStatusFetcher(ddWorker, configuration.get(), &minStorageReplicasRemaining)); - futures2.push_back(workloadStatusFetcher( - db, workers, mWorker, rkWorker, &qos, &dataOverlay, &status_incomplete_reasons, storageServerFuture)); + futures2.push_back(workloadStatusFetcher(db, + workers, + mWorker, + rkWorker, + &qos, + &dataOverlay, + &tenants, + &status_incomplete_reasons, + storageServerFuture)); futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons)); futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons)); futures2.push_back( @@ -3113,6 +3128,9 @@ ACTOR Future clusterGetStatus( if (!qos.empty()) statusObj["qos"] = qos; + if (!tenants.empty()) + statusObj["tenants"] = tenants; + // Merge dataOverlay into data JsonBuilderObject& clusterDataSection = workerStatuses[0]; From ebf11d5d485bdedb9266753c3c5f9867bc312e12 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 12 May 2022 12:09:34 -0700 Subject: [PATCH 218/299] Update schemas with the tenants information --- documentation/sphinx/source/mr-status-json-schemas.rst.inc | 3 +++ fdbclient/Schemas.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index dad23ddcaa..af96e3e1f2 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -877,6 +877,9 @@ "logical_core_utilization":0.4 // computed as cpu_seconds / elapsed_seconds; value may be capped at 0.5 due to hyper-threading } } + }, + "tenants":{ + "num_tenants":0 } }, "client":{ diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 8efaf244c3..8293379079 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -927,6 +927,9 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "logical_core_utilization":0.4 } } + }, + "tenants":{ + "num_tenants":0 } }, "client":{ From 7a22a6ea82255cd6c53e16ecac154de4484b4417 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 13 May 2022 10:12:10 -0700 Subject: [PATCH 219/299] Add a null pointer check for extra safety --- fdbserver/ProxyCommitData.actor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/ProxyCommitData.actor.h b/fdbserver/ProxyCommitData.actor.h index 01d129e419..b6a75f7ea7 100644 --- a/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/ProxyCommitData.actor.h @@ -105,7 +105,7 @@ struct ProxyStats { NotifiedVersion* pVersion, NotifiedVersion* pCommittedVersion, int64_t* commitBatchesMemBytesCountPtr, - std::map* tenantMapPtr) + std::map* pTenantMap) : cc("ProxyStats", id.toString()), txnCommitIn("TxnCommitIn", cc), txnCommitVersionAssigned("TxnCommitVersionAssigned", cc), txnCommitResolving("TxnCommitResolving", cc), txnCommitResolved("TxnCommitResolved", cc), txnCommitOut("TxnCommitOut", cc), @@ -161,7 +161,7 @@ struct ProxyStats { specialCounter(cc, "CommitBatchesMemBytesCount", [commitBatchesMemBytesCountPtr]() { return *commitBatchesMemBytesCountPtr; }); - specialCounter(cc, "NumTenants", [tenantMapPtr]() { return tenantMapPtr->size(); }); + specialCounter(cc, "NumTenants", [pTenantMap]() { return pTenantMap ? pTenantMap->size() : 0; }); specialCounter(cc, "MaxCompute", [this]() { return this->getAndResetMaxCompute(); }); specialCounter(cc, "MinCompute", [this]() { return this->getAndResetMinCompute(); }); logger = traceCounters("ProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "ProxyMetrics"); From ef0d49eb938ee479aa6a2d6b9b112129bd9d0fdd Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 16 May 2022 11:13:18 -0700 Subject: [PATCH 220/299] Update tests/CMakeLists.txt --- tests/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b409f6ae35..f9695703e6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -197,7 +197,6 @@ if(WITH_PYTHON) endif() add_fdb_test(TEST_FILES rare/CheckRelocation.toml) add_fdb_test(TEST_FILES rare/ClogUnclog.toml) - add_fdb_test(TEST_FILES rare/ReadSkewReadWrite.toml) add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml) add_fdb_test(TEST_FILES rare/ConfigIncrement.toml) add_fdb_test(TEST_FILES rare/ConfigIncrementWithKills.toml) From 290b903825274126b628473005b8e30ec1950d4c Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Mon, 16 May 2022 10:43:59 -0700 Subject: [PATCH 221/299] Do not add tenant-info to json if it cannot be correctly determined --- fdbserver/Status.actor.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index c7617ec492..b5e00bc043 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -2186,11 +2186,9 @@ ACTOR static Future workloadStatusFetcher( statusObj["transactions"] = transactions; - uint64_t numTenants = 0; if (commitProxyStats.size() > 0) { - numTenants = commitProxyStats[0].getUint64("NumTenants"); + (*tenants)["num_tenants"] = commitProxyStats[0].getUint64("NumTenants"); } - (*tenants)["num_tenants"] = numTenants; } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; From dc5d2bff069e30aed546ce23cc2601d5d4c4b0e8 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 16 May 2022 11:38:38 -0700 Subject: [PATCH 222/299] make ratekeeper don't limit read rebalance --- fdbserver/DataDistributionQueue.actor.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 5caef85278..3376c550f0 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -1802,12 +1802,15 @@ ACTOR Future BgDDLoadRebalance(DDQueueData* self, int teamCollectionIndex, moved ? resetCount = 0 : resetCount++; } - if (now() - (*self->lastLimited) < SERVER_KNOBS->BG_DD_SATURATION_DELAY) { - rebalancePollingInterval = std::min(SERVER_KNOBS->BG_DD_MAX_WAIT, - rebalancePollingInterval * SERVER_KNOBS->BG_DD_INCREASE_RATE); - } else { - rebalancePollingInterval = std::max(SERVER_KNOBS->BG_DD_MIN_WAIT, - rebalancePollingInterval / SERVER_KNOBS->BG_DD_DECREASE_RATE); + // NOTE: We don’t want read rebalancing to be slowed down when Ratekeeper kicks in + if(isDiskRebalancePriority(ddPriority)) { + if (now() - (*self->lastLimited) < SERVER_KNOBS->BG_DD_SATURATION_DELAY) { + rebalancePollingInterval = std::min(SERVER_KNOBS->BG_DD_MAX_WAIT, + rebalancePollingInterval * SERVER_KNOBS->BG_DD_INCREASE_RATE); + } else { + rebalancePollingInterval = std::max(SERVER_KNOBS->BG_DD_MIN_WAIT, + rebalancePollingInterval / SERVER_KNOBS->BG_DD_DECREASE_RATE); + } } if (resetCount >= SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT && From b0ea1a2ce89690797540e076956153e8c017be47 Mon Sep 17 00:00:00 2001 From: Ray Jenkins Date: Mon, 16 May 2022 13:41:30 -0500 Subject: [PATCH 223/299] client_tmp_dir needs a unique code in bindings. Duplicate code caused distributed_client_tracer to break. --- fdbclient/vexillographer/fdb.options | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index cafa0cc402..57ae180a07 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -134,7 +134,7 @@ description is not currently required but encouraged.