From a023d4994792aeace3380c67dafce9351abf794a Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 10 Feb 2022 01:36:24 -0800 Subject: [PATCH 001/138] Bug fix in DeltaTree - Insert() of record outside of lower/upper boundaries used for key decoding would calculate incorrect record delta. --- fdbserver/DeltaTree.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 9ea758aee5..c50004d738 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1545,7 +1545,17 @@ public: T leftBase = leftBaseIndex == -1 ? cache->lowerBound : get(cache->get(leftBaseIndex)); T rightBase = rightBaseIndex == -1 ? cache->upperBound : get(cache->get(rightBaseIndex)); - int common = leftBase.getCommonPrefixLen(rightBase, skipLen); + // If seek has reached a non-edge node then whatever bytes the left and right bases + // have in common are definitely in common with k. However, for an edge node there + // is no guarantee, as one of the bases will be the lower or upper decode boundary + // and it is possible to add elements to the DeltaTree beyond those boundaries. + int common; + if (leftBaseIndex == -1 || rightBaseIndex == -1) { + common = 0; + } else { + common = leftBase.getCommonPrefixLen(rightBase, skipLen); + } + int commonWithLeftParent = k.getCommonPrefixLen(leftBase, common); int commonWithRightParent = k.getCommonPrefixLen(rightBase, common); bool borrowFromLeft = commonWithLeftParent >= commonWithRightParent; From 42136bf80889a19935986686a3b741620da6cf08 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 14 Feb 2022 22:47:30 -0800 Subject: [PATCH 002/138] Fixed memory leak (and minor performance bug) where RedwoodRecordRef::updateCache() would create the key copy to be placed in the cache but not actually place it into the real cache entry because the cache parameter was accepted by value. --- fdbserver/VersionedBTree.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index dde5b8087e..315e311365 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4091,7 +4091,7 @@ struct RedwoodRecordRef { typedef KeyRef Partial; - void updateCache(Optional cache, Arena& arena) const { cache = KeyRef(arena, key); } + void updateCache(Optional& cache, Arena& arena) const { cache = KeyRef(arena, key); } KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); } From 31ed478488fc330c7bb34619de220b5e20e2f0ac Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 17 Feb 2022 16:03:33 -0800 Subject: [PATCH 003/138] Invert sort order of periodic memory usage dumps to make looking at live output easier. --- contrib/alloc_instrumentation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/alloc_instrumentation.py b/contrib/alloc_instrumentation.py index ce54cdc6b9..d6e588d04f 100755 --- a/contrib/alloc_instrumentation.py +++ b/contrib/alloc_instrumentation.py @@ -41,10 +41,10 @@ def print_stacks(stack_count, sort_by_count): sort_dict = counts if sort_by_count else sizes ordered_list = [(val, backtrace) for (backtrace, val) in sort_dict.items()] - ordered_list.sort(reverse=True) + ordered_list.sort() if stack_count: - ordered_list = ordered_list[:stack_count] + ordered_list = ordered_list[-stack_count:] for size, backtrace in ordered_list: print(str.format('bytes={0:<10} count={1:<8} {2}', sizes[backtrace], counts[backtrace], backtrace)) From 072bc86bb1e38a38727419099664c82bb506d03c Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 17 Feb 2022 16:15:01 -0800 Subject: [PATCH 004/138] Added Redwood knob to reserve part of configured page cache size for Redwood page decode cache state. --- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 3 +++ fdbserver/VersionedBTree.actor.cpp | 7 ++++--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 8759f493f4..b2c6f30790 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -786,6 +786,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_METRICS_INTERVAL, 5.0 ); init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } + init( REDWOOD_DECODE_CACHE_RESERVATION, 0.30 ); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 13c6d93d54..c1fb386796 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -737,6 +737,9 @@ public: double REDWOOD_METRICS_INTERVAL; double REDWOOD_HISTOGRAM_INTERVAL; bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. + double REDWOOD_DECODE_CACHE_RESERVATION; // Fraction of the configured page cache memory that should be reserved for + // use by DecodeCache instances. This is a soft reservation and may need + // to be tuned for a particular workload. // Server request latency measurement int LATENCY_SAMPLE_SIZE; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 315e311365..dd5108ccf0 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2144,9 +2144,10 @@ public: bool memoryOnly = false, Promise errorPromise = {}) : ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2), - pageCacheBytes(pageCacheSizeBytes), pHeader(nullptr), desiredPageSize(desiredPageSize), - desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), - remapCleanupWindow(remapCleanupWindow), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { + pageCacheBytes(pageCacheSizeBytes * (1.0 - SERVER_KNOBS->REDWOOD_DECODE_CACHE_RESERVATION)), pHeader(nullptr), + desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), + memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindow(remapCleanupWindow), + concurrentExtentReads(new FlowLock(concurrentExtentReads)) { if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); From 7fc8844ca4ad37689222865d57208c63c42ba37b Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 18 Feb 2022 11:31:16 +0100 Subject: [PATCH 005/138] C API Tests: new test executable --- bindings/c/CMakeLists.txt | 10 +- bindings/c/test/system/TesterOptions.h | 69 ++++++++ .../c/test/system/fdb_c_system_tester.cpp | 153 ++++++++++++++++++ 3 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 bindings/c/test/system/TesterOptions.h create mode 100644 bindings/c/test/system/fdb_c_system_tester.cpp diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index d08bb48344..7c5cc4a887 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -92,11 +92,16 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) set(UNIT_TEST_VERSION_510_SRCS test/unit/unit_tests_version_510.cpp) set(TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS test/unit/trace_partial_file_suffix_test.cpp) - set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS + set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS test/unit/disconnected_timeout_tests.cpp test/unit/fdb_api.cpp test/unit/fdb_api.hpp) + set(SYSTEM_TESTER_SRCS + test/system/fdb_c_system_tester.cpp + test/system/TesterOptions.h + ) + if(OPEN_FOR_IDE) add_library(fdb_c_performance_test OBJECT test/performance_test.c test/test.h) add_library(fdb_c_ryw_benchmark OBJECT test/ryw_benchmark.c test/test.h) @@ -107,6 +112,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) add_library(fdb_c_unit_tests_version_510 OBJECT ${UNIT_TEST_VERSION_510_SRCS}) add_library(trace_partial_file_suffix_test OBJECT ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS}) add_library(disconnected_timeout_unit_tests OBJECT ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS}) + add_library(fdb_c_system_tester OBJECT ${SYSTEM_TESTER_SRCS}) else() add_executable(fdb_c_performance_test test/performance_test.c test/test.h) add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h) @@ -117,6 +123,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) add_executable(fdb_c_unit_tests_version_510 ${UNIT_TEST_VERSION_510_SRCS}) add_executable(trace_partial_file_suffix_test ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS}) add_executable(disconnected_timeout_unit_tests ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS}) + add_executable(fdb_c_system_tester ${SYSTEM_TESTER_SRCS}) strip_debug_symbols(fdb_c_performance_test) strip_debug_symbols(fdb_c_ryw_benchmark) strip_debug_symbols(fdb_c_txn_size_test) @@ -138,6 +145,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads) target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads) target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads) + target_link_libraries(fdb_c_system_tester PRIVATE fdb_c flow) # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) diff --git a/bindings/c/test/system/TesterOptions.h b/bindings/c/test/system/TesterOptions.h new file mode 100644 index 0000000000..6f492305c3 --- /dev/null +++ b/bindings/c/test/system/TesterOptions.h @@ -0,0 +1,69 @@ +/* + * TesterOptions.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef SYSTEM_TESTER_TESTER_OPTIONS_H +#define SYSTEM_TESTER_TESTER_OPTIONS_H + +#include "flow/SimpleOpt.h" +#include +#include + +#define FDB_API_VERSION 710 +#include "bindings/c/foundationdb/fdb_c.h" + +namespace FDBSystemTester { + +class TesterOptions { +public: + enum { + OPT_CONNFILE, + OPT_HELP, + OPT_TRACE, + OPT_TRACE_DIR, + OPT_LOGGROUP, + OPT_TRACE_FORMAT, + OPT_KNOB, + OPT_API_VERSION, + }; + static const CSimpleOpt::SOption optionDefs[]; + + std::string clusterFile; + bool trace = false; + std::string traceDir; + std::string traceFormat; + std::string logGroup; + bool initialStatusCheck = true; + bool cliHints = true; + std::vector> knobs; + // api version, using the latest version by default + int api_version = FDB_API_VERSION; + + bool parseArgs(int argc, char** argv); + +private: + bool processArg(const CSimpleOpt& args); + static void printProgramUsage(const char* execName); +}; + +} // namespace FDBSystemTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/system/fdb_c_system_tester.cpp new file mode 100644 index 0000000000..179d9f5232 --- /dev/null +++ b/bindings/c/test/system/fdb_c_system_tester.cpp @@ -0,0 +1,153 @@ +/* + * fdb_c_system_tester.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterOptions.h" +#include "flow/Platform.h" +#include "flow/Trace.h" +#include "flow/ArgParseUtil.h" + +#define FDB_API_VERSION 710 +#include "bindings/c/foundationdb/fdb_c.h" + +namespace FDBSystemTester { + +const CSimpleOpt::SOption TesterOptions::optionDefs[] = // + { { OPT_CONNFILE, "-C", SO_REQ_SEP }, + { OPT_CONNFILE, "--cluster-file", SO_REQ_SEP }, + { OPT_TRACE, "--log", SO_NONE }, + { OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP }, + { OPT_LOGGROUP, "--log-group", SO_REQ_SEP }, + { OPT_HELP, "-h", SO_NONE }, + { OPT_HELP, "--help", SO_NONE }, + { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, + { OPT_KNOB, "--knob-", SO_REQ_SEP }, + { OPT_API_VERSION, "--api-version", SO_REQ_SEP } }; + +void TesterOptions::printProgramUsage(const char* execName) { + printf("usage: %s [OPTIONS]\n" + "\n", + execName); + printf(" -C CONNFILE The path of a file containing the connection string for the\n" + " FoundationDB cluster. The default is first the value of the\n" + " FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n" + " then `%s'.\n", + platform::getDefaultClusterFilePath().c_str()); + printf(" --log Enables trace file logging for the CLI session.\n" + " --log-dir PATH Specifes the output directory for trace files. If\n" + " unspecified, defaults to the current directory. Has\n" + " no effect unless --log is specified.\n" + " --log-group LOG_GROUP\n" + " Sets the LogGroup field with the specified value for all\n" + " events in the trace output (defaults to `default').\n" + " --trace-format FORMAT\n" + " Select the format of the log files. xml (the default) and json\n" + " are supported. Has no effect unless --log is specified.\n" + " --api-version APIVERSION\n" + " Specifies the version of the API for the CLI to use.\n" + " --knob-KNOBNAME KNOBVALUE\n" + " Changes a knob option. KNOBNAME should be lowercase.\n" + " -h, --help Display this help and exit.\n"); +} + +bool TesterOptions::parseArgs(int argc, char** argv) { + // declare our options parser, pass in the arguments from main + // as well as our array of valid options. + CSimpleOpt args(argc, argv, optionDefs); + + // while there are arguments left to process + while (args.Next()) { + if (args.LastError() == SO_SUCCESS) { + if (args.OptionId() == OPT_HELP) { + printProgramUsage(argv[0]); + return false; + } + if (!processArg(args)) { + return false; + } + } else { + printf("Invalid argument: %s\n", args.OptionText()); + printProgramUsage(argv[0]); + return false; + } + } + return true; +} + +bool TesterOptions::processArg(const CSimpleOpt& args) { + switch (args.OptionId()) { + case OPT_CONNFILE: + clusterFile = args.OptionArg(); + break; + case OPT_API_VERSION: { + char* endptr; + api_version = strtoul((char*)args.OptionArg(), &endptr, 10); + if (*endptr != '\0') { + fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg()); + return 1; + } else if (api_version < 700 || api_version > FDB_API_VERSION) { + // multi-version fdbcli only available after 7.0 + fprintf(stderr, + "ERROR: api version %s is not supported. (Min: 700, Max: %d)\n", + args.OptionArg(), + FDB_API_VERSION); + return 1; + } + break; + } + case OPT_TRACE: + trace = true; + break; + case OPT_TRACE_DIR: + traceDir = args.OptionArg(); + break; + case OPT_LOGGROUP: + logGroup = args.OptionArg(); + break; + case OPT_TRACE_FORMAT: + if (!validateTraceFormat(args.OptionArg())) { + fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); + } + traceFormat = args.OptionArg(); + break; + case OPT_KNOB: { + Optional knobName = extractPrefixedArgument("--knob", args.OptionSyntax()); + if (!knobName.present()) { + fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", args.OptionSyntax()); + return FDB_EXIT_ERROR; + } + knobs.emplace_back(knobName.get(), args.OptionArg()); + break; + } + } + return true; +} + +} // namespace FDBSystemTester + +using namespace FDBSystemTester; + +int main(int argc, char** argv) { + TesterOptions options; + if (!options.parseArgs(argc, argv)) { + return 1; + } + + return 0; +} From 735c5697d0016e703e35e24353de0fe6939fef13 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 18 Feb 2022 03:25:28 -0800 Subject: [PATCH 006/138] Added fast size estimates to Arena which are O(1) and are usually accurate. When inaccurate, a full size scan will update all estimates in the tree. --- flow/Arena.cpp | 93 +++++++++++++++++++++++++++++++++++++++++++++----- flow/Arena.h | 12 +++++-- 2 files changed, 95 insertions(+), 10 deletions(-) diff --git a/flow/Arena.cpp b/flow/Arena.cpp index ae4a0fb2ca..f100074935 100644 --- a/flow/Arena.cpp +++ b/flow/Arena.cpp @@ -106,15 +106,22 @@ void* Arena::allocate4kAlignedBuffer(uint32_t size) { return ArenaBlock::dependOn4kAlignedBuffer(impl, size); } -size_t Arena::getSize() const { +size_t Arena::getSize(bool fastInaccurateEstimate) const { if (impl) { allowAccess(impl.getPtr()); - auto result = impl->totalSize(); + size_t result; + if (fastInaccurateEstimate) { + result = impl->estimatedTotalSize(); + } else { + result = impl->totalSize(); + } + disallowAccess(impl.getPtr()); return result; } return 0; } + bool Arena::hasFree(size_t size, const void* address) { if (impl) { allowAccess(impl.getPtr()); @@ -167,28 +174,38 @@ const void* ArenaBlock::getData() const { const void* ArenaBlock::getNextData() const { return (const uint8_t*)getData() + used(); } -size_t ArenaBlock::totalSize() { + +size_t ArenaBlock::totalSize() const { if (isTiny()) { return size(); } - size_t s = size(); + // Walk the entire tree to get an accurate size and store it in the estimate for + // each block, recursively. + totalSizeEstimate = size(); int o = nextBlockOffset; while (o) { ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o); makeDefined(r, sizeof(ArenaBlockRef)); if (r->aligned4kBufferSize != 0) { - s += r->aligned4kBufferSize; + totalSizeEstimate += r->aligned4kBufferSize; } else { allowAccess(r->next); - s += r->next->totalSize(); + totalSizeEstimate += r->next->totalSize(); disallowAccess(r->next); } o = r->nextBlockOffset; makeNoAccess(r, sizeof(ArenaBlockRef)); } - return s; + return totalSizeEstimate; } +size_t ArenaBlock::estimatedTotalSize() const { + if (isTiny()) { + return size(); + } + return totalSizeEstimate; +} + // just for debugging: void ArenaBlock::getUniqueBlocks(std::set& a) { a.insert(this); @@ -232,6 +249,7 @@ void ArenaBlock::makeReference(ArenaBlock* next) { makeNoAccess(r, sizeof(ArenaBlockRef)); nextBlockOffset = bigUsed; bigUsed += sizeof(ArenaBlockRef); + totalSizeEstimate += next->estimatedTotalSize(); } void* ArenaBlock::make4kAlignedBuffer(uint32_t size) { @@ -245,6 +263,7 @@ void* ArenaBlock::make4kAlignedBuffer(uint32_t size) { makeNoAccess(r, sizeof(ArenaBlockRef)); nextBlockOffset = bigUsed; bigUsed += sizeof(ArenaBlockRef); + totalSizeEstimate += size; return result; } @@ -341,6 +360,7 @@ ArenaBlock* ArenaBlock::create(int dataSize, Reference& next) { b->bigSize = 8192; INSTRUMENT_ALLOCATE("Arena8192"); } + b->totalSizeEstimate = b->bigSize; b->tinySize = b->tinyUsed = NOT_TINY; b->bigUsed = sizeof(ArenaBlock); } else { @@ -350,6 +370,7 @@ ArenaBlock* ArenaBlock::create(int dataSize, Reference& next) { b = (ArenaBlock*)new uint8_t[reqSize]; b->tinySize = b->tinyUsed = NOT_TINY; b->bigSize = reqSize; + b->totalSizeEstimate = b->bigSize; b->bigUsed = sizeof(ArenaBlock); if (FLOW_KNOBS && g_allocation_tracing_disabled == 0 && @@ -649,4 +670,60 @@ TEST_CASE("/flow/Arena/DefaultBoostHash") { ASSERT(hashFunc(d) == hashFunc(d)); return Void(); -} \ No newline at end of file +} + +TEST_CASE("/flow/Arena/Size") { + Arena a; + + // Size estimates are accurate unless dependencies are added to an Arena via another Arena + // handle which points to a non-root node. + // + // Note that the ASSERT argument order matters, the estimate must be calculated first as + // the full accurate calculation will update the estimate + makeString(40, a); + ASSERT_EQ(a.getSize(true), a.getSize()); + + makeString(700, a); + ASSERT_EQ(a.getSize(true), a.getSize()); + + // Copy a at a point where it points to a large block with room for block references + Arena b = a; + + // copy a at a point where there isn't room for more block references + makeString(1000, a); + Arena c = a; + + makeString(1000, a); + makeString(1000, a); + ASSERT_EQ(a.getSize(true), a.getSize()); + + Standalone s = makeString(500); + a.dependsOn(s.arena()); + ASSERT_EQ(a.getSize(true), a.getSize()); + + Standalone s2 = makeString(500); + a.dependsOn(s2.arena()); + ASSERT_EQ(a.getSize(true), a.getSize()); + + // Add a dependency to b, which will fit in b's root and update b's size estimate + Standalone s3 = makeString(100); + b.dependsOn(s3.arena()); + ASSERT_EQ(b.getSize(true), b.getSize()); + + // But now a's size estimate is out of date because the new reference in b's root is still + // in a's tree + ASSERT_LT(a.getSize(true), a.getSize()); + + // Now that a full size calc has been done on a, the estimate is up to date. + ASSERT_EQ(a.getSize(true), a.getSize()); + + // Add a dependency to c, which will NOT fit in c's root, so it will be added to a new + // root for c and that root will not be in a's tree so a's size and estimate remain + // unchanged and the same. The size and estimate of c will also match. + Standalone s4 = makeString(100); + c.dependsOn(s4.arena()); + ASSERT_EQ(c.getSize(true), c.getSize()); + ASSERT_EQ(a.getSize(true), a.getSize()); + + return Void(); +} diff --git a/flow/Arena.h b/flow/Arena.h index efae1a508b..2039c086f2 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -104,7 +104,13 @@ public: void dependsOn(const Arena& p); void* allocate4kAlignedBuffer(uint32_t size); - size_t getSize() const; + + // If fastInaccurateEstimate is true this operation is O(1) but it is inaccurate in that it + // will omit memory added to this Arena's block tree using Arena handles which reference + // non-root nodes in this Arena's block tree. + // When fastInaccurateEstimate is false, all estimates in the block tree will be updated to + // be accurate. + size_t getSize(bool fastInaccurateEstimate = false) const; bool hasFree(size_t size, const void* address); @@ -156,6 +162,7 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted { // if tinySize != NOT_TINY, following variables aren't used uint32_t bigSize, bigUsed; // include block header uint32_t nextBlockOffset; + mutable size_t totalSizeEstimate; // Estimate of the minimum total size of arena blocks this one reaches void addref(); void delref(); @@ -165,7 +172,8 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted { int unused() const; const void* getData() const; const void* getNextData() const; - size_t totalSize(); + size_t totalSize() const; + size_t estimatedTotalSize() const; // just for debugging: void getUniqueBlocks(std::set& a); int addUsed(int bytes); From 17fe322bc4567c2b87b7564ee309e97fa2ce8ab6 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 18 Feb 2022 03:26:05 -0800 Subject: [PATCH 007/138] Added near real-time tracking of Redwood page DecodeCache memory. --- fdbserver/DeltaTree.h | 38 ++++++++++++++++++++++++++++-- fdbserver/VersionedBTree.actor.cpp | 29 +++++++++++++---------- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index c50004d738..8f69c0d66e 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1095,21 +1095,48 @@ public: // DecodedNodes are stored in a contiguous vector, which sometimes must be expanded, so care // must be taken to resolve DecodedNode pointers again after the DecodeCache has new entries added. struct DecodeCache : FastAllocated, ReferenceCounted { - DecodeCache(const T& lowerBound = T(), const T& upperBound = T()) - : lowerBound(arena, lowerBound), upperBound(arena, upperBound) { + DecodeCache(const T& lowerBound = T(), const T& upperBound = T(), size_t* pMemoryTracker = nullptr) + : lowerBound(arena, lowerBound), upperBound(arena, upperBound), lastKnownUsedMemory(0), + pMemoryTracker(pMemoryTracker) { decodedNodes.reserve(10); deltatree_printf("DecodedNode size: %d\n", sizeof(DecodedNode)); } + ~DecodeCache() { + if (pMemoryTracker != nullptr) { + // Do not update, only subtract the last known amount which would have been + // published to the counter + *pMemoryTracker -= lastKnownUsedMemory; + } + } + Arena arena; T lowerBound; T upperBound; + // Track the amount of memory used by the vector and arena and publish updates to some counter. + // Note that no update is pushed on construction because a Cursor will surely soon follow. + // Updates are pushed to the counter on + // DecodeCache clear + // DecodeCache destruction + // Cursor destruction + // as those are the most efficient times to publish an update. + size_t lastKnownUsedMemory; + size_t* pMemoryTracker; + // Index 0 is always the root std::vector decodedNodes; DecodedNode& get(int index) { return decodedNodes[index]; } + void updateUsedMemory() { + size_t usedNow = arena.getSize(true) + (decodedNodes.capacity() * sizeof(DecodedNode)); + if (pMemoryTracker != nullptr) { + *pMemoryTracker += (usedNow - lastKnownUsedMemory); + } + lastKnownUsedMemory = usedNow; + } + template int emplace_new(Args&&... args) { int index = decodedNodes.size(); @@ -1125,6 +1152,7 @@ public: lowerBound = T(a, lowerBound); upperBound = T(a, upperBound); arena = a; + updateUsedMemory(); } }; @@ -1142,6 +1170,12 @@ public: // Copy constructor does not copy item because normally a copied cursor will be immediately moved. Cursor(const Cursor& c) : tree(c.tree), cache(c.cache), nodeIndex(c.nodeIndex) {} + ~Cursor() { + if (cache != nullptr) { + cache->updateUsedMemory(); + } + } + Cursor next() const { Cursor c = *this; c.moveNext(); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index dd5108ccf0..12bf9d7473 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1673,6 +1673,9 @@ struct RedwoodMetrics { Reference kvSizeReadByGetRange; double startTime; + // Absolute counters, not reset per time interval + size_t decodeCacheMemory = 0; + // Return number of pages read or written, from cache or disk unsigned int pageOps() const { // All page reads are either a cache hit, probe hit, or a disk read @@ -1889,6 +1892,9 @@ class ObjectCache : NonCopyable { public: ObjectCache(int sizeLimit = 1) : sizeLimit(sizeLimit), currentSize(0) {} + int64_t getSizeUsed() const { return currentSize; } + int64_t getSizeLimit() const { return sizeLimit; } + void setSizeLimit(int n) { ASSERT(n > 0); sizeLimit = n; @@ -2023,8 +2029,6 @@ public: return clear_impl(this); } - int count() const { return currentSize; } - // Move the prioritized evictions queued to the front of the eviction order void flushPrioritizedEvictions() { evictionOrder.splice(evictionOrder.begin(), prioritizedEvictions); } @@ -3665,7 +3669,7 @@ public: int64_t total; if (memoryOnly) { total = pageCacheBytes; - free = pageCacheBytes - ((int64_t)pageCache.count() * physicalPageSize); + free = pageCacheBytes - (pageCache.getSizeUsed() * physicalPageSize); } else { g_network->getDiskBytes(parentDirectory(filename), free, total); } @@ -3688,9 +3692,9 @@ public: return StorageBytes(free, total, pagerSize - reusable, free + reusable, temp); } - int64_t getPageCacheCount() override { return (int64_t)pageCache.count(); } + int64_t getPageCacheCount() override { return pageCache.getSizeUsed(); } int64_t getPageCount() override { return pHeader->pageCount; } - int64_t getExtentCacheCount() override { return (int64_t)extentCache.count(); } + int64_t getExtentCacheCount() override { return extentCache.getSizeUsed(); } ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { // Wait for the remap eraser to finish all of its work (not triggering stop) @@ -4860,7 +4864,7 @@ public: entries.emplace_back(q.get(), self->readPage(PagerEventReasons::LazyClear, q.get().height, - snapshot, + snapshot.getPtr(), q.get().pageID, ioLeafPriority, true, @@ -4883,7 +4887,7 @@ public: // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding - BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd); + BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd, &g_redwoodMetrics.decodeCacheMemory); BTreePage::ValueTree::Cursor c(&cache, btPage.valueTree()); ASSERT(c.moveFirst()); Version v = entry.version; @@ -5697,7 +5701,7 @@ private: ACTOR static Future> readPage(PagerEventReasons reason, unsigned int level, - Reference snapshot, + IPagerSnapshot* snapshot, BTreePageIDRef id, int priority, bool forLazyClear, @@ -5738,7 +5742,8 @@ private: lowerBound.toString().c_str(), upperBound.toString().c_str()); - BTreePage::BinaryTree::DecodeCache* cache = new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound); + BTreePage::BinaryTree::DecodeCache* cache = + new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound, &g_redwoodMetrics.decodeCacheMemory); page->userData = cache; page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; } @@ -6170,7 +6175,7 @@ private: } state Reference page = - wait(readPage(PagerEventReasons::Commit, height, batch->snapshot, rootID, height, false, true)); + wait(readPage(PagerEventReasons::Commit, height, batch->snapshot.getPtr(), rootID, height, false, true)); // If the page exists in the cache, it must be copied before modification. // That copy will be referenced by pageCopy, as page must stay in scope in case anything references its @@ -7047,7 +7052,7 @@ public: debug_printf("pushPage(link=%s)\n", link.get().toString(false).c_str()); return map(readPage(reason, path.back().btPage()->height - 1, - pager, + pager.getPtr(), link.get().getChildPage(), ioMaxPriority, false, @@ -7064,7 +7069,7 @@ public: Future pushPage(BTreePageIDRef id) { debug_printf("pushPage(root=%s)\n", ::toString(id).c_str()); - return map(readPage(reason, btree->m_pHeader->height, pager, id, ioMaxPriority, false, true), + return map(readPage(reason, btree->m_pHeader->height, pager.getPtr(), id, ioMaxPriority, false, true), [=](Reference p) { #if REDWOOD_DEBUG path.push_back({ p, getCursor(p, dbBegin, dbEnd), id }); From a45c9ccce1a76e60fbea03a90435e75ef88edc01 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 18 Feb 2022 11:14:40 -0800 Subject: [PATCH 008/138] Revert "Added Redwood knob to reserve part of configured page cache size for Redwood page decode cache state." This reverts commit 072bc86bb1e38a38727419099664c82bb506d03c. --- fdbclient/ServerKnobs.cpp | 1 - fdbclient/ServerKnobs.h | 3 --- fdbserver/VersionedBTree.actor.cpp | 7 +++---- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index b2c6f30790..8759f493f4 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -786,7 +786,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_METRICS_INTERVAL, 5.0 ); init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } - init( REDWOOD_DECODE_CACHE_RESERVATION, 0.30 ); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index c1fb386796..13c6d93d54 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -737,9 +737,6 @@ public: double REDWOOD_METRICS_INTERVAL; double REDWOOD_HISTOGRAM_INTERVAL; bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. - double REDWOOD_DECODE_CACHE_RESERVATION; // Fraction of the configured page cache memory that should be reserved for - // use by DecodeCache instances. This is a soft reservation and may need - // to be tuned for a particular workload. // Server request latency measurement int LATENCY_SAMPLE_SIZE; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 12bf9d7473..66876ff63a 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2148,10 +2148,9 @@ public: bool memoryOnly = false, Promise errorPromise = {}) : ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2), - pageCacheBytes(pageCacheSizeBytes * (1.0 - SERVER_KNOBS->REDWOOD_DECODE_CACHE_RESERVATION)), pHeader(nullptr), - desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), - memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindow(remapCleanupWindow), - concurrentExtentReads(new FlowLock(concurrentExtentReads)) { + pageCacheBytes(pageCacheSizeBytes), pHeader(nullptr), desiredPageSize(desiredPageSize), + desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), + remapCleanupWindow(remapCleanupWindow), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); From 11527a18d32ece98af96c71fc9a5617d365440a5 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Mon, 21 Feb 2022 18:01:10 +0100 Subject: [PATCH 009/138] SysTester: Execution framework --- bindings/c/CMakeLists.txt | 11 +- bindings/c/test/system/SysTestApiWrapper.cpp | 85 +++++++++ bindings/c/test/system/SysTestApiWrapper.h | 87 +++++++++ .../system/SysTestCorrectnessWorkload.cpp | 80 +++++++++ .../{TesterOptions.h => SysTestOptions.h} | 20 +-- bindings/c/test/system/SysTestScheduler.cpp | 146 +++++++++++++++ bindings/c/test/system/SysTestScheduler.h | 45 +++++ .../system/SysTestTransactionExecutor.cpp | 170 ++++++++++++++++++ .../test/system/SysTestTransactionExecutor.h | 82 +++++++++ bindings/c/test/system/SysTestWorkload.cpp | 35 ++++ bindings/c/test/system/SysTestWorkload.h | 57 ++++++ .../c/test/system/fdb_c_system_tester.cpp | 91 +++++++++- 12 files changed, 886 insertions(+), 23 deletions(-) create mode 100644 bindings/c/test/system/SysTestApiWrapper.cpp create mode 100644 bindings/c/test/system/SysTestApiWrapper.h create mode 100644 bindings/c/test/system/SysTestCorrectnessWorkload.cpp rename bindings/c/test/system/{TesterOptions.h => SysTestOptions.h} (85%) create mode 100644 bindings/c/test/system/SysTestScheduler.cpp create mode 100644 bindings/c/test/system/SysTestScheduler.h create mode 100644 bindings/c/test/system/SysTestTransactionExecutor.cpp create mode 100644 bindings/c/test/system/SysTestTransactionExecutor.h create mode 100644 bindings/c/test/system/SysTestWorkload.cpp create mode 100644 bindings/c/test/system/SysTestWorkload.h diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 7c5cc4a887..df24f3170b 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -99,7 +99,16 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) set(SYSTEM_TESTER_SRCS test/system/fdb_c_system_tester.cpp - test/system/TesterOptions.h + test/system/SysTestApiWrapper.cpp + test/system/SysTestApiWrapper.h + test/system/SysTestCorrectnessWorkload.cpp + test/system/SysTestOptions.h + test/system/SysTestScheduler.cpp + test/system/SysTestScheduler.h + test/system/SysTestTransactionExecutor.cpp + test/system/SysTestTransactionExecutor.h + test/system/SysTestWorkload.cpp + test/system/SysTestWorkload.h ) if(OPEN_FOR_IDE) diff --git a/bindings/c/test/system/SysTestApiWrapper.cpp b/bindings/c/test/system/SysTestApiWrapper.cpp new file mode 100644 index 0000000000..c73d0a09ad --- /dev/null +++ b/bindings/c/test/system/SysTestApiWrapper.cpp @@ -0,0 +1,85 @@ +/* + * SysTestApiWrapper.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "SysTestApiWrapper.h" +#include +#include + +namespace FDBSystemTester { + +namespace { + +void fdb_check(fdb_error_t e) { + if (e) { + std::cerr << fdb_get_error(e) << std::endl; + std::abort(); + } +} + +} // namespace + +Future::~Future() { + if (future_) { + fdb_future_destroy(future_); + } +} + +void Future::reset() { + if (future_) { + fdb_future_destroy(future_); + future_ = nullptr; + } +} + +fdb_error_t Future::getError() { + return fdb_future_get_error(future_); +} + +std::optional ValueFuture::getValue() { + int out_present; + const std::uint8_t* val; + int vallen; + fdb_check(fdb_future_get_value(future_, &out_present, &val, &vallen)); + return out_present ? std::make_optional(std::string((const char*)val, vallen)) : std::nullopt; +} + +// Given an FDBDatabase, initializes a new transaction. +Transaction::Transaction(FDBTransaction* tx) : tx_(tx) {} + +ValueFuture Transaction::get(std::string_view key, fdb_bool_t snapshot) { + return ValueFuture(fdb_transaction_get(tx_, (const uint8_t*)key.data(), key.size(), snapshot)); +} + +void Transaction::set(std::string_view key, std::string_view value) { + fdb_transaction_set(tx_, (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size()); +} + +EmptyFuture Transaction::commit() { + return EmptyFuture(fdb_transaction_commit(tx_)); +} + +EmptyFuture Transaction::onError(fdb_error_t err) { + return EmptyFuture(fdb_transaction_on_error(tx_, err)); +} + +Transaction::~Transaction() { + fdb_transaction_destroy(tx_); +} + +} // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestApiWrapper.h b/bindings/c/test/system/SysTestApiWrapper.h new file mode 100644 index 0000000000..690ccd3fba --- /dev/null +++ b/bindings/c/test/system/SysTestApiWrapper.h @@ -0,0 +1,87 @@ +/* + * SysTestApiWrapper.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef SYS_TEST_API_WRAPPER_H +#define SYS_TEST_API_WRAPPER_H + +#include +#include + +#define FDB_API_VERSION 710 +#include "bindings/c/foundationdb/fdb_c.h" + +namespace FDBSystemTester { + +// Wrapper parent class to manage memory of an FDBFuture pointer. Cleans up +// FDBFuture when this instance goes out of scope. +class Future { +public: + Future() : future_(nullptr) {} + Future(FDBFuture* f) : future_(f) {} + virtual ~Future(); + + Future& operator=(Future&& other) { + future_ = other.future_; + other.future_ = nullptr; + return *this; + } + + FDBFuture* fdbFuture() { return future_; }; + + fdb_error_t getError(); + void reset(); + +protected: + FDBFuture* future_; +}; + +class ValueFuture : public Future { +public: + ValueFuture() = default; + ValueFuture(FDBFuture* f) : Future(f) {} + std::optional getValue(); +}; + +class EmptyFuture : public Future { +public: + EmptyFuture() = default; + EmptyFuture(FDBFuture* f) : Future(f) {} +}; + +class Transaction { +public: + // Given an FDBDatabase, initializes a new transaction. + Transaction(FDBTransaction* tx); + ~Transaction(); + + ValueFuture get(std::string_view key, fdb_bool_t snapshot); + void set(std::string_view key, std::string_view value); + EmptyFuture commit(); + EmptyFuture onError(fdb_error_t err); + +private: + FDBTransaction* tx_; +}; + +} // namespace FDBSystemTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp b/bindings/c/test/system/SysTestCorrectnessWorkload.cpp new file mode 100644 index 0000000000..ef229658cc --- /dev/null +++ b/bindings/c/test/system/SysTestCorrectnessWorkload.cpp @@ -0,0 +1,80 @@ +/* + * SysTestCorrectnessWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "SysTestWorkload.h" +#include +#include + +namespace FDBSystemTester { + +namespace { + +class UpdateTxActor : public TransactionActorBase { +public: + ValueFuture fGet; + + void start() override { + fGet = tx()->get(dbKey("foo"), false); + ctx()->continueAfter(fGet, [this]() { this->step1(); }); + } + + void step1() { + std::optional optStr = fGet.getValue(); + tx()->set(dbKey("foo"), optStr.value_or("bar")); + commit(); + } + + void reset() override { fGet.reset(); } +}; + +} // namespace + +class ApiCorrectnessWorkload : public WorkloadBase { +public: + ApiCorrectnessWorkload() : numTxLeft(10) {} + + void start() override { + schedule([this]() { nextTransaction(); }); + } + +private: + void nextTransaction() { + if (numTxLeft > 0) { + numTxLeft--; + UpdateTxActor* tx = new UpdateTxActor(); + execTransaction(tx, [this, tx]() { transactionDone(tx); }); + std::cout << numTxLeft << " transactions left" << std::endl; + } else { + std::cout << "Last transaction completed" << std::endl; + } + } + + void transactionDone(UpdateTxActor* tx) { + delete tx; + nextTransaction(); + } + + int numTxLeft; +}; + +IWorkload* createApiCorrectnessWorkload() { + return new ApiCorrectnessWorkload(); +} + +} // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/TesterOptions.h b/bindings/c/test/system/SysTestOptions.h similarity index 85% rename from bindings/c/test/system/TesterOptions.h rename to bindings/c/test/system/SysTestOptions.h index 6f492305c3..36ad529f67 100644 --- a/bindings/c/test/system/TesterOptions.h +++ b/bindings/c/test/system/SysTestOptions.h @@ -1,5 +1,5 @@ /* - * TesterOptions.h + * SysTestOptions.h * * This source file is part of the FoundationDB open source project * @@ -20,7 +20,7 @@ #pragma once -#ifndef SYSTEM_TESTER_TESTER_OPTIONS_H +#ifndef SYS_TEST_OPTIONS_TESTER_OPTIONS_H #define SYSTEM_TESTER_TESTER_OPTIONS_H #include "flow/SimpleOpt.h" @@ -34,18 +34,6 @@ namespace FDBSystemTester { class TesterOptions { public: - enum { - OPT_CONNFILE, - OPT_HELP, - OPT_TRACE, - OPT_TRACE_DIR, - OPT_LOGGROUP, - OPT_TRACE_FORMAT, - OPT_KNOB, - OPT_API_VERSION, - }; - static const CSimpleOpt::SOption optionDefs[]; - std::string clusterFile; bool trace = false; std::string traceDir; @@ -56,6 +44,8 @@ public: std::vector> knobs; // api version, using the latest version by default int api_version = FDB_API_VERSION; + bool blockOnFutures = false; + int numClientThreads = 1; bool parseArgs(int argc, char** argv); @@ -66,4 +56,4 @@ private: } // namespace FDBSystemTester -#endif \ No newline at end of file +#endif diff --git a/bindings/c/test/system/SysTestScheduler.cpp b/bindings/c/test/system/SysTestScheduler.cpp new file mode 100644 index 0000000000..e5abfe2c54 --- /dev/null +++ b/bindings/c/test/system/SysTestScheduler.cpp @@ -0,0 +1,146 @@ +/* + * SysTestScheduler.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SysTestScheduler.h" + +#include "flow/Arena.h" +#include "flow/ThreadPrimitives.h" +#include "flow/ThreadSafeQueue.h" +#include "flow/IRandom.h" +#include +#include +#include + +namespace FDBSystemTester { + +class SingleThreadedScheduler : public IScheduler { +public: + SingleThreadedScheduler() : stopRequested(false), sleeping(false), thr(nullptr) {} + + ~SingleThreadedScheduler() override { + if (thr) { + delete thr; + } + } + + void start() override { + assert(thr == nullptr); + assert(!stop_); + thr = new std::thread([this]() { this->threadMain(); }); + } + + void schedule(TTaskFct task) override { + taskQueue.push(task); + wake(); + } + + void stop() override { + if (stopRequested.exchange(true) == false) { + if (thr) { + wake(); + } + } + } + + void join() override { + assert(thr); + thr->join(); + } + +private: + void threadMain() { + while (!stopRequested) { + Optional t = taskQueue.pop(); + if (t.present()) { + t.get()(); + continue; + } + sleeping = true; + wakeEvent.block(); + sleeping = false; + continue; + } + } + + void wake() { + while (sleeping) { + wakeEvent.set(); + } + } + + ThreadSafeQueue taskQueue; + std::atomic stopRequested; + std::atomic sleeping; + Event wakeEvent; + std::thread* thr; +}; + +class MultiThreadedScheduler : public IScheduler { +public: + MultiThreadedScheduler(int numThreads) : numThreads(numThreads) { + for (int i = 0; i < numThreads; i++) { + schedulers.push_back(new SingleThreadedScheduler()); + } + } + + ~MultiThreadedScheduler() override { + for (auto sch : schedulers) { + delete sch; + } + } + + void start() override { + for (auto sch : schedulers) { + sch->start(); + } + } + + void schedule(TTaskFct task) override { + int idx = deterministicRandom()->randomInt(0, numThreads); + schedulers[idx]->schedule(task); + } + + void stop() override { + for (auto sch : schedulers) { + sch->stop(); + } + } + + void join() override { + for (auto sch : schedulers) { + sch->join(); + } + } + +private: + std::vector schedulers; + int numThreads; +}; + +IScheduler* createScheduler(int numThreads) { + assert(numThreads > 0 && numThreads <= 1000); + if (numThreads == 1) { + return new SingleThreadedScheduler(); + } else { + return new MultiThreadedScheduler(numThreads); + } +} + +} // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestScheduler.h b/bindings/c/test/system/SysTestScheduler.h new file mode 100644 index 0000000000..e92bb8c5c4 --- /dev/null +++ b/bindings/c/test/system/SysTestScheduler.h @@ -0,0 +1,45 @@ +/* + * SysTestScheduler.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef SYS_TEST_SCHEDULER_H +#define SYS_TEST_SCHEDULER_H + +#include + +namespace FDBSystemTester { + +using TTaskFct = std::function; + +class IScheduler { +public: + virtual ~IScheduler() {} + virtual void start() = 0; + virtual void schedule(TTaskFct task) = 0; + virtual void stop() = 0; + virtual void join() = 0; +}; + +IScheduler* createScheduler(int numThreads); + +} // namespace FDBSystemTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestTransactionExecutor.cpp b/bindings/c/test/system/SysTestTransactionExecutor.cpp new file mode 100644 index 0000000000..04326f24f0 --- /dev/null +++ b/bindings/c/test/system/SysTestTransactionExecutor.cpp @@ -0,0 +1,170 @@ +/* + * SysTestTransactionExecutor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SysTestTransactionExecutor.h" +#include +#include + +namespace FDBSystemTester { + +namespace { + +void fdb_check(fdb_error_t e) { + if (e) { + std::cerr << fdb_get_error(e) << std::endl; + std::abort(); + } +} + +} // namespace + +class TransactionContext : public ITransactionContext { +public: + TransactionContext(FDBTransaction* tx, + ITransactionActor* txActor, + TTaskFct cont, + const TransactionExecutorOptions& options, + IScheduler* scheduler) + : options(options), fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), finalError(0) {} + + Transaction* tx() override { return &fdbTx; } + void continueAfter(Future& f, TTaskFct cont) override { doContinueAfter(f, cont); } + void commit() override { + currFuture = fdbTx.commit(); + doContinueAfter(currFuture, [this]() { done(); }); + } + void done() override { + TTaskFct cont = contAfterDone; + delete this; + cont(); + } + std::string_view dbKey(std::string_view key) override { + std::string keyWithPrefix(options.prefix); + keyWithPrefix.append(key); + return key; + } + +private: + void doContinueAfter(Future& f, TTaskFct cont) { + if (options.blockOnFutures) { + blockingContinueAfter(f, cont); + } else { + asyncContinueAfter(f, cont); + } + } + + void blockingContinueAfter(Future& f, TTaskFct cont) { + Future* fptr = &f; + scheduler->schedule([this, fptr, cont]() { + fdb_check(fdb_future_block_until_ready(fptr->fdbFuture())); + fdb_error_t err = fptr->getError(); + if (err) { + currFuture = fdbTx.onError(err); + fdb_check(fdb_future_block_until_ready(currFuture.fdbFuture())); + handleOnErrorResult(); + } else { + cont(); + } + }); + } + + void asyncContinueAfter(Future& f, TTaskFct cont) { + currCont = cont; + fdb_check(fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this)); + } + + static void futureReadyCallback(FDBFuture* f, void* param) { + TransactionContext* txCtx = (TransactionContext*)param; + txCtx->onFutureReady(f); + } + + void onFutureReady(FDBFuture* f) { + fdb_error_t err = fdb_future_get_error(f); + if (err) { + currFuture = tx()->onError(err); + fdb_check(fdb_future_set_callback(currFuture.fdbFuture(), onErrorReadyCallback, this)); + } else { + scheduler->schedule(currCont); + } + } + + static void onErrorReadyCallback(FDBFuture* f, void* param) { + TransactionContext* txCtx = (TransactionContext*)param; + txCtx->onErrorReady(f); + } + + void onErrorReady(FDBFuture* f) { + scheduler->schedule([this]() { handleOnErrorResult(); }); + } + + void handleOnErrorResult() { + fdb_error_t err = currFuture.getError(); + if (err) { + finalError = err; + done(); + } else { + txActor->reset(); + txActor->start(); + } + } + + const TransactionExecutorOptions& options; + Transaction fdbTx; + ITransactionActor* txActor; + TTaskFct currCont; + TTaskFct contAfterDone; + IScheduler* scheduler; + fdb_error_t finalError; + EmptyFuture currFuture; +}; + +class TransactionExecutor : public ITransactionExecutor { +public: + TransactionExecutor() : db(nullptr), scheduler(nullptr) {} + + ~TransactionExecutor() { release(); } + + void init(IScheduler* scheduler, const char* clusterFile, const TransactionExecutorOptions& options) override { + this->scheduler = scheduler; + this->options = options; + fdb_check(fdb_create_database(clusterFile, &db)); + } + + void execute(ITransactionActor* txActor, TTaskFct cont) override { + FDBTransaction* tx; + fdb_check(fdb_database_create_transaction(db, &tx)); + TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); + txActor->init(ctx); + txActor->start(); + } + + void release() override { fdb_database_destroy(db); } + +private: + FDBDatabase* db; + TransactionExecutorOptions options; + IScheduler* scheduler; +}; + +ITransactionExecutor* createTransactionExecutor() { + return new TransactionExecutor(); +} + +} // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestTransactionExecutor.h b/bindings/c/test/system/SysTestTransactionExecutor.h new file mode 100644 index 0000000000..acede196cc --- /dev/null +++ b/bindings/c/test/system/SysTestTransactionExecutor.h @@ -0,0 +1,82 @@ +/* + * SysTestTransactionExecutor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef SYS_TEST_TRANSACTION_EXECUTOR_H +#define SYS_TEST_TRANSACTION_EXECUTOR_H + +#include "SysTestOptions.h" +#include "SysTestApiWrapper.h" +#include "SysTestScheduler.h" +#include + +namespace FDBSystemTester { + +class ITransactionContext { +public: + virtual ~ITransactionContext() {} + virtual Transaction* tx() = 0; + virtual void continueAfter(Future& f, TTaskFct cont) = 0; + virtual void commit() = 0; + virtual void done() = 0; + virtual std::string_view dbKey(std::string_view key) = 0; +}; + +class ITransactionActor { +public: + virtual ~ITransactionActor() {} + virtual void init(ITransactionContext* ctx) = 0; + virtual void start() = 0; + virtual void reset() = 0; +}; + +class TransactionActorBase : public ITransactionActor { +public: + void init(ITransactionContext* ctx) override { context = ctx; } + +protected: + ITransactionContext* ctx() { return context; } + Transaction* tx() { return ctx()->tx(); } + std::string_view dbKey(std::string_view key) { return ctx()->dbKey(key); } + void commit() { ctx()->commit(); } + +private: + ITransactionContext* context = nullptr; +}; + +struct TransactionExecutorOptions { + std::string prefix = ""; + bool blockOnFutures = false; +}; + +class ITransactionExecutor { +public: + virtual ~ITransactionExecutor() {} + virtual void init(IScheduler* sched, const char* clusterFile, const TransactionExecutorOptions& options) = 0; + virtual void execute(ITransactionActor* tx, TTaskFct cont) = 0; + virtual void release() = 0; +}; + +ITransactionExecutor* createTransactionExecutor(); + +} // namespace FDBSystemTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestWorkload.cpp b/bindings/c/test/system/SysTestWorkload.cpp new file mode 100644 index 0000000000..f58f5e589a --- /dev/null +++ b/bindings/c/test/system/SysTestWorkload.cpp @@ -0,0 +1,35 @@ +#include "SysTestWorkload.h" + +namespace FDBSystemTester { + +void WorkloadBase::init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) { + this->txExecutor = txExecutor; + this->scheduler = sched; + this->doneCont = cont; +} + +void WorkloadBase::schedule(TTaskFct task) { + tasksScheduled++; + scheduler->schedule([this, task]() { + tasksScheduled--; + task(); + contIfDone(); + }); +} + +void WorkloadBase::execTransaction(ITransactionActor* tx, TTaskFct cont) { + txRunning++; + txExecutor->execute(tx, [this, cont]() { + txRunning--; + cont(); + contIfDone(); + }); +} + +void WorkloadBase::contIfDone() { + if (txRunning == 0 && tasksScheduled == 0) { + doneCont(); + } +} + +} // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestWorkload.h b/bindings/c/test/system/SysTestWorkload.h new file mode 100644 index 0000000000..5fbc338c7c --- /dev/null +++ b/bindings/c/test/system/SysTestWorkload.h @@ -0,0 +1,57 @@ +/* + * SysTestWorkload.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef SYS_TEST_WORKLOAD_H +#define SYS_TEST_WORKLOAD_H + +#include "SysTestTransactionExecutor.h" + +namespace FDBSystemTester { + +class IWorkload { +public: + virtual ~IWorkload() {} + virtual void init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) = 0; + virtual void start() = 0; +}; + +class WorkloadBase : public IWorkload { +public: + WorkloadBase() : txExecutor(nullptr), scheduler(nullptr), tasksScheduled(0), txRunning(0) {} + void init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) override; + +protected: + void schedule(TTaskFct task); + void execTransaction(ITransactionActor* tx, TTaskFct cont); + void contIfDone(); + +private: + ITransactionExecutor* txExecutor; + IScheduler* scheduler; + TTaskFct doneCont; + std::atomic tasksScheduled; + std::atomic txRunning; +}; + +} // namespace FDBSystemTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/system/fdb_c_system_tester.cpp index 179d9f5232..ef8bff5cf7 100644 --- a/bindings/c/test/system/fdb_c_system_tester.cpp +++ b/bindings/c/test/system/fdb_c_system_tester.cpp @@ -18,17 +18,37 @@ * limitations under the License. */ -#include "TesterOptions.h" +#include "SysTestOptions.h" +#include "SysTestWorkload.h" #include "flow/Platform.h" #include "flow/Trace.h" #include "flow/ArgParseUtil.h" +#include "test/system/SysTestScheduler.h" +#include "test/system/SysTestTransactionExecutor.h" +#include +#include #define FDB_API_VERSION 710 #include "bindings/c/foundationdb/fdb_c.h" namespace FDBSystemTester { -const CSimpleOpt::SOption TesterOptions::optionDefs[] = // +namespace { + +enum TesterOptionId { + OPT_CONNFILE, + OPT_HELP, + OPT_TRACE, + OPT_TRACE_DIR, + OPT_LOGGROUP, + OPT_TRACE_FORMAT, + OPT_KNOB, + OPT_API_VERSION, + OPT_BLOCK_ON_FUTURES, + OPT_NUM_CLIENT_THREADS +}; + +CSimpleOpt::SOption TesterOptionDefs[] = // { { OPT_CONNFILE, "-C", SO_REQ_SEP }, { OPT_CONNFILE, "--cluster-file", SO_REQ_SEP }, { OPT_TRACE, "--log", SO_NONE }, @@ -38,7 +58,11 @@ const CSimpleOpt::SOption TesterOptions::optionDefs[] = // { OPT_HELP, "--help", SO_NONE }, { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - { OPT_API_VERSION, "--api-version", SO_REQ_SEP } }; + { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, + { OPT_BLOCK_ON_FUTURES, "--block-on-futures", SO_NONE }, + { OPT_NUM_CLIENT_THREADS, "--num-client-threads", SO_REQ_SEP } }; + +} // namespace void TesterOptions::printProgramUsage(const char* execName) { printf("usage: %s [OPTIONS]\n" @@ -63,13 +87,17 @@ void TesterOptions::printProgramUsage(const char* execName) { " Specifies the version of the API for the CLI to use.\n" " --knob-KNOBNAME KNOBVALUE\n" " Changes a knob option. KNOBNAME should be lowercase.\n" + " --block-on-futures\n" + " Use blocking waits on futures instead of scheduling callbacks.\n" + " --num-client-threads NUM_THREADS\n" + " Number of threads to be used for execution of client workloads.\n" " -h, --help Display this help and exit.\n"); } bool TesterOptions::parseArgs(int argc, char** argv) { // declare our options parser, pass in the arguments from main // as well as our array of valid options. - CSimpleOpt args(argc, argv, optionDefs); + CSimpleOpt args(argc, argv, TesterOptionDefs); // while there are arguments left to process while (args.Next()) { @@ -100,14 +128,14 @@ bool TesterOptions::processArg(const CSimpleOpt& args) { api_version = strtoul((char*)args.OptionArg(), &endptr, 10); if (*endptr != '\0') { fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg()); - return 1; + return false; } else if (api_version < 700 || api_version > FDB_API_VERSION) { // multi-version fdbcli only available after 7.0 fprintf(stderr, "ERROR: api version %s is not supported. (Min: 700, Max: %d)\n", args.OptionArg(), FDB_API_VERSION); - return 1; + return false; } break; } @@ -130,24 +158,73 @@ bool TesterOptions::processArg(const CSimpleOpt& args) { Optional knobName = extractPrefixedArgument("--knob", args.OptionSyntax()); if (!knobName.present()) { fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", args.OptionSyntax()); - return FDB_EXIT_ERROR; + return false; } knobs.emplace_back(knobName.get(), args.OptionArg()); break; } + case OPT_BLOCK_ON_FUTURES: + blockOnFutures = true; + break; + + case OPT_NUM_CLIENT_THREADS: + char* endptr; + numClientThreads = strtoul((char*)args.OptionArg(), &endptr, 10); + if (*endptr != '\0' || numClientThreads <= 0 || numClientThreads > 1000) { + fprintf(stderr, "ERROR: number of threads %s\n", args.OptionArg()); + return false; + } + break; } return true; } +namespace { +void fdb_check(fdb_error_t e) { + if (e) { + std::cerr << fdb_get_error(e) << std::endl; + std::abort(); + } +} +} // namespace + +IWorkload* createApiCorrectnessWorkload(); + } // namespace FDBSystemTester using namespace FDBSystemTester; +void runApiCorrectness(TesterOptions& options) { + TransactionExecutorOptions txExecOptions; + txExecOptions.blockOnFutures = options.blockOnFutures; + + IScheduler* scheduler = createScheduler(options.numClientThreads); + ITransactionExecutor* txExecutor = createTransactionExecutor(); + scheduler->start(); + txExecutor->init(scheduler, options.clusterFile.c_str(), txExecOptions); + IWorkload* workload = createApiCorrectnessWorkload(); + workload->init(txExecutor, scheduler, [scheduler]() { scheduler->stop(); }); + workload->start(); + scheduler->join(); + delete workload; + delete txExecutor; + delete scheduler; +} + int main(int argc, char** argv) { TesterOptions options; if (!options.parseArgs(argc, argv)) { return 1; } + fdb_check(fdb_select_api_version(options.api_version)); + fdb_check(fdb_setup_network()); + + std::thread network_thread{ &fdb_run_network }; + + runApiCorrectness(options); + + fdb_check(fdb_stop_network()); + network_thread.join(); return 0; } From 45d0815218798687c7d7ed9394fd0da836ebba60 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Mon, 21 Feb 2022 18:54:51 +0100 Subject: [PATCH 010/138] SysTester: Load balance over multiple databases --- bindings/c/test/system/SysTestOptions.h | 1 + .../system/SysTestTransactionExecutor.cpp | 20 +++++-- .../test/system/SysTestTransactionExecutor.h | 1 + .../c/test/system/fdb_c_system_tester.cpp | 54 +++++++++++-------- 4 files changed, 50 insertions(+), 26 deletions(-) diff --git a/bindings/c/test/system/SysTestOptions.h b/bindings/c/test/system/SysTestOptions.h index 36ad529f67..de839befce 100644 --- a/bindings/c/test/system/SysTestOptions.h +++ b/bindings/c/test/system/SysTestOptions.h @@ -46,6 +46,7 @@ public: int api_version = FDB_API_VERSION; bool blockOnFutures = false; int numClientThreads = 1; + int numDatabases = 1; bool parseArgs(int argc, char** argv); diff --git a/bindings/c/test/system/SysTestTransactionExecutor.cpp b/bindings/c/test/system/SysTestTransactionExecutor.cpp index 04326f24f0..2122415241 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.cpp +++ b/bindings/c/test/system/SysTestTransactionExecutor.cpp @@ -19,6 +19,7 @@ */ #include "SysTestTransactionExecutor.h" +#include "flow/IRandom.h" #include #include @@ -137,28 +138,37 @@ private: class TransactionExecutor : public ITransactionExecutor { public: - TransactionExecutor() : db(nullptr), scheduler(nullptr) {} + TransactionExecutor() : scheduler(nullptr) {} ~TransactionExecutor() { release(); } void init(IScheduler* scheduler, const char* clusterFile, const TransactionExecutorOptions& options) override { this->scheduler = scheduler; this->options = options; - fdb_check(fdb_create_database(clusterFile, &db)); + for (int i = 0; i < options.numDatabases; i++) { + FDBDatabase* db; + fdb_check(fdb_create_database(clusterFile, &db)); + databases.push_back(db); + } } void execute(ITransactionActor* txActor, TTaskFct cont) override { + int idx = deterministicRandom()->randomInt(0, options.numDatabases); FDBTransaction* tx; - fdb_check(fdb_database_create_transaction(db, &tx)); + fdb_check(fdb_database_create_transaction(databases[idx], &tx)); TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); txActor->init(ctx); txActor->start(); } - void release() override { fdb_database_destroy(db); } + void release() override { + for (FDBDatabase* db : databases) { + fdb_database_destroy(db); + } + } private: - FDBDatabase* db; + std::vector databases; TransactionExecutorOptions options; IScheduler* scheduler; }; diff --git a/bindings/c/test/system/SysTestTransactionExecutor.h b/bindings/c/test/system/SysTestTransactionExecutor.h index acede196cc..e88219ed38 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.h +++ b/bindings/c/test/system/SysTestTransactionExecutor.h @@ -65,6 +65,7 @@ private: struct TransactionExecutorOptions { std::string prefix = ""; bool blockOnFutures = false; + int numDatabases = 1; }; class ITransactionExecutor { diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/system/fdb_c_system_tester.cpp index ef8bff5cf7..56cbdadae3 100644 --- a/bindings/c/test/system/fdb_c_system_tester.cpp +++ b/bindings/c/test/system/fdb_c_system_tester.cpp @@ -45,7 +45,8 @@ enum TesterOptionId { OPT_KNOB, OPT_API_VERSION, OPT_BLOCK_ON_FUTURES, - OPT_NUM_CLIENT_THREADS + OPT_NUM_CLIENT_THREADS, + OPT_NUM_DATABASES }; CSimpleOpt::SOption TesterOptionDefs[] = // @@ -60,7 +61,8 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_KNOB, "--knob-", SO_REQ_SEP }, { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, { OPT_BLOCK_ON_FUTURES, "--block-on-futures", SO_NONE }, - { OPT_NUM_CLIENT_THREADS, "--num-client-threads", SO_REQ_SEP } }; + { OPT_NUM_CLIENT_THREADS, "--num-client-threads", SO_REQ_SEP }, + { OPT_NUM_DATABASES, "--num-databases", SO_REQ_SEP } }; } // namespace @@ -91,6 +93,8 @@ void TesterOptions::printProgramUsage(const char* execName) { " Use blocking waits on futures instead of scheduling callbacks.\n" " --num-client-threads NUM_THREADS\n" " Number of threads to be used for execution of client workloads.\n" + " --num-databases NUM_DB\n" + " Number of database connections to be used concurrently.\n" " -h, --help Display this help and exit.\n"); } @@ -118,25 +122,32 @@ bool TesterOptions::parseArgs(int argc, char** argv) { return true; } +namespace { + +bool processIntArg(const CSimpleOpt& args, int& res, int minVal, int maxVal) { + char* endptr; + res = strtol(args.OptionArg(), &endptr, 10); + if (*endptr != '\0') { + fprintf(stderr, "ERROR: invalid value %s for %s\n", args.OptionArg(), args.OptionText()); + return false; + } + if (res < minVal || res > maxVal) { + fprintf(stderr, "ERROR: value for %s must be between %d and %d\n", args.OptionText(), minVal, maxVal); + return false; + } + return true; +} + +} // namespace + bool TesterOptions::processArg(const CSimpleOpt& args) { switch (args.OptionId()) { case OPT_CONNFILE: clusterFile = args.OptionArg(); break; case OPT_API_VERSION: { - char* endptr; - api_version = strtoul((char*)args.OptionArg(), &endptr, 10); - if (*endptr != '\0') { - fprintf(stderr, "ERROR: invalid client version %s\n", args.OptionArg()); - return false; - } else if (api_version < 700 || api_version > FDB_API_VERSION) { - // multi-version fdbcli only available after 7.0 - fprintf(stderr, - "ERROR: api version %s is not supported. (Min: 700, Max: %d)\n", - args.OptionArg(), - FDB_API_VERSION); - return false; - } + // multi-version fdbcli only available after 7.0 + processIntArg(args, api_version, 700, FDB_API_VERSION); break; } case OPT_TRACE: @@ -168,12 +179,11 @@ bool TesterOptions::processArg(const CSimpleOpt& args) { break; case OPT_NUM_CLIENT_THREADS: - char* endptr; - numClientThreads = strtoul((char*)args.OptionArg(), &endptr, 10); - if (*endptr != '\0' || numClientThreads <= 0 || numClientThreads > 1000) { - fprintf(stderr, "ERROR: number of threads %s\n", args.OptionArg()); - return false; - } + processIntArg(args, numClientThreads, 1, 1000); + break; + + case OPT_NUM_DATABASES: + processIntArg(args, numDatabases, 1, 1000); break; } return true; @@ -197,6 +207,7 @@ using namespace FDBSystemTester; void runApiCorrectness(TesterOptions& options) { TransactionExecutorOptions txExecOptions; txExecOptions.blockOnFutures = options.blockOnFutures; + txExecOptions.numDatabases = options.numDatabases; IScheduler* scheduler = createScheduler(options.numClientThreads); ITransactionExecutor* txExecutor = createTransactionExecutor(); @@ -206,6 +217,7 @@ void runApiCorrectness(TesterOptions& options) { workload->init(txExecutor, scheduler, [scheduler]() { scheduler->stop(); }); workload->start(); scheduler->join(); + delete workload; delete txExecutor; delete scheduler; From e50c4320f043b45f044181bbf1d68c1d0d3381e9 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Tue, 22 Feb 2022 17:25:57 +0100 Subject: [PATCH 011/138] SysTester: Adding options for testing with external library & multithreaded client --- bindings/c/test/system/SysTestApiWrapper.cpp | 16 ++++++++ bindings/c/test/system/SysTestApiWrapper.h | 11 ++++++ bindings/c/test/system/SysTestOptions.h | 2 + .../c/test/system/fdb_c_system_tester.cpp | 38 +++++++++++++++++-- 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/bindings/c/test/system/SysTestApiWrapper.cpp b/bindings/c/test/system/SysTestApiWrapper.cpp index c73d0a09ad..3b39d5bf87 100644 --- a/bindings/c/test/system/SysTestApiWrapper.cpp +++ b/bindings/c/test/system/SysTestApiWrapper.cpp @@ -78,8 +78,24 @@ EmptyFuture Transaction::onError(fdb_error_t err) { return EmptyFuture(fdb_transaction_on_error(tx_, err)); } +void Transaction::reset() { + fdb_transaction_reset(tx_); +} + Transaction::~Transaction() { fdb_transaction_destroy(tx_); } +fdb_error_t FdbApi::setOption(FDBNetworkOption option, std::string_view value) { + return fdb_network_set_option(option, reinterpret_cast(value.data()), value.size()); +} + +fdb_error_t FdbApi::setOption(FDBNetworkOption option, int64_t value) { + return fdb_network_set_option(option, reinterpret_cast(&value), sizeof(value)); +} + +fdb_error_t FdbApi::setOption(FDBNetworkOption option) { + return fdb_network_set_option(option, reinterpret_cast(""), 0); +} + } // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestApiWrapper.h b/bindings/c/test/system/SysTestApiWrapper.h index 690ccd3fba..575aa5e373 100644 --- a/bindings/c/test/system/SysTestApiWrapper.h +++ b/bindings/c/test/system/SysTestApiWrapper.h @@ -40,6 +40,9 @@ public: virtual ~Future(); Future& operator=(Future&& other) { + if (future_) { + reset(); + } future_ = other.future_; other.future_ = nullptr; return *this; @@ -77,11 +80,19 @@ public: void set(std::string_view key, std::string_view value); EmptyFuture commit(); EmptyFuture onError(fdb_error_t err); + void reset(); private: FDBTransaction* tx_; }; +class FdbApi { +public: + static fdb_error_t setOption(FDBNetworkOption option, std::string_view value); + static fdb_error_t setOption(FDBNetworkOption option, int64_t value); + static fdb_error_t setOption(FDBNetworkOption option); +}; + } // namespace FDBSystemTester #endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestOptions.h b/bindings/c/test/system/SysTestOptions.h index de839befce..7b5ad21e05 100644 --- a/bindings/c/test/system/SysTestOptions.h +++ b/bindings/c/test/system/SysTestOptions.h @@ -47,6 +47,8 @@ public: bool blockOnFutures = false; int numClientThreads = 1; int numDatabases = 1; + std::string externalClientLibrary; + int numFdbThreads = 1; bool parseArgs(int argc, char** argv); diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/system/fdb_c_system_tester.cpp index 56cbdadae3..f3f6dbe06b 100644 --- a/bindings/c/test/system/fdb_c_system_tester.cpp +++ b/bindings/c/test/system/fdb_c_system_tester.cpp @@ -46,7 +46,9 @@ enum TesterOptionId { OPT_API_VERSION, OPT_BLOCK_ON_FUTURES, OPT_NUM_CLIENT_THREADS, - OPT_NUM_DATABASES + OPT_NUM_DATABASES, + OPT_EXTERNAL_CLIENT_LIBRARY, + OPT_NUM_FDB_THREADS }; CSimpleOpt::SOption TesterOptionDefs[] = // @@ -62,7 +64,9 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, { OPT_BLOCK_ON_FUTURES, "--block-on-futures", SO_NONE }, { OPT_NUM_CLIENT_THREADS, "--num-client-threads", SO_REQ_SEP }, - { OPT_NUM_DATABASES, "--num-databases", SO_REQ_SEP } }; + { OPT_NUM_DATABASES, "--num-databases", SO_REQ_SEP }, + { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, + { OPT_NUM_FDB_THREADS, "--num-fdb-threads", SO_REQ_SEP } }; } // namespace @@ -91,10 +95,14 @@ void TesterOptions::printProgramUsage(const char* execName) { " Changes a knob option. KNOBNAME should be lowercase.\n" " --block-on-futures\n" " Use blocking waits on futures instead of scheduling callbacks.\n" - " --num-client-threads NUM_THREADS\n" + " --num-client-threads NUMBER\n" " Number of threads to be used for execution of client workloads.\n" - " --num-databases NUM_DB\n" + " --num-databases NUMBER\n" " Number of database connections to be used concurrently.\n" + " --external-client-library FILE_PATH\n" + " Path to the external client library.\n" + " --num-fdb-threads NUMBER\n" + " Number of FDB client threads to be created.\n" " -h, --help Display this help and exit.\n"); } @@ -185,6 +193,14 @@ bool TesterOptions::processArg(const CSimpleOpt& args) { case OPT_NUM_DATABASES: processIntArg(args, numDatabases, 1, 1000); break; + + case OPT_EXTERNAL_CLIENT_LIBRARY: + externalClientLibrary = args.OptionArg(); + break; + + case OPT_NUM_FDB_THREADS: + processIntArg(args, numFdbThreads, 1, 1000); + break; } return true; } @@ -204,6 +220,19 @@ IWorkload* createApiCorrectnessWorkload(); using namespace FDBSystemTester; +void applyNetworkOptions(TesterOptions& options) { + if (!options.externalClientLibrary.empty()) { + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT)); + fdb_check( + FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, options.externalClientLibrary)); + } + + if (options.numFdbThreads > 1) { + fdb_check( + FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads)); + } +} + void runApiCorrectness(TesterOptions& options) { TransactionExecutorOptions txExecOptions; txExecOptions.blockOnFutures = options.blockOnFutures; @@ -230,6 +259,7 @@ int main(int argc, char** argv) { } fdb_check(fdb_select_api_version(options.api_version)); + applyNetworkOptions(options); fdb_check(fdb_setup_network()); std::thread network_thread{ &fdb_run_network }; From a8828db58e99ab27f3a5f3fa056b9a2508330ed0 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 22 Feb 2022 10:40:36 -0800 Subject: [PATCH 012/138] Load balance dynamic knob requests This commit also removes an attempt to read the latest configuration snapshot when a rollforward timeout occurs. The normal retry loop will eventually fetch an up to date snapshot and the rollforward will be retried. --- fdbclient/PaxosConfigTransaction.actor.cpp | 26 ++++++---- fdbclient/ServerKnobs.cpp | 1 + fdbclient/ServerKnobs.h | 1 + fdbserver/ConfigBroadcaster.actor.cpp | 4 +- fdbserver/PaxosConfigConsumer.actor.cpp | 57 +++++++++++----------- 5 files changed, 49 insertions(+), 40 deletions(-) diff --git a/fdbclient/PaxosConfigTransaction.actor.cpp b/fdbclient/PaxosConfigTransaction.actor.cpp index c3dce23efb..b5a0c9dd53 100644 --- a/fdbclient/PaxosConfigTransaction.actor.cpp +++ b/fdbclient/PaxosConfigTransaction.actor.cpp @@ -22,6 +22,8 @@ #include "fdbclient/PaxosConfigTransaction.h" #include "flow/actorcompiler.h" // must be last include +using ConfigTransactionInfo = ModelInterface; + class CommitQuorum { ActorCollection actors{ false }; std::vector ctis; @@ -224,10 +226,12 @@ class PaxosConfigTransactionImpl { loop { try { ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - // TODO: Load balance + state Reference configNodes( + new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false)); ConfigTransactionGetReply reply = - wait(timeoutError(self->getGenerationQuorum.getReadReplicas()[0].get.getReply( - ConfigTransactionGetRequest{ generation, configKey }), + wait(timeoutError(basicLoadBalance(configNodes, + &ConfigTransactionInterface::get, + ConfigTransactionGetRequest{ generation, configKey }), CLIENT_KNOBS->GET_KNOB_TIMEOUT)); if (reply.value.present()) { return reply.value.get().toValue(); @@ -245,10 +249,12 @@ class PaxosConfigTransactionImpl { ACTOR static Future getConfigClasses(PaxosConfigTransactionImpl* self) { ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - // TODO: Load balance + state Reference configNodes( + new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false)); ConfigTransactionGetConfigClassesReply reply = - wait(retryBrokenPromise(self->getGenerationQuorum.getReadReplicas()[0].getClasses, - ConfigTransactionGetConfigClassesRequest{ generation })); + wait(basicLoadBalance(configNodes, + &ConfigTransactionInterface::getClasses, + ConfigTransactionGetConfigClassesRequest{ generation })); RangeResult result; result.reserve(result.arena(), reply.configClasses.size()); for (const auto& configClass : reply.configClasses) { @@ -259,10 +265,12 @@ class PaxosConfigTransactionImpl { ACTOR static Future getKnobs(PaxosConfigTransactionImpl* self, Optional configClass) { ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); - // TODO: Load balance + state Reference configNodes( + new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false)); ConfigTransactionGetKnobsReply reply = - wait(retryBrokenPromise(self->getGenerationQuorum.getReadReplicas()[0].getKnobs, - ConfigTransactionGetKnobsRequest{ generation, configClass })); + wait(basicLoadBalance(configNodes, + &ConfigTransactionInterface::getKnobs, + ConfigTransactionGetKnobsRequest{ generation, configClass })); RangeResult result; result.reserve(result.arena(), reply.knobNames.size()); for (const auto& knobName : reply.knobNames) { diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 752839ccc6..c2942476a6 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -698,6 +698,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( COORDINATOR_LEADER_CONNECTION_TIMEOUT, 20.0 ); // Dynamic Knobs (implementation) + init( COMPACTION_INTERVAL, isSimulated ? 5.0 : 300.0 ); init( UPDATE_NODE_TIMEOUT, 3.0 ); init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 ); init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT, 3.0 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index a813bad097..5a5c928340 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -639,6 +639,7 @@ public: double COORDINATOR_LEADER_CONNECTION_TIMEOUT; // Dynamic Knobs (implementation) + double COMPACTION_INTERVAL; double UPDATE_NODE_TIMEOUT; double GET_COMMITTED_VERSION_TIMEOUT; double GET_SNAPSHOT_AND_CHANGES_TIMEOUT; diff --git a/fdbserver/ConfigBroadcaster.actor.cpp b/fdbserver/ConfigBroadcaster.actor.cpp index f7467a41a0..c7b46a1f4d 100644 --- a/fdbserver/ConfigBroadcaster.actor.cpp +++ b/fdbserver/ConfigBroadcaster.actor.cpp @@ -390,9 +390,9 @@ public: this->coordinators = coordinators.configServers.size(); if (configDBType != ConfigDBType::DISABLED) { if (configDBType == ConfigDBType::SIMPLE) { - consumer = IConfigConsumer::createSimple(coordinators, 0.5, Optional{}); + consumer = IConfigConsumer::createSimple(coordinators, 0.5, SERVER_KNOBS->COMPACTION_INTERVAL); } else { - consumer = IConfigConsumer::createPaxos(coordinators, 0.5, Optional{}); + consumer = IConfigConsumer::createPaxos(coordinators, 0.5, SERVER_KNOBS->COMPACTION_INTERVAL); } TraceEvent(SevDebug, "ConfigBroadcasterStartingConsumer", id) .detail("Consumer", consumer->getID()) diff --git a/fdbserver/PaxosConfigConsumer.actor.cpp b/fdbserver/PaxosConfigConsumer.actor.cpp index ddae968d48..cc39b91df8 100644 --- a/fdbserver/PaxosConfigConsumer.actor.cpp +++ b/fdbserver/PaxosConfigConsumer.actor.cpp @@ -27,6 +27,8 @@ #include "fdbserver/Knobs.h" #include "flow/actorcompiler.h" // This must be the last #include. +using ConfigFollowerInfo = ModelInterface; + struct CommittedVersions { Version secondToLastCommitted; Version lastCommitted; @@ -81,35 +83,31 @@ class GetCommittedVersionQuorum { // Now roll node forward to match the largest committed version of // the replies. - // TODO: Load balance over quorum. Also need to catch - // error_code_process_behind and retry with the next ConfigNode in - // the quorum. - state ConfigFollowerInterface quorumCfi = self->replies[target][0]; + state Reference quorumCfi(new ConfigFollowerInfo(self->replies[target], false)); try { state Version lastSeenVersion = rollback.present() ? rollback.get() : nodeVersion.lastCommitted; - ConfigFollowerGetChangesReply reply = wait(timeoutError( - quorumCfi.getChanges.getReply(ConfigFollowerGetChangesRequest{ lastSeenVersion, target }), - SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); + ConfigFollowerGetChangesReply reply = + wait(timeoutError(basicLoadBalance(quorumCfi, + &ConfigFollowerInterface::getChanges, + ConfigFollowerGetChangesRequest{ lastSeenVersion, target }), + SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); wait(timeoutError(cfi.rollforward.getReply(ConfigFollowerRollforwardRequest{ rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }), SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); } catch (Error& e) { - if (e.code() == error_code_version_already_compacted) { - TEST(true); // PaxosConfigConsumer rollforward compacted ConfigNode - ConfigFollowerGetSnapshotAndChangesReply reply = wait(retryBrokenPromise( - quorumCfi.getSnapshotAndChanges, ConfigFollowerGetSnapshotAndChangesRequest{ target })); - wait(retryBrokenPromise( - cfi.rollforward, - ConfigFollowerRollforwardRequest{ - rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations })); - } else if (e.code() == error_code_transaction_too_old) { + if (e.code() == error_code_transaction_too_old) { // Seeing this trace is not necessarily a problem. There // are legitimate scenarios where a ConfigNode could return // transaction_too_old in response to a rollforward // request. TraceEvent(SevInfo, "ConfigNodeRollforwardError").error(e); } else { - throw e; + // In the case of an already_compacted error, the retry + // loop will fetch the latest snapshot and a rollforward + // request will eventually be resent. + TEST(e.code() == + error_code_version_already_compacted); // PaxosConfigConsumer rollforward compacted ConfigNode + throw; } } } @@ -263,12 +261,14 @@ class PaxosConfigConsumerImpl { loop { self->resetCommittedVersionQuorum(); // TODO: This seems to fix a segfault, investigate more try { - // TODO: Load balance state Version committedVersion = wait(getCommittedVersion(self)); - ConfigFollowerGetSnapshotAndChangesReply reply = wait( - timeoutError(self->getCommittedVersionQuorum.getReadReplicas()[0].getSnapshotAndChanges.getReply( - ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion }), - SERVER_KNOBS->GET_SNAPSHOT_AND_CHANGES_TIMEOUT)); + state Reference configNodes( + new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas(), false)); + ConfigFollowerGetSnapshotAndChangesReply reply = + wait(timeoutError(basicLoadBalance(configNodes, + &ConfigFollowerInterface::getSnapshotAndChanges, + ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion }), + SERVER_KNOBS->GET_SNAPSHOT_AND_CHANGES_TIMEOUT)); TraceEvent(SevDebug, "ConfigConsumerGotSnapshotAndChanges", self->id) .detail("SnapshotVersion", reply.snapshotVersion) .detail("SnapshotSize", reply.snapshot.size()) @@ -313,13 +313,14 @@ class PaxosConfigConsumerImpl { // ConfigNodes changes to 1, 1, 2, the committed version // returned would be 1. if (committedVersion > self->lastSeenVersion) { - // TODO: Load balance to avoid always hitting the - // node at index 0 first ASSERT(self->getCommittedVersionQuorum.getReadReplicas().size() >= self->cfis.size() / 2 + 1); - ConfigFollowerGetChangesReply reply = wait( - timeoutError(self->getCommittedVersionQuorum.getReadReplicas()[0].getChanges.getReply( + state Reference configNodes( + new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas(), false)); + ConfigFollowerGetChangesReply reply = wait(timeoutError( + basicLoadBalance(configNodes, + &ConfigFollowerInterface::getChanges, ConfigFollowerGetChangesRequest{ self->lastSeenVersion, committedVersion }), - SERVER_KNOBS->FETCH_CHANGES_TIMEOUT)); + SERVER_KNOBS->FETCH_CHANGES_TIMEOUT)); for (const auto& versionedMutation : reply.changes) { TraceEvent te(SevDebug, "ConsumerFetchedMutation", self->id); te.detail("Version", versionedMutation.version) @@ -337,8 +338,6 @@ class PaxosConfigConsumerImpl { committedVersion, reply.annotations, self->getCommittedVersionQuorum.getReadReplicas()); - // TODO: Catch error_code_process_behind and retry with - // the next ConfigNode in the quorum. } else if (committedVersion == self->lastSeenVersion) { broadcaster->applyChanges({}, -1, {}, self->getCommittedVersionQuorum.getReadReplicas()); } From 803fc86e615aad2991db050aef46b15fecb3f40d Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 22 Feb 2022 16:21:33 -0800 Subject: [PATCH 013/138] Fix compaction issue when rolling nodes back --- fdbserver/ConfigNode.actor.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fdbserver/ConfigNode.actor.cpp b/fdbserver/ConfigNode.actor.cpp index be7d9a867c..a99fda55dc 100644 --- a/fdbserver/ConfigNode.actor.cpp +++ b/fdbserver/ConfigNode.actor.cpp @@ -495,7 +495,7 @@ class ConfigNodeImpl { } ACTOR static Future rollforward(ConfigNodeImpl* self, ConfigFollowerRollforwardRequest req) { - Version lastCompactedVersion = wait(getLastCompactedVersion(self)); + state Version lastCompactedVersion = wait(getLastCompactedVersion(self)); if (req.lastKnownCommitted < lastCompactedVersion) { req.reply.sendError(version_already_compacted()); return Void(); @@ -529,6 +529,10 @@ class ConfigNodeImpl { versionedAnnotationKey(currentGeneration.committedVersion + 1))); currentGeneration.committedVersion = req.rollback.get(); + if (req.rollback.get() < lastCompactedVersion) { + self->kvStore->set( + KeyValueRef(lastCompactedVersionKey, BinaryWriter::toValue(req.rollback.get(), IncludeVersion()))); + } // The mutation commit loop below should persist the new generation // to disk, so we don't need to do it here. } From f5d722b65b6b8789589c5ffe488a527c2f693e84 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 23 Feb 2022 15:43:01 +0100 Subject: [PATCH 014/138] SysTester: use boost io_context for scheduling --- bindings/c/test/system/SysTestScheduler.cpp | 112 +++----------------- 1 file changed, 15 insertions(+), 97 deletions(-) diff --git a/bindings/c/test/system/SysTestScheduler.cpp b/bindings/c/test/system/SysTestScheduler.cpp index e5abfe2c54..145e3d2cf0 100644 --- a/bindings/c/test/system/SysTestScheduler.cpp +++ b/bindings/c/test/system/SysTestScheduler.cpp @@ -20,127 +20,45 @@ #include "SysTestScheduler.h" -#include "flow/Arena.h" -#include "flow/ThreadPrimitives.h" -#include "flow/ThreadSafeQueue.h" -#include "flow/IRandom.h" -#include #include #include +#include + +using namespace boost::asio; namespace FDBSystemTester { -class SingleThreadedScheduler : public IScheduler { +class AsioScheduler : public IScheduler { public: - SingleThreadedScheduler() : stopRequested(false), sleeping(false), thr(nullptr) {} - - ~SingleThreadedScheduler() override { - if (thr) { - delete thr; - } - } + AsioScheduler(int numThreads) : numThreads(numThreads) {} void start() override { - assert(thr == nullptr); - assert(!stop_); - thr = new std::thread([this]() { this->threadMain(); }); - } - - void schedule(TTaskFct task) override { - taskQueue.push(task); - wake(); - } - - void stop() override { - if (stopRequested.exchange(true) == false) { - if (thr) { - wake(); - } - } - } - - void join() override { - assert(thr); - thr->join(); - } - -private: - void threadMain() { - while (!stopRequested) { - Optional t = taskQueue.pop(); - if (t.present()) { - t.get()(); - continue; - } - sleeping = true; - wakeEvent.block(); - sleeping = false; - continue; - } - } - - void wake() { - while (sleeping) { - wakeEvent.set(); - } - } - - ThreadSafeQueue taskQueue; - std::atomic stopRequested; - std::atomic sleeping; - Event wakeEvent; - std::thread* thr; -}; - -class MultiThreadedScheduler : public IScheduler { -public: - MultiThreadedScheduler(int numThreads) : numThreads(numThreads) { + work = require(io_ctx.get_executor(), execution::outstanding_work.tracked); for (int i = 0; i < numThreads; i++) { - schedulers.push_back(new SingleThreadedScheduler()); + threads.emplace_back([this]() { io_ctx.run(); }); } } - ~MultiThreadedScheduler() override { - for (auto sch : schedulers) { - delete sch; - } - } + void schedule(TTaskFct task) override { post(io_ctx, task); } - void start() override { - for (auto sch : schedulers) { - sch->start(); - } - } - - void schedule(TTaskFct task) override { - int idx = deterministicRandom()->randomInt(0, numThreads); - schedulers[idx]->schedule(task); - } - - void stop() override { - for (auto sch : schedulers) { - sch->stop(); - } - } + void stop() override { work = any_io_executor(); } void join() override { - for (auto sch : schedulers) { - sch->join(); + for (auto& th : threads) { + th.join(); } } private: - std::vector schedulers; int numThreads; + std::vector threads; + io_context io_ctx; + any_io_executor work; }; IScheduler* createScheduler(int numThreads) { assert(numThreads > 0 && numThreads <= 1000); - if (numThreads == 1) { - return new SingleThreadedScheduler(); - } else { - return new MultiThreadedScheduler(numThreads); - } + return new AsioScheduler(numThreads); } } // namespace FDBSystemTester \ No newline at end of file From e4311ae6611ea5f5f7d52ac5e74cee6145f54679 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 23 Feb 2022 15:43:38 +0100 Subject: [PATCH 015/138] SysTester: remove direct dependencies on flow --- bindings/c/CMakeLists.txt | 3 +- bindings/c/test/system/SysTestOptions.h | 8 -- .../system/SysTestTransactionExecutor.cpp | 7 +- .../c/test/system/fdb_c_system_tester.cpp | 119 +++++++++--------- 4 files changed, 69 insertions(+), 68 deletions(-) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index df24f3170b..880480e5ee 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -109,6 +109,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) test/system/SysTestTransactionExecutor.h test/system/SysTestWorkload.cpp test/system/SysTestWorkload.h + ../../flow/SimpleOpt.h ) if(OPEN_FOR_IDE) @@ -154,7 +155,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads) target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads) target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads) - target_link_libraries(fdb_c_system_tester PRIVATE fdb_c flow) + target_link_libraries(fdb_c_system_tester PRIVATE fdb_c Threads::Threads) # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) diff --git a/bindings/c/test/system/SysTestOptions.h b/bindings/c/test/system/SysTestOptions.h index 7b5ad21e05..85be667843 100644 --- a/bindings/c/test/system/SysTestOptions.h +++ b/bindings/c/test/system/SysTestOptions.h @@ -23,12 +23,10 @@ #ifndef SYS_TEST_OPTIONS_TESTER_OPTIONS_H #define SYSTEM_TESTER_TESTER_OPTIONS_H -#include "flow/SimpleOpt.h" #include #include #define FDB_API_VERSION 710 -#include "bindings/c/foundationdb/fdb_c.h" namespace FDBSystemTester { @@ -49,12 +47,6 @@ public: int numDatabases = 1; std::string externalClientLibrary; int numFdbThreads = 1; - - bool parseArgs(int argc, char** argv); - -private: - bool processArg(const CSimpleOpt& args); - static void printProgramUsage(const char* execName); }; } // namespace FDBSystemTester diff --git a/bindings/c/test/system/SysTestTransactionExecutor.cpp b/bindings/c/test/system/SysTestTransactionExecutor.cpp index 2122415241..a299007006 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.cpp +++ b/bindings/c/test/system/SysTestTransactionExecutor.cpp @@ -19,9 +19,9 @@ */ #include "SysTestTransactionExecutor.h" -#include "flow/IRandom.h" #include #include +#include namespace FDBSystemTester { @@ -150,10 +150,12 @@ public: fdb_check(fdb_create_database(clusterFile, &db)); databases.push_back(db); } + std::random_device dev; + random.seed(dev()); } void execute(ITransactionActor* txActor, TTaskFct cont) override { - int idx = deterministicRandom()->randomInt(0, options.numDatabases); + int idx = std::uniform_int_distribution<>(0, options.numDatabases - 1)(random); FDBTransaction* tx; fdb_check(fdb_database_create_transaction(databases[idx], &tx)); TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); @@ -171,6 +173,7 @@ private: std::vector databases; TransactionExecutorOptions options; IScheduler* scheduler; + std::mt19937 random; }; ITransactionExecutor* createTransactionExecutor() { diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/system/fdb_c_system_tester.cpp index f3f6dbe06b..0de9cf1232 100644 --- a/bindings/c/test/system/fdb_c_system_tester.cpp +++ b/bindings/c/test/system/fdb_c_system_tester.cpp @@ -20,15 +20,11 @@ #include "SysTestOptions.h" #include "SysTestWorkload.h" -#include "flow/Platform.h" -#include "flow/Trace.h" -#include "flow/ArgParseUtil.h" -#include "test/system/SysTestScheduler.h" -#include "test/system/SysTestTransactionExecutor.h" +#include "SysTestScheduler.h" +#include "SysTestTransactionExecutor.h" #include #include - -#define FDB_API_VERSION 710 +#include "flow/SimpleOpt.h" #include "bindings/c/foundationdb/fdb_c.h" namespace FDBSystemTester { @@ -68,17 +64,14 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, { OPT_NUM_FDB_THREADS, "--num-fdb-threads", SO_REQ_SEP } }; -} // namespace - -void TesterOptions::printProgramUsage(const char* execName) { +void printProgramUsage(const char* execName) { printf("usage: %s [OPTIONS]\n" "\n", execName); printf(" -C CONNFILE The path of a file containing the connection string for the\n" - " FoundationDB cluster. The default is first the value of the\n" - " FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n" + " FoundationDB cluster. The default is `fdb.cluster',\n" " then `%s'.\n", - platform::getDefaultClusterFilePath().c_str()); + "fdb.cluster"); printf(" --log Enables trace file logging for the CLI session.\n" " --log-dir PATH Specifes the output directory for trace files. If\n" " unspecified, defaults to the current directory. Has\n" @@ -106,32 +99,6 @@ void TesterOptions::printProgramUsage(const char* execName) { " -h, --help Display this help and exit.\n"); } -bool TesterOptions::parseArgs(int argc, char** argv) { - // declare our options parser, pass in the arguments from main - // as well as our array of valid options. - CSimpleOpt args(argc, argv, TesterOptionDefs); - - // while there are arguments left to process - while (args.Next()) { - if (args.LastError() == SO_SUCCESS) { - if (args.OptionId() == OPT_HELP) { - printProgramUsage(argv[0]); - return false; - } - if (!processArg(args)) { - return false; - } - } else { - printf("Invalid argument: %s\n", args.OptionText()); - printProgramUsage(argv[0]); - return false; - } - } - return true; -} - -namespace { - bool processIntArg(const CSimpleOpt& args, int& res, int minVal, int maxVal) { char* endptr; res = strtol(args.OptionArg(), &endptr, 10); @@ -146,66 +113,104 @@ bool processIntArg(const CSimpleOpt& args, int& res, int minVal, int maxVal) { return true; } -} // namespace +// Extracts the key for command line arguments that are specified with a prefix (e.g. --knob-). +// This function converts any hyphens in the extracted key to underscores. +bool extractPrefixedArgument(std::string prefix, const std::string& arg, std::string& res) { + if (arg.size() <= prefix.size() || arg.find(prefix) != 0 || + (arg[prefix.size()] != '-' && arg[prefix.size()] != '_')) { + return false; + } -bool TesterOptions::processArg(const CSimpleOpt& args) { + res = arg.substr(prefix.size() + 1); + std::transform(res.begin(), res.end(), res.begin(), [](int c) { return c == '-' ? '_' : c; }); + return true; +} + +bool validateTraceFormat(std::string_view format) { + return format == "xml" || format == "json"; +} + +bool processArg(TesterOptions& options, const CSimpleOpt& args) { switch (args.OptionId()) { case OPT_CONNFILE: - clusterFile = args.OptionArg(); + options.clusterFile = args.OptionArg(); break; case OPT_API_VERSION: { // multi-version fdbcli only available after 7.0 - processIntArg(args, api_version, 700, FDB_API_VERSION); + processIntArg(args, options.api_version, 700, FDB_API_VERSION); break; } case OPT_TRACE: - trace = true; + options.trace = true; break; case OPT_TRACE_DIR: - traceDir = args.OptionArg(); + options.traceDir = args.OptionArg(); break; case OPT_LOGGROUP: - logGroup = args.OptionArg(); + options.logGroup = args.OptionArg(); break; case OPT_TRACE_FORMAT: if (!validateTraceFormat(args.OptionArg())) { fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); } - traceFormat = args.OptionArg(); + options.traceFormat = args.OptionArg(); break; case OPT_KNOB: { - Optional knobName = extractPrefixedArgument("--knob", args.OptionSyntax()); - if (!knobName.present()) { + std::string knobName; + if (!extractPrefixedArgument("--knob", args.OptionSyntax(), knobName)) { fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", args.OptionSyntax()); return false; } - knobs.emplace_back(knobName.get(), args.OptionArg()); + options.knobs.emplace_back(knobName, args.OptionArg()); break; } case OPT_BLOCK_ON_FUTURES: - blockOnFutures = true; + options.blockOnFutures = true; break; case OPT_NUM_CLIENT_THREADS: - processIntArg(args, numClientThreads, 1, 1000); + processIntArg(args, options.numClientThreads, 1, 1000); break; case OPT_NUM_DATABASES: - processIntArg(args, numDatabases, 1, 1000); + processIntArg(args, options.numDatabases, 1, 1000); break; case OPT_EXTERNAL_CLIENT_LIBRARY: - externalClientLibrary = args.OptionArg(); + options.externalClientLibrary = args.OptionArg(); break; case OPT_NUM_FDB_THREADS: - processIntArg(args, numFdbThreads, 1, 1000); + processIntArg(args, options.numFdbThreads, 1, 1000); break; } return true; } -namespace { +bool parseArgs(TesterOptions& options, int argc, char** argv) { + // declare our options parser, pass in the arguments from main + // as well as our array of valid options. + CSimpleOpt args(argc, argv, TesterOptionDefs); + + // while there are arguments left to process + while (args.Next()) { + if (args.LastError() == SO_SUCCESS) { + if (args.OptionId() == OPT_HELP) { + printProgramUsage(argv[0]); + return false; + } + if (!processArg(options, args)) { + return false; + } + } else { + printf("Invalid argument: %s\n", args.OptionText()); + printProgramUsage(argv[0]); + return false; + } + } + return true; +} + void fdb_check(fdb_error_t e) { if (e) { std::cerr << fdb_get_error(e) << std::endl; @@ -254,7 +259,7 @@ void runApiCorrectness(TesterOptions& options) { int main(int argc, char** argv) { TesterOptions options; - if (!options.parseArgs(argc, argv)) { + if (!parseArgs(options, argc, argv)) { return 1; } From 2fb8d6ac97592804706af21b16d414359934b57e Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 23 Feb 2022 16:51:48 +0100 Subject: [PATCH 016/138] SysTester: use unique_ptr instead of plain pointers --- .../test/system/SysTestCorrectnessWorkload.cpp | 5 +++-- bindings/c/test/system/SysTestScheduler.cpp | 5 +++-- bindings/c/test/system/SysTestScheduler.h | 2 +- .../test/system/SysTestTransactionExecutor.cpp | 5 +++-- .../c/test/system/SysTestTransactionExecutor.h | 2 +- bindings/c/test/system/fdb_c_system_tester.cpp | 18 ++++++++---------- 6 files changed, 19 insertions(+), 18 deletions(-) diff --git a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp b/bindings/c/test/system/SysTestCorrectnessWorkload.cpp index ef229658cc..f5cfa8ccf6 100644 --- a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp +++ b/bindings/c/test/system/SysTestCorrectnessWorkload.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ #include "SysTestWorkload.h" +#include #include #include @@ -73,8 +74,8 @@ private: int numTxLeft; }; -IWorkload* createApiCorrectnessWorkload() { - return new ApiCorrectnessWorkload(); +std::unique_ptr createApiCorrectnessWorkload() { + return std::make_unique(); } } // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestScheduler.cpp b/bindings/c/test/system/SysTestScheduler.cpp index 145e3d2cf0..399d87d7f6 100644 --- a/bindings/c/test/system/SysTestScheduler.cpp +++ b/bindings/c/test/system/SysTestScheduler.cpp @@ -20,6 +20,7 @@ #include "SysTestScheduler.h" +#include #include #include #include @@ -56,9 +57,9 @@ private: any_io_executor work; }; -IScheduler* createScheduler(int numThreads) { +std::unique_ptr createScheduler(int numThreads) { assert(numThreads > 0 && numThreads <= 1000); - return new AsioScheduler(numThreads); + return std::make_unique(numThreads); } } // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestScheduler.h b/bindings/c/test/system/SysTestScheduler.h index e92bb8c5c4..687577e60e 100644 --- a/bindings/c/test/system/SysTestScheduler.h +++ b/bindings/c/test/system/SysTestScheduler.h @@ -38,7 +38,7 @@ public: virtual void join() = 0; }; -IScheduler* createScheduler(int numThreads); +std::unique_ptr createScheduler(int numThreads); } // namespace FDBSystemTester diff --git a/bindings/c/test/system/SysTestTransactionExecutor.cpp b/bindings/c/test/system/SysTestTransactionExecutor.cpp index a299007006..a9c39ed253 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.cpp +++ b/bindings/c/test/system/SysTestTransactionExecutor.cpp @@ -21,6 +21,7 @@ #include "SysTestTransactionExecutor.h" #include #include +#include #include namespace FDBSystemTester { @@ -176,8 +177,8 @@ private: std::mt19937 random; }; -ITransactionExecutor* createTransactionExecutor() { - return new TransactionExecutor(); +std::unique_ptr createTransactionExecutor() { + return std::make_unique(); } } // namespace FDBSystemTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestTransactionExecutor.h b/bindings/c/test/system/SysTestTransactionExecutor.h index e88219ed38..15383fd755 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.h +++ b/bindings/c/test/system/SysTestTransactionExecutor.h @@ -76,7 +76,7 @@ public: virtual void release() = 0; }; -ITransactionExecutor* createTransactionExecutor(); +std::unique_ptr createTransactionExecutor(); } // namespace FDBSystemTester diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/system/fdb_c_system_tester.cpp index 0de9cf1232..257ed82658 100644 --- a/bindings/c/test/system/fdb_c_system_tester.cpp +++ b/bindings/c/test/system/fdb_c_system_tester.cpp @@ -23,6 +23,7 @@ #include "SysTestScheduler.h" #include "SysTestTransactionExecutor.h" #include +#include #include #include "flow/SimpleOpt.h" #include "bindings/c/foundationdb/fdb_c.h" @@ -219,7 +220,7 @@ void fdb_check(fdb_error_t e) { } } // namespace -IWorkload* createApiCorrectnessWorkload(); +std::unique_ptr createApiCorrectnessWorkload(); } // namespace FDBSystemTester @@ -243,18 +244,15 @@ void runApiCorrectness(TesterOptions& options) { txExecOptions.blockOnFutures = options.blockOnFutures; txExecOptions.numDatabases = options.numDatabases; - IScheduler* scheduler = createScheduler(options.numClientThreads); - ITransactionExecutor* txExecutor = createTransactionExecutor(); + std::unique_ptr scheduler = createScheduler(options.numClientThreads); + std::unique_ptr txExecutor = createTransactionExecutor(); scheduler->start(); - txExecutor->init(scheduler, options.clusterFile.c_str(), txExecOptions); - IWorkload* workload = createApiCorrectnessWorkload(); - workload->init(txExecutor, scheduler, [scheduler]() { scheduler->stop(); }); + txExecutor->init(scheduler.get(), options.clusterFile.c_str(), txExecOptions); + std::unique_ptr workload = createApiCorrectnessWorkload(); + IScheduler* schedPtr = scheduler.get(); + workload->init(txExecutor.get(), schedPtr, [schedPtr]() { schedPtr->stop(); }); workload->start(); scheduler->join(); - - delete workload; - delete txExecutor; - delete scheduler; } int main(int argc, char** argv) { From b61adc10bed852959f8e8f6365645830b53541c4 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 23 Feb 2022 17:51:58 +0100 Subject: [PATCH 017/138] SysTester: adding missing includes --- bindings/c/test/system/SysTestScheduler.h | 1 + bindings/c/test/system/SysTestTransactionExecutor.h | 1 + bindings/c/test/system/SysTestWorkload.h | 1 + 3 files changed, 3 insertions(+) diff --git a/bindings/c/test/system/SysTestScheduler.h b/bindings/c/test/system/SysTestScheduler.h index 687577e60e..b124959014 100644 --- a/bindings/c/test/system/SysTestScheduler.h +++ b/bindings/c/test/system/SysTestScheduler.h @@ -24,6 +24,7 @@ #define SYS_TEST_SCHEDULER_H #include +#include namespace FDBSystemTester { diff --git a/bindings/c/test/system/SysTestTransactionExecutor.h b/bindings/c/test/system/SysTestTransactionExecutor.h index 15383fd755..97cd0828e9 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.h +++ b/bindings/c/test/system/SysTestTransactionExecutor.h @@ -27,6 +27,7 @@ #include "SysTestApiWrapper.h" #include "SysTestScheduler.h" #include +#include namespace FDBSystemTester { diff --git a/bindings/c/test/system/SysTestWorkload.h b/bindings/c/test/system/SysTestWorkload.h index 5fbc338c7c..326264e126 100644 --- a/bindings/c/test/system/SysTestWorkload.h +++ b/bindings/c/test/system/SysTestWorkload.h @@ -24,6 +24,7 @@ #define SYS_TEST_WORKLOAD_H #include "SysTestTransactionExecutor.h" +#include namespace FDBSystemTester { From de46144af72d43c2b7c08bdee49f9c51c5419526 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Thu, 24 Feb 2022 13:59:12 +0100 Subject: [PATCH 018/138] SysTester: Use reference counting for transactions and futures; Define transaction workflow by lambdas --- bindings/c/test/system/SysTestApiWrapper.cpp | 39 ++++++--------- bindings/c/test/system/SysTestApiWrapper.h | 37 ++++---------- .../system/SysTestCorrectnessWorkload.cpp | 48 ++++++------------- .../system/SysTestTransactionExecutor.cpp | 28 ++++++----- .../test/system/SysTestTransactionExecutor.h | 16 ++++++- bindings/c/test/system/SysTestWorkload.cpp | 3 +- bindings/c/test/system/SysTestWorkload.h | 5 +- 7 files changed, 74 insertions(+), 102 deletions(-) diff --git a/bindings/c/test/system/SysTestApiWrapper.cpp b/bindings/c/test/system/SysTestApiWrapper.cpp index 3b39d5bf87..d0eff60647 100644 --- a/bindings/c/test/system/SysTestApiWrapper.cpp +++ b/bindings/c/test/system/SysTestApiWrapper.cpp @@ -34,56 +34,45 @@ void fdb_check(fdb_error_t e) { } // namespace -Future::~Future() { - if (future_) { - fdb_future_destroy(future_); - } -} +Future::Future(FDBFuture* f) : future_(f, fdb_future_destroy) {} void Future::reset() { - if (future_) { - fdb_future_destroy(future_); - future_ = nullptr; - } + future_.reset(); } -fdb_error_t Future::getError() { - return fdb_future_get_error(future_); +fdb_error_t Future::getError() const { + return fdb_future_get_error(future_.get()); } -std::optional ValueFuture::getValue() { +std::optional ValueFuture::getValue() const { int out_present; const std::uint8_t* val; int vallen; - fdb_check(fdb_future_get_value(future_, &out_present, &val, &vallen)); + fdb_check(fdb_future_get_value(future_.get(), &out_present, &val, &vallen)); return out_present ? std::make_optional(std::string((const char*)val, vallen)) : std::nullopt; } // Given an FDBDatabase, initializes a new transaction. -Transaction::Transaction(FDBTransaction* tx) : tx_(tx) {} +Transaction::Transaction(FDBTransaction* tx) : tx_(tx, fdb_transaction_destroy) {} ValueFuture Transaction::get(std::string_view key, fdb_bool_t snapshot) { - return ValueFuture(fdb_transaction_get(tx_, (const uint8_t*)key.data(), key.size(), snapshot)); + return ValueFuture(fdb_transaction_get(tx_.get(), (const uint8_t*)key.data(), key.size(), snapshot)); } void Transaction::set(std::string_view key, std::string_view value) { - fdb_transaction_set(tx_, (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size()); + fdb_transaction_set(tx_.get(), (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size()); } -EmptyFuture Transaction::commit() { - return EmptyFuture(fdb_transaction_commit(tx_)); +Future Transaction::commit() { + return Future(fdb_transaction_commit(tx_.get())); } -EmptyFuture Transaction::onError(fdb_error_t err) { - return EmptyFuture(fdb_transaction_on_error(tx_, err)); +Future Transaction::onError(fdb_error_t err) { + return Future(fdb_transaction_on_error(tx_.get(), err)); } void Transaction::reset() { - fdb_transaction_reset(tx_); -} - -Transaction::~Transaction() { - fdb_transaction_destroy(tx_); + fdb_transaction_reset(tx_.get()); } fdb_error_t FdbApi::setOption(FDBNetworkOption option, std::string_view value) { diff --git a/bindings/c/test/system/SysTestApiWrapper.h b/bindings/c/test/system/SysTestApiWrapper.h index 575aa5e373..d3a70f36cb 100644 --- a/bindings/c/test/system/SysTestApiWrapper.h +++ b/bindings/c/test/system/SysTestApiWrapper.h @@ -25,6 +25,7 @@ #include #include +#include #define FDB_API_VERSION 710 #include "bindings/c/foundationdb/fdb_c.h" @@ -36,54 +37,36 @@ namespace FDBSystemTester { class Future { public: Future() : future_(nullptr) {} - Future(FDBFuture* f) : future_(f) {} - virtual ~Future(); + Future(FDBFuture* f); - Future& operator=(Future&& other) { - if (future_) { - reset(); - } - future_ = other.future_; - other.future_ = nullptr; - return *this; - } + FDBFuture* fdbFuture() { return future_.get(); }; - FDBFuture* fdbFuture() { return future_; }; - - fdb_error_t getError(); + fdb_error_t getError() const; void reset(); protected: - FDBFuture* future_; + std::shared_ptr future_; }; class ValueFuture : public Future { public: ValueFuture() = default; ValueFuture(FDBFuture* f) : Future(f) {} - std::optional getValue(); -}; - -class EmptyFuture : public Future { -public: - EmptyFuture() = default; - EmptyFuture(FDBFuture* f) : Future(f) {} + std::optional getValue() const; }; class Transaction { public: - // Given an FDBDatabase, initializes a new transaction. + Transaction(); Transaction(FDBTransaction* tx); - ~Transaction(); - ValueFuture get(std::string_view key, fdb_bool_t snapshot); void set(std::string_view key, std::string_view value); - EmptyFuture commit(); - EmptyFuture onError(fdb_error_t err); + Future commit(); + Future onError(fdb_error_t err); void reset(); private: - FDBTransaction* tx_; + std::shared_ptr tx_; }; class FdbApi { diff --git a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp b/bindings/c/test/system/SysTestCorrectnessWorkload.cpp index f5cfa8ccf6..17b6014a3f 100644 --- a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp +++ b/bindings/c/test/system/SysTestCorrectnessWorkload.cpp @@ -24,28 +24,6 @@ namespace FDBSystemTester { -namespace { - -class UpdateTxActor : public TransactionActorBase { -public: - ValueFuture fGet; - - void start() override { - fGet = tx()->get(dbKey("foo"), false); - ctx()->continueAfter(fGet, [this]() { this->step1(); }); - } - - void step1() { - std::optional optStr = fGet.getValue(); - tx()->set(dbKey("foo"), optStr.value_or("bar")); - commit(); - } - - void reset() override { fGet.reset(); } -}; - -} // namespace - class ApiCorrectnessWorkload : public WorkloadBase { public: ApiCorrectnessWorkload() : numTxLeft(10) {} @@ -56,19 +34,21 @@ public: private: void nextTransaction() { - if (numTxLeft > 0) { - numTxLeft--; - UpdateTxActor* tx = new UpdateTxActor(); - execTransaction(tx, [this, tx]() { transactionDone(tx); }); - std::cout << numTxLeft << " transactions left" << std::endl; - } else { - std::cout << "Last transaction completed" << std::endl; - } - } + std::cout << numTxLeft << " transactions left" << std::endl; + if (numTxLeft == 0) + return; - void transactionDone(UpdateTxActor* tx) { - delete tx; - nextTransaction(); + numTxLeft--; + execTransaction( + [](auto ctx) { + ValueFuture fGet = ctx->tx()->get(ctx->dbKey("foo"), false); + ctx->continueAfter(fGet, [fGet, ctx]() { + std::optional optStr = fGet.getValue(); + ctx->tx()->set(ctx->dbKey("foo"), optStr.value_or("bar")); + ctx->commit(); + }); + }, + [this]() { nextTransaction(); }); } int numTxLeft; diff --git a/bindings/c/test/system/SysTestTransactionExecutor.cpp b/bindings/c/test/system/SysTestTransactionExecutor.cpp index a9c39ed253..deb740ff30 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.cpp +++ b/bindings/c/test/system/SysTestTransactionExecutor.cpp @@ -40,14 +40,14 @@ void fdb_check(fdb_error_t e) { class TransactionContext : public ITransactionContext { public: TransactionContext(FDBTransaction* tx, - ITransactionActor* txActor, + std::shared_ptr txActor, TTaskFct cont, const TransactionExecutorOptions& options, IScheduler* scheduler) : options(options), fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), finalError(0) {} Transaction* tx() override { return &fdbTx; } - void continueAfter(Future& f, TTaskFct cont) override { doContinueAfter(f, cont); } + void continueAfter(Future f, TTaskFct cont) override { doContinueAfter(f, cont); } void commit() override { currFuture = fdbTx.commit(); doContinueAfter(currFuture, [this]() { done(); }); @@ -64,7 +64,7 @@ public: } private: - void doContinueAfter(Future& f, TTaskFct cont) { + void doContinueAfter(Future f, TTaskFct cont) { if (options.blockOnFutures) { blockingContinueAfter(f, cont); } else { @@ -72,11 +72,10 @@ private: } } - void blockingContinueAfter(Future& f, TTaskFct cont) { - Future* fptr = &f; - scheduler->schedule([this, fptr, cont]() { - fdb_check(fdb_future_block_until_ready(fptr->fdbFuture())); - fdb_error_t err = fptr->getError(); + void blockingContinueAfter(Future f, TTaskFct cont) { + scheduler->schedule([this, f, cont]() mutable { + fdb_check(fdb_future_block_until_ready(f.fdbFuture())); + fdb_error_t err = f.getError(); if (err) { currFuture = fdbTx.onError(err); fdb_check(fdb_future_block_until_ready(currFuture.fdbFuture())); @@ -87,8 +86,9 @@ private: }); } - void asyncContinueAfter(Future& f, TTaskFct cont) { + void asyncContinueAfter(Future f, TTaskFct cont) { currCont = cont; + currFuture = f; fdb_check(fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this)); } @@ -104,6 +104,8 @@ private: fdb_check(fdb_future_set_callback(currFuture.fdbFuture(), onErrorReadyCallback, this)); } else { scheduler->schedule(currCont); + currFuture.reset(); + currCont = TTaskFct(); } } @@ -118,6 +120,8 @@ private: void handleOnErrorResult() { fdb_error_t err = currFuture.getError(); + currFuture.reset(); + currCont = TTaskFct(); if (err) { finalError = err; done(); @@ -129,12 +133,12 @@ private: const TransactionExecutorOptions& options; Transaction fdbTx; - ITransactionActor* txActor; + std::shared_ptr txActor; TTaskFct currCont; TTaskFct contAfterDone; IScheduler* scheduler; fdb_error_t finalError; - EmptyFuture currFuture; + Future currFuture; }; class TransactionExecutor : public ITransactionExecutor { @@ -155,7 +159,7 @@ public: random.seed(dev()); } - void execute(ITransactionActor* txActor, TTaskFct cont) override { + void execute(std::shared_ptr txActor, TTaskFct cont) override { int idx = std::uniform_int_distribution<>(0, options.numDatabases - 1)(random); FDBTransaction* tx; fdb_check(fdb_database_create_transaction(databases[idx], &tx)); diff --git a/bindings/c/test/system/SysTestTransactionExecutor.h b/bindings/c/test/system/SysTestTransactionExecutor.h index 97cd0828e9..9c91e12521 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.h +++ b/bindings/c/test/system/SysTestTransactionExecutor.h @@ -35,7 +35,7 @@ class ITransactionContext { public: virtual ~ITransactionContext() {} virtual Transaction* tx() = 0; - virtual void continueAfter(Future& f, TTaskFct cont) = 0; + virtual void continueAfter(Future f, TTaskFct cont) = 0; virtual void commit() = 0; virtual void done() = 0; virtual std::string_view dbKey(std::string_view key) = 0; @@ -58,11 +58,23 @@ protected: Transaction* tx() { return ctx()->tx(); } std::string_view dbKey(std::string_view key) { return ctx()->dbKey(key); } void commit() { ctx()->commit(); } + void reset() override {} private: ITransactionContext* context = nullptr; }; +using TTxStartFct = std::function; + +class TransactionFct : public TransactionActorBase { +public: + TransactionFct(TTxStartFct startFct) : startFct(startFct) {} + void start() override { startFct(this->ctx()); } + +private: + TTxStartFct startFct; +}; + struct TransactionExecutorOptions { std::string prefix = ""; bool blockOnFutures = false; @@ -73,7 +85,7 @@ class ITransactionExecutor { public: virtual ~ITransactionExecutor() {} virtual void init(IScheduler* sched, const char* clusterFile, const TransactionExecutorOptions& options) = 0; - virtual void execute(ITransactionActor* tx, TTaskFct cont) = 0; + virtual void execute(std::shared_ptr tx, TTaskFct cont) = 0; virtual void release() = 0; }; diff --git a/bindings/c/test/system/SysTestWorkload.cpp b/bindings/c/test/system/SysTestWorkload.cpp index f58f5e589a..a38dccef6c 100644 --- a/bindings/c/test/system/SysTestWorkload.cpp +++ b/bindings/c/test/system/SysTestWorkload.cpp @@ -1,4 +1,5 @@ #include "SysTestWorkload.h" +#include namespace FDBSystemTester { @@ -17,7 +18,7 @@ void WorkloadBase::schedule(TTaskFct task) { }); } -void WorkloadBase::execTransaction(ITransactionActor* tx, TTaskFct cont) { +void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont) { txRunning++; txExecutor->execute(tx, [this, cont]() { txRunning--; diff --git a/bindings/c/test/system/SysTestWorkload.h b/bindings/c/test/system/SysTestWorkload.h index 326264e126..0a65138acb 100644 --- a/bindings/c/test/system/SysTestWorkload.h +++ b/bindings/c/test/system/SysTestWorkload.h @@ -42,7 +42,10 @@ public: protected: void schedule(TTaskFct task); - void execTransaction(ITransactionActor* tx, TTaskFct cont); + void execTransaction(std::shared_ptr tx, TTaskFct cont); + void execTransaction(TTxStartFct start, TTaskFct cont) { + execTransaction(std::make_shared(start), cont); + } void contIfDone(); private: From 64873b6873fdd04f5e7f569d30db479b4d2c1a7e Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 25 Feb 2022 11:22:33 +0100 Subject: [PATCH 019/138] ApiTester: Rename files and namespace --- bindings/c/CMakeLists.txt | 30 +++++++++---------- .../TesterApiWrapper.cpp} | 8 ++--- .../TesterApiWrapper.h} | 6 ++-- .../TesterCorrectnessWorkload.cpp} | 8 ++--- .../TesterOptions.h} | 6 ++-- .../TesterScheduler.cpp} | 8 ++--- .../TesterScheduler.h} | 6 ++-- .../TesterTransactionExecutor.cpp} | 8 ++--- .../TesterTransactionExecutor.h} | 12 ++++---- .../TesterWorkload.cpp} | 6 ++-- .../TesterWorkload.h} | 8 ++--- .../fdb_c_api_tester.cpp} | 16 +++++----- 12 files changed, 61 insertions(+), 61 deletions(-) rename bindings/c/test/{system/SysTestApiWrapper.cpp => apitester/TesterApiWrapper.cpp} (95%) rename bindings/c/test/{system/SysTestApiWrapper.h => apitester/TesterApiWrapper.h} (95%) rename bindings/c/test/{system/SysTestCorrectnessWorkload.cpp => apitester/TesterCorrectnessWorkload.cpp} (93%) rename bindings/c/test/{system/SysTestOptions.h => apitester/TesterOptions.h} (94%) rename bindings/c/test/{system/SysTestScheduler.cpp => apitester/TesterScheduler.cpp} (93%) rename bindings/c/test/{system/SysTestScheduler.h => apitester/TesterScheduler.h} (92%) rename bindings/c/test/{system/SysTestTransactionExecutor.cpp => apitester/TesterTransactionExecutor.cpp} (97%) rename bindings/c/test/{system/SysTestTransactionExecutor.h => apitester/TesterTransactionExecutor.h} (93%) rename bindings/c/test/{system/SysTestWorkload.cpp => apitester/TesterWorkload.cpp} (88%) rename bindings/c/test/{system/SysTestWorkload.h => apitester/TesterWorkload.h} (92%) rename bindings/c/test/{system/fdb_c_system_tester.cpp => apitester/fdb_c_api_tester.cpp} (97%) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 880480e5ee..18c3ae8793 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -97,18 +97,18 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) test/unit/fdb_api.cpp test/unit/fdb_api.hpp) - set(SYSTEM_TESTER_SRCS - test/system/fdb_c_system_tester.cpp - test/system/SysTestApiWrapper.cpp - test/system/SysTestApiWrapper.h - test/system/SysTestCorrectnessWorkload.cpp - test/system/SysTestOptions.h - test/system/SysTestScheduler.cpp - test/system/SysTestScheduler.h - test/system/SysTestTransactionExecutor.cpp - test/system/SysTestTransactionExecutor.h - test/system/SysTestWorkload.cpp - test/system/SysTestWorkload.h + set(API_TESTER_SRCS + test/apitester/fdb_c_api_tester.cpp + test/apitester/TesterApiWrapper.cpp + test/apitester/TesterApiWrapper.h + test/apitester/TesterCorrectnessWorkload.cpp + test/apitester/TesterOptions.h + test/apitester/TesterScheduler.cpp + test/apitester/TesterScheduler.h + test/apitester/TesterTransactionExecutor.cpp + test/apitester/TesterTransactionExecutor.h + test/apitester/TesterWorkload.cpp + test/apitester/TesterWorkload.h ../../flow/SimpleOpt.h ) @@ -122,7 +122,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) add_library(fdb_c_unit_tests_version_510 OBJECT ${UNIT_TEST_VERSION_510_SRCS}) add_library(trace_partial_file_suffix_test OBJECT ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS}) add_library(disconnected_timeout_unit_tests OBJECT ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS}) - add_library(fdb_c_system_tester OBJECT ${SYSTEM_TESTER_SRCS}) + add_library(fdb_c_api_tester OBJECT ${API_TESTER_SRCS}) else() add_executable(fdb_c_performance_test test/performance_test.c test/test.h) add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h) @@ -133,7 +133,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) add_executable(fdb_c_unit_tests_version_510 ${UNIT_TEST_VERSION_510_SRCS}) add_executable(trace_partial_file_suffix_test ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS}) add_executable(disconnected_timeout_unit_tests ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS}) - add_executable(fdb_c_system_tester ${SYSTEM_TESTER_SRCS}) + add_executable(fdb_c_api_tester ${API_TESTER_SRCS}) strip_debug_symbols(fdb_c_performance_test) strip_debug_symbols(fdb_c_ryw_benchmark) strip_debug_symbols(fdb_c_txn_size_test) @@ -155,7 +155,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads) target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads) target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads) - target_link_libraries(fdb_c_system_tester PRIVATE fdb_c Threads::Threads) + target_link_libraries(fdb_c_api_tester PRIVATE fdb_c Threads::Threads) # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) diff --git a/bindings/c/test/system/SysTestApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp similarity index 95% rename from bindings/c/test/system/SysTestApiWrapper.cpp rename to bindings/c/test/apitester/TesterApiWrapper.cpp index d0eff60647..764533d837 100644 --- a/bindings/c/test/system/SysTestApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -1,5 +1,5 @@ /* - * SysTestApiWrapper.cpp + * TesterApiWrapper.cpp * * This source file is part of the FoundationDB open source project * @@ -17,11 +17,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "SysTestApiWrapper.h" +#include "TesterApiWrapper.h" #include #include -namespace FDBSystemTester { +namespace FdbApiTester { namespace { @@ -87,4 +87,4 @@ fdb_error_t FdbApi::setOption(FDBNetworkOption option) { return fdb_network_set_option(option, reinterpret_cast(""), 0); } -} // namespace FDBSystemTester \ No newline at end of file +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h similarity index 95% rename from bindings/c/test/system/SysTestApiWrapper.h rename to bindings/c/test/apitester/TesterApiWrapper.h index d3a70f36cb..7b3769a536 100644 --- a/bindings/c/test/system/SysTestApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -1,5 +1,5 @@ /* - * SysTestApiWrapper.h + * TesterApiWrapper.h * * This source file is part of the FoundationDB open source project * @@ -30,7 +30,7 @@ #define FDB_API_VERSION 710 #include "bindings/c/foundationdb/fdb_c.h" -namespace FDBSystemTester { +namespace FdbApiTester { // Wrapper parent class to manage memory of an FDBFuture pointer. Cleans up // FDBFuture when this instance goes out of scope. @@ -76,6 +76,6 @@ public: static fdb_error_t setOption(FDBNetworkOption option); }; -} // namespace FDBSystemTester +} // namespace FdbApiTester #endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp similarity index 93% rename from bindings/c/test/system/SysTestCorrectnessWorkload.cpp rename to bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 17b6014a3f..a6a546666c 100644 --- a/bindings/c/test/system/SysTestCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -1,5 +1,5 @@ /* - * SysTestCorrectnessWorkload.cpp + * TesterCorrectnessWorkload.cpp * * This source file is part of the FoundationDB open source project * @@ -17,12 +17,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "SysTestWorkload.h" +#include "TesterWorkload.h" #include #include #include -namespace FDBSystemTester { +namespace FdbApiTester { class ApiCorrectnessWorkload : public WorkloadBase { public: @@ -58,4 +58,4 @@ std::unique_ptr createApiCorrectnessWorkload() { return std::make_unique(); } -} // namespace FDBSystemTester \ No newline at end of file +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestOptions.h b/bindings/c/test/apitester/TesterOptions.h similarity index 94% rename from bindings/c/test/system/SysTestOptions.h rename to bindings/c/test/apitester/TesterOptions.h index 85be667843..3dfcf5e0e2 100644 --- a/bindings/c/test/system/SysTestOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -1,5 +1,5 @@ /* - * SysTestOptions.h + * TesterOptions.h * * This source file is part of the FoundationDB open source project * @@ -28,7 +28,7 @@ #define FDB_API_VERSION 710 -namespace FDBSystemTester { +namespace FdbApiTester { class TesterOptions { public: @@ -49,6 +49,6 @@ public: int numFdbThreads = 1; }; -} // namespace FDBSystemTester +} // namespace FdbApiTester #endif diff --git a/bindings/c/test/system/SysTestScheduler.cpp b/bindings/c/test/apitester/TesterScheduler.cpp similarity index 93% rename from bindings/c/test/system/SysTestScheduler.cpp rename to bindings/c/test/apitester/TesterScheduler.cpp index 399d87d7f6..14661b04b2 100644 --- a/bindings/c/test/system/SysTestScheduler.cpp +++ b/bindings/c/test/apitester/TesterScheduler.cpp @@ -1,5 +1,5 @@ /* - * SysTestScheduler.cpp + * TesterScheduler.cpp * * This source file is part of the FoundationDB open source project * @@ -18,7 +18,7 @@ * limitations under the License. */ -#include "SysTestScheduler.h" +#include "TesterScheduler.h" #include #include @@ -27,7 +27,7 @@ using namespace boost::asio; -namespace FDBSystemTester { +namespace FdbApiTester { class AsioScheduler : public IScheduler { public: @@ -62,4 +62,4 @@ std::unique_ptr createScheduler(int numThreads) { return std::make_unique(numThreads); } -} // namespace FDBSystemTester \ No newline at end of file +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestScheduler.h b/bindings/c/test/apitester/TesterScheduler.h similarity index 92% rename from bindings/c/test/system/SysTestScheduler.h rename to bindings/c/test/apitester/TesterScheduler.h index b124959014..a680cd70ad 100644 --- a/bindings/c/test/system/SysTestScheduler.h +++ b/bindings/c/test/apitester/TesterScheduler.h @@ -1,5 +1,5 @@ /* - * SysTestScheduler.h + * TesterScheduler.h * * This source file is part of the FoundationDB open source project * @@ -26,7 +26,7 @@ #include #include -namespace FDBSystemTester { +namespace FdbApiTester { using TTaskFct = std::function; @@ -41,6 +41,6 @@ public: std::unique_ptr createScheduler(int numThreads); -} // namespace FDBSystemTester +} // namespace FdbApiTester #endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp similarity index 97% rename from bindings/c/test/system/SysTestTransactionExecutor.cpp rename to bindings/c/test/apitester/TesterTransactionExecutor.cpp index deb740ff30..bee96f4fca 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -1,5 +1,5 @@ /* - * SysTestTransactionExecutor.cpp + * TesterTransactionExecutor.cpp * * This source file is part of the FoundationDB open source project * @@ -18,13 +18,13 @@ * limitations under the License. */ -#include "SysTestTransactionExecutor.h" +#include "TesterTransactionExecutor.h" #include #include #include #include -namespace FDBSystemTester { +namespace FdbApiTester { namespace { @@ -185,4 +185,4 @@ std::unique_ptr createTransactionExecutor() { return std::make_unique(); } -} // namespace FDBSystemTester \ No newline at end of file +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h similarity index 93% rename from bindings/c/test/system/SysTestTransactionExecutor.h rename to bindings/c/test/apitester/TesterTransactionExecutor.h index 9c91e12521..6ffe7aa47a 100644 --- a/bindings/c/test/system/SysTestTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -1,5 +1,5 @@ /* - * SysTestTransactionExecutor.h + * TesterTransactionExecutor.h * * This source file is part of the FoundationDB open source project * @@ -23,13 +23,13 @@ #ifndef SYS_TEST_TRANSACTION_EXECUTOR_H #define SYS_TEST_TRANSACTION_EXECUTOR_H -#include "SysTestOptions.h" -#include "SysTestApiWrapper.h" -#include "SysTestScheduler.h" +#include "TesterOptions.h" +#include "TesterApiWrapper.h" +#include "TesterScheduler.h" #include #include -namespace FDBSystemTester { +namespace FdbApiTester { class ITransactionContext { public: @@ -91,6 +91,6 @@ public: std::unique_ptr createTransactionExecutor(); -} // namespace FDBSystemTester +} // namespace FdbApiTester #endif \ No newline at end of file diff --git a/bindings/c/test/system/SysTestWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp similarity index 88% rename from bindings/c/test/system/SysTestWorkload.cpp rename to bindings/c/test/apitester/TesterWorkload.cpp index a38dccef6c..17b9f6a29e 100644 --- a/bindings/c/test/system/SysTestWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -1,7 +1,7 @@ -#include "SysTestWorkload.h" +#include "TesterWorkload.h" #include -namespace FDBSystemTester { +namespace FdbApiTester { void WorkloadBase::init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) { this->txExecutor = txExecutor; @@ -33,4 +33,4 @@ void WorkloadBase::contIfDone() { } } -} // namespace FDBSystemTester \ No newline at end of file +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/system/SysTestWorkload.h b/bindings/c/test/apitester/TesterWorkload.h similarity index 92% rename from bindings/c/test/system/SysTestWorkload.h rename to bindings/c/test/apitester/TesterWorkload.h index 0a65138acb..42f55cb69f 100644 --- a/bindings/c/test/system/SysTestWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -1,5 +1,5 @@ /* - * SysTestWorkload.h + * TesterWorkload.h * * This source file is part of the FoundationDB open source project * @@ -23,10 +23,10 @@ #ifndef SYS_TEST_WORKLOAD_H #define SYS_TEST_WORKLOAD_H -#include "SysTestTransactionExecutor.h" +#include "TesterTransactionExecutor.h" #include -namespace FDBSystemTester { +namespace FdbApiTester { class IWorkload { public: @@ -56,6 +56,6 @@ private: std::atomic txRunning; }; -} // namespace FDBSystemTester +} // namespace FdbApiTester #endif \ No newline at end of file diff --git a/bindings/c/test/system/fdb_c_system_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp similarity index 97% rename from bindings/c/test/system/fdb_c_system_tester.cpp rename to bindings/c/test/apitester/fdb_c_api_tester.cpp index 257ed82658..85994144ba 100644 --- a/bindings/c/test/system/fdb_c_system_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -1,5 +1,5 @@ /* - * fdb_c_system_tester.cpp + * fdb_c_api_tester.cpp * * This source file is part of the FoundationDB open source project * @@ -18,17 +18,17 @@ * limitations under the License. */ -#include "SysTestOptions.h" -#include "SysTestWorkload.h" -#include "SysTestScheduler.h" -#include "SysTestTransactionExecutor.h" +#include "TesterOptions.h" +#include "TesterWorkload.h" +#include "TesterScheduler.h" +#include "TesterTransactionExecutor.h" #include #include #include #include "flow/SimpleOpt.h" #include "bindings/c/foundationdb/fdb_c.h" -namespace FDBSystemTester { +namespace FdbApiTester { namespace { @@ -222,9 +222,9 @@ void fdb_check(fdb_error_t e) { std::unique_ptr createApiCorrectnessWorkload(); -} // namespace FDBSystemTester +} // namespace FdbApiTester -using namespace FDBSystemTester; +using namespace FdbApiTester; void applyNetworkOptions(TesterOptions& options) { if (!options.externalClientLibrary.empty()) { From a32c3a2891d97c64c811584f54c56c0fcac2ebea Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 25 Feb 2022 03:13:58 -0800 Subject: [PATCH 020/138] Redwood now uses one global page cache eviction order and size limit for all page sizes, for real or virtual processes, and DecodeCache memory is counted against the global page cache budget. --- fdbserver/DeltaTree.h | 8 +- fdbserver/IPager.h | 4 + fdbserver/VersionedBTree.actor.cpp | 601 +++++++++++++++++------------ 3 files changed, 364 insertions(+), 249 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 8f69c0d66e..9cd2e69b4c 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1095,7 +1095,7 @@ public: // DecodedNodes are stored in a contiguous vector, which sometimes must be expanded, so care // must be taken to resolve DecodedNode pointers again after the DecodeCache has new entries added. struct DecodeCache : FastAllocated, ReferenceCounted { - DecodeCache(const T& lowerBound = T(), const T& upperBound = T(), size_t* pMemoryTracker = nullptr) + DecodeCache(const T& lowerBound = T(), const T& upperBound = T(), int64_t* pMemoryTracker = nullptr) : lowerBound(arena, lowerBound), upperBound(arena, upperBound), lastKnownUsedMemory(0), pMemoryTracker(pMemoryTracker) { decodedNodes.reserve(10); @@ -1121,8 +1121,8 @@ public: // DecodeCache destruction // Cursor destruction // as those are the most efficient times to publish an update. - size_t lastKnownUsedMemory; - size_t* pMemoryTracker; + int lastKnownUsedMemory; + int64_t* pMemoryTracker; // Index 0 is always the root std::vector decodedNodes; @@ -1130,7 +1130,7 @@ public: DecodedNode& get(int index) { return decodedNodes[index]; } void updateUsedMemory() { - size_t usedNow = arena.getSize(true) + (decodedNodes.capacity() * sizeof(DecodedNode)); + int usedNow = sizeof(DeltaTree2) + arena.getSize(true) + (decodedNodes.capacity() * sizeof(DecodedNode)); if (pMemoryTracker != nullptr) { *pMemoryTracker += (usedNow - lastKnownUsedMemory); } diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index ab179b10d0..bf14613cbd 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -309,6 +309,10 @@ public: // Advance the commit version and the oldest readble version and commit until the remap queue is empty. virtual Future clearRemapQueue() = 0; + // If set to a valid pointer, the page cache should behave as though the page cache size limit has been + // reduced by the target byte count. + virtual int64_t* getPageCachePenaltySource() = 0; + protected: ~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface }; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 66876ff63a..4e43c57a1b 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1673,9 +1673,6 @@ struct RedwoodMetrics { Reference kvSizeReadByGetRange; double startTime; - // Absolute counters, not reset per time interval - size_t decodeCacheMemory = 0; - // Return number of pages read or written, from cache or disk unsigned int pageOps() const { // All page reads are either a cache hit, probe hit, or a disk read @@ -1722,112 +1719,7 @@ struct RedwoodMetrics { // This will populate a trace event and/or a string with Redwood metrics. // The string is a reasonably well formatted page of information - void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false) { - std::pair metrics[] = { { "BTreePreload", metric.btreeLeafPreload }, - { "BTreePreloadExt", metric.btreeLeafPreloadExt }, - { "", 0 }, - { "OpSet", metric.opSet }, - { "OpSetKeyBytes", metric.opSetKeyBytes }, - { "OpSetValueBytes", metric.opSetValueBytes }, - { "OpClear", metric.opClear }, - { "OpClearKey", metric.opClearKey }, - { "", 0 }, - { "OpGet", metric.opGet }, - { "OpGetRange", metric.opGetRange }, - { "OpCommit", metric.opCommit }, - { "", 0 }, - { "PagerDiskWrite", metric.pagerDiskWrite }, - { "PagerDiskRead", metric.pagerDiskRead }, - { "PagerCacheHit", metric.pagerCacheHit }, - { "PagerCacheMiss", metric.pagerCacheMiss }, - { "", 0 }, - { "PagerProbeHit", metric.pagerProbeHit }, - { "PagerProbeMiss", metric.pagerProbeMiss }, - { "PagerEvictUnhit", metric.pagerEvictUnhit }, - { "PagerEvictFail", metric.pagerEvictFail }, - { "", 0 }, - { "PagerRemapFree", metric.pagerRemapFree }, - { "PagerRemapCopy", metric.pagerRemapCopy }, - { "PagerRemapSkip", metric.pagerRemapSkip }, - { "", 0 } }; - - double elapsed = now() - startTime; - - if (e != nullptr) { - for (auto& m : metrics) { - char c = m.first[0]; - if (c != 0 && (!skipZeroes || m.second != 0)) { - e->detail(m.first, m.second); - } - } - levels[0].metrics.events.toTraceEvent(e, 0); - } - - if (s != nullptr) { - for (auto& m : metrics) { - if (*m.first == '\0') { - *s += "\n"; - } else if (!skipZeroes || m.second != 0) { - *s += format("%-15s %-8u %8" PRId64 "/s ", m.first, m.second, int64_t(m.second / elapsed)); - } - } - *s += levels[0].metrics.events.toString(0, elapsed); - } - - for (int i = 1; i < btreeLevels + 1; ++i) { - auto& metric = levels[i].metrics; - - std::pair metrics[] = { - { "PageBuild", metric.pageBuild }, - { "PageBuildExt", metric.pageBuildExt }, - { "PageModify", metric.pageModify }, - { "PageModifyExt", metric.pageModifyExt }, - { "", 0 }, - { "PageRead", metric.pageRead }, - { "PageReadExt", metric.pageReadExt }, - { "PageCommitStart", metric.pageCommitStart }, - { "", 0 }, - { "LazyClearInt", metric.lazyClearRequeue }, - { "LazyClearIntExt", metric.lazyClearRequeueExt }, - { "LazyClear", metric.lazyClearFree }, - { "LazyClearExt", metric.lazyClearFreeExt }, - { "", 0 }, - { "ForceUpdate", metric.forceUpdate }, - { "DetachChild", metric.detachChild }, - { "", 0 }, - }; - - if (e != nullptr) { - for (auto& m : metrics) { - char c = m.first[0]; - if (c != 0 && (!skipZeroes || m.second != 0)) { - e->detail(format("L%d%s", i, m.first + (c == '-' ? 1 : 0)), m.second); - } - } - metric.events.toTraceEvent(e, i); - } - - if (s != nullptr) { - *s += format("\nLevel %d\n\t", i); - - for (auto& m : metrics) { - const char* name = m.first; - bool rate = elapsed != 0; - if (*name == '-') { - ++name; - rate = false; - } - - if (*name == '\0') { - *s += "\n\t"; - } else if (!skipZeroes || m.second != 0) { - *s += format("%-15s %8u %8u/s ", name, m.second, rate ? int(m.second / elapsed) : 0); - } - } - *s += metric.events.toString(i, elapsed); - } - } - } + void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false); std::string toString(bool clearAfter) { std::string s; @@ -1876,6 +1768,8 @@ ACTOR Future redwoodMetricsLogger() { // Future onEvictable() const; // ready when entry can be evicted template class ObjectCache : NonCopyable { + struct Entry; + typedef std::unordered_map CacheT; struct Entry : public boost::intrusive::list_base_hook<> { Entry() : hits(0), size(0) {} @@ -1883,24 +1777,159 @@ class ObjectCache : NonCopyable { ObjectType item; int hits; int size; - bool evictionPrioritized; + bool ownedByEvictor; + CacheT* pCache; }; - typedef std::unordered_map CacheT; typedef boost::intrusive::list EvictionOrderT; public: - ObjectCache(int sizeLimit = 1) : sizeLimit(sizeLimit), currentSize(0) {} + // Object evictor, manages the eviction order for one or more ObjectCaches + // Not all objects tracked by the Evictor are in its evictionOrder, as ObjectCaches + // using this Evictor can temporarily remove entries to an external order but they + // must eventually give them back with moveIn() or remove them with reclaim(). + class Evictor : NonCopyable { + public: + // Evictors are normally singletons, either one per real process or one per virtual process in simulation + static Evictor* getEvictor() { + static Evictor nonSimEvictor; + static std::map simEvictors; - int64_t getSizeUsed() const { return currentSize; } - int64_t getSizeLimit() const { return sizeLimit; } + if (g_network->isSimulated()) { + return &simEvictors[g_network->getLocalAddress()]; + } else { + return &nonSimEvictor; + } + } - void setSizeLimit(int n) { - ASSERT(n > 0); - sizeLimit = n; - cache.reserve(n); + // Move an entry to a different eviction order, stored outside of the Evictor, + // but the entry size is still counted against the evictor + void moveOut(Entry& e, EvictionOrderT& dest) { + ASSERT(e.ownedByEvictor); + dest.splice(dest.end(), evictionOrder, EvictionOrderT::s_iterator_to(e)); + e.ownedByEvictor = false; + ++movedOutCount; + } + + // Move an entry to the back of the eviction order if it is in the eviction order + void moveToBack(Entry& e) { + ASSERT(e.ownedByEvictor); + evictionOrder.splice(evictionOrder.end(), evictionOrder, EvictionOrderT::s_iterator_to(e)); + } + + // Move entire contents of an external eviction order containing entries whose size is part of + // this Evictor to the front of its eviction order. + void moveIn(EvictionOrderT& otherOrder) { + for (auto& e : otherOrder) { + ASSERT(!e.ownedByEvictor); + e.ownedByEvictor = true; + --movedOutCount; + } + evictionOrder.splice(evictionOrder.begin(), otherOrder); + } + + // Add a new item to the back of the eviction order + void addNew(Entry& e) { + sizeUsed += e.size; + evictionOrder.push_back(e); + e.ownedByEvictor = true; + } + + // Claim ownership of an entry, removing its size from the current size and removing it + // from the eviction order if it exists there + void reclaim(Entry& e) { + sizeUsed -= e.size; + // If e is in evictionOrder then remove it + if (e.ownedByEvictor) { + evictionOrder.erase(EvictionOrderT::s_iterator_to(e)); + e.ownedByEvictor = false; + } else { + // Otherwise, it wasn't so it had to be a movedOut item so decrement the count + --movedOutCount; + } + } + + void trim(int additionalSpaceNeeded = 0) { + // While the cache is too big, evict the oldest entry until the oldest entry can't be evicted. + while (sizeUsed > (sizeLimit - reservedSize - additionalSpaceNeeded)) { + Entry& toEvict = evictionOrder.front(); + + debug_printf("Evictor count=%" PRId64 " sizeUsed=%" PRId64 " sizeLimit=%" PRId64 " sizePenalty=%" PRId64 + " needed=%d Trying to evict %s\n", + evictionOrder.size(), + sizeUsed, + sizeLimit, + reservedSize, + additionalSpaceNeeded, + ::toString(toEvict.index).c_str()); + + if (!toEvict.item.evictable()) { + // shift the front to the back + evictionOrder.shift_forward(1); + ++g_redwoodMetrics.metric.pagerEvictFail; + break; + } else { + if (toEvict.hits == 0) { + ++g_redwoodMetrics.metric.pagerEvictUnhit; + } + sizeUsed -= toEvict.size; + debug_printf("Evicting %s\n", ::toString(toEvict.index).c_str()); + evictionOrder.pop_front(); + toEvict.pCache->erase(toEvict.index); + } + } + } + + int64_t getCountUsed() const { return evictionOrder.size() + movedOutCount; } + int64_t getCountMoved() const { return movedOutCount; } + int64_t getSizeUsed() const { return sizeUsed + reservedSize; } + + // Only to be used in tests at a point where all ObjectCache instances should be destroyed. + bool empty() const { return reservedSize == 0 && sizeUsed == 0 && getCountUsed() == 0; } + + std::string toString() const { + std::string s = format("Evictor {sizeLimit=%" PRId64 " sizeUsed=%" PRId64 " countUsed=%" PRId64 + " sizePenalty=%" PRId64 " movedOutCount=%" PRId64, + sizeLimit, + sizeUsed, + getCountUsed(), + reservedSize, + movedOutCount); + for (auto& entry : evictionOrder) { + s += format("\n\tindex %s size %d evictable %d\n", + ::toString(entry.index).c_str(), + entry.size, + entry.item.evictable()); + } + s += "}\n"; + return s; + } + + // Any external data strutures whose memory usage should be counted as part of the object cache + // budget should add their usage to this total and keep it updated. + int64_t reservedSize = 0; + int64_t sizeLimit = 0; + + private: + EvictionOrderT evictionOrder; + // Size of all entries in the eviction order or held in external eviction orders + int64_t sizeUsed = 0; + // Number of items that have been moveOut()'d to other evictionOrders and aren't back yet + int64_t movedOutCount = 0; + }; + + ObjectCache(Evictor* evictor = nullptr) : pEvictor(evictor) { + if (pEvictor == nullptr) { + pEvictor = Evictor::getEvictor(); + } } + Evictor& evictor() const { return *pEvictor; } + + int64_t getCount() const { return cache.size(); } + + void reserveCount(int count) { cache.reserve(count); } + // Get the object for i if it exists, else return nullptr. // If the object exists, its eviction order will NOT change as this is not a cache hit. ObjectType* getIfExists(const IndexType& index) { @@ -1915,10 +1944,8 @@ public: // If index is in cache and not on the prioritized eviction order list, move it there. void prioritizeEviction(const IndexType& index) { auto i = cache.find(index); - if (i != cache.end() && !i->second.evictionPrioritized) { - prioritizedEvictions.splice( - prioritizedEvictions.end(), evictionOrder, EvictionOrderT::s_iterator_to(i->second)); - i->second.evictionPrioritized = true; + if (i != cache.end() && i->second.ownedByEvictor) { + pEvictor->moveOut(i->second, prioritizedEvictions); } } @@ -1929,61 +1956,27 @@ public: ObjectType& get(const IndexType& index, int size, bool noHit = false) { Entry& entry = cache[index]; - // If entry is linked into evictionOrder then move it to the back of the order + // If entry is linked into an evictionOrder if (entry.is_linked()) { + // If this access is meant to be a hit if (!noHit) { ++entry.hits; - // If item eviction is not prioritized, move to back of eviction order - if (!entry.evictionPrioritized) { - evictionOrder.splice(evictionOrder.end(), evictionOrder, EvictionOrderT::s_iterator_to(entry)); + // If item eviction is not prioritized, move to end of eviction order + if (entry.ownedByEvictor) { + pEvictor->moveToBack(entry); } } } else { // Otherwise it was a cache miss + // Finish initializing entry entry.index = index; + entry.pCache = &cache; entry.hits = 0; entry.size = size; - currentSize += size; - // Insert the newly created Entry at the back of the eviction order - evictionOrder.push_back(entry); - entry.evictionPrioritized = false; - // While the cache is too big, evict the oldest entry until the oldest entry can't be evicted. - while (currentSize > sizeLimit) { - Entry& toEvict = evictionOrder.front(); - - // It's critical that we do not evict the item we just added because it would cause the reference - // returned to be invalid. An eviction could happen with a no-hit access to a cache resident page - // that is currently evictable and exists in the oversized portion of the cache eviction order due - // to previously failed evictions. - if (&entry == &toEvict) { - debug_printf("Cannot evict target index %s\n", toString(index).c_str()); - break; - } - - debug_printf("currentSize is %u and input size is %u. Trying to evict %s to make room for %s\n", - currentSize, - size, - toString(toEvict.index).c_str(), - toString(index).c_str()); - - if (!toEvict.item.evictable()) { - // shift the front to the back - evictionOrder.shift_forward(1); - ++g_redwoodMetrics.metric.pagerEvictFail; - break; - } else { - if (toEvict.hits == 0) { - ++g_redwoodMetrics.metric.pagerEvictUnhit; - } - currentSize -= toEvict.size; - debug_printf( - "Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); - evictionOrder.pop_front(); - cache.erase(toEvict.index); - } - } + pEvictor->trim(entry.size); + pEvictor->addNew(entry); } return entry.item; @@ -1991,52 +1984,34 @@ public: // Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it. ACTOR static Future clear_impl(ObjectCache* self) { - state ObjectCache::CacheT cache; - state EvictionOrderT evictionOrder; - state int64_t currentSize; + // Claim ownership of all of our cached items, removing them from the evictor's control and quota. + for (auto& ie : self->cache) { + self->pEvictor->reclaim(ie.second); + } - // Flush all prioritized evictions to the main eviction order - self->flushPrioritizedEvictions(); - ASSERT(cache.size() == evictionOrder.size()); + // All items are in the cache so we don't need the prioritized eviction order anymore, and the cache is about + // to be destroyed so the prioritizedEvictions head/tail will become invalid. + self->prioritizedEvictions.clear(); - // Swap cache contents to local state vars - // After this, no more entries will be added to or read from these - // structures so we know for sure that no page will become unevictable - // after it is either evictable or onEvictable() is ready. - cache.swap(self->cache); - currentSize = self->currentSize; - evictionOrder.swap(self->evictionOrder); - - state typename EvictionOrderT::iterator i = evictionOrder.begin(); - state typename EvictionOrderT::iterator iEnd = evictionOrder.end(); - while (i != iEnd) { - if (!i->item.evictable()) { - wait(i->item.onEvictable()); + state typename CacheT::iterator i = self->cache.begin(); + while (i != self->cache.end()) { + if (!i->second.item.evictable()) { + wait(i->second.item.onEvictable()); } - currentSize -= i->size; - self->currentSize -= i->size; ++i; } - evictionOrder.clear(); - cache.clear(); - ASSERT(currentSize == 0); return Void(); } - Future clear() { - ASSERT(evictionOrder.size() + prioritizedEvictions.size() == cache.size()); - return clear_impl(this); - } + Future clear() { return clear_impl(this); } // Move the prioritized evictions queued to the front of the eviction order - void flushPrioritizedEvictions() { evictionOrder.splice(evictionOrder.begin(), prioritizedEvictions); } + void flushPrioritizedEvictions() { pEvictor->moveIn(prioritizedEvictions); } private: - int64_t sizeLimit; - int64_t currentSize; + Evictor* pEvictor; CacheT cache; - EvictionOrderT evictionOrder; EvictionOrderT prioritizedEvictions; }; @@ -2072,6 +2047,26 @@ public: typedef FIFOQueue LogicalPageQueueT; typedef std::map VersionToPageMapT; typedef std::unordered_map PageToVersionedMapT; + struct PageCacheEntry { + Future> readFuture; + Future writeFuture; + + bool initialized() const { return readFuture.isValid(); } + + bool reading() const { return !readFuture.isReady(); } + + bool writing() const { return !writeFuture.isReady(); } + + bool evictable() const { + // Don't evict if a page is still being read or written + return !reading() && !writing(); + } + + Future onEvictable() const { return ready(readFuture) && writeFuture; } + }; + typedef ObjectCache PageCacheT; + + int64_t* getPageCachePenaltySource() override { return &pageCache.evictor().reservedSize; } #pragma pack(push, 1) struct DelayedFreePage { @@ -2138,7 +2133,7 @@ public: // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 to use default from flow knobs - // If filename is empty, the pager will exist only in memory and once the cache is full writes will fail. + // If memoryOnly is true, the pager will exist only in memory and once the cache is full writes will fail. DWALPager(int desiredPageSize, int desiredExtentSize, std::string filename, @@ -2152,6 +2147,9 @@ public: desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindow(remapCleanupWindow), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { + // This sets the page cache size for all PageCacheT instances using the same evictor + pageCache.evictor().sizeLimit = pageCacheBytes; + if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); } @@ -2161,7 +2159,8 @@ public: } void setPageSize(int size) { - g_redwoodMetrics.updateMaxRecordCount(315 * size / 4096); + // Conservative maximum for number of records that can fit in this page size + g_redwoodMetrics.updateMaxRecordCount(315.0 * size / 4096); logicalPageSize = size; // Physical page size is the total size of the smallest number of physical blocks needed to store @@ -2171,7 +2170,6 @@ public: if (pHeader != nullptr) { pHeader->pageSize = logicalPageSize; } - pageCache.setSizeLimit(1 + ((pageCacheBytes - 1) / physicalPageSize)); } void setExtentSize(int size) { @@ -2187,9 +2185,6 @@ public: if (pHeader != nullptr) { pHeader->extentSize = size; } - - // TODO: How should this cache be sized - not really a cache. it should hold all extentIDs? - extentCache.setSizeLimit(100000); } void updateCommittedHeader() { @@ -2688,7 +2683,7 @@ public: // or as a cache miss because there is no benefit to the page already being in cache // Similarly, this does not count as a point lookup for reason. ASSERT(pageIDs.front() != invalidLogicalPageID); - PageCacheEntry& cacheEntry = pageCache.get(pageIDs.front(), pageIDs.size(), true); + PageCacheEntry& cacheEntry = pageCache.get(pageIDs.front(), pageIDs.size() * physicalPageSize, true); debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageIDs).c_str(), @@ -3002,7 +2997,7 @@ public: debug_printf("DWALPager(%s) op=readUncachedMiss %s\n", filename.c_str(), toString(pageID).c_str()); return forwardError(readPhysicalPage(this, pageID, priority, false), errorPromise); } - PageCacheEntry& cacheEntry = pageCache.get(pageID, 1, noHit); + PageCacheEntry& cacheEntry = pageCache.get(pageID, physicalPageSize, noHit); debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), toString(pageID).c_str(), @@ -3048,7 +3043,7 @@ public: return forwardError(readPhysicalMultiPage(this, pageIDs, priority), errorPromise); } - PageCacheEntry& cacheEntry = pageCache.get(pageIDs.front(), pageIDs.size(), noHit); + PageCacheEntry& cacheEntry = pageCache.get(pageIDs.front(), pageIDs.size() * physicalPageSize, noHit); debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), toString(pageIDs).c_str(), @@ -3667,8 +3662,8 @@ public: int64_t free; int64_t total; if (memoryOnly) { - total = pageCacheBytes; - free = pageCacheBytes - (pageCache.getSizeUsed() * physicalPageSize); + total = pageCache.evictor().sizeLimit; + free = pageCache.evictor().getSizeUsed(); } else { g_network->getDiskBytes(parentDirectory(filename), free, total); } @@ -3691,9 +3686,9 @@ public: return StorageBytes(free, total, pagerSize - reusable, free + reusable, temp); } - int64_t getPageCacheCount() override { return pageCache.getSizeUsed(); } + int64_t getPageCacheCount() override { return pageCache.getCount(); } int64_t getPageCount() override { return pHeader->pageCount; } - int64_t getExtentCacheCount() override { return extentCache.getSizeUsed(); } + int64_t getExtentCacheCount() override { return extentCache.getCount(); } ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { // Wait for the remap eraser to finish all of its work (not triggering stop) @@ -3780,24 +3775,6 @@ private: }; #pragma pack(pop) - struct PageCacheEntry { - Future> readFuture; - Future writeFuture; - - bool initialized() const { return readFuture.isValid(); } - - bool reading() const { return !readFuture.isReady(); } - - bool writing() const { return !writeFuture.isReady(); } - - bool evictable() const { - // Don't evict if a page is still being read or written - return !reading() && !writing(); - } - - Future onEvictable() const { return ready(readFuture) && writeFuture; } - }; - ACTOR static Future clearRemapQueue_impl(DWALPager* self) { // Wait for outstanding commit. wait(self->commitFuture); @@ -3820,6 +3797,7 @@ private: Future clearRemapQueue() override { return clearRemapQueue_impl(this); } +private: // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires // this in simulation, and it also makes sense for current SSDs. // Allowing a smaller 'logical' page size is very useful for testing. @@ -3831,7 +3809,6 @@ private: int physicalExtentSize; int pagesPerExtent; -private: PriorityMultiLock ioLock; int64_t pageCacheBytes; @@ -3857,11 +3834,8 @@ private: std::string filename; bool memoryOnly; - typedef ObjectCache PageCacheT; PageCacheT pageCache; - - typedef ObjectCache ExtentCacheT; - ExtentCacheT extentCache; + PageCacheT extentCache; Promise closedPromise; Promise errorPromise; @@ -4827,6 +4801,7 @@ public: VersionedBTree(IPager2* pager, std::string name) : m_pager(pager), m_pBuffer(nullptr), m_mutationCount(0), m_name(name), m_pHeader(nullptr), m_headerSpace(0) { + m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource(); m_lazyClearActor = 0; m_init = init_impl(this); m_latestCommit = m_init; @@ -4886,7 +4861,7 @@ public: // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding - BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd, &g_redwoodMetrics.decodeCacheMemory); + BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd); BTreePage::ValueTree::Cursor c(&cache, btPage.valueTree()); ASSERT(c.moveFirst()); Version v = entry.version; @@ -5267,6 +5242,9 @@ private: IPager2* m_pager; + // Counter to update with DecodeCache memory usage + int64_t* m_pDecodeCacheMemory = nullptr; + // The mutation buffer currently being written to std::unique_ptr m_pBuffer; int64_t m_mutationCount; @@ -5732,7 +5710,7 @@ private: } // Get cursor into a BTree node, creating decode cache from boundaries if needed - static BTreePage::BinaryTree::Cursor getCursor(Reference page, + inline BTreePage::BinaryTree::Cursor getCursor(const ArenaPage* page, const RedwoodRecordRef& lowerBound, const RedwoodRecordRef& upperBound) { if (page->userData == nullptr) { @@ -5742,7 +5720,7 @@ private: upperBound.toString().c_str()); BTreePage::BinaryTree::DecodeCache* cache = - new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound, &g_redwoodMetrics.decodeCacheMemory); + new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound, m_pDecodeCacheMemory); page->userData = cache; page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; } @@ -5752,8 +5730,7 @@ private: } // Get cursor into a BTree node from a child link - static BTreePage::BinaryTree::Cursor getCursor(const Reference& page, - const BTreePage::BinaryTree::Cursor& link) { + inline BTreePage::BinaryTree::Cursor getCursor(const ArenaPage* page, const BTreePage::BinaryTree::Cursor& link) { if (page->userData == nullptr) { return getCursor(page, link.get(), link.next().getOrUpperBound()); } @@ -6200,8 +6177,9 @@ private: false, rootID, batch->snapshot->getVersion(), update->decodeLowerBound, update->decodeUpperBound) .c_str()); - state BTreePage::BinaryTree::Cursor cursor = - update->cBegin.valid() ? getCursor(page, update->cBegin) : getCursor(page, dbBegin, dbEnd); + state BTreePage::BinaryTree::Cursor cursor = update->cBegin.valid() + ? self->getCursor(page.getPtr(), update->cBegin) + : self->getCursor(page.getPtr(), dbBegin, dbEnd); if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); @@ -7058,9 +7036,9 @@ public: true), [=](Reference p) { #if REDWOOD_DEBUG - path.push_back({ p, getCursor(p, link), link.get().getChildPage() }); + path.push_back({ p, btree->getCursor(p.getPtr(), link), link.get().getChildPage() }); #else - path.push_back({ p, getCursor(p, link) }); + path.push_back({ p, btree->getCursor(p.getPtr(), link) }); #endif return Void(); }); @@ -7071,9 +7049,9 @@ public: return map(readPage(reason, btree->m_pHeader->height, pager.getPtr(), id, ioMaxPriority, false, true), [=](Reference p) { #if REDWOOD_DEBUG - path.push_back({ p, getCursor(p, dbBegin, dbEnd), id }); + path.push_back({ p, btree->getCursor(p.getPtr(), dbBegin, dbEnd), id }); #else - path.push_back({ p, getCursor(p, dbBegin, dbEnd) }); + path.push_back({ p, btree->getCursor(p.getPtr(), dbBegin, dbEnd) }); #endif return Void(); }); @@ -8078,6 +8056,133 @@ RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std: return rec; } +void RedwoodMetrics::getFields(TraceEvent* e, std::string* s, bool skipZeroes) { + std::pair metrics[] = { { "BTreePreload", metric.btreeLeafPreload }, + { "BTreePreloadExt", metric.btreeLeafPreloadExt }, + { "", 0 }, + { "OpSet", metric.opSet }, + { "OpSetKeyBytes", metric.opSetKeyBytes }, + { "OpSetValueBytes", metric.opSetValueBytes }, + { "OpClear", metric.opClear }, + { "OpClearKey", metric.opClearKey }, + { "", 0 }, + { "OpGet", metric.opGet }, + { "OpGetRange", metric.opGetRange }, + { "OpCommit", metric.opCommit }, + { "", 0 }, + { "PagerDiskWrite", metric.pagerDiskWrite }, + { "PagerDiskRead", metric.pagerDiskRead }, + { "PagerCacheHit", metric.pagerCacheHit }, + { "PagerCacheMiss", metric.pagerCacheMiss }, + { "", 0 }, + { "PagerProbeHit", metric.pagerProbeHit }, + { "PagerProbeMiss", metric.pagerProbeMiss }, + { "PagerEvictUnhit", metric.pagerEvictUnhit }, + { "PagerEvictFail", metric.pagerEvictFail }, + { "", 0 }, + { "PagerRemapFree", metric.pagerRemapFree }, + { "PagerRemapCopy", metric.pagerRemapCopy }, + { "PagerRemapSkip", metric.pagerRemapSkip }, + { "", 0 } }; + + double elapsed = now() - startTime; + + if (e != nullptr) { + for (auto& m : metrics) { + char c = m.first[0]; + if (c != 0 && (!skipZeroes || m.second != 0)) { + e->detail(m.first, m.second); + } + } + levels[0].metrics.events.toTraceEvent(e, 0); + } + + if (s != nullptr) { + for (auto& m : metrics) { + if (*m.first == '\0') { + *s += "\n"; + } else if (!skipZeroes || m.second != 0) { + *s += format("%-15s %-8u %8" PRId64 "/s ", m.first, m.second, int64_t(m.second / elapsed)); + } + } + *s += levels[0].metrics.events.toString(0, elapsed); + } + + auto const& evictor = DWALPager::PageCacheT::Evictor::getEvictor(); + + std::pair cacheMetrics[] = { { "PageCacheCount", evictor->getCountUsed() }, + { "PageCacheMoved", evictor->getCountMoved() }, + { "PageCacheSize", evictor->getSizeUsed() }, + { "DecodeCacheSize", evictor->reservedSize } }; + + if (e != nullptr) { + for (auto& m : cacheMetrics) { + e->detail(m.first, m.second); + } + } + + if (s != nullptr) { + for (auto& m : cacheMetrics) { + *s += format("%-15s %-14" PRId64 " ", m.first, m.second); + } + *s += "\n"; + } + + for (int i = 1; i < btreeLevels + 1; ++i) { + auto& metric = levels[i].metrics; + + std::pair metrics[] = { + { "PageBuild", metric.pageBuild }, + { "PageBuildExt", metric.pageBuildExt }, + { "PageModify", metric.pageModify }, + { "PageModifyExt", metric.pageModifyExt }, + { "", 0 }, + { "PageRead", metric.pageRead }, + { "PageReadExt", metric.pageReadExt }, + { "PageCommitStart", metric.pageCommitStart }, + { "", 0 }, + { "LazyClearInt", metric.lazyClearRequeue }, + { "LazyClearIntExt", metric.lazyClearRequeueExt }, + { "LazyClear", metric.lazyClearFree }, + { "LazyClearExt", metric.lazyClearFreeExt }, + { "", 0 }, + { "ForceUpdate", metric.forceUpdate }, + { "DetachChild", metric.detachChild }, + { "", 0 }, + }; + + if (e != nullptr) { + for (auto& m : metrics) { + char c = m.first[0]; + if (c != 0 && (!skipZeroes || m.second != 0)) { + e->detail(format("L%d%s", i, m.first + (c == '-' ? 1 : 0)), m.second); + } + } + metric.events.toTraceEvent(e, i); + } + + if (s != nullptr) { + *s += format("\nLevel %d\n\t", i); + + for (auto& m : metrics) { + const char* name = m.first; + bool rate = elapsed != 0; + if (*name == '-') { + ++name; + rate = false; + } + + if (*name == '\0') { + *s += "\n\t"; + } else if (!skipZeroes || m.second != 0) { + *s += format("%-15s %8u %8u/s ", name, m.second, rate ? int(m.second / elapsed) : 0); + } + } + *s += metric.events.toString(i, elapsed); + } + } +} + TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3); ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4); @@ -9219,7 +9324,7 @@ TEST_CASE("Lredwood/correctness/btree") { state int64_t cacheSizeBytes = params.getInt("cacheSizeBytes") .orDefault(pagerMemoryOnly ? 2e9 - : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1))); + : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 10 : 10000) + 1))); state Version versionIncrement = params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8)); state Version remapCleanupWindow = @@ -9517,6 +9622,9 @@ TEST_CASE("Lredwood/correctness/btree") { debug_printf("Closing.\n"); wait(closedFuture); + wait(delay(0)); + ASSERT(DWALPager::PageCacheT::Evictor::getEvictor()->empty()); + return Void(); } @@ -9982,6 +10090,9 @@ TEST_CASE(":/redwood/performance/set") { btree->close(); wait(closedFuture); + wait(delay(0)); + ASSERT(DWALPager::PageCacheT::Evictor::getEvictor()->empty()); + return Void(); } From c3e6eea41a24baeb1c3a29014bb652c6a4018f23 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Sat, 26 Feb 2022 00:07:17 +0100 Subject: [PATCH 021/138] ApiTester: In memory key-value store --- bindings/c/CMakeLists.txt | 2 + .../c/test/apitester/TesterApiWrapper.cpp | 2 +- bindings/c/test/apitester/TesterApiWrapper.h | 6 +- .../apitester/TesterCorrectnessWorkload.cpp | 2 +- .../c/test/apitester/TesterKeyValueStore.cpp | 149 ++++++++++++++++++ .../c/test/apitester/TesterKeyValueStore.h | 74 +++++++++ bindings/c/test/apitester/TesterOptions.h | 4 +- bindings/c/test/apitester/TesterScheduler.h | 4 +- .../apitester/TesterTransactionExecutor.h | 4 +- bindings/c/test/apitester/TesterWorkload.cpp | 20 +++ bindings/c/test/apitester/TesterWorkload.h | 4 +- .../c/test/apitester/fdb_c_api_tester.cpp | 2 +- 12 files changed, 259 insertions(+), 14 deletions(-) create mode 100644 bindings/c/test/apitester/TesterKeyValueStore.cpp create mode 100644 bindings/c/test/apitester/TesterKeyValueStore.h diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 18c3ae8793..1a05ba88c6 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -102,6 +102,8 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) test/apitester/TesterApiWrapper.cpp test/apitester/TesterApiWrapper.h test/apitester/TesterCorrectnessWorkload.cpp + test/apitester/TesterKeyValueStore.cpp + test/apitester/TesterKeyValueStore.h test/apitester/TesterOptions.h test/apitester/TesterScheduler.cpp test/apitester/TesterScheduler.h diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 764533d837..710040a2cb 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index 7b3769a536..c7d7571dbc 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,8 @@ #pragma once -#ifndef SYS_TEST_API_WRAPPER_H -#define SYS_TEST_API_WRAPPER_H +#ifndef APITESTER_API_WRAPPER_H +#define APITESTER_API_WRAPPER_H #include #include diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index a6a546666c..3ab3c5ab71 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bindings/c/test/apitester/TesterKeyValueStore.cpp b/bindings/c/test/apitester/TesterKeyValueStore.cpp new file mode 100644 index 0000000000..f40f01f134 --- /dev/null +++ b/bindings/c/test/apitester/TesterKeyValueStore.cpp @@ -0,0 +1,149 @@ +/* + * TesterKeyValueStore.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterKeyValueStore.h" + +// Get the value associated with a key +std::optional KeyValueStore::get(std::string_view key) const { + std::map::const_iterator value = store.find(std::string(key)); + if (value != store.end()) + return value->second; + else + return std::optional(); +} + +// Returns the key designated by a key selector +std::string KeyValueStore::getKey(std::string_view keyName, bool orEqual, int offset) const { + // Begin by getting the start key referenced by the key selector + std::map::const_iterator mapItr = store.lower_bound(keyName); + + // Update the iterator position if necessary based on the value of orEqual + int count = 0; + if (offset <= 0) { + if (mapItr == store.end() || keyName != mapItr->first || !orEqual) { + if (mapItr == store.begin()) + return startKey(); + + mapItr--; + } + } else { + if (mapItr == store.end()) + return endKey(); + + if (keyName == mapItr->first && orEqual) { + mapItr++; + } + + count++; + } + + // Increment the map iterator until the desired offset is reached + for (; count < abs(offset); count++) { + if (offset < 0) { + if (mapItr == store.begin()) + break; + + mapItr--; + } else { + if (mapItr == store.end()) + break; + + mapItr++; + } + } + + if (mapItr == store.end()) + return endKey(); + else if (count == abs(offset)) + return mapItr->first; + else + return startKey(); +} + +// Gets a range of key-value pairs, returning a maximum of results +std::vector KeyValueStore::getRange(std::string_view begin, + std::string_view end, + int limit, + bool reverse) const { + std::vector results; + if (!reverse) { + std::map::const_iterator mapItr = store.lower_bound(begin); + + for (; mapItr != store.end() && mapItr->first < end && results.size() < limit; mapItr++) + results.push_back(KeyValue{ mapItr->first, mapItr->second }); + } + + // Support for reverse getRange queries is supported, but not tested at this time. This is because reverse range + // queries have been disallowed by the database at the API level + else { + std::map::const_iterator mapItr = store.lower_bound(end); + if (mapItr == store.begin()) + return results; + + for (--mapItr; mapItr->first >= begin && results.size() < abs(limit); mapItr--) { + results.push_back(KeyValue{ mapItr->first, mapItr->second }); + if (mapItr == store.begin()) + break; + } + } + + return results; +} + +// Stores a key-value pair in the database +void KeyValueStore::set(std::string_view key, std::string_view value) { + store[std::string(key)] = value; +} + +// Removes a key from the database +void KeyValueStore::clear(std::string_view key) { + auto iter = store.find(key); + if (iter != store.end()) { + store.erase(iter); + } +} + +// Removes a range of keys from the database +void KeyValueStore::clear(std::string_view begin, std::string_view end) { + store.erase(store.lower_bound(begin), store.lower_bound(end)); +} + +// The number of keys in the database +uint64_t KeyValueStore::size() const { + return store.size(); +} + +// The first key in the database; returned by key selectors that choose a key off the front +std::string KeyValueStore::startKey() const { + return ""; +} + +// The last key in the database; returned by key selectors that choose a key off the back +std::string KeyValueStore::endKey() const { + return "\xff"; +} + +// Debugging function that prints all key-value pairs +void KeyValueStore::printContents() const { + printf("Contents:\n"); + std::map::const_iterator mapItr; + for (mapItr = store.begin(); mapItr != store.end(); mapItr++) + printf("%s\n", mapItr->first.c_str()); +} diff --git a/bindings/c/test/apitester/TesterKeyValueStore.h b/bindings/c/test/apitester/TesterKeyValueStore.h new file mode 100644 index 0000000000..9dbd35b0de --- /dev/null +++ b/bindings/c/test/apitester/TesterKeyValueStore.h @@ -0,0 +1,74 @@ +/* + * TesterKeyValueStore.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef APITESTER_KEY_VALUE_STORE_H +#define APITESTER_KEY_VALUE_STORE_H + +#include +#include +#include +#include +#include + +class KeyValueStore { +public: + struct KeyValue { + std::string key; + std::string value; + }; + + // Get the value associated with a key + std::optional get(std::string_view key) const; + + // Returns the key designated by a key selector + std::string getKey(std::string_view keyName, bool orEqual, int offset) const; + + // Gets a range of key-value pairs, returning a maximum of results + std::vector getRange(std::string_view begin, std::string_view end, int limit, bool reverse) const; + + // Stores a key-value pair in the database + void set(std::string_view key, std::string_view value); + + // Removes a key from the database + void clear(std::string_view key); + + // Removes a range of keys from the database + void clear(std::string_view begin, std::string_view end); + + // The number of keys in the database + uint64_t size() const; + + // The first key in the database; returned by key selectors that choose a key off the front + std::string startKey() const; + + // The last key in the database; returned by key selectors that choose a key off the back + std::string endKey() const; + + // Debugging function that prints all key-value pairs + void printContents() const; + +private: + // A map holding the key-value pairs + std::map> store; +}; + +#endif \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 3dfcf5e0e2..999c1cafac 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -20,8 +20,8 @@ #pragma once -#ifndef SYS_TEST_OPTIONS_TESTER_OPTIONS_H -#define SYSTEM_TESTER_TESTER_OPTIONS_H +#ifndef APITESTER_TESTER_OPTIONS_H +#define APITESTER_TESTER_OPTIONS_H #include #include diff --git a/bindings/c/test/apitester/TesterScheduler.h b/bindings/c/test/apitester/TesterScheduler.h index a680cd70ad..1486bfba02 100644 --- a/bindings/c/test/apitester/TesterScheduler.h +++ b/bindings/c/test/apitester/TesterScheduler.h @@ -20,8 +20,8 @@ #pragma once -#ifndef SYS_TEST_SCHEDULER_H -#define SYS_TEST_SCHEDULER_H +#ifndef APITESTER_SCHEDULER_H +#define APITESTER_SCHEDULER_H #include #include diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 6ffe7aa47a..d7b077c180 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -20,8 +20,8 @@ #pragma once -#ifndef SYS_TEST_TRANSACTION_EXECUTOR_H -#define SYS_TEST_TRANSACTION_EXECUTOR_H +#ifndef APITESTER_TRANSACTION_EXECUTOR_H +#define APITESTER_TRANSACTION_EXECUTOR_H #include "TesterOptions.h" #include "TesterApiWrapper.h" diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index 17b9f6a29e..c4269dd37b 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -1,3 +1,23 @@ +/* + * TesterWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "TesterWorkload.h" #include diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index 42f55cb69f..b6f57ced1b 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -20,8 +20,8 @@ #pragma once -#ifndef SYS_TEST_WORKLOAD_H -#define SYS_TEST_WORKLOAD_H +#ifndef APITESTER_WORKLOAD_H +#define APITESTER_WORKLOAD_H #include "TesterTransactionExecutor.h" #include diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 85994144ba..134ec85281 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 3967f9ed1482c72a88cf2fcbe847ca2f65b8df1f Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Sat, 26 Feb 2022 00:09:37 +0100 Subject: [PATCH 022/138] ApiTester: Introduce workload manager --- .../apitester/TesterCorrectnessWorkload.cpp | 10 ++-- bindings/c/test/apitester/TesterScheduler.cpp | 2 + bindings/c/test/apitester/TesterScheduler.h | 2 + bindings/c/test/apitester/TesterWorkload.cpp | 49 +++++++++++++++---- bindings/c/test/apitester/TesterWorkload.h | 41 +++++++++++++--- .../c/test/apitester/fdb_c_api_tester.cpp | 14 +++--- 6 files changed, 91 insertions(+), 27 deletions(-) diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 3ab3c5ab71..1b90e4b5a6 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -26,7 +26,7 @@ namespace FdbApiTester { class ApiCorrectnessWorkload : public WorkloadBase { public: - ApiCorrectnessWorkload() : numTxLeft(10) {} + ApiCorrectnessWorkload() : numTxLeft(1000) {} void start() override { schedule([this]() { nextTransaction(); }); @@ -34,7 +34,9 @@ public: private: void nextTransaction() { - std::cout << numTxLeft << " transactions left" << std::endl; + if (numTxLeft % 100 == 0) { + std::cout << numTxLeft << " transactions left" << std::endl; + } if (numTxLeft == 0) return; @@ -54,8 +56,8 @@ private: int numTxLeft; }; -std::unique_ptr createApiCorrectnessWorkload() { - return std::make_unique(); +std::shared_ptr createApiCorrectnessWorkload() { + return std::make_shared(); } } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterScheduler.cpp b/bindings/c/test/apitester/TesterScheduler.cpp index 14661b04b2..7038cd85bd 100644 --- a/bindings/c/test/apitester/TesterScheduler.cpp +++ b/bindings/c/test/apitester/TesterScheduler.cpp @@ -29,6 +29,8 @@ using namespace boost::asio; namespace FdbApiTester { +const TTaskFct NO_OP_TASK = []() {}; + class AsioScheduler : public IScheduler { public: AsioScheduler(int numThreads) : numThreads(numThreads) {} diff --git a/bindings/c/test/apitester/TesterScheduler.h b/bindings/c/test/apitester/TesterScheduler.h index 1486bfba02..676384fc9c 100644 --- a/bindings/c/test/apitester/TesterScheduler.h +++ b/bindings/c/test/apitester/TesterScheduler.h @@ -30,6 +30,8 @@ namespace FdbApiTester { using TTaskFct = std::function; +extern const TTaskFct NO_OP_TASK; + class IScheduler { public: virtual ~IScheduler() {} diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index c4269dd37b..5406c65635 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -20,36 +20,65 @@ #include "TesterWorkload.h" #include +#include namespace FdbApiTester { -void WorkloadBase::init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) { - this->txExecutor = txExecutor; - this->scheduler = sched; - this->doneCont = cont; +void WorkloadBase::init(WorkloadManager* manager) { + this->manager = manager; } void WorkloadBase::schedule(TTaskFct task) { tasksScheduled++; - scheduler->schedule([this, task]() { + manager->scheduler->schedule([this, task]() { tasksScheduled--; task(); - contIfDone(); + checkIfDone(); }); } void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont) { txRunning++; - txExecutor->execute(tx, [this, cont]() { + manager->txExecutor->execute(tx, [this, cont]() { txRunning--; cont(); - contIfDone(); + checkIfDone(); }); } -void WorkloadBase::contIfDone() { +void WorkloadBase::checkIfDone() { if (txRunning == 0 && tasksScheduled == 0) { - doneCont(); + manager->workloadDone(this); + } +} + +void WorkloadManager::add(std::shared_ptr workload, TTaskFct cont) { + std::unique_lock lock(mutex); + workloads[workload.get()] = WorkloadInfo{ workload, cont }; +} + +void WorkloadManager::run() { + for (auto iter : workloads) { + iter.first->init(this); + } + for (auto iter : workloads) { + iter.first->start(); + } + scheduler->join(); +} + +void WorkloadManager::workloadDone(IWorkload* workload) { + std::unique_lock lock(mutex); + auto iter = workloads.find(workload); + assert(iter != workloads.end()); + lock.unlock(); + iter->second.cont(); + lock.lock(); + workloads.erase(iter); + bool done = workloads.empty(); + lock.unlock(); + if (done) { + scheduler->stop(); } } diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index b6f57ced1b..53cc8637fb 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -25,20 +25,24 @@ #include "TesterTransactionExecutor.h" #include +#include +#include namespace FdbApiTester { +class WorkloadManager; + class IWorkload { public: virtual ~IWorkload() {} - virtual void init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) = 0; + virtual void init(WorkloadManager* manager) = 0; virtual void start() = 0; }; class WorkloadBase : public IWorkload { public: - WorkloadBase() : txExecutor(nullptr), scheduler(nullptr), tasksScheduled(0), txRunning(0) {} - void init(ITransactionExecutor* txExecutor, IScheduler* sched, TTaskFct cont) override; + WorkloadBase() : manager(nullptr), tasksScheduled(0), txRunning(0) {} + void init(WorkloadManager* manager) override; protected: void schedule(TTaskFct task); @@ -46,16 +50,39 @@ protected: void execTransaction(TTxStartFct start, TTaskFct cont) { execTransaction(std::make_shared(start), cont); } - void contIfDone(); + void checkIfDone(); private: - ITransactionExecutor* txExecutor; - IScheduler* scheduler; - TTaskFct doneCont; + WorkloadManager* manager; std::atomic tasksScheduled; std::atomic txRunning; }; +class WorkloadManager { +public: + WorkloadManager(ITransactionExecutor* txExecutor, IScheduler* scheduler) + : txExecutor(txExecutor), scheduler(scheduler) {} + + void add(std::shared_ptr workload, TTaskFct cont = NO_OP_TASK); + void run(); + +private: + friend WorkloadBase; + + struct WorkloadInfo { + std::shared_ptr ref; + TTaskFct cont; + }; + + void workloadDone(IWorkload* workload); + + ITransactionExecutor* txExecutor; + IScheduler* scheduler; + + std::mutex mutex; + std::unordered_map workloads; +}; + } // namespace FdbApiTester #endif \ No newline at end of file diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 134ec85281..8334149351 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -220,7 +220,7 @@ void fdb_check(fdb_error_t e) { } } // namespace -std::unique_ptr createApiCorrectnessWorkload(); +std::shared_ptr createApiCorrectnessWorkload(); } // namespace FdbApiTester @@ -248,11 +248,13 @@ void runApiCorrectness(TesterOptions& options) { std::unique_ptr txExecutor = createTransactionExecutor(); scheduler->start(); txExecutor->init(scheduler.get(), options.clusterFile.c_str(), txExecOptions); - std::unique_ptr workload = createApiCorrectnessWorkload(); - IScheduler* schedPtr = scheduler.get(); - workload->init(txExecutor.get(), schedPtr, [schedPtr]() { schedPtr->stop(); }); - workload->start(); - scheduler->join(); + + WorkloadManager workloadMgr(txExecutor.get(), scheduler.get()); + for (int i = 0; i < 10; i++) { + std::shared_ptr workload = createApiCorrectnessWorkload(); + workloadMgr.add(workload); + } + workloadMgr.run(); } int main(int argc, char** argv) { From a47f481bae5745a1b093fae51c4b4cf786ea45fe Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 25 Feb 2022 19:21:11 -0800 Subject: [PATCH 023/138] Bug fix, the extent cache used during recovery is not allowed to evict anything so it now uses a private Evictor with no size limit. --- fdbserver/VersionedBTree.actor.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 4e43c57a1b..27b88e7c43 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1790,6 +1790,8 @@ public: // must eventually give them back with moveIn() or remove them with reclaim(). class Evictor : NonCopyable { public: + Evictor(int64_t sizeLimit = 0) : sizeLimit(sizeLimit) {} + // Evictors are normally singletons, either one per real process or one per virtual process in simulation static Evictor* getEvictor() { static Evictor nonSimEvictor; @@ -1850,18 +1852,21 @@ public: } void trim(int additionalSpaceNeeded = 0) { + int attemptsLeft = FLOW_KNOBS->MAX_EVICT_ATTEMPTS; // While the cache is too big, evict the oldest entry until the oldest entry can't be evicted. - while (sizeUsed > (sizeLimit - reservedSize - additionalSpaceNeeded)) { + while (attemptsLeft-- > 0 && sizeUsed > (sizeLimit - reservedSize - additionalSpaceNeeded) && + !evictionOrder.empty()) { Entry& toEvict = evictionOrder.front(); debug_printf("Evictor count=%" PRId64 " sizeUsed=%" PRId64 " sizeLimit=%" PRId64 " sizePenalty=%" PRId64 - " needed=%d Trying to evict %s\n", + " needed=%d Trying to evict %s evictable %d\n", evictionOrder.size(), sizeUsed, sizeLimit, reservedSize, additionalSpaceNeeded, - ::toString(toEvict.index).c_str()); + ::toString(toEvict.index).c_str(), + toEvict.item.evictable()); if (!toEvict.item.evictable()) { // shift the front to the back @@ -1908,7 +1913,7 @@ public: // Any external data strutures whose memory usage should be counted as part of the object cache // budget should add their usage to this total and keep it updated. int64_t reservedSize = 0; - int64_t sizeLimit = 0; + int64_t sizeLimit; private: EvictionOrderT evictionOrder; @@ -3835,7 +3840,11 @@ private: bool memoryOnly; PageCacheT pageCache; - PageCacheT extentCache; + + // The extent cache isn't a normal cache, it isn't allowed to evict things. It is populated + // during recovery with remap queue extents and then cleared. + PageCacheT::Evictor extentCacheDummyEvictor{ std::numeric_limits::max() }; + PageCacheT extentCache{ &extentCacheDummyEvictor }; Promise closedPromise; Promise errorPromise; From c122cf1ce00d7a984213fcea34044c6e6c520a85 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 26 Feb 2022 00:10:46 -0800 Subject: [PATCH 024/138] Bug fix: Avoid having a local temporary Cursor destructed after the Reference containing its DecodeCache has been dropped by using the BTreeCursor's leaf cursor by reference. --- fdbserver/VersionedBTree.actor.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 27b88e7c43..9ccfba5225 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7411,8 +7411,9 @@ public: } while (cur.isValid()) { - // Read page contents without using waits - BTreePage::BinaryTree::Cursor leafCursor = cur.back().cursor; + // Read leaf page contents without using waits by using the leaf page cursor directly + // and advancing it until it is no longer valid + BTreePage::BinaryTree::Cursor& leafCursor = cur.back().cursor; // we can bypass the bounds check for each key in the leaf if the entire leaf is in range // > because both query end and page upper bound are exclusive of the query results and page contents, @@ -7466,8 +7467,9 @@ public: } while (cur.isValid()) { - // Read page contents without using waits - BTreePage::BinaryTree::Cursor leafCursor = cur.back().cursor; + // Read leaf page contents without using waits by using the leaf page cursor directly + // and advancing it until it is no longer valid + BTreePage::BinaryTree::Cursor& leafCursor = cur.back().cursor; // we can bypass the bounds check for each key in the leaf if the entire leaf is in range // < because both query begin and page lower bound are inclusive of the query results and page contents, From 3a666610ad7d5bc5ee55833186853ab3dd161b70 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 26 Feb 2022 20:06:21 -0800 Subject: [PATCH 025/138] Tweaked Redwood unit test ops limits and added total record read count to prevent test instances that run too long. --- fdbserver/VersionedBTree.actor.cpp | 81 ++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 2ce8bc3fc7..a7cf97f057 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7623,7 +7623,8 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, Key start, Key end, Version v, - std::map, Optional>* written) { + std::map, Optional>* written, + int64_t* pRecordsRead) { if (end <= start) end = keyAfter(start); @@ -7658,6 +7659,9 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, while (cur.isValid() && cur.get().key < end) { // Find the next written kv pair that would be present at this version while (1) { + // Since the written map grows, range scans become less efficient so count all records written + // at any version against the records read count + ++*pRecordsRead; iLast = i; if (i == iEnd) break; @@ -7747,6 +7751,7 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, state std::reverse_iterator r = results.rbegin(); while (cur.isValid() && cur.get().key >= start) { + ++*pRecordsRead; if (r == results.rend()) { printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR:BTree key '%s' vs nothing in written map.\n", v, @@ -7793,10 +7798,11 @@ ACTOR Future verifyRangeBTreeCursor(VersionedBTree* btree, return Void(); } -// Verify the result of point reads for every set or cleared key at the given version +// Verify the result of point reads for every set or cleared key change made at exactly v ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, Version v, - std::map, Optional>* written) { + std::map, Optional>* written, + int64_t* pRecordsRead) { state std::map, Optional>::const_iterator i = written->cbegin(); state std::map, Optional>::const_iterator iEnd = written->cend(); state VersionedBTree::BTreeCursor cur; @@ -7804,6 +7810,8 @@ ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, wait(btree->initBTreeCursor(&cur, v, PagerEventReasons::RangeRead)); while (i != iEnd) { + // Since the written map gets larger and takes longer to scan each time, count visits to all written recs + ++*pRecordsRead; state std::string key = i->first.first; state Version ver = i->first.second; if (ver == v) { @@ -7852,6 +7860,7 @@ ACTOR Future seekAllBTreeCursor(VersionedBTree* btree, ACTOR Future verify(VersionedBTree* btree, FutureStream vStream, std::map, Optional>* written, + int64_t* pRecordsRead, bool serial) { // Queue of committed versions still readable from btree @@ -7883,8 +7892,8 @@ ACTOR Future verify(VersionedBTree* btree, wait(btree->initBTreeCursor(&cur, v, PagerEventReasons::RangeRead)); debug_printf("Verifying entire key range at version %" PRId64 "\n", v); - state Future fRangeAll = - verifyRangeBTreeCursor(btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written); + state Future fRangeAll = verifyRangeBTreeCursor( + btree, LiteralStringRef(""), LiteralStringRef("\xff\xff"), v, written, pRecordsRead); if (serial) { wait(fRangeAll); } @@ -7894,13 +7903,13 @@ ACTOR Future verify(VersionedBTree* btree, debug_printf( "Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v); - state Future fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written); + state Future fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pRecordsRead); if (serial) { wait(fRangeRandom); } debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v); - state Future fSeekAll = seekAllBTreeCursor(btree, v, written); + state Future fSeekAll = seekAllBTreeCursor(btree, v, written, pRecordsRead); if (serial) { wait(fSeekAll); } @@ -7918,7 +7927,7 @@ ACTOR Future verify(VersionedBTree* btree, } // Does a random range read, doesn't trap/report errors -ACTOR Future randomReader(VersionedBTree* btree) { +ACTOR Future randomReader(VersionedBTree* btree, int64_t* pRecordsRead) { try { state VersionedBTree::BTreeCursor cur; @@ -7933,6 +7942,7 @@ ACTOR Future randomReader(VersionedBTree* btree) { state int c = deterministicRandom()->randomInt(0, 100); state bool direction = deterministicRandom()->coinflip(); while (cur.isValid() && c-- > 0) { + ++*pRecordsRead; wait(success(direction ? cur.moveNext() : cur.movePrev())); wait(yield()); } @@ -9310,7 +9320,6 @@ TEST_CASE("Lredwood/correctness/btree") { params.getInt("extentSize") .orDefault(deterministicRandom()->coinflip() ? SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE : deterministicRandom()->randomInt(4096, 32768)); - state int64_t targetPageOps = params.getInt("targetPageOps").orDefault(shortTest ? 50000 : 1000000); state bool pagerMemoryOnly = params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001)); state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2)); @@ -9341,13 +9350,21 @@ TEST_CASE("Lredwood/correctness/btree") { state Version remapCleanupWindow = params.getInt("remapCleanupWindow") .orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50)); - state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3); state int concurrentExtentReads = params.getInt("concurrentExtentReads").orDefault(SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS); + // These settings are an attempt to keep the test execution real reasonably short + state int64_t maxPageOps = params.getInt("maxPageOps").orDefault((shortTest || serialTest) ? 50e3 : 1e6); + state int maxVerificationMapEntries = + params.getInt("maxVerificationMapEntries").orDefault((1.0 - coldStartProbability) * 300e3); + // Max number of records in the BTree or the versioned written map to visit + state int64_t maxRecordsRead = 300e6; + printf("\n"); printf("file: %s\n", file.c_str()); - printf("targetPageOps: %" PRId64 "\n", targetPageOps); + printf("maxPageOps: %" PRId64 "\n", maxPageOps); + printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries); + printf("maxRecordsRead: %" PRId64 "\n", maxRecordsRead); printf("pagerMemoryOnly: %d\n", pagerMemoryOnly); printf("serialTest: %d\n", serialTest); printf("shortTest: %d\n", shortTest); @@ -9366,7 +9383,6 @@ TEST_CASE("Lredwood/correctness/btree") { printf("cacheSizeBytes: %s\n", cacheSizeBytes == 0 ? "default" : format("%" PRId64, cacheSizeBytes).c_str()); printf("versionIncrement: %" PRId64 "\n", versionIncrement); printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow); - printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries); printf("\n"); printf("Deleting existing test data...\n"); @@ -9379,6 +9395,7 @@ TEST_CASE("Lredwood/correctness/btree") { wait(btree->init()); state std::map, Optional> written; + state int64_t totalRecordsRead = 0; state std::set keys; state Version lastVer = btree->getLastCommittedVersion(); @@ -9396,8 +9413,9 @@ TEST_CASE("Lredwood/correctness/btree") { state int mutationBytesTargetThisCommit = randomSize(maxCommitSize); state PromiseStream committedVersions; - state Future verifyTask = verify(btree, committedVersions.getFuture(), &written, serialTest); - state Future randomTask = serialTest ? Void() : (randomReader(btree) || btree->getError()); + state Future verifyTask = + verify(btree, committedVersions.getFuture(), &written, &totalRecordsRead, serialTest); + state Future randomTask = serialTest ? Void() : (randomReader(btree, &totalRecordsRead) || btree->getError()); committedVersions.send(lastVer); // Sometimes do zero-change commit at last version @@ -9408,7 +9426,13 @@ TEST_CASE("Lredwood/correctness/btree") { state Future commit = Void(); state int64_t totalPageOps = 0; - while (totalPageOps < targetPageOps && written.size() < maxVerificationMapEntries) { + // Check test op limits + state std::function testFinished = [=]() { + return !(totalPageOps < maxPageOps && written.size() < maxVerificationMapEntries && + totalRecordsRead < maxRecordsRead); + }; + + while (!testFinished()) { // Sometimes increment the version if (deterministicRandom()->random01() < 0.10) { ++version; @@ -9510,8 +9534,7 @@ TEST_CASE("Lredwood/correctness/btree") { } // Commit after any limits for this commit or the total test are reached - if (totalPageOps >= targetPageOps || written.size() >= maxVerificationMapEntries || - mutationBytesThisCommit >= mutationBytesTargetThisCommit) { + if (mutationBytesThisCommit >= mutationBytesTargetThisCommit || testFinished()) { // Wait for previous commit to finish wait(commit); printf("Commit complete. Next commit %d bytes, %" PRId64 " bytes committed so far.", @@ -9531,17 +9554,21 @@ TEST_CASE("Lredwood/correctness/btree") { 0, btree->getLastCommittedVersion() - btree->getOldestReadableVersion() + 1)); } - commit = map(btree->commit(version), [=, &ops = totalPageOps, v = version](Void) { + commit = map(btree->commit(version), [&, v = version](Void) { // Update pager ops before clearing metrics - ops += g_redwoodMetrics.pageOps(); - fmt::print("Committed {0} PageOps {1}/{2} ({3:.2f}) VerificationMapEntries {4}/{5} ({6:.2f})\n", + totalPageOps += g_redwoodMetrics.pageOps(); + fmt::print("Committed {0} PageOps {1}/{2} ({3:.2f}%) VerificationMapEntries {4}/{5} ({6:.2f}%) " + "RecordsRead {7}/{8} ({9:.2f}%)\n", toString(v).c_str(), - ops, - targetPageOps, - ops * 100.0 / targetPageOps, + totalPageOps, + maxPageOps, + totalPageOps * 100.0 / maxPageOps, written.size(), maxVerificationMapEntries, - written.size() * 100.0 / maxVerificationMapEntries); + written.size() * 100.0 / maxVerificationMapEntries, + totalRecordsRead, + maxRecordsRead, + totalRecordsRead * 100.0 / maxRecordsRead); printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str()); // Notify the background verifier that version is committed and therefore readable @@ -9556,7 +9583,7 @@ TEST_CASE("Lredwood/correctness/btree") { debug_printf("Waiting for verification to complete.\n"); wait(verifyTask); committedVersions = PromiseStream(); - verifyTask = verify(btree, committedVersions.getFuture(), &written, serialTest); + verifyTask = verify(btree, committedVersions.getFuture(), &written, &totalRecordsRead, serialTest); } mutationBytesThisCommit = 0; @@ -9600,9 +9627,9 @@ TEST_CASE("Lredwood/correctness/btree") { // Create new promise stream and start the verifier again committedVersions = PromiseStream(); - verifyTask = verify(btree, committedVersions.getFuture(), &written, serialTest); + verifyTask = verify(btree, committedVersions.getFuture(), &written, &totalRecordsRead, serialTest); if (!serialTest) { - randomTask = randomReader(btree) || btree->getError(); + randomTask = randomReader(btree, &totalRecordsRead) || btree->getError(); } committedVersions.send(version); } From 73a81a86da41dc6c91d3b88eb83c0030c7de07f6 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 26 Feb 2022 23:37:30 -0800 Subject: [PATCH 026/138] Added Redwood metrics printing during set unit test using new generic repeatEvery() actor. --- fdbserver/VersionedBTree.actor.cpp | 12 ++++++------ flow/genericactors.actor.h | 9 +++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index a7cf97f057..9a4ba64d58 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -10093,6 +10093,10 @@ TEST_CASE(":/redwood/performance/set") { printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); } + state Future stats = + traceMetrics ? Void() + : repeatEvery(1.0, [&]() { printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); }); + if (scans > 0) { printf("Parallel scans, concurrency=%d, scans=%d, scanWidth=%d, scanPreftchBytes=%d ...\n", concurrentScans, @@ -10104,9 +10108,6 @@ TEST_CASE(":/redwood/performance/set") { randomScans(btree, scans / concurrentScans, scanWidth, scanPrefetchBytes, firstKeyChar, lastKeyChar)); } wait(actors.signalAndReset()); - if (!traceMetrics) { - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - } } if (seeks > 0) { @@ -10115,11 +10116,10 @@ TEST_CASE(":/redwood/performance/set") { actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); } wait(actors.signalAndReset()); - if (!traceMetrics) { - printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - } } + stats.cancel(); + if (destructiveSanityCheck) { wait(btree->clearAllAndCheckSanity()); } diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 8d1465b756..2209a8f3c3 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -2042,6 +2042,15 @@ private: Reference data; }; +// Call a lambda every seconds +ACTOR template +Future repeatEvery(double interval, Fn fn) { + loop { + wait(delay(interval)); + fn(); + } +} + #include "flow/unactorcompiler.h" #endif From 398374c7d715a11d7254533a314602ecb51defce Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 27 Feb 2022 00:36:23 -0800 Subject: [PATCH 027/138] Rename cacheSizeBytes to pageCacheBytes to match other tests. --- fdbserver/VersionedBTree.actor.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9a4ba64d58..555dcbd1e8 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -9341,8 +9341,8 @@ TEST_CASE("Lredwood/correctness/btree") { params.getDouble("coldStartProbability").orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01())); state double advanceOldVersionProbability = params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01()); - state int64_t cacheSizeBytes = - params.getInt("cacheSizeBytes") + state int64_t pageCacheBytes = + params.getInt("pageCacheBytes") .orDefault(pagerMemoryOnly ? 2e9 : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 10 : 10000) + 1))); state Version versionIncrement = @@ -9380,7 +9380,7 @@ TEST_CASE("Lredwood/correctness/btree") { printf("clearPostSetProbability: %f\n", clearPostSetProbability); printf("coldStartProbability: %f\n", coldStartProbability); printf("advanceOldVersionProbability: %f\n", advanceOldVersionProbability); - printf("cacheSizeBytes: %s\n", cacheSizeBytes == 0 ? "default" : format("%" PRId64, cacheSizeBytes).c_str()); + printf("pageCacheBytes: %s\n", pageCacheBytes == 0 ? "default" : format("%" PRId64, pageCacheBytes).c_str()); printf("versionIncrement: %" PRId64 "\n", versionIncrement); printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow); printf("\n"); @@ -9390,7 +9390,7 @@ TEST_CASE("Lredwood/correctness/btree") { printf("Initializing...\n"); pager = new DWALPager( - pageSize, extentSize, file, cacheSizeBytes, remapCleanupWindow, concurrentExtentReads, pagerMemoryOnly); + pageSize, extentSize, file, pageCacheBytes, remapCleanupWindow, concurrentExtentReads, pagerMemoryOnly); state VersionedBTree* btree = new VersionedBTree(pager, file); wait(btree->init()); @@ -9609,7 +9609,7 @@ TEST_CASE("Lredwood/correctness/btree") { printf("Reopening btree from disk.\n"); IPager2* pager = new DWALPager( - pageSize, extentSize, file, cacheSizeBytes, remapCleanupWindow, concurrentExtentReads); + pageSize, extentSize, file, pageCacheBytes, remapCleanupWindow, concurrentExtentReads); btree = new VersionedBTree(pager, file); wait(btree->init()); @@ -9650,7 +9650,7 @@ TEST_CASE("Lredwood/correctness/btree") { btree->close(); wait(closedFuture); btree = - new VersionedBTree(new DWALPager(pageSize, extentSize, file, cacheSizeBytes, 0, concurrentExtentReads), file); + new VersionedBTree(new DWALPager(pageSize, extentSize, file, pageCacheBytes, 0, concurrentExtentReads), file); wait(btree->init()); wait(btree->clearAllAndCheckSanity()); From ee5030d06cc16016a5f947c9849cab68486025c9 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 28 Feb 2022 00:16:10 -0800 Subject: [PATCH 028/138] Fixed memory "leak" where child node update tracker entries for freed internal pages were not removed so the map would just grow. --- fdbserver/VersionedBTree.actor.cpp | 50 +++++++++++++++++------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 555dcbd1e8..072fbe9876 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -39,6 +39,7 @@ #include "flow/ActorCollection.h" #include #include +#include #include #include "fdbclient/CommitTransaction.h" #include "fdbserver/IKeyValueStore.h" @@ -4610,13 +4611,6 @@ struct BTreePage { } }; -static void makeEmptyRoot(Reference page) { - BTreePage* btpage = (BTreePage*)page->begin(); - btpage->height = 1; - btpage->kvBytes = 0; - btpage->tree()->build(page->size(), nullptr, nullptr, nullptr, nullptr); -} - struct BoundaryRefAndPage { Standalone lowerBound; Reference firstPage; @@ -4716,7 +4710,7 @@ public: } } - bool maybeUpdated(LogicalPageID child) { return (mask(child) & bits) != 0; } + bool maybeUpdated(LogicalPageID child) const { return (mask(child) & bits) != 0; } uint32_t bits; int count; @@ -4879,8 +4873,8 @@ public: BTreePageIDRef btChildPageID = c.get().getChildPage(); // If this page is height 2, then the children are leaves so free them directly if (entry.height == 2) { - debug_printf("LazyClear: freeing child %s\n", toString(btChildPageID).c_str()); - self->freeBTreePage(btChildPageID, v); + debug_printf("LazyClear: freeing leaf child %s\n", toString(btChildPageID).c_str()); + self->freeBTreePage(1, btChildPageID, v); freedPages += btChildPageID.size(); metrics.lazyClearFree += 1; metrics.lazyClearFreeExt += (btChildPageID.size() - 1); @@ -4900,7 +4894,7 @@ public: // Free the page, now that its children have either been freed or queued debug_printf("LazyClear: freeing queue entry %s\n", toString(entry.pageID).c_str()); - self->freeBTreePage(entry.pageID, v); + self->freeBTreePage(entry.height, entry.pageID, v); freedPages += entry.pageID.size(); metrics.lazyClearFree += 1; metrics.lazyClearFreeExt += entry.pageID.size() - 1; @@ -4946,7 +4940,7 @@ public: self->m_pHeader->root.set(newRoot, self->m_headerSpace - sizeof(MetaKey)); self->m_pHeader->height = 1; Reference page = self->m_pager->newPageBuffer(); - makeEmptyRoot(page); + self->makeEmptyRoot(id, page); self->m_pager->updatePage(PagerEventReasons::MetaData, nonBtreeLevel, newRoot, page); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); @@ -5467,6 +5461,13 @@ private: return pages; } + void makeEmptyRoot(LogicalPageID id, Reference page) { + BTreePage* btpage = (BTreePage*)page->begin(); + btpage->height = 1; + btpage->kvBytes = 0; + btpage->tree()->build(page->size(), nullptr, nullptr, nullptr, nullptr); + } + // Writes entries to 1 or more pages and return a vector of boundary keys with their ArenaPage(s) ACTOR static Future>> writePages(VersionedBTree* self, const RedwoodRecordRef* lowerBound, @@ -5612,7 +5613,7 @@ private: // Free the old IDs, but only once (before the first output record is added). if (records.empty()) { - self->freeBTreePage(previousID, v); + self->freeBTreePage(height, previousID, v); } state Standalone> emptyPages; @@ -5760,15 +5761,21 @@ private: } } - void freeBTreePage(BTreePageIDRef btPageID, Version v) { + void freeBTreePage(int height, BTreePageIDRef btPageID, Version v) { // Free individual pages at v for (LogicalPageID id : btPageID) { m_pager->freePage(id, v); } + + // Stop tracking child updates for deleted internal nodes + if (height > 1 && !btPageID.empty()) { + childUpdateTracker.erase(btPageID.front()); + } } // Write new version of pageID at version v using page as its data. - // Attempts to reuse original id(s) in btPageID, returns BTreePageID. + // If oldID size is 1, attempts to keep logical page ID via an atomic page update. + // Returns resulting BTreePageID which might be the same as the input // updateBTreePage is only called from commitSubTree function so write reason is always btree commit ACTOR static Future updateBTreePage(VersionedBTree* self, BTreePageIDRef oldID, @@ -5811,7 +5818,7 @@ private: newID[i] = id; ++i; } - self->freeBTreePage(oldID, writeVersion); + self->freeBTreePage(height, oldID, writeVersion); return newID; } @@ -6458,7 +6465,7 @@ private: // If the tree is now empty, delete the page if (cursor.tree->numItems == 0) { update->cleared(); - self->freeBTreePage(rootID, batch->writeVersion); + self->freeBTreePage(height, rootID, batch->writeVersion); debug_printf("%s Page updates cleared all entries, returning %s\n", context.c_str(), toString(*update).c_str()); @@ -6477,7 +6484,7 @@ private: // If everything in the page was deleted then this page should be deleted as of the new version if (merged.empty()) { update->cleared(); - self->freeBTreePage(rootID, batch->writeVersion); + self->freeBTreePage(height, rootID, batch->writeVersion); debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), @@ -6637,7 +6644,7 @@ private: debug_printf("%s: freeing child page in cleared subtree range: %s\n", context.c_str(), ::toString(rec.getChildPage()).c_str()); - self->freeBTreePage(rec.getChildPage(), batch->writeVersion); + self->freeBTreePage(height, rec.getChildPage(), batch->writeVersion); } else { debug_printf("%s: queuing subtree deletion cleared subtree range: %s\n", context.c_str(), @@ -6740,8 +6747,7 @@ private: debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), toString(*update).c_str()); - self->freeBTreePage(rootID, batch->writeVersion); - self->childUpdateTracker.erase(rootID.front()); + self->freeBTreePage(height, rootID, batch->writeVersion); } else { if (modifier.updating) { // Page was updated in place (or being forced to be updated in place to update child page ids) @@ -6918,7 +6924,7 @@ private: debug_printf("Writing new empty root.\n"); LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); - makeEmptyRoot(page); + self->makeEmptyRoot(newRootID, page); self->m_pHeader->height = 1; VectorRef rootID((LogicalPageID*)&newRootID, 1); self->m_pager->updatePage(PagerEventReasons::Commit, self->m_pHeader->height, rootID, page); From bd1bf723048d5ff7417b5e1bca74cc9016d08f98 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Mon, 28 Feb 2022 15:53:29 +0100 Subject: [PATCH 029/138] ApiTester: Test mixed insert/get workload; support concurrent execution of multiple gets --- bindings/c/CMakeLists.txt | 2 + .../c/test/apitester/TesterApiWrapper.cpp | 2 +- bindings/c/test/apitester/TesterApiWrapper.h | 3 +- .../apitester/TesterCorrectnessWorkload.cpp | 174 +++++++++++++++--- .../c/test/apitester/TesterKeyValueStore.cpp | 29 ++- .../c/test/apitester/TesterKeyValueStore.h | 19 +- bindings/c/test/apitester/TesterOptions.h | 2 +- bindings/c/test/apitester/TesterScheduler.cpp | 6 +- bindings/c/test/apitester/TesterScheduler.h | 2 +- .../apitester/TesterTransactionExecutor.cpp | 87 +++++---- .../apitester/TesterTransactionExecutor.h | 5 +- bindings/c/test/apitester/TesterUtil.cpp | 96 ++++++++++ bindings/c/test/apitester/TesterUtil.h | 66 +++++++ bindings/c/test/apitester/TesterWorkload.cpp | 14 +- bindings/c/test/apitester/TesterWorkload.h | 25 ++- .../c/test/apitester/fdb_c_api_tester.cpp | 5 +- 16 files changed, 451 insertions(+), 86 deletions(-) create mode 100644 bindings/c/test/apitester/TesterUtil.cpp create mode 100644 bindings/c/test/apitester/TesterUtil.h diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 1a05ba88c6..ff91b09a21 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -109,6 +109,8 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) test/apitester/TesterScheduler.h test/apitester/TesterTransactionExecutor.cpp test/apitester/TesterTransactionExecutor.h + test/apitester/TesterUtil.cpp + test/apitester/TesterUtil.h test/apitester/TesterWorkload.cpp test/apitester/TesterWorkload.h ../../flow/SimpleOpt.h diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 710040a2cb..34b1fd802c 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -44,7 +44,7 @@ fdb_error_t Future::getError() const { return fdb_future_get_error(future_.get()); } -std::optional ValueFuture::getValue() const { +std::optional ValueFuture::getValue() const { int out_present; const std::uint8_t* val; int vallen; diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index c7d7571dbc..0eb7191b75 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -42,6 +42,7 @@ public: FDBFuture* fdbFuture() { return future_.get(); }; fdb_error_t getError() const; + explicit operator bool() const { return future_ != nullptr; }; void reset(); protected: @@ -52,7 +53,7 @@ class ValueFuture : public Future { public: ValueFuture() = default; ValueFuture(FDBFuture* f) : Future(f) {} - std::optional getValue() const; + std::optional getValue() const; }; class Transaction { diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 1b90e4b5a6..3f2ee3573d 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -18,46 +18,176 @@ * limitations under the License. */ #include "TesterWorkload.h" +#include "TesterUtil.h" +#include "TesterKeyValueStore.h" +#include "test/apitester/TesterScheduler.h" #include #include #include +#include namespace FdbApiTester { class ApiCorrectnessWorkload : public WorkloadBase { public: - ApiCorrectnessWorkload() : numTxLeft(1000) {} + enum OpType { OP_INSERT, OP_GET, OP_LAST = OP_GET }; + + // The minimum length of a key + int minKeyLength; + + // The maximum length of a key + int maxKeyLength; + + // The minimum length of a value + int minValueLength; + + // The maximum length of a value + int maxValueLength; + + // Maximum number of keys to be accessed by a transaction + int maxKeysPerTransaction; + + // The number of operations to be executed + int numOperations; + + // The ratio of reading existing keys + double readExistingKeysRatio; + + // Key prefix + std::string keyPrefix; + + ApiCorrectnessWorkload(std::string_view prefix) { + minKeyLength = 1; + maxKeyLength = 64; + minValueLength = 1; + maxValueLength = 1000; + maxKeysPerTransaction = 50; + numOperations = 1000; + readExistingKeysRatio = 0.9; + keyPrefix = prefix; + numOpLeft = numOperations; + } void start() override { schedule([this]() { nextTransaction(); }); } -private: - void nextTransaction() { - if (numTxLeft % 100 == 0) { - std::cout << numTxLeft << " transactions left" << std::endl; - } - if (numTxLeft == 0) - return; + std::string randomKeyName() { return keyPrefix + random.randomStringLowerCase(minKeyLength, maxKeyLength); } - numTxLeft--; - execTransaction( - [](auto ctx) { - ValueFuture fGet = ctx->tx()->get(ctx->dbKey("foo"), false); - ctx->continueAfter(fGet, [fGet, ctx]() { - std::optional optStr = fGet.getValue(); - ctx->tx()->set(ctx->dbKey("foo"), optStr.value_or("bar")); - ctx->commit(); - }); - }, - [this]() { nextTransaction(); }); + std::string randomValue() { return random.randomStringLowerCase(minValueLength, maxValueLength); } + + std::string randomNotExistingKey() { + while (true) { + std::string key = randomKeyName(); + if (!store.exists(key)) { + return key; + } + } } - int numTxLeft; + std::string randomExistingKey() { + std::string genKey = randomKeyName(); + std::string key = store.getKey(genKey, true, 1); + if (key != store.endKey()) { + return key; + } else { + return store.getKey(genKey, true, 0); + } + } + + std::string randomKey(double existingKeyRatio) { + if (random.randomBool(existingKeyRatio)) { + return randomExistingKey(); + } else { + return randomNotExistingKey(); + } + } + + void randomInsertOp(TTaskFct cont) { + int numKeys = random.randomInt(1, maxKeysPerTransaction); + auto kvPairs = std::make_shared>(); + for (int i = 0; i < numKeys; i++) { + kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() }); + } + execTransaction( + [kvPairs](auto ctx) { + for (const KeyValue& kv : *kvPairs) { + ctx->tx()->set(kv.key, kv.value); + } + ctx->commit(); + }, + [this, kvPairs, cont]() { + for (const KeyValue& kv : *kvPairs) { + store.set(kv.key, kv.value); + } + cont(); + }); + } + + void randomGetOp(TTaskFct cont) { + int numKeys = random.randomInt(1, maxKeysPerTransaction); + auto keys = std::make_shared>(); + auto results = std::make_shared>>(); + for (int i = 0; i < numKeys; i++) { + keys->push_back(randomKey(readExistingKeysRatio)); + } + execTransaction( + [keys, results](auto ctx) { + auto futures = std::make_shared>(); + for (const auto& key : *keys) { + futures->push_back(ctx->tx()->get(key, false)); + } + ctx->continueAfterAll(futures, [ctx, futures, results]() { + for (auto& f : *futures) { + results->push_back(((ValueFuture&)f).getValue()); + } + ctx->done(); + }); + }, + [this, keys, results, cont]() { + ASSERT(results->size() == keys->size()); + for (int i = 0; i < keys->size(); i++) { + auto expected = store.get((*keys)[i]); + if ((*results)[i] != expected) { + std::cout << "randomGetOp mismatch. expected: " << expected << " actual: " << (*results)[i] + << std::endl; + } + } + cont(); + }); + } + + void randomOperation(TTaskFct cont) { + OpType txType = (OpType)random.randomInt(0, OP_LAST); + switch (txType) { + case OP_INSERT: + randomInsertOp(cont); + break; + case OP_GET: + randomGetOp(cont); + break; + } + } + +private: + void nextTransaction() { + if (numOpLeft % 100 == 0) { + std::cout << numOpLeft << " transactions left" << std::endl; + } + if (numOpLeft == 0) + return; + + numOpLeft--; + randomOperation([this]() { schedule([this]() { nextTransaction(); }); }); + } + + int numOpLeft; + Random random; + KeyValueStore store; }; -std::shared_ptr createApiCorrectnessWorkload() { - return std::make_shared(); +std::shared_ptr createApiCorrectnessWorkload(std::string_view prefix) { + return std::make_shared(prefix); } } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterKeyValueStore.cpp b/bindings/c/test/apitester/TesterKeyValueStore.cpp index f40f01f134..3c221bb47c 100644 --- a/bindings/c/test/apitester/TesterKeyValueStore.cpp +++ b/bindings/c/test/apitester/TesterKeyValueStore.cpp @@ -19,18 +19,29 @@ */ #include "TesterKeyValueStore.h" +#include + +namespace FdbApiTester { // Get the value associated with a key std::optional KeyValueStore::get(std::string_view key) const { - std::map::const_iterator value = store.find(std::string(key)); + std::unique_lock lock(mutex); + auto value = store.find(std::string(key)); if (value != store.end()) return value->second; else return std::optional(); } +// Checks if the key exists +bool KeyValueStore::exists(std::string_view key) { + std::unique_lock lock(mutex); + return (store.find(std::string(key)) != store.end()); +} + // Returns the key designated by a key selector std::string KeyValueStore::getKey(std::string_view keyName, bool orEqual, int offset) const { + std::unique_lock lock(mutex); // Begin by getting the start key referenced by the key selector std::map::const_iterator mapItr = store.lower_bound(keyName); @@ -78,10 +89,11 @@ std::string KeyValueStore::getKey(std::string_view keyName, bool orEqual, int of } // Gets a range of key-value pairs, returning a maximum of results -std::vector KeyValueStore::getRange(std::string_view begin, - std::string_view end, - int limit, - bool reverse) const { +std::vector KeyValueStore::getRange(std::string_view begin, + std::string_view end, + int limit, + bool reverse) const { + std::unique_lock lock(mutex); std::vector results; if (!reverse) { std::map::const_iterator mapItr = store.lower_bound(begin); @@ -109,11 +121,13 @@ std::vector KeyValueStore::getRange(std::string_view be // Stores a key-value pair in the database void KeyValueStore::set(std::string_view key, std::string_view value) { + std::unique_lock lock(mutex); store[std::string(key)] = value; } // Removes a key from the database void KeyValueStore::clear(std::string_view key) { + std::unique_lock lock(mutex); auto iter = store.find(key); if (iter != store.end()) { store.erase(iter); @@ -122,11 +136,13 @@ void KeyValueStore::clear(std::string_view key) { // Removes a range of keys from the database void KeyValueStore::clear(std::string_view begin, std::string_view end) { + std::unique_lock lock(mutex); store.erase(store.lower_bound(begin), store.lower_bound(end)); } // The number of keys in the database uint64_t KeyValueStore::size() const { + std::unique_lock lock(mutex); return store.size(); } @@ -142,8 +158,11 @@ std::string KeyValueStore::endKey() const { // Debugging function that prints all key-value pairs void KeyValueStore::printContents() const { + std::unique_lock lock(mutex); printf("Contents:\n"); std::map::const_iterator mapItr; for (mapItr = store.begin(); mapItr != store.end(); mapItr++) printf("%s\n", mapItr->first.c_str()); } + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterKeyValueStore.h b/bindings/c/test/apitester/TesterKeyValueStore.h index 9dbd35b0de..c7474d7831 100644 --- a/bindings/c/test/apitester/TesterKeyValueStore.h +++ b/bindings/c/test/apitester/TesterKeyValueStore.h @@ -28,17 +28,23 @@ #include #include #include +#include + +namespace FdbApiTester { + +struct KeyValue { + std::string key; + std::string value; +}; class KeyValueStore { public: - struct KeyValue { - std::string key; - std::string value; - }; - // Get the value associated with a key std::optional get(std::string_view key) const; + // Checks if the key exists + bool exists(std::string_view key); + // Returns the key designated by a key selector std::string getKey(std::string_view keyName, bool orEqual, int offset) const; @@ -69,6 +75,9 @@ public: private: // A map holding the key-value pairs std::map> store; + mutable std::mutex mutex; }; +} // namespace FdbApiTester + #endif \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 999c1cafac..595ee79530 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bindings/c/test/apitester/TesterScheduler.cpp b/bindings/c/test/apitester/TesterScheduler.cpp index 7038cd85bd..ac39f430bd 100644 --- a/bindings/c/test/apitester/TesterScheduler.cpp +++ b/bindings/c/test/apitester/TesterScheduler.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,10 @@ */ #include "TesterScheduler.h" +#include "TesterUtil.h" #include #include -#include #include using namespace boost::asio; @@ -60,7 +60,7 @@ private: }; std::unique_ptr createScheduler(int numThreads) { - assert(numThreads > 0 && numThreads <= 1000); + ASSERT(numThreads > 0 && numThreads <= 1000); return std::make_unique(numThreads); } diff --git a/bindings/c/test/apitester/TesterScheduler.h b/bindings/c/test/apitester/TesterScheduler.h index 676384fc9c..491aef568b 100644 --- a/bindings/c/test/apitester/TesterScheduler.h +++ b/bindings/c/test/apitester/TesterScheduler.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index bee96f4fca..2f4ba49046 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,11 @@ */ #include "TesterTransactionExecutor.h" +#include "TesterUtil.h" +#include "test/apitester/TesterScheduler.h" #include -#include #include -#include +#include namespace FdbApiTester { @@ -37,6 +38,17 @@ void fdb_check(fdb_error_t e) { } // namespace +void ITransactionContext::continueAfterAll(std::shared_ptr> futures, TTaskFct cont) { + auto counter = std::make_shared>(futures->size()); + for (auto& f : *futures) { + continueAfter(f, [counter, cont]() { + if (--(*counter) == 0) { + cont(); + } + }); + } +} + class TransactionContext : public ITransactionContext { public: TransactionContext(FDBTransaction* tx, @@ -49,19 +61,14 @@ public: Transaction* tx() override { return &fdbTx; } void continueAfter(Future f, TTaskFct cont) override { doContinueAfter(f, cont); } void commit() override { - currFuture = fdbTx.commit(); - doContinueAfter(currFuture, [this]() { done(); }); + Future f = fdbTx.commit(); + doContinueAfter(f, [this]() { done(); }); } void done() override { TTaskFct cont = contAfterDone; delete this; cont(); } - std::string_view dbKey(std::string_view key) override { - std::string keyWithPrefix(options.prefix); - keyWithPrefix.append(key); - return key; - } private: void doContinueAfter(Future f, TTaskFct cont) { @@ -77,19 +84,25 @@ private: fdb_check(fdb_future_block_until_ready(f.fdbFuture())); fdb_error_t err = f.getError(); if (err) { - currFuture = fdbTx.onError(err); - fdb_check(fdb_future_block_until_ready(currFuture.fdbFuture())); - handleOnErrorResult(); + std::unique_lock lock(mutex); + if (!onErrorFuture) { + onErrorFuture = fdbTx.onError(err); + fdb_check(fdb_future_block_until_ready(onErrorFuture.fdbFuture())); + scheduler->schedule([this]() { handleOnErrorResult(); }); + } } else { - cont(); + scheduler->schedule([cont]() { cont(); }); } }); } void asyncContinueAfter(Future f, TTaskFct cont) { - currCont = cont; - currFuture = f; - fdb_check(fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this)); + std::unique_lock lock(mutex); + if (!onErrorFuture) { + waitMap[f.fdbFuture()] = WaitInfo{ f, cont }; + lock.unlock(); + fdb_check(fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this)); + } } static void futureReadyCallback(FDBFuture* f, void* param) { @@ -98,14 +111,21 @@ private: } void onFutureReady(FDBFuture* f) { + std::unique_lock lock(mutex); + auto iter = waitMap.find(f); + if (iter == waitMap.end()) { + return; + } fdb_error_t err = fdb_future_get_error(f); + TTaskFct cont = iter->second.cont; + waitMap.erase(iter); if (err) { - currFuture = tx()->onError(err); - fdb_check(fdb_future_set_callback(currFuture.fdbFuture(), onErrorReadyCallback, this)); + waitMap.clear(); + onErrorFuture = tx()->onError(err); + lock.unlock(); + fdb_check(fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this)); } else { - scheduler->schedule(currCont); - currFuture.reset(); - currCont = TTaskFct(); + scheduler->schedule(cont); } } @@ -119,26 +139,33 @@ private: } void handleOnErrorResult() { - fdb_error_t err = currFuture.getError(); - currFuture.reset(); - currCont = TTaskFct(); + std::unique_lock lock(mutex); + fdb_error_t err = onErrorFuture.getError(); + onErrorFuture.reset(); if (err) { finalError = err; done(); } else { + lock.unlock(); txActor->reset(); txActor->start(); } } + struct WaitInfo { + Future future; + TTaskFct cont; + }; + const TransactionExecutorOptions& options; Transaction fdbTx; std::shared_ptr txActor; - TTaskFct currCont; + std::mutex mutex; + std::unordered_map waitMap; + Future onErrorFuture; TTaskFct contAfterDone; IScheduler* scheduler; fdb_error_t finalError; - Future currFuture; }; class TransactionExecutor : public ITransactionExecutor { @@ -155,12 +182,10 @@ public: fdb_check(fdb_create_database(clusterFile, &db)); databases.push_back(db); } - std::random_device dev; - random.seed(dev()); } void execute(std::shared_ptr txActor, TTaskFct cont) override { - int idx = std::uniform_int_distribution<>(0, options.numDatabases - 1)(random); + int idx = random.randomInt(0, options.numDatabases - 1); FDBTransaction* tx; fdb_check(fdb_database_create_transaction(databases[idx], &tx)); TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); @@ -178,7 +203,7 @@ private: std::vector databases; TransactionExecutorOptions options; IScheduler* scheduler; - std::mt19937 random; + Random random; }; std::unique_ptr createTransactionExecutor() { diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index d7b077c180..ef01838205 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ public: virtual void continueAfter(Future f, TTaskFct cont) = 0; virtual void commit() = 0; virtual void done() = 0; - virtual std::string_view dbKey(std::string_view key) = 0; + virtual void continueAfterAll(std::shared_ptr> futures, TTaskFct cont); }; class ITransactionActor { @@ -56,7 +56,6 @@ public: protected: ITransactionContext* ctx() { return context; } Transaction* tx() { return ctx()->tx(); } - std::string_view dbKey(std::string_view key) { return ctx()->dbKey(key); } void commit() { ctx()->commit(); } void reset() override {} diff --git a/bindings/c/test/apitester/TesterUtil.cpp b/bindings/c/test/apitester/TesterUtil.cpp new file mode 100644 index 0000000000..641df8bcc2 --- /dev/null +++ b/bindings/c/test/apitester/TesterUtil.cpp @@ -0,0 +1,96 @@ +/* + * TesterUtil.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterUtil.h" +#include + +namespace FdbApiTester { + +Random::Random() { + std::random_device dev; + random.seed(dev()); +} + +int Random::randomInt(int min, int max) { + return std::uniform_int_distribution(min, max)(random); +} + +std::string Random::randomStringLowerCase(int minLength, int maxLength) { + int length = randomInt(minLength, maxLength); + std::string str; + str.reserve(length); + for (int i = 0; i < length; i++) { + str += (char)randomInt('a', 'z'); + } + return str; +} + +bool Random::randomBool(double trueRatio) { + return std::uniform_real_distribution(0.0, 1.0)(random) <= trueRatio; +} + +int vsformat(std::string& outputString, const char* form, va_list args) { + char buf[200]; + + va_list args2; + va_copy(args2, args); + int size = vsnprintf(buf, sizeof(buf), form, args2); + va_end(args2); + + if (size >= 0 && size < sizeof(buf)) { + outputString = std::string(buf, size); + return size; + } + +#ifdef _WIN32 + // Microsoft's non-standard vsnprintf doesn't return a correct size, but just an error, so determine the necessary + // size + va_copy(args2, args); + size = _vscprintf(form, args2); + va_end(args2); +#endif + + if (size < 0) { + return -1; + } + + outputString.resize(size + 1); + size = vsnprintf(&outputString[0], outputString.size(), form, args); + if (size < 0 || size >= outputString.size()) { + return -1; + } + + outputString.resize(size); + return size; +} + +std::string format(const char* form, ...) { + va_list args; + va_start(args, form); + + std::string str; + int result = vsformat(str, form, args); + va_end(args); + + ASSERT(result >= 0); + return str; +} + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h new file mode 100644 index 0000000000..6cd68ce9be --- /dev/null +++ b/bindings/c/test/apitester/TesterUtil.h @@ -0,0 +1,66 @@ +/* + * TesterUtil.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef APITESTER_UTIL_H +#define APITESTER_UTIL_H + +#include +#include +#include + +namespace FdbApiTester { + +class Random { +public: + Random(); + + int randomInt(int min, int max); + + std::string randomStringLowerCase(int minLength, int maxLength); + + bool randomBool(double trueRatio); + + std::mt19937 random; +}; + +template +std::ostream& operator<<(std::ostream& os, const std::optional& obj) { + if (obj.has_value()) { + os << obj.value(); + } else { + os << ""; + } + return os; +} + +std::string format(const char* form, ...); + +#define ASSERT(condition) \ + do { \ + if (!(condition)) { \ + abort(); \ + } \ + } while (false) // For use in destructors, where throwing exceptions is extremely dangerous + +} // namespace FdbApiTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index 5406c65635..745083f8c7 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,8 @@ */ #include "TesterWorkload.h" +#include "TesterUtil.h" #include -#include namespace FdbApiTester { @@ -31,23 +31,23 @@ void WorkloadBase::init(WorkloadManager* manager) { void WorkloadBase::schedule(TTaskFct task) { tasksScheduled++; manager->scheduler->schedule([this, task]() { - tasksScheduled--; task(); + tasksScheduled--; checkIfDone(); }); } void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont) { - txRunning++; + tasksScheduled++; manager->txExecutor->execute(tx, [this, cont]() { - txRunning--; cont(); + tasksScheduled--; checkIfDone(); }); } void WorkloadBase::checkIfDone() { - if (txRunning == 0 && tasksScheduled == 0) { + if (tasksScheduled == 0) { manager->workloadDone(this); } } @@ -70,7 +70,7 @@ void WorkloadManager::run() { void WorkloadManager::workloadDone(IWorkload* workload) { std::unique_lock lock(mutex); auto iter = workloads.find(workload); - assert(iter != workloads.end()); + ASSERT(iter != workloads.end()); lock.unlock(); iter->second.cont(); lock.lock(); diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index 53cc8637fb..f65e88261b 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,31 +39,48 @@ public: virtual void start() = 0; }; +// A base class for test workloads +// Tracks if workload is active, notifies the workload manager when the workload completes class WorkloadBase : public IWorkload { public: - WorkloadBase() : manager(nullptr), tasksScheduled(0), txRunning(0) {} + WorkloadBase() : manager(nullptr), tasksScheduled(0) {} void init(WorkloadManager* manager) override; protected: + // Schedule the a task as a part of the workload void schedule(TTaskFct task); + + // Execute a transaction within the workload void execTransaction(std::shared_ptr tx, TTaskFct cont); + + // Execute a transaction within the workload, a convenience method for tranasactions defined by a single lambda void execTransaction(TTxStartFct start, TTaskFct cont) { execTransaction(std::make_shared(start), cont); } - void checkIfDone(); private: WorkloadManager* manager; + + // Check if workload is done and notify the workload manager + void checkIfDone(); + + // Keep track of tasks scheduled by the workload + // End workload when this number falls to 0 std::atomic tasksScheduled; - std::atomic txRunning; }; +// Workload manager +// Keeps track of active workoads, stops the scheduler after all workloads complete class WorkloadManager { public: WorkloadManager(ITransactionExecutor* txExecutor, IScheduler* scheduler) : txExecutor(txExecutor), scheduler(scheduler) {} + // Add a workload + // A continuation is to be specified for subworkloads void add(std::shared_ptr workload, TTaskFct cont = NO_OP_TASK); + + // Run all workloads. Blocks until all workloads complete void run(); private: diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 8334149351..96933dd5aa 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -22,6 +22,7 @@ #include "TesterWorkload.h" #include "TesterScheduler.h" #include "TesterTransactionExecutor.h" +#include "TesterUtil.h" #include #include #include @@ -220,7 +221,7 @@ void fdb_check(fdb_error_t e) { } } // namespace -std::shared_ptr createApiCorrectnessWorkload(); +std::shared_ptr createApiCorrectnessWorkload(std::string_view prefix); } // namespace FdbApiTester @@ -251,7 +252,7 @@ void runApiCorrectness(TesterOptions& options) { WorkloadManager workloadMgr(txExecutor.get(), scheduler.get()); for (int i = 0; i < 10; i++) { - std::shared_ptr workload = createApiCorrectnessWorkload(); + std::shared_ptr workload = createApiCorrectnessWorkload(format("workload%d/", i)); workloadMgr.add(workload); } workloadMgr.run(); From 99f210e19805a39693c01e85eef20b0ea30a30e0 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Mon, 28 Feb 2022 19:01:31 +0100 Subject: [PATCH 030/138] ApiTester: Buggify option; Fixing transaction executor in error paths --- bindings/c/test/apitester/TesterApiWrapper.h | 5 +++ bindings/c/test/apitester/TesterOptions.h | 1 + .../apitester/TesterTransactionExecutor.cpp | 35 +++++++++++-------- .../c/test/apitester/fdb_c_api_tester.cpp | 16 +++++++-- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index 0eb7191b75..df6c3817dc 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -30,6 +30,11 @@ #define FDB_API_VERSION 710 #include "bindings/c/foundationdb/fdb_c.h" +#undef ERROR +#define ERROR(name, number, description) enum { error_code_##name = number }; + +#include "flow/error_definitions.h" + namespace FdbApiTester { // Wrapper parent class to manage memory of an FDBFuture pointer. Cleans up diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 595ee79530..349f13eac6 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -47,6 +47,7 @@ public: int numDatabases = 1; std::string externalClientLibrary; int numFdbThreads = 1; + bool buggify = false; }; } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 2f4ba49046..740b234c40 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -66,6 +66,8 @@ public: } void done() override { TTaskFct cont = contAfterDone; + ASSERT(!onErrorFuture); + ASSERT(waitMap.empty()); delete this; cont(); } @@ -81,17 +83,19 @@ private: void blockingContinueAfter(Future f, TTaskFct cont) { scheduler->schedule([this, f, cont]() mutable { - fdb_check(fdb_future_block_until_ready(f.fdbFuture())); - fdb_error_t err = f.getError(); - if (err) { - std::unique_lock lock(mutex); - if (!onErrorFuture) { - onErrorFuture = fdbTx.onError(err); - fdb_check(fdb_future_block_until_ready(onErrorFuture.fdbFuture())); - scheduler->schedule([this]() { handleOnErrorResult(); }); + std::unique_lock lock(mutex); + if (!onErrorFuture) { + fdb_check(fdb_future_block_until_ready(f.fdbFuture())); + fdb_error_t err = f.getError(); + if (err) { + if (err != error_code_transaction_cancelled) { + onErrorFuture = fdbTx.onError(err); + fdb_check(fdb_future_block_until_ready(onErrorFuture.fdbFuture())); + scheduler->schedule([this]() { handleOnErrorResult(); }); + } + } else { + scheduler->schedule([cont]() { cont(); }); } - } else { - scheduler->schedule([cont]() { cont(); }); } }); } @@ -120,10 +124,12 @@ private: TTaskFct cont = iter->second.cont; waitMap.erase(iter); if (err) { - waitMap.clear(); - onErrorFuture = tx()->onError(err); - lock.unlock(); - fdb_check(fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this)); + if (err != error_code_transaction_cancelled) { + waitMap.clear(); + onErrorFuture = tx()->onError(err); + lock.unlock(); + fdb_check(fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this)); + } } else { scheduler->schedule(cont); } @@ -144,6 +150,7 @@ private: onErrorFuture.reset(); if (err) { finalError = err; + ASSERT(false); done(); } else { lock.unlock(); diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 96933dd5aa..46b11ac5bd 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -46,7 +46,8 @@ enum TesterOptionId { OPT_NUM_CLIENT_THREADS, OPT_NUM_DATABASES, OPT_EXTERNAL_CLIENT_LIBRARY, - OPT_NUM_FDB_THREADS + OPT_NUM_FDB_THREADS, + OPT_BUGGIFY }; CSimpleOpt::SOption TesterOptionDefs[] = // @@ -64,7 +65,8 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_NUM_CLIENT_THREADS, "--num-client-threads", SO_REQ_SEP }, { OPT_NUM_DATABASES, "--num-databases", SO_REQ_SEP }, { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, - { OPT_NUM_FDB_THREADS, "--num-fdb-threads", SO_REQ_SEP } }; + { OPT_NUM_FDB_THREADS, "--num-fdb-threads", SO_REQ_SEP }, + { OPT_BUGGIFY, "--buggify", SO_NONE } }; void printProgramUsage(const char* execName) { printf("usage: %s [OPTIONS]\n" @@ -98,6 +100,8 @@ void printProgramUsage(const char* execName) { " Path to the external client library.\n" " --num-fdb-threads NUMBER\n" " Number of FDB client threads to be created.\n" + " --buggify\n" + " Enable injection of client errors.\n" " -h, --help Display this help and exit.\n"); } @@ -185,6 +189,10 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { case OPT_NUM_FDB_THREADS: processIntArg(args, options.numFdbThreads, 1, 1000); break; + + case OPT_BUGGIFY: + options.buggify = true; + break; } return true; } @@ -238,6 +246,10 @@ void applyNetworkOptions(TesterOptions& options) { fdb_check( FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads)); } + + if (options.buggify) { + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE)); + } } void runApiCorrectness(TesterOptions& options) { From e89eba3b15db57a798de7892f81cbafc32d9ae45 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Mon, 28 Feb 2022 20:00:27 +0100 Subject: [PATCH 031/138] ApiTester: Apply client knob settings --- bindings/c/test/apitester/fdb_c_api_tester.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 46b11ac5bd..ad10dd9fd0 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -250,6 +250,11 @@ void applyNetworkOptions(TesterOptions& options) { if (options.buggify) { fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE)); } + + for (auto knob : options.knobs) { + FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_KNOB, + format("%s=%s", knob.first.c_str(), knob.second.c_str())); + } } void runApiCorrectness(TesterOptions& options) { From e3de8c6847253f43b22dc09a8691fa7807e28584 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Tue, 1 Mar 2022 18:32:50 +0100 Subject: [PATCH 032/138] ApiTester: Prepopulating database; commit-read operation for testing GRV correctness --- .../c/test/apitester/TesterApiWrapper.cpp | 4 + bindings/c/test/apitester/TesterApiWrapper.h | 1 + .../apitester/TesterCorrectnessWorkload.cpp | 98 ++++++++++++++++--- 3 files changed, 90 insertions(+), 13 deletions(-) diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 34b1fd802c..0955278d6b 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -75,6 +75,10 @@ void Transaction::reset() { fdb_transaction_reset(tx_.get()); } +fdb_error_t Transaction::setOption(FDBTransactionOption option) { + return fdb_transaction_set_option(tx_.get(), option, reinterpret_cast(""), 0); +} + fdb_error_t FdbApi::setOption(FDBNetworkOption option, std::string_view value) { return fdb_network_set_option(option, reinterpret_cast(value.data()), value.size()); } diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index df6c3817dc..266358d022 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -70,6 +70,7 @@ public: Future commit(); Future onError(fdb_error_t err); void reset(); + fdb_error_t setOption(FDBTransactionOption option); private: std::shared_ptr tx_; diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 3f2ee3573d..f01920cc3d 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -30,7 +30,7 @@ namespace FdbApiTester { class ApiCorrectnessWorkload : public WorkloadBase { public: - enum OpType { OP_INSERT, OP_GET, OP_LAST = OP_GET }; + enum OpType { OP_INSERT, OP_GET, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ }; // The minimum length of a key int minKeyLength; @@ -47,6 +47,9 @@ public: // Maximum number of keys to be accessed by a transaction int maxKeysPerTransaction; + // Initial data size (number of key-value pairs) + int initialSize; + // The number of operations to be executed int numOperations; @@ -59,9 +62,10 @@ public: ApiCorrectnessWorkload(std::string_view prefix) { minKeyLength = 1; maxKeyLength = 64; - minValueLength = 1; - maxValueLength = 1000; + minValueLength = 5; + maxValueLength = 10; maxKeysPerTransaction = 50; + initialSize = 100; numOperations = 1000; readExistingKeysRatio = 0.9; keyPrefix = prefix; @@ -69,9 +73,16 @@ public: } void start() override { - schedule([this]() { nextTransaction(); }); + schedule([this]() { + // 1. Populate initial data + populateData([this]() { + // 2. Generate random workload + randomOperations(); + }); + }); } +private: std::string randomKeyName() { return keyPrefix + random.randomStringLowerCase(minKeyLength, maxKeyLength); } std::string randomValue() { return random.randomStringLowerCase(minValueLength, maxValueLength); } @@ -90,9 +101,13 @@ public: std::string key = store.getKey(genKey, true, 1); if (key != store.endKey()) { return key; - } else { - return store.getKey(genKey, true, 0); } + key = store.getKey(genKey, true, 0); + if (key != store.startKey()) { + return key; + } + std::cout << "No existing key found, using a new random key." << std::endl; + return genKey; } std::string randomKey(double existingKeyRatio) { @@ -120,7 +135,54 @@ public: for (const KeyValue& kv : *kvPairs) { store.set(kv.key, kv.value); } - cont(); + schedule(cont); + }); + } + + void randomCommitReadOp(TTaskFct cont) { + int numKeys = random.randomInt(1, maxKeysPerTransaction); + auto kvPairs = std::make_shared>(); + for (int i = 0; i < numKeys; i++) { + kvPairs->push_back(KeyValue{ randomKey(readExistingKeysRatio), randomValue() }); + } + execTransaction( + [kvPairs](auto ctx) { + for (const KeyValue& kv : *kvPairs) { + ctx->tx()->set(kv.key, kv.value); + } + ctx->commit(); + }, + [this, kvPairs, cont]() { + for (const KeyValue& kv : *kvPairs) { + store.set(kv.key, kv.value); + } + auto results = std::make_shared>>(); + execTransaction( + [kvPairs, results](auto ctx) { + // TODO: Enable after merging with GRV caching + // ctx->tx()->setOption(FDB_TR_OPTION_USE_GRV_CACHE); + auto futures = std::make_shared>(); + for (const auto& kv : *kvPairs) { + futures->push_back(ctx->tx()->get(kv.key, false)); + } + ctx->continueAfterAll(futures, [ctx, futures, results]() { + for (auto& f : *futures) { + results->push_back(((ValueFuture&)f).getValue()); + } + ctx->done(); + }); + }, + [this, kvPairs, results, cont]() { + ASSERT(results->size() == kvPairs->size()); + for (int i = 0; i < kvPairs->size(); i++) { + auto expected = store.get((*kvPairs)[i].key); + if ((*results)[i] != expected) { + std::cout << "randomCommitReadOp mismatch. key: " << (*kvPairs)[i].key + << " expected: " << expected << " actual: " << (*results)[i] << std::endl; + } + } + schedule(cont); + }); }); } @@ -149,11 +211,11 @@ public: for (int i = 0; i < keys->size(); i++) { auto expected = store.get((*keys)[i]); if ((*results)[i] != expected) { - std::cout << "randomGetOp mismatch. expected: " << expected << " actual: " << (*results)[i] - << std::endl; + std::cout << "randomGetOp mismatch. key :" << (*keys)[i] << " expected: " << expected + << " actual: " << (*results)[i] << std::endl; } } - cont(); + schedule(cont); }); } @@ -166,11 +228,21 @@ public: case OP_GET: randomGetOp(cont); break; + case OP_COMMIT_READ: + randomCommitReadOp(cont); + break; } } -private: - void nextTransaction() { + void populateData(TTaskFct cont) { + if (store.size() < initialSize) { + randomInsertOp([this, cont]() { populateData(cont); }); + } else { + schedule(cont); + } + } + + void randomOperations() { if (numOpLeft % 100 == 0) { std::cout << numOpLeft << " transactions left" << std::endl; } @@ -178,7 +250,7 @@ private: return; numOpLeft--; - randomOperation([this]() { schedule([this]() { nextTransaction(); }); }); + randomOperation([this]() { randomOperations(); }); } int numOpLeft; From 9f520ae8eb7162e507d35b7749e8d93d05f91ed1 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Tue, 1 Mar 2022 20:58:52 +0100 Subject: [PATCH 033/138] ApiTester: testing clear & clearRange --- .../c/test/apitester/TesterApiWrapper.cpp | 9 +++ bindings/c/test/apitester/TesterApiWrapper.h | 2 + .../apitester/TesterCorrectnessWorkload.cpp | 73 +++++++++++++++++-- .../apitester/TesterTransactionExecutor.cpp | 1 + 4 files changed, 77 insertions(+), 8 deletions(-) diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 0955278d6b..50751d70ee 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -63,6 +63,15 @@ void Transaction::set(std::string_view key, std::string_view value) { fdb_transaction_set(tx_.get(), (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size()); } +void Transaction::clear(std::string_view key) { + fdb_transaction_clear(tx_.get(), (const uint8_t*)key.data(), key.size()); +} + +void Transaction::clearRange(std::string_view begin, std::string_view end) { + fdb_transaction_clear_range( + tx_.get(), (const uint8_t*)begin.data(), begin.size(), (const uint8_t*)end.data(), end.size()); +} + Future Transaction::commit() { return Future(fdb_transaction_commit(tx_.get())); } diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index 266358d022..d4a2793ffc 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -67,6 +67,8 @@ public: Transaction(FDBTransaction* tx); ValueFuture get(std::string_view key, fdb_bool_t snapshot); void set(std::string_view key, std::string_view value); + void clear(std::string_view key); + void clearRange(std::string_view begin, std::string_view end); Future commit(); Future onError(fdb_error_t err); void reset(); diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index f01920cc3d..d077bae15c 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -30,7 +30,7 @@ namespace FdbApiTester { class ApiCorrectnessWorkload : public WorkloadBase { public: - enum OpType { OP_INSERT, OP_GET, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ }; + enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ }; // The minimum length of a key int minKeyLength; @@ -65,7 +65,7 @@ public: minValueLength = 5; maxValueLength = 10; maxKeysPerTransaction = 50; - initialSize = 100; + initialSize = 1000; numOperations = 1000; readExistingKeysRatio = 0.9; keyPrefix = prefix; @@ -74,10 +74,13 @@ public: void start() override { schedule([this]() { - // 1. Populate initial data - populateData([this]() { - // 2. Generate random workload - randomOperations(); + // 1. Clear data + clearData([this]() { + // 2. Populate initial data + populateData([this]() { + // 3. Generate random workload + randomOperations(); + }); }); }); } @@ -106,7 +109,7 @@ private: if (key != store.startKey()) { return key; } - std::cout << "No existing key found, using a new random key." << std::endl; + std::cout << "WARNING: No existing key found, using a new random key." << std::endl; return genKey; } @@ -219,8 +222,46 @@ private: }); } + void randomClearOp(TTaskFct cont) { + int numKeys = random.randomInt(1, maxKeysPerTransaction); + auto keys = std::make_shared>(); + for (int i = 0; i < numKeys; i++) { + keys->push_back(randomExistingKey()); + } + execTransaction( + [keys](auto ctx) { + for (const auto& key : *keys) { + ctx->tx()->clear(key); + } + ctx->commit(); + }, + [this, keys, cont]() { + for (const auto& key : *keys) { + store.clear(key); + } + schedule(cont); + }); + } + + void randomClearRangeOp(TTaskFct cont) { + std::string begin = randomKeyName(); + std::string end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + ctx->tx()->clearRange(begin, end); + ctx->commit(); + }, + [this, begin, end, cont]() { + store.clear(begin, end); + schedule(cont); + }); + } + void randomOperation(TTaskFct cont) { - OpType txType = (OpType)random.randomInt(0, OP_LAST); + OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)random.randomInt(0, OP_LAST); switch (txType) { case OP_INSERT: randomInsertOp(cont); @@ -228,16 +269,32 @@ private: case OP_GET: randomGetOp(cont); break; + case OP_CLEAR: + randomClearOp(cont); + break; + case OP_CLEAR_RANGE: + randomClearRangeOp(cont); + break; case OP_COMMIT_READ: randomCommitReadOp(cont); break; } } + void clearData(TTaskFct cont) { + execTransaction( + [this](auto ctx) { + ctx->tx()->clearRange(keyPrefix, format("%s\xff", keyPrefix.c_str())); + ctx->commit(); + }, + [this, cont]() { schedule(cont); }); + } + void populateData(TTaskFct cont) { if (store.size() < initialSize) { randomInsertOp([this, cont]() { populateData(cont); }); } else { + std::cout << "Data population completed" << std::endl; schedule(cont); } } diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 740b234c40..992a04c408 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -150,6 +150,7 @@ private: onErrorFuture.reset(); if (err) { finalError = err; + std::cout << "Fatal error: " << fdb_get_error(finalError) << std::endl; ASSERT(false); done(); } else { From 9f1d4580e48d6622d2939c67fa8e6150978fca0e Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 1 Mar 2022 23:46:38 -0800 Subject: [PATCH 034/138] Added simulation-only verification of DeltaTree initialization boundaries between writes and reads over time. --- fdbserver/VersionedBTree.actor.cpp | 109 +++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 6 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 072fbe9876..b483a67a84 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4647,6 +4647,70 @@ struct InPlaceArray { }; #pragma pack(pop) +// DecodeBoundaryVerifier provides simulation-only verification of DeltaTree boundaries between +// reads and writes by using a static structure to track boundaries used during DeltaTree generation +// for all writes and updates across cold starts and virtual process restarts. +struct DecodeBoundaryVerifier { + struct DecodeBoundaries { + Key lower; + Key upper; + bool empty() const { return lower.empty() && upper.empty(); } + }; + + typedef std::map BoundariesByVersion; + std::unordered_map boundariesByPageID; + + static DecodeBoundaryVerifier* getVerifier(std::string name) { + static std::map verifiers; + return g_network->isSimulated() ? &verifiers[name] : nullptr; + } + + void update(BTreePageIDRef id, Version v, Key lowerBound, Key upperBound) { + debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s'\n", + ::toString(id).c_str(), + ::toString(v).c_str(), + lowerBound.toString().c_str(), + upperBound.toString().c_str()); + + auto& b = boundariesByPageID[id.front()][v]; + ASSERT(b.empty()); + b = { lowerBound, upperBound }; + } + + bool verify(LogicalPageID id, Version v, Key lowerBound, Key upperBound) { + auto i = boundariesByPageID.find(id); + ASSERT(i != boundariesByPageID.end()); + ASSERT(!i->second.empty()); + + auto b = i->second.upper_bound(v); + --b; + if (b->second.lower != lowerBound || b->second.upper != upperBound) { + fprintf(stderr, + "Boundary mismatch on %s %s\nFound :%s %s\nExpected:%s %s\n", + ::toString(id).c_str(), + ::toString(v).c_str(), + lowerBound.toString().c_str(), + upperBound.toString().c_str(), + b->second.lower.toString().c_str(), + b->second.upper.toString().c_str()); + return false; + } + return true; + } + + void update(Version v, LogicalPageID oldID, LogicalPageID newID) { + debug_printf("decodeBoundariesUpdate copy %s %s to %s\n", + ::toString(v).c_str(), + ::toString(oldID).c_str(), + ::toString(newID).c_str()); + auto& old = boundariesByPageID[oldID]; + ASSERT(!old.empty()); + auto i = old.end(); + --i; + boundariesByPageID[newID][v] = i->second; + } +}; + class VersionedBTree { public: // The first possible internal record possible in the tree @@ -4804,6 +4868,7 @@ public: VersionedBTree(IPager2* pager, std::string name) : m_pager(pager), m_pBuffer(nullptr), m_mutationCount(0), m_name(name), m_pHeader(nullptr), m_headerSpace(0) { + m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name); m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource(); m_lazyClearActor = 0; m_init = init_impl(this); @@ -4940,7 +5005,7 @@ public: self->m_pHeader->root.set(newRoot, self->m_headerSpace - sizeof(MetaKey)); self->m_pHeader->height = 1; Reference page = self->m_pager->newPageBuffer(); - self->makeEmptyRoot(id, page); + self->makeEmptyRoot(page); self->m_pager->updatePage(PagerEventReasons::MetaData, nonBtreeLevel, newRoot, page); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); @@ -5251,6 +5316,7 @@ private: // The mutation buffer currently being written to std::unique_ptr m_pBuffer; int64_t m_mutationCount; + DecodeBoundaryVerifier* m_pBoundaryVerifier; struct CommitBatch { Version readVersion; @@ -5461,7 +5527,7 @@ private: return pages; } - void makeEmptyRoot(LogicalPageID id, Reference page) { + void makeEmptyRoot(Reference page) { BTreePage* btpage = (BTreePage*)page->begin(); btpage->height = 1; btpage->kvBytes = 0; @@ -5630,6 +5696,10 @@ private: } } + if (self->m_pBoundaryVerifier != nullptr) { + self->m_pBoundaryVerifier->update(childPageID, v, pageLowerBound.key, pageUpperBound.key); + } + if (++sinceYield > 100) { sinceYield = 0; wait(yield()); @@ -5726,8 +5796,8 @@ private: if (page->userData == nullptr) { debug_printf("Creating DecodeCache for ptr=%p lower=%s upper=%s\n", page->begin(), - lowerBound.toString().c_str(), - upperBound.toString().c_str()); + lowerBound.toString(false).c_str(), + upperBound.toString(false).c_str()); BTreePage::BinaryTree::DecodeCache* cache = new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound, m_pDecodeCacheMemory); @@ -5818,6 +5888,11 @@ private: newID[i] = id; ++i; } + + if (self->m_pBoundaryVerifier != nullptr) { + self->m_pBoundaryVerifier->update(writeVersion, oldID.front(), newID.front()); + } + self->freeBTreePage(height, oldID, writeVersion); return newID; } @@ -6197,6 +6272,15 @@ private: ? self->getCursor(page.getPtr(), update->cBegin) : self->getCursor(page.getPtr(), dbBegin, dbEnd); + if (self->m_pBoundaryVerifier != nullptr) { + if (update->cBegin.valid()) { + ASSERT(self->m_pBoundaryVerifier->verify(rootID.front(), + batch->snapshot->getVersion(), + update->cBegin.get().key, + update->cBegin.next().getOrUpperBound().key)); + } + } + if (REDWOOD_DEBUG) { debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); auto begin = mBegin; @@ -6770,6 +6854,9 @@ private: self->m_pager->detachRemappedPage(p, batch->writeVersion); if (newID != invalidLogicalPageID) { debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); + if (self->m_pBoundaryVerifier != nullptr) { + self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID); + } p = newID; ++stats.metrics.detachChild; ++detached; @@ -6832,6 +6919,9 @@ private: rec.setChildPage(newPages); } debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); + if (self->m_pBoundaryVerifier != nullptr) { + self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID); + } newPages[i] = newID; ++stats.metrics.detachChild; } @@ -6924,7 +7014,7 @@ private: debug_printf("Writing new empty root.\n"); LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); - self->makeEmptyRoot(newRootID, page); + self->makeEmptyRoot(page); self->m_pHeader->height = 1; VectorRef rootID((LogicalPageID*)&newRootID, 1); self->m_pager->updatePage(PagerEventReasons::Commit, self->m_pHeader->height, rootID, page); @@ -7053,8 +7143,15 @@ public: #if REDWOOD_DEBUG path.push_back({ p, btree->getCursor(p.getPtr(), link), link.get().getChildPage() }); #else - path.push_back({ p, btree->getCursor(p.getPtr(), link) }); + path.push_back({ p, btree->getCursor(p.getPtr(), link) }); #endif + + if (btree->m_pBoundaryVerifier != nullptr) { + ASSERT(btree->m_pBoundaryVerifier->verify(link.get().getChildPage().front(), + pager->getVersion(), + link.get().key, + link.next().getOrUpperBound().key)); + } return Void(); }); } From 5d15a2d62381cb328fc5e80c70a538ae00aecaa7 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 2 Mar 2022 22:51:56 +0100 Subject: [PATCH 035/138] ApiTester: Loading test configuration from files, calling from ctest --- bindings/c/CMakeLists.txt | 17 +- .../apitester/TesterCorrectnessWorkload.cpp | 8 +- bindings/c/test/apitester/TesterOptions.h | 22 +-- bindings/c/test/apitester/TesterTestSpec.cpp | 161 ++++++++++++++++ bindings/c/test/apitester/TesterTestSpec.h | 62 +++++++ bindings/c/test/apitester/TesterUtil.cpp | 4 + bindings/c/test/apitester/TesterUtil.h | 13 ++ bindings/c/test/apitester/TesterWorkload.cpp | 12 ++ bindings/c/test/apitester/TesterWorkload.h | 23 +++ .../c/test/apitester/fdb_c_api_tester.cpp | 173 ++++++++---------- bindings/c/test/apitester/run_c_api_tests.py | 123 +++++++++++++ .../tests/CApiCorrectnessBlocking.toml | 14 ++ .../tests/CApiCorrectnessBuggify.toml | 13 ++ .../tests/CApiCorrectnessMultiThr.toml | 12 ++ .../tests/CApiCorrectnessSingleThr.toml | 5 + 15 files changed, 544 insertions(+), 118 deletions(-) create mode 100644 bindings/c/test/apitester/TesterTestSpec.cpp create mode 100644 bindings/c/test/apitester/TesterTestSpec.h create mode 100755 bindings/c/test/apitester/run_c_api_tests.py create mode 100644 bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml create mode 100644 bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml create mode 100644 bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml create mode 100644 bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index a239f5bc53..2963fa3286 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -103,6 +103,8 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) test/apitester/fdb_c_api_tester.cpp test/apitester/TesterApiWrapper.cpp test/apitester/TesterApiWrapper.h + test/apitester/TesterTestSpec.cpp + test/apitester/TesterTestSpec.h test/apitester/TesterCorrectnessWorkload.cpp test/apitester/TesterKeyValueStore.cpp test/apitester/TesterKeyValueStore.h @@ -161,7 +163,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads) target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads) target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads) - target_link_libraries(fdb_c_api_tester PRIVATE fdb_c Threads::Threads) + target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads) # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) @@ -188,6 +190,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) add_custom_target(external_client DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so) add_dependencies(fdb_c_unit_tests external_client) add_dependencies(disconnected_timeout_unit_tests external_client) + add_dependencies(fdb_c_api_tester external_client) add_fdbclient_test( NAME fdb_c_setup_tests @@ -225,6 +228,18 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) @CLUSTER_FILE@ ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so ) + add_fdbclient_test( + NAME fdb_c_api_tests + COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py + --cluster-file + @CLUSTER_FILE@ + --tester-binary + $ + --external-client-library + ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so + --test-dir + ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests + ) endif() set(c_workloads_srcs diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index d077bae15c..15a2292a3c 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -59,7 +59,7 @@ public: // Key prefix std::string keyPrefix; - ApiCorrectnessWorkload(std::string_view prefix) { + ApiCorrectnessWorkload(const WorkloadConfig& config) { minKeyLength = 1; maxKeyLength = 64; minValueLength = 5; @@ -68,7 +68,7 @@ public: initialSize = 1000; numOperations = 1000; readExistingKeysRatio = 0.9; - keyPrefix = prefix; + keyPrefix = format("ApiCorrectness%d/", config.clientId); numOpLeft = numOperations; } @@ -315,8 +315,6 @@ private: KeyValueStore store; }; -std::shared_ptr createApiCorrectnessWorkload(std::string_view prefix) { - return std::make_shared(prefix); -} +WorkloadFactory ApiCorrectnessWorkloadFactory("ApiCorrectness"); } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterOptions.h b/bindings/c/test/apitester/TesterOptions.h index 349f13eac6..0f60ae436f 100644 --- a/bindings/c/test/apitester/TesterOptions.h +++ b/bindings/c/test/apitester/TesterOptions.h @@ -23,10 +23,7 @@ #ifndef APITESTER_TESTER_OPTIONS_H #define APITESTER_TESTER_OPTIONS_H -#include -#include - -#define FDB_API_VERSION 710 +#include "TesterTestSpec.h" namespace FdbApiTester { @@ -37,17 +34,14 @@ public: std::string traceDir; std::string traceFormat; std::string logGroup; - bool initialStatusCheck = true; - bool cliHints = true; - std::vector> knobs; - // api version, using the latest version by default - int api_version = FDB_API_VERSION; - bool blockOnFutures = false; - int numClientThreads = 1; - int numDatabases = 1; std::string externalClientLibrary; - int numFdbThreads = 1; - bool buggify = false; + std::string testFile; + int numFdbThreads; + int numClientThreads; + int numDatabases; + int numClients; + std::vector> knobs; + TestSpec testSpec; }; } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterTestSpec.cpp b/bindings/c/test/apitester/TesterTestSpec.cpp new file mode 100644 index 0000000000..7c0da5b59c --- /dev/null +++ b/bindings/c/test/apitester/TesterTestSpec.cpp @@ -0,0 +1,161 @@ +/* + * TesterTestSpec.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterTestSpec.h" +#include "TesterUtil.h" +#include +#include + +namespace FdbApiTester { + +namespace { + +void processIntOption(const std::string& value, const std::string& optionName, int& res, int minVal, int maxVal) { + char* endptr; + res = strtol(value.c_str(), &endptr, 10); + if (*endptr != '\0') { + throw TesterError(format("Invalid test file. Invalid value %s for %s\n", value.c_str(), optionName.c_str())); + } + if (res < minVal || res > maxVal) { + throw TesterError( + format("Invalid test file. Value for %s must be between %d and %d\n", optionName.c_str(), minVal, maxVal)); + } +} + +std::unordered_map> testSpecTestKeys = { + { "title", + [](const std::string& value, TestSpec* spec) { // + spec->title = value; + } }, + { "apiVersion", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "apiVersion", spec->apiVersion, 700, 710); + } }, + { "blockOnFutures", + [](const std::string& value, TestSpec* spec) { // + spec->blockOnFutures = (value == "true"); + } }, + { "buggify", + [](const std::string& value, TestSpec* spec) { // + spec->buggify = (value == "true"); + } }, + { "multiThreaded", + [](const std::string& value, TestSpec* spec) { // + spec->multiThreaded = (value == "true"); + } }, + { "minFdbThreads", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "minFdbThreads", spec->minFdbThreads, 1, 1000); + } }, + { "maxFdbThreads", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "maxFdbThreads", spec->maxFdbThreads, 1, 1000); + } }, + { "minClientThreads", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "minClientThreads", spec->minClientThreads, 1, 1000); + } }, + { "maxClientThreads", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "maxClientThreads", spec->maxClientThreads, 1, 1000); + } }, + { "minDatabases", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "minDatabases", spec->minDatabases, 1, 1000); + } }, + { "maxDatabases", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "maxDatabases", spec->maxDatabases, 1, 1000); + } }, + { "minClients", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "minClients", spec->minClients, 1, 1000); + } }, + { "maxClients", + [](const std::string& value, TestSpec* spec) { // + processIntOption(value, "maxClients", spec->maxClients, 1, 1000); + } } +}; + +template +std::string toml_to_string(const T& value) { + // TOML formatting converts numbers to strings exactly how they're in the file + // and thus, is equivalent to testspec. However, strings are quoted, so we + // must remove the quotes. + if (value.type() == toml::value_t::string) { + const std::string& formatted = toml::format(value); + return formatted.substr(1, formatted.size() - 2); + } else { + return toml::format(value); + } +} + +} // namespace + +TestSpec readTomlTestSpec(std::string fileName) { + TestSpec spec; + WorkloadSpec workloadSpec; + + const toml::value& conf = toml::parse(fileName); + + // Then parse each test + const toml::array& tests = toml::find(conf, "test").as_array(); + if (tests.size() == 0) { + throw TesterError("Invalid test file. No [test] section found"); + } else if (tests.size() > 1) { + throw TesterError("Invalid test file. More than one [test] section found"); + } + + const toml::value& test = tests[0]; + + // First handle all test-level settings + for (const auto& [k, v] : test.as_table()) { + if (k == "workload") { + continue; + } + if (testSpecTestKeys.find(k) != testSpecTestKeys.end()) { + testSpecTestKeys[k](toml_to_string(v), &spec); + } else { + throw TesterError(format("Invalid test file. Unrecognized test parameter. Name: %s, value %s", + k.c_str(), + toml_to_string(v).c_str())); + } + } + + // And then copy the workload attributes to spec.options + const toml::array& workloads = toml::find(test, "workload").as_array(); + for (const toml::value& workload : workloads) { + workloadSpec = WorkloadSpec(); + auto& options = workloadSpec.options; + for (const auto& [attrib, v] : workload.as_table()) { + options[attrib] = toml_to_string(v); + } + auto itr = options.find("name"); + if (itr == options.end()) { + throw TesterError("Invalid test file. Unspecified workload name."); + } + workloadSpec.name = itr->second; + spec.workloads.push_back(workloadSpec); + } + + return spec; +} + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterTestSpec.h b/bindings/c/test/apitester/TesterTestSpec.h new file mode 100644 index 0000000000..cbeb787b84 --- /dev/null +++ b/bindings/c/test/apitester/TesterTestSpec.h @@ -0,0 +1,62 @@ +/* + * TesterTestSpec.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef APITESTER_CONFIG_READER_H +#define APITESTER_CONFIG_READER_H + +#include +#include +#include + +#define FDB_API_VERSION 710 + +namespace FdbApiTester { + +struct WorkloadSpec { + std::string name; + std::unordered_map options; +}; + +struct TestSpec { + std::string title; + // api version, using the latest version by default + int apiVersion = FDB_API_VERSION; + bool blockOnFutures = false; + bool multiThreaded = false; + bool buggify = false; + int minFdbThreads = 1; + int maxFdbThreads = 1; + int minClientThreads = 1; + int maxClientThreads = 1; + int minDatabases = 1; + int maxDatabases = 1; + int minClients = 1; + int maxClients = 10; + std::string testFile; + std::vector workloads; +}; + +TestSpec readTomlTestSpec(std::string fileName); + +} // namespace FdbApiTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterUtil.cpp b/bindings/c/test/apitester/TesterUtil.cpp index 641df8bcc2..95b5890dc6 100644 --- a/bindings/c/test/apitester/TesterUtil.cpp +++ b/bindings/c/test/apitester/TesterUtil.cpp @@ -93,4 +93,8 @@ std::string format(const char* form, ...) { return str; } +void print_internal_error(const char* msg, const char* file, int line) { + fprintf(stderr, "Assertion %s failed @ %s %d:\n", msg, file, line); +} + } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h index 6cd68ce9be..e0936143fc 100644 --- a/bindings/c/test/apitester/TesterUtil.h +++ b/bindings/c/test/apitester/TesterUtil.h @@ -54,9 +54,22 @@ std::ostream& operator<<(std::ostream& os, const std::optional& obj) { std::string format(const char* form, ...); +class TesterError : public std::runtime_error { +public: + explicit TesterError(const char* message) : std::runtime_error(message) {} + explicit TesterError(const std::string& message) : std::runtime_error(message) {} + TesterError(const TesterError&) = default; + TesterError& operator=(const TesterError&) = default; + TesterError(TesterError&&) = default; + TesterError& operator=(TesterError&&) = default; +}; + +void print_internal_error(const char* msg, const char* file, int line); + #define ASSERT(condition) \ do { \ if (!(condition)) { \ + print_internal_error(#condition, __FILE__, __LINE__); \ abort(); \ } \ } while (false) // For use in destructors, where throwing exceptions is extremely dangerous diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index 745083f8c7..f299e7328e 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -82,4 +82,16 @@ void WorkloadManager::workloadDone(IWorkload* workload) { } } +std::shared_ptr IWorkloadFactory::create(std::string const& name, const WorkloadConfig& config) { + auto it = factories().find(name); + if (it == factories().end()) + return {}; // or throw? + return it->second->create(config); +} + +std::unordered_map& IWorkloadFactory::factories() { + static std::unordered_map theFactories; + return theFactories; +} + } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index f65e88261b..f542bc1d00 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -20,6 +20,7 @@ #pragma once +#include #ifndef APITESTER_WORKLOAD_H #define APITESTER_WORKLOAD_H @@ -39,6 +40,12 @@ public: virtual void start() = 0; }; +struct WorkloadConfig { + int clientId; + int numClients; + std::unordered_map options; +}; + // A base class for test workloads // Tracks if workload is active, notifies the workload manager when the workload completes class WorkloadBase : public IWorkload { @@ -100,6 +107,22 @@ private: std::unordered_map workloads; }; +struct IWorkloadFactory { + static std::shared_ptr create(std::string const& name, const WorkloadConfig& config); + static std::unordered_map& factories(); + + virtual ~IWorkloadFactory() = default; + virtual std::shared_ptr create(const WorkloadConfig& config) = 0; +}; + +template +struct WorkloadFactory : IWorkloadFactory { + WorkloadFactory(const char* name) { factories()[name] = this; } + std::shared_ptr create(const WorkloadConfig& config) override { + return std::make_shared(config); + } +}; + } // namespace FdbApiTester #endif \ No newline at end of file diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index ad10dd9fd0..537b393f43 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -22,9 +22,11 @@ #include "TesterWorkload.h" #include "TesterScheduler.h" #include "TesterTransactionExecutor.h" +#include "TesterTestSpec.h" #include "TesterUtil.h" #include #include +#include #include #include "flow/SimpleOpt.h" #include "bindings/c/foundationdb/fdb_c.h" @@ -41,13 +43,8 @@ enum TesterOptionId { OPT_LOGGROUP, OPT_TRACE_FORMAT, OPT_KNOB, - OPT_API_VERSION, - OPT_BLOCK_ON_FUTURES, - OPT_NUM_CLIENT_THREADS, - OPT_NUM_DATABASES, OPT_EXTERNAL_CLIENT_LIBRARY, - OPT_NUM_FDB_THREADS, - OPT_BUGGIFY + OPT_TEST_FILE }; CSimpleOpt::SOption TesterOptionDefs[] = // @@ -60,23 +57,19 @@ CSimpleOpt::SOption TesterOptionDefs[] = // { OPT_HELP, "--help", SO_NONE }, { OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP }, { OPT_KNOB, "--knob-", SO_REQ_SEP }, - { OPT_API_VERSION, "--api-version", SO_REQ_SEP }, - { OPT_BLOCK_ON_FUTURES, "--block-on-futures", SO_NONE }, - { OPT_NUM_CLIENT_THREADS, "--num-client-threads", SO_REQ_SEP }, - { OPT_NUM_DATABASES, "--num-databases", SO_REQ_SEP }, { OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP }, - { OPT_NUM_FDB_THREADS, "--num-fdb-threads", SO_REQ_SEP }, - { OPT_BUGGIFY, "--buggify", SO_NONE } }; + { OPT_TEST_FILE, "-f", SO_REQ_SEP }, + { OPT_TEST_FILE, "--test-file", SO_REQ_SEP }, + SO_END_OF_OPTIONS }; void printProgramUsage(const char* execName) { printf("usage: %s [OPTIONS]\n" "\n", execName); - printf(" -C CONNFILE The path of a file containing the connection string for the\n" - " FoundationDB cluster. The default is `fdb.cluster',\n" - " then `%s'.\n", - "fdb.cluster"); - printf(" --log Enables trace file logging for the CLI session.\n" + printf(" -C, --cluster-file FILE\n" + " The path of a file containing the connection string for the\n" + " FoundationDB cluster. The default is `fdb.cluster'\n" + " --log Enables trace file logging for the CLI session.\n" " --log-dir PATH Specifes the output directory for trace files. If\n" " unspecified, defaults to the current directory. Has\n" " no effect unless --log is specified.\n" @@ -86,39 +79,15 @@ void printProgramUsage(const char* execName) { " --trace-format FORMAT\n" " Select the format of the log files. xml (the default) and json\n" " are supported. Has no effect unless --log is specified.\n" - " --api-version APIVERSION\n" - " Specifies the version of the API for the CLI to use.\n" " --knob-KNOBNAME KNOBVALUE\n" " Changes a knob option. KNOBNAME should be lowercase.\n" - " --block-on-futures\n" - " Use blocking waits on futures instead of scheduling callbacks.\n" - " --num-client-threads NUMBER\n" - " Number of threads to be used for execution of client workloads.\n" - " --num-databases NUMBER\n" - " Number of database connections to be used concurrently.\n" - " --external-client-library FILE_PATH\n" + " --external-client-library FILE\n" " Path to the external client library.\n" - " --num-fdb-threads NUMBER\n" - " Number of FDB client threads to be created.\n" - " --buggify\n" - " Enable injection of client errors.\n" + " -f, --test-file FILE\n" + " Test file to run.\n" " -h, --help Display this help and exit.\n"); } -bool processIntArg(const CSimpleOpt& args, int& res, int minVal, int maxVal) { - char* endptr; - res = strtol(args.OptionArg(), &endptr, 10); - if (*endptr != '\0') { - fprintf(stderr, "ERROR: invalid value %s for %s\n", args.OptionArg(), args.OptionText()); - return false; - } - if (res < minVal || res > maxVal) { - fprintf(stderr, "ERROR: value for %s must be between %d and %d\n", args.OptionText(), minVal, maxVal); - return false; - } - return true; -} - // Extracts the key for command line arguments that are specified with a prefix (e.g. --knob-). // This function converts any hyphens in the extracted key to underscores. bool extractPrefixedArgument(std::string prefix, const std::string& arg, std::string& res) { @@ -141,11 +110,6 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { case OPT_CONNFILE: options.clusterFile = args.OptionArg(); break; - case OPT_API_VERSION: { - // multi-version fdbcli only available after 7.0 - processIntArg(args, options.api_version, 700, FDB_API_VERSION); - break; - } case OPT_TRACE: options.trace = true; break; @@ -157,7 +121,8 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { break; case OPT_TRACE_FORMAT: if (!validateTraceFormat(args.OptionArg())) { - fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); + fprintf(stderr, "ERROR: Unrecognized trace format `%s'\n", args.OptionArg()); + return false; } options.traceFormat = args.OptionArg(); break; @@ -170,28 +135,13 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { options.knobs.emplace_back(knobName, args.OptionArg()); break; } - case OPT_BLOCK_ON_FUTURES: - options.blockOnFutures = true; - break; - - case OPT_NUM_CLIENT_THREADS: - processIntArg(args, options.numClientThreads, 1, 1000); - break; - - case OPT_NUM_DATABASES: - processIntArg(args, options.numDatabases, 1, 1000); - break; - case OPT_EXTERNAL_CLIENT_LIBRARY: options.externalClientLibrary = args.OptionArg(); break; - case OPT_NUM_FDB_THREADS: - processIntArg(args, options.numFdbThreads, 1, 1000); - break; - - case OPT_BUGGIFY: - options.buggify = true; + case OPT_TEST_FILE: + options.testFile = args.OptionArg(); + options.testSpec = readTomlTestSpec(options.testFile); break; } return true; @@ -227,13 +177,6 @@ void fdb_check(fdb_error_t e) { std::abort(); } } -} // namespace - -std::shared_ptr createApiCorrectnessWorkload(std::string_view prefix); - -} // namespace FdbApiTester - -using namespace FdbApiTester; void applyNetworkOptions(TesterOptions& options) { if (!options.externalClientLibrary.empty()) { @@ -242,24 +185,37 @@ void applyNetworkOptions(TesterOptions& options) { FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, options.externalClientLibrary)); } - if (options.numFdbThreads > 1) { - fdb_check( - FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads)); + if (options.testSpec.multiThreaded) { + FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads); } - if (options.buggify) { + if (options.testSpec.buggify) { fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE)); } + if (options.trace) { + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir)); + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, options.traceFormat)); + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_LOG_GROUP, options.logGroup)); + } + for (auto knob : options.knobs) { FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_KNOB, format("%s=%s", knob.first.c_str(), knob.second.c_str())); } } -void runApiCorrectness(TesterOptions& options) { +void randomizeOptions(TesterOptions& options) { + Random random; + options.numFdbThreads = random.randomInt(options.testSpec.minFdbThreads, options.testSpec.maxFdbThreads); + options.numClientThreads = random.randomInt(options.testSpec.minClientThreads, options.testSpec.maxClientThreads); + options.numDatabases = random.randomInt(options.testSpec.minDatabases, options.testSpec.maxDatabases); + options.numClients = random.randomInt(options.testSpec.minClients, options.testSpec.maxClients); +} + +void runWorkloads(TesterOptions& options) { TransactionExecutorOptions txExecOptions; - txExecOptions.blockOnFutures = options.blockOnFutures; + txExecOptions.blockOnFutures = options.testSpec.blockOnFutures; txExecOptions.numDatabases = options.numDatabases; std::unique_ptr scheduler = createScheduler(options.numClientThreads); @@ -268,28 +224,49 @@ void runApiCorrectness(TesterOptions& options) { txExecutor->init(scheduler.get(), options.clusterFile.c_str(), txExecOptions); WorkloadManager workloadMgr(txExecutor.get(), scheduler.get()); - for (int i = 0; i < 10; i++) { - std::shared_ptr workload = createApiCorrectnessWorkload(format("workload%d/", i)); - workloadMgr.add(workload); + for (const auto& workloadSpec : options.testSpec.workloads) { + for (int i = 0; i < options.numClients; i++) { + WorkloadConfig config; + config.options = workloadSpec.options; + config.clientId = i; + config.numClients = options.numClients; + std::shared_ptr workload = IWorkloadFactory::create(workloadSpec.name, config); + if (!workload) { + throw TesterError(format("Unknown workload '%s'", workloadSpec.name.c_str())); + } + workloadMgr.add(workload); + } } + workloadMgr.run(); } +} // namespace +} // namespace FdbApiTester + +using namespace FdbApiTester; + int main(int argc, char** argv) { - TesterOptions options; - if (!parseArgs(options, argc, argv)) { + try { + TesterOptions options; + if (!parseArgs(options, argc, argv)) { + return 1; + } + randomizeOptions(options); + + fdb_check(fdb_select_api_version(options.testSpec.apiVersion)); + applyNetworkOptions(options); + fdb_check(fdb_setup_network()); + + std::thread network_thread{ &fdb_run_network }; + + runWorkloads(options); + + fdb_check(fdb_stop_network()); + network_thread.join(); + return 0; + } catch (const std::runtime_error& err) { + std::cerr << "ERROR: " << err.what() << std::endl; return 1; } - - fdb_check(fdb_select_api_version(options.api_version)); - applyNetworkOptions(options); - fdb_check(fdb_setup_network()); - - std::thread network_thread{ &fdb_run_network }; - - runApiCorrectness(options); - - fdb_check(fdb_stop_network()); - network_thread.join(); - return 0; } diff --git a/bindings/c/test/apitester/run_c_api_tests.py b/bindings/c/test/apitester/run_c_api_tests.py new file mode 100755 index 0000000000..f61cbd151d --- /dev/null +++ b/bindings/c/test/apitester/run_c_api_tests.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# +# run_c_api_tests.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import subprocess +import argparse +import os +from subprocess import Popen, TimeoutExpired +import logging +import signal + + +def get_logger(): + return logging.getLogger('foundationdb.run_c_api_tests') + + +def initialize_logger_level(logging_level): + logger = get_logger() + + assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] + + logging.basicConfig(format='%(message)s') + if logging_level == 'DEBUG': + logger.setLevel(logging.DEBUG) + elif logging_level == 'INFO': + logger.setLevel(logging.INFO) + elif logging_level == 'WARNING': + logger.setLevel(logging.WARNING) + elif logging_level == 'ERROR': + logger.setLevel(logging.ERROR) + + +def run_tester(args, test_file): + cmd = [args.tester_binary, "--cluster-file", + args.cluster_file, "--test-file", test_file] + if args.external_client_library is not None: + cmd += ["--external-client-library", args.external_client_library] + + get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd)) + proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr) + timed_out = False + try: + ret_code = proc.wait(args.timeout) + except TimeoutExpired: + proc.kill() + timed_out = True + except Exception as e: + raise Exception('Unable to run tester (%s)' % e) + + if ret_code != 0: + if ret_code < 0: + reason = signal.Signals(-ret_code).name + else: + reason = 'exit code: %d' % ret_code + if timed_out: + reason = 'timed out after %d seconds' % args.timeout + ret_code = 1 + get_logger().error('\n\'%s\' did not complete succesfully (%s)' % + (cmd[0], reason)) + + get_logger().info('') + return ret_code + + +def run_tests(args): + num_failed = 0 + test_files = [f for f in os.listdir(args.test_dir) + if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")] + + for test_file in test_files: + get_logger().info('Running test %s' % test_file) + ret_code = run_tester(args, os.path.join(args.test_dir, test_file)) + if ret_code != 0: + num_failed += 1 + + return num_failed + + +def parse_args(argv): + parser = argparse.ArgumentParser(description='FoundationDB C API Tester') + + parser.add_argument('--cluster-file', type=str, default="fdb.cluster", + help='The cluster file for the cluster being connected to. (default: fdb.cluster)') + parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester", + help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)') + parser.add_argument('--external-client-library', type=str, default=None, + help='Path to the external client library. (default: None)') + parser.add_argument('--test-dir', type=str, default="./", + help='Path to a directory with test definitions. (default: ./)') + parser.add_argument('--timeout', type=int, default=300, + help='The timeout in seconds for running each individual test. (default 300)') + parser.add_argument('--logging-level', type=str, default='INFO', + choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').') + + return parser.parse_args(argv) + + +def main(argv): + args = parse_args(argv) + initialize_logger_level(args.logging_level) + return run_tests(args) + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml new file mode 100644 index 0000000000..a77908ce79 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml @@ -0,0 +1,14 @@ +[[test]] +title = 'API Correctness Blocking' +multiThreaded = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +buggify = true +blockOnFutures = true + + [[test.workload]] + name = 'ApiCorrectness' \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml new file mode 100644 index 0000000000..f8b6d31c95 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml @@ -0,0 +1,13 @@ +[[test]] +title = 'API Correctness Buggify' +multiThreaded = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +buggify = true + + [[test.workload]] + name = 'ApiCorrectness' \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml new file mode 100644 index 0000000000..6d292fff4d --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml @@ -0,0 +1,12 @@ +[[test]] +title = 'API Correctness Multi Threaded' +multiThreaded = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 + + [[test.workload]] + name = 'ApiCorrectness' \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml new file mode 100644 index 0000000000..bb57f92fa8 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml @@ -0,0 +1,5 @@ +[[test]] +title = 'API Correctness Single Threaded' + + [[test.workload]] + name = 'ApiCorrectness' \ No newline at end of file From b8386f15d6240de04cdc9ceaba0a3d385eebe6eb Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 4 Mar 2022 16:22:49 +0100 Subject: [PATCH 036/138] ApiTester: configuration for ApiCorrectness workload; Better error handing and reporting --- bindings/c/CMakeLists.txt | 1 + .../c/test/apitester/TesterApiWrapper.cpp | 4 +- .../apitester/TesterCorrectnessWorkload.cpp | 48 +++++----- .../c/test/apitester/TesterKeyValueStore.cpp | 1 - bindings/c/test/apitester/TesterTestSpec.cpp | 11 +-- .../apitester/TesterTransactionExecutor.cpp | 76 +++++++++------ .../apitester/TesterTransactionExecutor.h | 3 + bindings/c/test/apitester/TesterUtil.cpp | 47 ---------- bindings/c/test/apitester/TesterUtil.h | 30 +++--- bindings/c/test/apitester/TesterWorkload.cpp | 93 ++++++++++++++++++- bindings/c/test/apitester/TesterWorkload.h | 37 ++++++-- .../c/test/apitester/fdb_c_api_tester.cpp | 39 ++++---- bindings/c/test/apitester/run_c_api_tests.py | 2 + .../tests/CApiCorrectnessBlocking.toml | 12 ++- .../tests/CApiCorrectnessBuggify.toml | 12 ++- .../tests/CApiCorrectnessMultiThr.toml | 12 ++- .../tests/CApiCorrectnessSingleThr.toml | 13 ++- cmake/AddFdbTest.cmake | 35 ++++--- tests/TestRunner/tmp_cluster.py | 13 ++- 19 files changed, 317 insertions(+), 172 deletions(-) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 2963fa3286..3c1d987ae4 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -230,6 +230,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) ) add_fdbclient_test( NAME fdb_c_api_tests + DISABLE_LOG_DUMP COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py --cluster-file @CLUSTER_FILE@ diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 50751d70ee..9097d13858 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -19,7 +19,7 @@ */ #include "TesterApiWrapper.h" #include -#include +#include namespace FdbApiTester { @@ -27,7 +27,7 @@ namespace { void fdb_check(fdb_error_t e) { if (e) { - std::cerr << fdb_get_error(e) << std::endl; + fmt::print(stderr, "Unexpected error: %s\n", fdb_get_error(e)); std::abort(); } } diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 15a2292a3c..c41eb9e72c 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -23,8 +23,8 @@ #include "test/apitester/TesterScheduler.h" #include #include -#include #include +#include namespace FdbApiTester { @@ -51,7 +51,7 @@ public: int initialSize; // The number of operations to be executed - int numOperations; + int numRandomOperations; // The ratio of reading existing keys double readExistingKeysRatio; @@ -59,17 +59,17 @@ public: // Key prefix std::string keyPrefix; - ApiCorrectnessWorkload(const WorkloadConfig& config) { - minKeyLength = 1; - maxKeyLength = 64; - minValueLength = 5; - maxValueLength = 10; - maxKeysPerTransaction = 50; - initialSize = 1000; - numOperations = 1000; - readExistingKeysRatio = 0.9; - keyPrefix = format("ApiCorrectness%d/", config.clientId); - numOpLeft = numOperations; + ApiCorrectnessWorkload(const WorkloadConfig& config) : WorkloadBase(config) { + minKeyLength = config.getIntOption("minKeyLength", 1); + maxKeyLength = config.getIntOption("maxKeyLength", 64); + minValueLength = config.getIntOption("minValueLength", 1); + maxValueLength = config.getIntOption("maxValueLength", 1000); + maxKeysPerTransaction = config.getIntOption("maxKeysPerTransaction", 50); + initialSize = config.getIntOption("initialSize", 1000); + numRandomOperations = config.getIntOption("numRandomOperations", 1000); + readExistingKeysRatio = config.getFloatOption("readExistingKeysRatio", 0.9); + keyPrefix = fmt::format("{}/", workloadId); + numOpLeft = numRandomOperations; } void start() override { @@ -109,7 +109,7 @@ private: if (key != store.startKey()) { return key; } - std::cout << "WARNING: No existing key found, using a new random key." << std::endl; + info("No existing key found, using a new random key."); return genKey; } @@ -180,8 +180,11 @@ private: for (int i = 0; i < kvPairs->size(); i++) { auto expected = store.get((*kvPairs)[i].key); if ((*results)[i] != expected) { - std::cout << "randomCommitReadOp mismatch. key: " << (*kvPairs)[i].key - << " expected: " << expected << " actual: " << (*results)[i] << std::endl; + error( + fmt::format("randomCommitReadOp mismatch. key: {} expected: {:.80} actual: {:.80}", + (*kvPairs)[i].key, + expected, + (*results)[i])); } } schedule(cont); @@ -214,8 +217,10 @@ private: for (int i = 0; i < keys->size(); i++) { auto expected = store.get((*keys)[i]); if ((*results)[i] != expected) { - std::cout << "randomGetOp mismatch. key :" << (*keys)[i] << " expected: " << expected - << " actual: " << (*results)[i] << std::endl; + error(fmt::format("randomGetOp mismatch. key: {} expected: {:.80} actual: {:.80}", + (*keys)[i], + expected, + (*results)[i])); } } schedule(cont); @@ -284,7 +289,7 @@ private: void clearData(TTaskFct cont) { execTransaction( [this](auto ctx) { - ctx->tx()->clearRange(keyPrefix, format("%s\xff", keyPrefix.c_str())); + ctx->tx()->clearRange(keyPrefix, fmt::format("{}\xff", keyPrefix)); ctx->commit(); }, [this, cont]() { schedule(cont); }); @@ -294,15 +299,12 @@ private: if (store.size() < initialSize) { randomInsertOp([this, cont]() { populateData(cont); }); } else { - std::cout << "Data population completed" << std::endl; + info("Data population completed"); schedule(cont); } } void randomOperations() { - if (numOpLeft % 100 == 0) { - std::cout << numOpLeft << " transactions left" << std::endl; - } if (numOpLeft == 0) return; diff --git a/bindings/c/test/apitester/TesterKeyValueStore.cpp b/bindings/c/test/apitester/TesterKeyValueStore.cpp index 3c221bb47c..1d9b8d63d2 100644 --- a/bindings/c/test/apitester/TesterKeyValueStore.cpp +++ b/bindings/c/test/apitester/TesterKeyValueStore.cpp @@ -19,7 +19,6 @@ */ #include "TesterKeyValueStore.h" -#include namespace FdbApiTester { diff --git a/bindings/c/test/apitester/TesterTestSpec.cpp b/bindings/c/test/apitester/TesterTestSpec.cpp index 7c0da5b59c..936e92216e 100644 --- a/bindings/c/test/apitester/TesterTestSpec.cpp +++ b/bindings/c/test/apitester/TesterTestSpec.cpp @@ -21,7 +21,7 @@ #include "TesterTestSpec.h" #include "TesterUtil.h" #include -#include +#include namespace FdbApiTester { @@ -31,11 +31,11 @@ void processIntOption(const std::string& value, const std::string& optionName, i char* endptr; res = strtol(value.c_str(), &endptr, 10); if (*endptr != '\0') { - throw TesterError(format("Invalid test file. Invalid value %s for %s\n", value.c_str(), optionName.c_str())); + throw TesterError(fmt::format("Invalid test file. Invalid value {} for {}", value, optionName)); } if (res < minVal || res > maxVal) { throw TesterError( - format("Invalid test file. Value for %s must be between %d and %d\n", optionName.c_str(), minVal, maxVal)); + fmt::format("Invalid test file. Value for {} must be between {} and {}", optionName, minVal, maxVal)); } } @@ -133,9 +133,8 @@ TestSpec readTomlTestSpec(std::string fileName) { if (testSpecTestKeys.find(k) != testSpecTestKeys.end()) { testSpecTestKeys[k](toml_to_string(v), &spec); } else { - throw TesterError(format("Invalid test file. Unrecognized test parameter. Name: %s, value %s", - k.c_str(), - toml_to_string(v).c_str())); + throw TesterError(fmt::format( + "Invalid test file. Unrecognized test parameter. Name: {}, value {}", k, toml_to_string(v))); } } diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 992a04c408..08b825406a 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -21,23 +21,11 @@ #include "TesterTransactionExecutor.h" #include "TesterUtil.h" #include "test/apitester/TesterScheduler.h" -#include #include #include namespace FdbApiTester { -namespace { - -void fdb_check(fdb_error_t e) { - if (e) { - std::cerr << fdb_get_error(e) << std::endl; - std::abort(); - } -} - -} // namespace - void ITransactionContext::continueAfterAll(std::shared_ptr> futures, TTaskFct cont) { auto counter = std::make_shared>(futures->size()); for (auto& f : *futures) { @@ -56,7 +44,7 @@ public: TTaskFct cont, const TransactionExecutorOptions& options, IScheduler* scheduler) - : options(options), fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), finalError(0) {} + : options(options), fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler) {} Transaction* tx() override { return &fdbTx; } void continueAfter(Future f, TTaskFct cont) override { doContinueAfter(f, cont); } @@ -85,12 +73,22 @@ private: scheduler->schedule([this, f, cont]() mutable { std::unique_lock lock(mutex); if (!onErrorFuture) { - fdb_check(fdb_future_block_until_ready(f.fdbFuture())); - fdb_error_t err = f.getError(); + fdb_error_t err = fdb_future_block_until_ready(f.fdbFuture()); + if (err) { + lock.unlock(); + transactionFailed(err); + return; + } + err = f.getError(); if (err) { if (err != error_code_transaction_cancelled) { onErrorFuture = fdbTx.onError(err); - fdb_check(fdb_future_block_until_ready(onErrorFuture.fdbFuture())); + fdb_error_t err2 = fdb_future_block_until_ready(onErrorFuture.fdbFuture()); + if (err2) { + lock.unlock(); + transactionFailed(err2); + return; + } scheduler->schedule([this]() { handleOnErrorResult(); }); } } else { @@ -105,7 +103,10 @@ private: if (!onErrorFuture) { waitMap[f.fdbFuture()] = WaitInfo{ f, cont }; lock.unlock(); - fdb_check(fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this)); + fdb_error_t err = fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this); + if (err) { + transactionFailed(err); + } } } @@ -128,7 +129,10 @@ private: waitMap.clear(); onErrorFuture = tx()->onError(err); lock.unlock(); - fdb_check(fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this)); + fdb_error_t err = fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this); + if (err) { + transactionFailed(err); + } } } else { scheduler->schedule(cont); @@ -149,10 +153,7 @@ private: fdb_error_t err = onErrorFuture.getError(); onErrorFuture.reset(); if (err) { - finalError = err; - std::cout << "Fatal error: " << fdb_get_error(finalError) << std::endl; - ASSERT(false); - done(); + transactionFailed(err); } else { lock.unlock(); txActor->reset(); @@ -165,6 +166,15 @@ private: TTaskFct cont; }; + void transactionFailed(fdb_error_t err) { + std::unique_lock lock(mutex); + onErrorFuture.reset(); + waitMap.clear(); + lock.unlock(); + txActor->setError(err); + done(); + } + const TransactionExecutorOptions& options; Transaction fdbTx; std::shared_ptr txActor; @@ -173,7 +183,6 @@ private: Future onErrorFuture; TTaskFct contAfterDone; IScheduler* scheduler; - fdb_error_t finalError; }; class TransactionExecutor : public ITransactionExecutor { @@ -187,7 +196,13 @@ public: this->options = options; for (int i = 0; i < options.numDatabases; i++) { FDBDatabase* db; - fdb_check(fdb_create_database(clusterFile, &db)); + fdb_error_t err = fdb_create_database(clusterFile, &db); + if (err != error_code_success) { + throw TesterError(fmt::format("Failed create database with the culster file '{}'. Error: {}({})", + clusterFile, + err, + fdb_get_error(err))); + } databases.push_back(db); } } @@ -195,10 +210,15 @@ public: void execute(std::shared_ptr txActor, TTaskFct cont) override { int idx = random.randomInt(0, options.numDatabases - 1); FDBTransaction* tx; - fdb_check(fdb_database_create_transaction(databases[idx], &tx)); - TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); - txActor->init(ctx); - txActor->start(); + fdb_error_t err = fdb_database_create_transaction(databases[idx], &tx); + if (err != error_code_success) { + txActor->setError(err); + cont(); + } else { + TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); + txActor->init(ctx); + txActor->start(); + } } void release() override { diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index ef01838205..01d5bb581c 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -47,6 +47,9 @@ public: virtual void init(ITransactionContext* ctx) = 0; virtual void start() = 0; virtual void reset() = 0; + virtual void setError(fdb_error_t err) { error = err; } + + fdb_error_t error = error_code_success; }; class TransactionActorBase : public ITransactionActor { diff --git a/bindings/c/test/apitester/TesterUtil.cpp b/bindings/c/test/apitester/TesterUtil.cpp index 95b5890dc6..4ab572b980 100644 --- a/bindings/c/test/apitester/TesterUtil.cpp +++ b/bindings/c/test/apitester/TesterUtil.cpp @@ -46,53 +46,6 @@ bool Random::randomBool(double trueRatio) { return std::uniform_real_distribution(0.0, 1.0)(random) <= trueRatio; } -int vsformat(std::string& outputString, const char* form, va_list args) { - char buf[200]; - - va_list args2; - va_copy(args2, args); - int size = vsnprintf(buf, sizeof(buf), form, args2); - va_end(args2); - - if (size >= 0 && size < sizeof(buf)) { - outputString = std::string(buf, size); - return size; - } - -#ifdef _WIN32 - // Microsoft's non-standard vsnprintf doesn't return a correct size, but just an error, so determine the necessary - // size - va_copy(args2, args); - size = _vscprintf(form, args2); - va_end(args2); -#endif - - if (size < 0) { - return -1; - } - - outputString.resize(size + 1); - size = vsnprintf(&outputString[0], outputString.size(), form, args); - if (size < 0 || size >= outputString.size()) { - return -1; - } - - outputString.resize(size); - return size; -} - -std::string format(const char* form, ...) { - va_list args; - va_start(args, form); - - std::string str; - int result = vsformat(str, form, args); - va_end(args); - - ASSERT(result >= 0); - return str; -} - void print_internal_error(const char* msg, const char* file, int line) { fprintf(stderr, "Assertion %s failed @ %s %d:\n", msg, file, line); } diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h index e0936143fc..6b10a6ffb7 100644 --- a/bindings/c/test/apitester/TesterUtil.h +++ b/bindings/c/test/apitester/TesterUtil.h @@ -26,6 +26,24 @@ #include #include #include +#include + +namespace fmt { + +template +struct formatter> : fmt::formatter { + + template + auto format(const std::optional& opt, FormatContext& ctx) { + if (opt) { + fmt::formatter::format(*opt, ctx); + return ctx.out(); + } + return fmt::format_to(ctx.out(), ""); + } +}; + +} // namespace fmt namespace FdbApiTester { @@ -42,18 +60,6 @@ public: std::mt19937 random; }; -template -std::ostream& operator<<(std::ostream& os, const std::optional& obj) { - if (obj.has_value()) { - os << obj.value(); - } else { - os << ""; - } - return os; -} - -std::string format(const char* form, ...); - class TesterError : public std::runtime_error { public: explicit TesterError(const char* message) : std::runtime_error(message) {} diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index f299e7328e..270bfb7c8c 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -20,15 +20,57 @@ #include "TesterWorkload.h" #include "TesterUtil.h" +#include #include +#include namespace FdbApiTester { +int WorkloadConfig::getIntOption(const std::string& name, int defaultVal) const { + auto iter = options.find(name); + if (iter == options.end()) { + return defaultVal; + } else { + char* endptr; + int intVal = strtol(iter->second.c_str(), &endptr, 10); + if (*endptr != '\0') { + throw TesterError( + fmt::format("Invalid workload configuration. Invalid value {} for {}", iter->second, name)); + } + return intVal; + } +} + +double WorkloadConfig::getFloatOption(const std::string& name, double defaultVal) const { + auto iter = options.find(name); + if (iter == options.end()) { + return defaultVal; + } else { + char* endptr; + double floatVal = strtod(iter->second.c_str(), &endptr); + if (*endptr != '\0') { + throw TesterError( + fmt::format("Invalid workload configuration. Invalid value {} for {}", iter->second, name)); + } + return floatVal; + } +} + +WorkloadBase::WorkloadBase(const WorkloadConfig& config) + : manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients), + failed(false) { + maxErrors = config.getIntOption("maxErrors", 10); + workloadId = fmt::format("{}{}", config.name, clientId); +} + void WorkloadBase::init(WorkloadManager* manager) { this->manager = manager; } void WorkloadBase::schedule(TTaskFct task) { + if (failed) { + return; + } tasksScheduled++; manager->scheduler->schedule([this, task]() { task(); @@ -37,18 +79,51 @@ void WorkloadBase::schedule(TTaskFct task) { }); } -void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont) { +void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError) { + if (failed) { + return; + } tasksScheduled++; - manager->txExecutor->execute(tx, [this, cont]() { - cont(); + manager->txExecutor->execute(tx, [this, tx, cont, failOnError]() { + if (tx->error == error_code_success) { + cont(); + } else { + std::string msg = + fmt::format("Transaction failed with error: {} ({}})", tx->error, fdb_get_error(tx->error)); + if (failOnError) { + error(msg); + failed = true; + } else { + info(msg); + cont(); + } + } tasksScheduled--; checkIfDone(); }); } +void WorkloadBase::info(const std::string& msg) { + fmt::print(stderr, "[{}] {}\n", workloadId, msg); +} + +void WorkloadBase::error(const std::string& msg) { + fmt::print(stderr, "[{}] ERROR: {}\n", workloadId, msg); + numErrors++; + if (numErrors > maxErrors && !failed) { + fmt::print(stderr, "[{}] ERROR: Stopping workload after {} errors\n", workloadId, numErrors); + failed = true; + } +} + void WorkloadBase::checkIfDone() { if (tasksScheduled == 0) { - manager->workloadDone(this); + if (numErrors > 0) { + error(fmt::format("Workload failed with {} errors", numErrors.load())); + } else { + info("Workload successfully completed"); + } + manager->workloadDone(this, numErrors > 0); } } @@ -65,9 +140,14 @@ void WorkloadManager::run() { iter.first->start(); } scheduler->join(); + if (numWorkloadsFailed > 0) { + fmt::print(stderr, "{} workloads failed", numWorkloadsFailed); + } else { + fprintf(stderr, "All workloads succesfully completed"); + } } -void WorkloadManager::workloadDone(IWorkload* workload) { +void WorkloadManager::workloadDone(IWorkload* workload, bool failed) { std::unique_lock lock(mutex); auto iter = workloads.find(workload); ASSERT(iter != workloads.end()); @@ -75,6 +155,9 @@ void WorkloadManager::workloadDone(IWorkload* workload) { iter->second.cont(); lock.lock(); workloads.erase(iter); + if (failed) { + numWorkloadsFailed++; + } bool done = workloads.empty(); lock.unlock(); if (done) { diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index f542bc1d00..aa4b5cc3ca 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -25,6 +25,7 @@ #define APITESTER_WORKLOAD_H #include "TesterTransactionExecutor.h" +#include "TesterUtil.h" #include #include #include @@ -41,16 +42,20 @@ public: }; struct WorkloadConfig { + std::string name; int clientId; int numClients; std::unordered_map options; + + int getIntOption(const std::string& name, int defaultVal) const; + double getFloatOption(const std::string& name, double defaultVal) const; }; // A base class for test workloads // Tracks if workload is active, notifies the workload manager when the workload completes class WorkloadBase : public IWorkload { public: - WorkloadBase() : manager(nullptr), tasksScheduled(0) {} + WorkloadBase(const WorkloadConfig& config); void init(WorkloadManager* manager) override; protected: @@ -58,13 +63,19 @@ protected: void schedule(TTaskFct task); // Execute a transaction within the workload - void execTransaction(std::shared_ptr tx, TTaskFct cont); + void execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError = true); // Execute a transaction within the workload, a convenience method for tranasactions defined by a single lambda - void execTransaction(TTxStartFct start, TTaskFct cont) { - execTransaction(std::make_shared(start), cont); + void execTransaction(TTxStartFct start, TTaskFct cont, bool failOnError = true) { + execTransaction(std::make_shared(start), cont, failOnError); } + // Log an error message + void error(const std::string& msg); + + // Log an info message + void info(const std::string& msg); + private: WorkloadManager* manager; @@ -74,6 +85,14 @@ private: // Keep track of tasks scheduled by the workload // End workload when this number falls to 0 std::atomic tasksScheduled; + std::atomic numErrors; + +protected: + int clientId; + int numClients; + int maxErrors; + std::string workloadId; + std::atomic failed; }; // Workload manager @@ -81,7 +100,7 @@ private: class WorkloadManager { public: WorkloadManager(ITransactionExecutor* txExecutor, IScheduler* scheduler) - : txExecutor(txExecutor), scheduler(scheduler) {} + : txExecutor(txExecutor), scheduler(scheduler), numWorkloadsFailed(0) {} // Add a workload // A continuation is to be specified for subworkloads @@ -90,6 +109,11 @@ public: // Run all workloads. Blocks until all workloads complete void run(); + bool failed() { + std::unique_lock lock(mutex); + return numWorkloadsFailed > 0; + } + private: friend WorkloadBase; @@ -98,13 +122,14 @@ private: TTaskFct cont; }; - void workloadDone(IWorkload* workload); + void workloadDone(IWorkload* workload, bool failed); ITransactionExecutor* txExecutor; IScheduler* scheduler; std::mutex mutex; std::unordered_map workloads; + int numWorkloadsFailed; }; struct IWorkloadFactory { diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 537b393f43..a08dbc6571 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -24,12 +24,13 @@ #include "TesterTransactionExecutor.h" #include "TesterTestSpec.h" #include "TesterUtil.h" -#include +#include "flow/SimpleOpt.h" +#include "bindings/c/foundationdb/fdb_c.h" + #include #include #include -#include "flow/SimpleOpt.h" -#include "bindings/c/foundationdb/fdb_c.h" +#include namespace FdbApiTester { @@ -121,7 +122,7 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { break; case OPT_TRACE_FORMAT: if (!validateTraceFormat(args.OptionArg())) { - fprintf(stderr, "ERROR: Unrecognized trace format `%s'\n", args.OptionArg()); + fmt::print(stderr, "ERROR: Unrecognized trace format `{}'\n", args.OptionArg()); return false; } options.traceFormat = args.OptionArg(); @@ -129,7 +130,7 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) { case OPT_KNOB: { std::string knobName; if (!extractPrefixedArgument("--knob", args.OptionSyntax(), knobName)) { - fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", args.OptionSyntax()); + fmt::print(stderr, "ERROR: unable to parse knob option '{}'\n", args.OptionSyntax()); return false; } options.knobs.emplace_back(knobName, args.OptionArg()); @@ -163,7 +164,7 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) { return false; } } else { - printf("Invalid argument: %s\n", args.OptionText()); + fmt::print(stderr, "ERROR: Invalid argument: {}\n", args.OptionText()); printProgramUsage(argv[0]); return false; } @@ -173,7 +174,7 @@ bool parseArgs(TesterOptions& options, int argc, char** argv) { void fdb_check(fdb_error_t e) { if (e) { - std::cerr << fdb_get_error(e) << std::endl; + fmt::print(stderr, "Unexpected FDB error: {}({})\n", e, fdb_get_error(e)); std::abort(); } } @@ -186,7 +187,8 @@ void applyNetworkOptions(TesterOptions& options) { } if (options.testSpec.multiThreaded) { - FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads); + fdb_check( + FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads)); } if (options.testSpec.buggify) { @@ -200,8 +202,8 @@ void applyNetworkOptions(TesterOptions& options) { } for (auto knob : options.knobs) { - FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_KNOB, - format("%s=%s", knob.first.c_str(), knob.second.c_str())); + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_KNOB, + fmt::format("{}={}", knob.first.c_str(), knob.second.c_str()))); } } @@ -213,7 +215,7 @@ void randomizeOptions(TesterOptions& options) { options.numClients = random.randomInt(options.testSpec.minClients, options.testSpec.maxClients); } -void runWorkloads(TesterOptions& options) { +bool runWorkloads(TesterOptions& options) { TransactionExecutorOptions txExecOptions; txExecOptions.blockOnFutures = options.testSpec.blockOnFutures; txExecOptions.numDatabases = options.numDatabases; @@ -227,18 +229,20 @@ void runWorkloads(TesterOptions& options) { for (const auto& workloadSpec : options.testSpec.workloads) { for (int i = 0; i < options.numClients; i++) { WorkloadConfig config; + config.name = workloadSpec.name; config.options = workloadSpec.options; config.clientId = i; config.numClients = options.numClients; std::shared_ptr workload = IWorkloadFactory::create(workloadSpec.name, config); if (!workload) { - throw TesterError(format("Unknown workload '%s'", workloadSpec.name.c_str())); + throw TesterError(fmt::format("Unknown workload '{}'", workloadSpec.name)); } workloadMgr.add(workload); } } workloadMgr.run(); + return !workloadMgr.failed(); } } // namespace @@ -247,6 +251,7 @@ void runWorkloads(TesterOptions& options) { using namespace FdbApiTester; int main(int argc, char** argv) { + int retCode = 0; try { TesterOptions options; if (!parseArgs(options, argc, argv)) { @@ -260,13 +265,15 @@ int main(int argc, char** argv) { std::thread network_thread{ &fdb_run_network }; - runWorkloads(options); + if (!runWorkloads(options)) { + retCode = 1; + } fdb_check(fdb_stop_network()); network_thread.join(); - return 0; } catch (const std::runtime_error& err) { - std::cerr << "ERROR: " << err.what() << std::endl; - return 1; + fmt::print(stderr, "ERROR: {}\n", err.what()); + retCode = 1; } + return retCode; } diff --git a/bindings/c/test/apitester/run_c_api_tests.py b/bindings/c/test/apitester/run_c_api_tests.py index f61cbd151d..8f79e6d8b1 100755 --- a/bindings/c/test/apitester/run_c_api_tests.py +++ b/bindings/c/test/apitester/run_c_api_tests.py @@ -86,7 +86,9 @@ def run_tests(args): if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")] for test_file in test_files: + get_logger().info('=========================================================') get_logger().info('Running test %s' % test_file) + get_logger().info('=========================================================') ret_code = run_tester(args, os.path.join(args.test_dir, test_file)) if ret_code != 0: num_failed += 1 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml index a77908ce79..4c23c2ac85 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml @@ -7,8 +7,18 @@ minDatabases = 2 maxDatabases = 8 minClientThreads = 2 maxClientThreads = 8 +minClients = 2 +maxClients = 8 buggify = true blockOnFutures = true [[test.workload]] - name = 'ApiCorrectness' \ No newline at end of file + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml index f8b6d31c95..33e523c43c 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml @@ -7,7 +7,17 @@ minDatabases = 2 maxDatabases = 8 minClientThreads = 2 maxClientThreads = 8 +minClients = 2 +maxClients = 8 buggify = true [[test.workload]] - name = 'ApiCorrectness' \ No newline at end of file + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml index 6d292fff4d..af778764ed 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml @@ -7,6 +7,16 @@ minDatabases = 2 maxDatabases = 8 minClientThreads = 2 maxClientThreads = 8 +minClients = 2 +maxClients = 8 [[test.workload]] - name = 'ApiCorrectness' \ No newline at end of file + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml index bb57f92fa8..9e27c27353 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml @@ -1,5 +1,16 @@ [[test]] title = 'API Correctness Single Threaded' +minClients = 1 +maxClients = 3 +multiThreaded = false [[test.workload]] - name = 'ApiCorrectness' \ No newline at end of file + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 896cf3802a..c81ef2485d 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -129,7 +129,7 @@ function(add_fdb_test) -n ${test_name} -b ${PROJECT_BINARY_DIR} -t ${test_type} - -O ${OLD_FDBSERVER_BINARY} + -O ${OLD_FDBSERVER_BINARY} --config "@CTEST_CONFIGURATION_TYPE@" --crash --aggregate-traces ${TEST_AGGREGATE_TRACES} @@ -404,7 +404,7 @@ endfunction() # Creates a single cluster before running the specified command (usually a ctest test) function(add_fdbclient_test) - set(options DISABLED ENABLED) + set(options DISABLED ENABLED DISABLE_LOG_DUMP) set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY) set(multiValueArgs COMMAND) cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") @@ -423,23 +423,20 @@ function(add_fdbclient_test) if(NOT T_COMMAND) message(FATAL_ERROR "COMMAND is a required argument for add_fdbclient_test") endif() - message(STATUS "Adding Client test ${T_NAME}") - if (T_PROCESS_NUMBER) - add_test(NAME "${T_NAME}" - WORKING_DIRECTORY ${T_WORKING_DIRECTORY} - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py - --build-dir ${CMAKE_BINARY_DIR} - --process-number ${T_PROCESS_NUMBER} - -- - ${T_COMMAND}) - else() - add_test(NAME "${T_NAME}" - WORKING_DIRECTORY ${T_WORKING_DIRECTORY} - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py - --build-dir ${CMAKE_BINARY_DIR} - -- - ${T_COMMAND}) + set(TMP_CLUSTER_CMD ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py + --build-dir ${CMAKE_BINARY_DIR}) + if(T_PROCESS_NUMBER) + list(APPEND TMP_CLUSTER_CMD --process-number ${T_PROCESS_NUMBER}) endif() + if(T_DISABLE_LOG_DUMP) + list(APPEND TMP_CLUSTER_CMD --disable-log-dump) + endif() + message(STATUS "Adding Client test ${T_NAME}") + add_test(NAME "${T_NAME}" + WORKING_DIRECTORY ${T_WORKING_DIRECTORY} + COMMAND ${Python_EXECUTABLE} ${TMP_CLUSTER_CMD} + -- + ${T_COMMAND}) if (T_TEST_TIMEOUT) set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT}) else() @@ -449,7 +446,7 @@ function(add_fdbclient_test) set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1) endfunction() -# Creates a cluster file for a nonexistent cluster before running the specified command +# Creates a cluster file for a nonexistent cluster before running the specified command # (usually a ctest test) function(add_unavailable_fdbclient_test) set(options DISABLED ENABLED) diff --git a/tests/TestRunner/tmp_cluster.py b/tests/TestRunner/tmp_cluster.py index eec0535bc4..21ec4547ad 100755 --- a/tests/TestRunner/tmp_cluster.py +++ b/tests/TestRunner/tmp_cluster.py @@ -18,7 +18,8 @@ class TempCluster: assert self.build_dir.is_dir(), "{} is not a directory".format(build_dir) tmp_dir = self.build_dir.joinpath( "tmp", - "".join(choice(LocalCluster.valid_letters_for_secret) for i in range(16)), + "".join(choice(LocalCluster.valid_letters_for_secret) + for i in range(16)), ) tmp_dir.mkdir(parents=True) self.cluster = LocalCluster( @@ -75,7 +76,8 @@ if __name__ == "__main__": help="FDB build directory", required=True, ) - parser.add_argument("cmd", metavar="COMMAND", nargs="+", help="The command to run") + parser.add_argument("cmd", metavar="COMMAND", + nargs="+", help="The command to run") parser.add_argument( "--process-number", "-p", @@ -83,6 +85,11 @@ if __name__ == "__main__": type=int, default=1, ) + parser.add_argument( + '--disable-log-dump', + help='Do not dump cluster log on error', + action="store_true" + ) args = parser.parse_args() errcode = 1 with TempCluster(args.build_dir, args.process_number) as cluster: @@ -128,7 +135,7 @@ if __name__ == "__main__": errcode = 1 break - if errcode: + if errcode and not args.disable_log_dump: for etc_file in glob.glob(os.path.join(cluster.etc, "*")): print(">>>>>>>>>>>>>>>>>>>> Contents of {}:".format(etc_file)) with open(etc_file, "r") as f: From 20c1e893c750799a108e2b2d5ff34bb0958c7281 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 4 Mar 2022 16:50:50 +0100 Subject: [PATCH 037/138] ApiTester: fix build dependencies --- bindings/c/CMakeLists.txt | 2 +- bindings/c/test/apitester/TesterApiWrapper.cpp | 2 +- bindings/c/test/apitester/TesterTestSpec.cpp | 1 + bindings/c/test/apitester/TesterTransactionExecutor.cpp | 2 ++ bindings/c/test/apitester/fdb_c_api_tester.cpp | 2 +- 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 59107033f8..f440de842f 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -163,7 +163,7 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads) target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow) target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads) - target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads) + target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_target) # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 9097d13858..a8cf50b0d4 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -19,7 +19,7 @@ */ #include "TesterApiWrapper.h" #include -#include +#include namespace FdbApiTester { diff --git a/bindings/c/test/apitester/TesterTestSpec.cpp b/bindings/c/test/apitester/TesterTestSpec.cpp index 936e92216e..5f008f3dba 100644 --- a/bindings/c/test/apitester/TesterTestSpec.cpp +++ b/bindings/c/test/apitester/TesterTestSpec.cpp @@ -22,6 +22,7 @@ #include "TesterUtil.h" #include #include +#include namespace FdbApiTester { diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 08b825406a..fdecf205b8 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -23,6 +23,8 @@ #include "test/apitester/TesterScheduler.h" #include #include +#include +#include namespace FdbApiTester { diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index a08dbc6571..1770b671dc 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -30,7 +30,7 @@ #include #include #include -#include +#include namespace FdbApiTester { From 1e75ffd8805dddb98a8d9309e00db4f1ab16bfdd Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 4 Mar 2022 18:34:36 +0100 Subject: [PATCH 038/138] ApiTester: Options to run each transaction in new database & to execute callbacks on external threads --- bindings/c/test/apitester/TesterTestSpec.cpp | 8 ++ bindings/c/test/apitester/TesterTestSpec.h | 2 + .../apitester/TesterTransactionExecutor.cpp | 81 ++++++++++++++----- .../apitester/TesterTransactionExecutor.h | 6 +- .../c/test/apitester/fdb_c_api_tester.cpp | 9 ++- .../tests/CApiCorrectnessBlocking.toml | 5 +- .../CApiCorrectnessCallbacksOnExtThr.toml | 24 ++++++ ...ggify.toml => CApiCorrectnessDBPerTX.toml} | 5 +- .../tests/CApiCorrectnessMultiThr.toml | 1 + 9 files changed, 111 insertions(+), 30 deletions(-) create mode 100644 bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml rename bindings/c/test/apitester/tests/{CApiCorrectnessBuggify.toml => CApiCorrectnessDBPerTX.toml} (78%) diff --git a/bindings/c/test/apitester/TesterTestSpec.cpp b/bindings/c/test/apitester/TesterTestSpec.cpp index 5f008f3dba..dcb837dfb9 100644 --- a/bindings/c/test/apitester/TesterTestSpec.cpp +++ b/bindings/c/test/apitester/TesterTestSpec.cpp @@ -61,6 +61,14 @@ std::unordered_mapmultiThreaded = (value == "true"); } }, + { "fdbCallbacksOnExternalThreads", + [](const std::string& value, TestSpec* spec) { // + spec->fdbCallbacksOnExternalThreads = (value == "true"); + } }, + { "databasePerTransaction", + [](const std::string& value, TestSpec* spec) { // + spec->databasePerTransaction = (value == "true"); + } }, { "minFdbThreads", [](const std::string& value, TestSpec* spec) { // processIntOption(value, "minFdbThreads", spec->minFdbThreads, 1, 1000); diff --git a/bindings/c/test/apitester/TesterTestSpec.h b/bindings/c/test/apitester/TesterTestSpec.h index cbeb787b84..5c7c162756 100644 --- a/bindings/c/test/apitester/TesterTestSpec.h +++ b/bindings/c/test/apitester/TesterTestSpec.h @@ -43,6 +43,8 @@ struct TestSpec { bool blockOnFutures = false; bool multiThreaded = false; bool buggify = false; + bool fdbCallbacksOnExternalThreads = false; + bool databasePerTransaction = false; int minFdbThreads = 1; int maxFdbThreads = 1; int minClientThreads = 1; diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index fdecf205b8..e1192d6d1a 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -187,15 +187,43 @@ private: IScheduler* scheduler; }; -class TransactionExecutor : public ITransactionExecutor { +class TransactionExecutorBase : public ITransactionExecutor { public: - TransactionExecutor() : scheduler(nullptr) {} + TransactionExecutorBase(const TransactionExecutorOptions& options) : options(options), scheduler(nullptr) {} - ~TransactionExecutor() { release(); } - - void init(IScheduler* scheduler, const char* clusterFile, const TransactionExecutorOptions& options) override { + void init(IScheduler* scheduler, const char* clusterFile) override { this->scheduler = scheduler; - this->options = options; + this->clusterFile = clusterFile; + } + +protected: + void executeWithDatabase(FDBDatabase* db, std::shared_ptr txActor, TTaskFct cont) { + FDBTransaction* tx; + fdb_error_t err = fdb_database_create_transaction(db, &tx); + if (err != error_code_success) { + txActor->setError(err); + cont(); + } else { + TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); + txActor->init(ctx); + txActor->start(); + } + } + +protected: + TransactionExecutorOptions options; + std::string clusterFile; + IScheduler* scheduler; +}; + +class DBPoolTransactionExecutor : public TransactionExecutorBase { +public: + DBPoolTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {} + + ~DBPoolTransactionExecutor() override { release(); } + + void init(IScheduler* scheduler, const char* clusterFile) override { + TransactionExecutorBase::init(scheduler, clusterFile); for (int i = 0; i < options.numDatabases; i++) { FDBDatabase* db; fdb_error_t err = fdb_create_database(clusterFile, &db); @@ -211,19 +239,10 @@ public: void execute(std::shared_ptr txActor, TTaskFct cont) override { int idx = random.randomInt(0, options.numDatabases - 1); - FDBTransaction* tx; - fdb_error_t err = fdb_database_create_transaction(databases[idx], &tx); - if (err != error_code_success) { - txActor->setError(err); - cont(); - } else { - TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); - txActor->init(ctx); - txActor->start(); - } + executeWithDatabase(databases[idx], txActor, cont); } - void release() override { + void release() { for (FDBDatabase* db : databases) { fdb_database_destroy(db); } @@ -231,13 +250,33 @@ public: private: std::vector databases; - TransactionExecutorOptions options; - IScheduler* scheduler; Random random; }; -std::unique_ptr createTransactionExecutor() { - return std::make_unique(); +class DBPerTransactionExecutor : public TransactionExecutorBase { +public: + DBPerTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {} + + void execute(std::shared_ptr txActor, TTaskFct cont) override { + FDBDatabase* db = nullptr; + fdb_error_t err = fdb_create_database(clusterFile.c_str(), &db); + if (err != error_code_success) { + txActor->setError(err); + cont(); + } + executeWithDatabase(db, txActor, [cont, db]() { + fdb_database_destroy(db); + cont(); + }); + } +}; + +std::unique_ptr createTransactionExecutor(const TransactionExecutorOptions& options) { + if (options.databasePerTransaction) { + return std::make_unique(options); + } else { + return std::make_unique(options); + } } } // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 01d5bb581c..899f812cfc 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -80,18 +80,18 @@ private: struct TransactionExecutorOptions { std::string prefix = ""; bool blockOnFutures = false; + bool databasePerTransaction = false; int numDatabases = 1; }; class ITransactionExecutor { public: virtual ~ITransactionExecutor() {} - virtual void init(IScheduler* sched, const char* clusterFile, const TransactionExecutorOptions& options) = 0; + virtual void init(IScheduler* sched, const char* clusterFile) = 0; virtual void execute(std::shared_ptr tx, TTaskFct cont) = 0; - virtual void release() = 0; }; -std::unique_ptr createTransactionExecutor(); +std::unique_ptr createTransactionExecutor(const TransactionExecutorOptions& options); } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 1770b671dc..1dc5cb6da4 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -191,6 +191,10 @@ void applyNetworkOptions(TesterOptions& options) { FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads)); } + if (options.testSpec.fdbCallbacksOnExternalThreads) { + fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS)); + } + if (options.testSpec.buggify) { fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE)); } @@ -219,11 +223,12 @@ bool runWorkloads(TesterOptions& options) { TransactionExecutorOptions txExecOptions; txExecOptions.blockOnFutures = options.testSpec.blockOnFutures; txExecOptions.numDatabases = options.numDatabases; + txExecOptions.databasePerTransaction = options.testSpec.databasePerTransaction; std::unique_ptr scheduler = createScheduler(options.numClientThreads); - std::unique_ptr txExecutor = createTransactionExecutor(); + std::unique_ptr txExecutor = createTransactionExecutor(txExecOptions); scheduler->start(); - txExecutor->init(scheduler.get(), options.clusterFile.c_str(), txExecOptions); + txExecutor->init(scheduler.get(), options.clusterFile.c_str()); WorkloadManager workloadMgr(txExecutor.get(), scheduler.get()); for (const auto& workloadSpec : options.testSpec.workloads) { diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml index 4c23c2ac85..a55d484616 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml @@ -1,6 +1,8 @@ [[test]] title = 'API Correctness Blocking' multiThreaded = true +buggify = true +blockOnFutures = true minFdbThreads = 2 maxFdbThreads = 8 minDatabases = 2 @@ -9,8 +11,7 @@ minClientThreads = 2 maxClientThreads = 8 minClients = 2 maxClients = 8 -buggify = true -blockOnFutures = true + [[test.workload]] name = 'ApiCorrectness' diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml new file mode 100644 index 0000000000..3c609624ce --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml @@ -0,0 +1,24 @@ +[[test]] +title = 'API Correctness Callbacks On External Threads' +multiThreaded = true +fdbCallbacksOnExternalThreads = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'ApiCorrectness' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml similarity index 78% rename from bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml rename to bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml index 33e523c43c..0f7f25e494 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessBuggify.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml @@ -1,6 +1,8 @@ [[test]] -title = 'API Correctness Buggify' +title = 'API Correctness Database Per Transaction' multiThreaded = true +buggify = true +databasePerTransaction = true minFdbThreads = 2 maxFdbThreads = 8 minDatabases = 2 @@ -9,7 +11,6 @@ minClientThreads = 2 maxClientThreads = 8 minClients = 2 maxClients = 8 -buggify = true [[test.workload]] name = 'ApiCorrectness' diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml index af778764ed..e0b07b09e1 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml @@ -1,6 +1,7 @@ [[test]] title = 'API Correctness Multi Threaded' multiThreaded = true +buggify = true minFdbThreads = 2 maxFdbThreads = 8 minDatabases = 2 From 892538e233af49ef8913de74debab3afd19d1002 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 4 Mar 2022 20:04:17 +0100 Subject: [PATCH 039/138] ApiTester: Adding some comments to the headers --- bindings/c/test/apitester/TesterScheduler.h | 12 ++++ bindings/c/test/apitester/TesterTestSpec.h | 30 +++++++++- .../apitester/TesterTransactionExecutor.h | 47 +++++++++++++++- bindings/c/test/apitester/TesterWorkload.h | 55 ++++++++++++++++++- 4 files changed, 139 insertions(+), 5 deletions(-) diff --git a/bindings/c/test/apitester/TesterScheduler.h b/bindings/c/test/apitester/TesterScheduler.h index 491aef568b..c48183b3e0 100644 --- a/bindings/c/test/apitester/TesterScheduler.h +++ b/bindings/c/test/apitester/TesterScheduler.h @@ -32,15 +32,27 @@ using TTaskFct = std::function; extern const TTaskFct NO_OP_TASK; +/** + * Scheduler for asynchronous execution of tasks on a pool of threads + */ class IScheduler { public: virtual ~IScheduler() {} + + // Create scheduler threads and begin accepting tasks virtual void start() = 0; + + // Schedule a task for asynchronous execution virtual void schedule(TTaskFct task) = 0; + + // Gracefully stop the scheduler. Waits for already running tasks to be finish virtual void stop() = 0; + + // Join with all threads of the scheduler virtual void join() = 0; }; +// create a scheduler using given number of threads std::unique_ptr createScheduler(int numThreads); } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterTestSpec.h b/bindings/c/test/apitester/TesterTestSpec.h index 5c7c162756..7467a7d59a 100644 --- a/bindings/c/test/apitester/TesterTestSpec.h +++ b/bindings/c/test/apitester/TesterTestSpec.h @@ -31,32 +31,58 @@ namespace FdbApiTester { +/// Workload specification struct WorkloadSpec { std::string name; std::unordered_map options; }; +// Test speficification loaded from a *.toml file struct TestSpec { + // Title of the test std::string title; - // api version, using the latest version by default + + // FDB API version, using the latest version by default int apiVersion = FDB_API_VERSION; + + // Use blocking waits on futures instead of scheduling callbacks bool blockOnFutures = false; + + // Use multi-threaded FDB client bool multiThreaded = false; + + // Enable injection of errors in FDB client bool buggify = false; + + // Execute future callbacks on the threads of the external FDB library + // rather than on the main thread of the local FDB client library bool fdbCallbacksOnExternalThreads = false; + + // Execute each transaction in a separate database instance bool databasePerTransaction = false; + + // Size of the FDB client thread pool (a random number in the [min,max] range) int minFdbThreads = 1; int maxFdbThreads = 1; + + // Size of the thread pool for test workloads (a random number in the [min,max] range) int minClientThreads = 1; int maxClientThreads = 1; + + // Size of the database instance pool (a random number in the [min,max] range) + // Each transaction is assigned randomly to one of the databases in the pool int minDatabases = 1; int maxDatabases = 1; + + // Number of workload clients (a random number in the [min,max] range) int minClients = 1; int maxClients = 10; - std::string testFile; + + // List of workloads with their options std::vector workloads; }; +// Read the test specfication from a *.toml file TestSpec readTomlTestSpec(std::string fileName); } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 899f812cfc..3797d6df9e 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -31,27 +31,55 @@ namespace FdbApiTester { +/** + * Interface to be used for implementation of a concrete transaction + */ class ITransactionContext { public: virtual ~ITransactionContext() {} + + // Current FDB transaction virtual Transaction* tx() = 0; + + // Schedule a continuation to be executed when the future gets ready virtual void continueAfter(Future f, TTaskFct cont) = 0; + + // Commit the transaction virtual void commit() = 0; + + // Mark the transaction as completed without committing it (for read transactions) virtual void done() = 0; + + // A continuation to be executed when all of the given futures get ready virtual void continueAfterAll(std::shared_ptr> futures, TTaskFct cont); }; +/** + * Interface of an actor object implementing a concrete transaction + */ class ITransactionActor { public: virtual ~ITransactionActor() {} + + // Initialize with the given transaction context virtual void init(ITransactionContext* ctx) = 0; + + // Start execution of the transaction, also called on retries virtual void start() = 0; + + // Reset the transaction state virtual void reset() = 0; + + // Abort the transaction with an unretriable error virtual void setError(fdb_error_t err) { error = err; } + // Unretriable error, set if the transaction has failed fdb_error_t error = error_code_success; }; +/** + * A helper base class for transaction actors + */ class TransactionActorBase : public ITransactionActor { public: void init(ITransactionContext* ctx) override { context = ctx; } @@ -66,8 +94,12 @@ private: ITransactionContext* context = nullptr; }; +// Type of the lambda functions implementing a transaction using TTxStartFct = std::function; +/** + * A wrapper class for transactions implemented by lambda functions + */ class TransactionFct : public TransactionActorBase { public: TransactionFct(TTxStartFct startFct) : startFct(startFct) {} @@ -77,13 +109,25 @@ private: TTxStartFct startFct; }; +/** + * Configuration of transaction execution mode + */ struct TransactionExecutorOptions { - std::string prefix = ""; + // Use blocking waits on futures bool blockOnFutures = false; + + // Create each transaction in a separate database instance bool databasePerTransaction = false; + + // The size of the database instance pool int numDatabases = 1; }; +/** + * Transaction executor provides an interface for executing transactions + * It is responsible for instantiating FDB databases and transactions and managing their lifecycle + * according to the provided options + */ class ITransactionExecutor { public: virtual ~ITransactionExecutor() {} @@ -91,6 +135,7 @@ public: virtual void execute(std::shared_ptr tx, TTaskFct cont) = 0; }; +// Create a transaction executor for the given options std::unique_ptr createTransactionExecutor(const TransactionExecutorOptions& options); } // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index aa4b5cc3ca..023a443757 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -34,19 +34,33 @@ namespace FdbApiTester { class WorkloadManager; +// Workoad interface class IWorkload { public: virtual ~IWorkload() {} + + // Intialize the workload virtual void init(WorkloadManager* manager) = 0; + + // Start executing the workload virtual void start() = 0; }; +// Workload configuration struct WorkloadConfig { + // Workoad name std::string name; + + // Client ID assigned to the workload (a number from 0 to numClients-1) int clientId; + + // Total number of clients int numClients; + + // Workload options: as key-value pairs std::unordered_map options; + // Get option of a certain type by name. Throws an exception if the values is of a wrong type int getIntOption(const std::string& name, int defaultVal) const; double getFloatOption(const std::string& name, double defaultVal) const; }; @@ -56,6 +70,8 @@ struct WorkloadConfig { class WorkloadBase : public IWorkload { public: WorkloadBase(const WorkloadConfig& config); + + // Initialize the workload void init(WorkloadManager* manager) override; protected: @@ -65,12 +81,12 @@ protected: // Execute a transaction within the workload void execTransaction(std::shared_ptr tx, TTaskFct cont, bool failOnError = true); - // Execute a transaction within the workload, a convenience method for tranasactions defined by a single lambda + // Execute a transaction within the workload, a convenience method for a tranasaction defined by a lambda function void execTransaction(TTxStartFct start, TTaskFct cont, bool failOnError = true) { execTransaction(std::make_shared(start), cont, failOnError); } - // Log an error message + // Log an error message, increase error counter void error(const std::string& msg); // Log an info message @@ -85,13 +101,24 @@ private: // Keep track of tasks scheduled by the workload // End workload when this number falls to 0 std::atomic tasksScheduled; + + // Number of errors logged std::atomic numErrors; protected: + // Client ID assigned to the workload (a number from 0 to numClients-1) int clientId; + + // Total number of clients int numClients; + + // The maximum number of errors before stoppoing the workload int maxErrors; + + // Workload identifier, consisting of workload name and client ID std::string workloadId; + + // Workload is failed, no further transactions or continuations will be scheduled by the workload std::atomic failed; }; @@ -109,6 +136,7 @@ public: // Run all workloads. Blocks until all workloads complete void run(); + // True if at least one workload has failed bool failed() { std::unique_lock lock(mutex); return numWorkloadsFailed > 0; @@ -117,29 +145,52 @@ public: private: friend WorkloadBase; + // Info about a running workload struct WorkloadInfo { + // Reference to the workoad for ownership std::shared_ptr ref; + // Continuation to be executed after completing the workload TTaskFct cont; }; + // To be called by a workload to notify that it is done void workloadDone(IWorkload* workload, bool failed); + // Transaction executor to be used by the workloads ITransactionExecutor* txExecutor; + + // A scheduler to be used by the workloads IScheduler* scheduler; + // Mutex protects access to workloads & numWorkloadsFailed std::mutex mutex; + + // A map of currently running workloads std::unordered_map workloads; + + // Number of workloads failed int numWorkloadsFailed; }; +// A workload factory struct IWorkloadFactory { + // create a workload by name static std::shared_ptr create(std::string const& name, const WorkloadConfig& config); + + // a singleton registry of workload factories static std::unordered_map& factories(); + // Interface to be implemented by a workload factory virtual ~IWorkloadFactory() = default; virtual std::shared_ptr create(const WorkloadConfig& config) = 0; }; +/** + * A template for a workload factory for creating workloads of a certain type + * + * Declare a global instance of the factory for a workload type as follows: + * WorkloadFactory MyWorkloadFactory("myWorkload"); + */ template struct WorkloadFactory : IWorkloadFactory { WorkloadFactory(const char* name) { factories()[name] = this; } From f03c0b8c3caa0c9898f1fff408a7ec26def7999b Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 4 Mar 2022 17:19:46 -0800 Subject: [PATCH 040/138] Added ISimulated::restarted for detecting a restarted simulation test. --- fdbrpc/simulator.h | 1 + fdbserver/SimulatedCluster.actor.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index e03f16cfef..86c0b8c6b2 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -427,6 +427,7 @@ public: bool speedUpSimulation; BackupAgentType backupAgents; BackupAgentType drAgents; + bool restarted = false; bool hasDiffProtocolProcess; // true if simulator is testing a process with a different version bool setDiffProtocol; // true if a process with a different protocol version has been started diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index e7cb21e74a..8c173ef617 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1179,6 +1179,8 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none); } + g_simulator.restarted = true; + TraceEvent("RestartSimulatorSettings") .detail("DesiredCoordinators", g_simulator.desiredCoordinators) .detail("ProcessesPerMachine", g_simulator.processesPerMachine) From 11f25bc08f8350872eb6bbba95f35fcbce78765c Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 4 Mar 2022 17:20:31 -0800 Subject: [PATCH 041/138] Initialize decode boundary verifier to nullptr for restart tests, also disable check for now as the feature is not finished. --- fdbserver/VersionedBTree.actor.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b483a67a84..f77c7aa8de 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4662,7 +4662,13 @@ struct DecodeBoundaryVerifier { static DecodeBoundaryVerifier* getVerifier(std::string name) { static std::map verifiers; - return g_network->isSimulated() ? &verifiers[name] : nullptr; + // Verifier disabled due to not being finished + // + // Only use verifier in a non-restarted simulation so that all page writes are captured + // if (g_network->isSimulated() && !g_simulator.restarted) { + // return &verifiers[name]; + // } + return nullptr; } void update(BTreePageIDRef id, Version v, Key lowerBound, Key upperBound) { From 8d2c3a4773956c21399c0ab9bf70b0f48ffaf06c Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 9 Mar 2022 13:28:12 +0100 Subject: [PATCH 042/138] ApiTester: Use thread-local random generators --- .../apitester/TesterCorrectnessWorkload.cpp | 17 ++++++++--------- .../apitester/TesterTransactionExecutor.cpp | 3 +-- bindings/c/test/apitester/TesterUtil.cpp | 5 +++++ bindings/c/test/apitester/TesterUtil.h | 2 ++ bindings/c/test/apitester/fdb_c_api_tester.cpp | 2 +- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index c41eb9e72c..b597645665 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -86,9 +86,9 @@ public: } private: - std::string randomKeyName() { return keyPrefix + random.randomStringLowerCase(minKeyLength, maxKeyLength); } + std::string randomKeyName() { return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength); } - std::string randomValue() { return random.randomStringLowerCase(minValueLength, maxValueLength); } + std::string randomValue() { return Random::get().randomStringLowerCase(minValueLength, maxValueLength); } std::string randomNotExistingKey() { while (true) { @@ -114,7 +114,7 @@ private: } std::string randomKey(double existingKeyRatio) { - if (random.randomBool(existingKeyRatio)) { + if (Random::get().randomBool(existingKeyRatio)) { return randomExistingKey(); } else { return randomNotExistingKey(); @@ -122,7 +122,7 @@ private: } void randomInsertOp(TTaskFct cont) { - int numKeys = random.randomInt(1, maxKeysPerTransaction); + int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() }); @@ -143,7 +143,7 @@ private: } void randomCommitReadOp(TTaskFct cont) { - int numKeys = random.randomInt(1, maxKeysPerTransaction); + int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto kvPairs = std::make_shared>(); for (int i = 0; i < numKeys; i++) { kvPairs->push_back(KeyValue{ randomKey(readExistingKeysRatio), randomValue() }); @@ -193,7 +193,7 @@ private: } void randomGetOp(TTaskFct cont) { - int numKeys = random.randomInt(1, maxKeysPerTransaction); + int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); auto results = std::make_shared>>(); for (int i = 0; i < numKeys; i++) { @@ -228,7 +228,7 @@ private: } void randomClearOp(TTaskFct cont) { - int numKeys = random.randomInt(1, maxKeysPerTransaction); + int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); auto keys = std::make_shared>(); for (int i = 0; i < numKeys; i++) { keys->push_back(randomExistingKey()); @@ -266,7 +266,7 @@ private: } void randomOperation(TTaskFct cont) { - OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)random.randomInt(0, OP_LAST); + OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); switch (txType) { case OP_INSERT: randomInsertOp(cont); @@ -313,7 +313,6 @@ private: } int numOpLeft; - Random random; KeyValueStore store; }; diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index e1192d6d1a..36f8e43ec9 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -238,7 +238,7 @@ public: } void execute(std::shared_ptr txActor, TTaskFct cont) override { - int idx = random.randomInt(0, options.numDatabases - 1); + int idx = Random::get().randomInt(0, options.numDatabases - 1); executeWithDatabase(databases[idx], txActor, cont); } @@ -250,7 +250,6 @@ public: private: std::vector databases; - Random random; }; class DBPerTransactionExecutor : public TransactionExecutorBase { diff --git a/bindings/c/test/apitester/TesterUtil.cpp b/bindings/c/test/apitester/TesterUtil.cpp index 4ab572b980..42f9eb218f 100644 --- a/bindings/c/test/apitester/TesterUtil.cpp +++ b/bindings/c/test/apitester/TesterUtil.cpp @@ -32,6 +32,11 @@ int Random::randomInt(int min, int max) { return std::uniform_int_distribution(min, max)(random); } +Random& Random::get() { + static thread_local Random random; + return random; +} + std::string Random::randomStringLowerCase(int minLength, int maxLength) { int length = randomInt(minLength, maxLength); std::string str; diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h index 6b10a6ffb7..c6d7de92bc 100644 --- a/bindings/c/test/apitester/TesterUtil.h +++ b/bindings/c/test/apitester/TesterUtil.h @@ -51,6 +51,8 @@ class Random { public: Random(); + static Random& get(); + int randomInt(int min, int max); std::string randomStringLowerCase(int minLength, int maxLength); diff --git a/bindings/c/test/apitester/fdb_c_api_tester.cpp b/bindings/c/test/apitester/fdb_c_api_tester.cpp index 1dc5cb6da4..062ffb95f7 100644 --- a/bindings/c/test/apitester/fdb_c_api_tester.cpp +++ b/bindings/c/test/apitester/fdb_c_api_tester.cpp @@ -212,7 +212,7 @@ void applyNetworkOptions(TesterOptions& options) { } void randomizeOptions(TesterOptions& options) { - Random random; + Random& random = Random::get(); options.numFdbThreads = random.randomInt(options.testSpec.minFdbThreads, options.testSpec.maxFdbThreads); options.numClientThreads = random.randomInt(options.testSpec.minClientThreads, options.testSpec.maxClientThreads); options.numDatabases = random.randomInt(options.testSpec.minDatabases, options.testSpec.maxDatabases); From bd0bf1cfc20a449cd1b14a58e9a8da09bf4d8824 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 9 Mar 2022 14:38:07 +0100 Subject: [PATCH 043/138] ApiTester: Address concurrency issues in the workload management --- .../apitester/TesterTransactionExecutor.cpp | 2 +- bindings/c/test/apitester/TesterWorkload.cpp | 24 +++++++++++-------- bindings/c/test/apitester/TesterWorkload.h | 5 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 36f8e43ec9..16e2659a58 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -228,7 +228,7 @@ public: FDBDatabase* db; fdb_error_t err = fdb_create_database(clusterFile, &db); if (err != error_code_success) { - throw TesterError(fmt::format("Failed create database with the culster file '{}'. Error: {}({})", + throw TesterError(fmt::format("Failed create database with the cluster file '{}'. Error: {}({})", clusterFile, err, fdb_get_error(err))); diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index 270bfb7c8c..a374e5a09c 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -20,9 +20,11 @@ #include "TesterWorkload.h" #include "TesterUtil.h" +#include "test/apitester/TesterScheduler.h" #include #include #include +#include namespace FdbApiTester { @@ -74,8 +76,7 @@ void WorkloadBase::schedule(TTaskFct task) { tasksScheduled++; manager->scheduler->schedule([this, task]() { task(); - tasksScheduled--; - checkIfDone(); + scheduledTaskDone(); }); } @@ -98,8 +99,7 @@ void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskF cont(); } } - tasksScheduled--; - checkIfDone(); + scheduledTaskDone(); }); } @@ -116,8 +116,8 @@ void WorkloadBase::error(const std::string& msg) { } } -void WorkloadBase::checkIfDone() { - if (tasksScheduled == 0) { +void WorkloadBase::scheduledTaskDone() { + if (--tasksScheduled == 0) { if (numErrors > 0) { error(fmt::format("Workload failed with {} errors", numErrors.load())); } else { @@ -133,14 +133,18 @@ void WorkloadManager::add(std::shared_ptr workload, TTaskFct cont) { } void WorkloadManager::run() { + std::vector> initialWorkloads; for (auto iter : workloads) { - iter.first->init(this); + initialWorkloads.push_back(iter.second.ref); } - for (auto iter : workloads) { - iter.first->start(); + for (auto iter : initialWorkloads) { + iter->init(this); + } + for (auto iter : initialWorkloads) { + iter->start(); } scheduler->join(); - if (numWorkloadsFailed > 0) { + if (failed()) { fmt::print(stderr, "{} workloads failed", numWorkloadsFailed); } else { fprintf(stderr, "All workloads succesfully completed"); diff --git a/bindings/c/test/apitester/TesterWorkload.h b/bindings/c/test/apitester/TesterWorkload.h index 023a443757..d1e9f94c3d 100644 --- a/bindings/c/test/apitester/TesterWorkload.h +++ b/bindings/c/test/apitester/TesterWorkload.h @@ -95,8 +95,9 @@ protected: private: WorkloadManager* manager; - // Check if workload is done and notify the workload manager - void checkIfDone(); + // Decrease scheduled task counter, notify the workload manager + // that the task is done if no more tasks schedule + void scheduledTaskDone(); // Keep track of tasks scheduled by the workload // End workload when this number falls to 0 From e6950abae6914dab989ec00a4b4b648bb9d8af52 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Fri, 4 Mar 2022 17:09:06 -0800 Subject: [PATCH 044/138] Use jemalloc for SQLite/Redwood page cache allocation --- cmake/Jemalloc.cmake | 1 + fdbrpc/AsyncFileCached.actor.cpp | 8 ++++++++ fdbrpc/AsyncFileCached.actor.h | 8 ++++++++ flow/FastAlloc.h | 4 ++++ 4 files changed, 21 insertions(+) diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index 176c88c9b6..723ca6b081 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -8,6 +8,7 @@ if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE) return() endif() +add_definitions(-DUSE_JEMALLOC) find_path(JEMALLOC_INCLUDE_DIR NAMES jemalloc/jemalloc.h diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp index 6354e55cd0..f91e4ab10d 100644 --- a/fdbrpc/AsyncFileCached.actor.cpp +++ b/fdbrpc/AsyncFileCached.actor.cpp @@ -29,10 +29,14 @@ static std::map, Referen EvictablePage::~EvictablePage() { if (data) { +#if defined(USE_JEMALLOC) + aligned_free(data); +#else if (pageCache->pageSize == 4096) FastAllocator<4096>::release(data); else aligned_free(data); +#endif } if (EvictablePageCache::RANDOM == pageCache->cacheEvictionType) { if (index > -1) { @@ -169,10 +173,14 @@ void AsyncFileCached::releaseZeroCopy(void* data, int length, int64_t offset) { if (o != orphanedPages.end()) { if (o->second == 1) { if (data) { +#if defined(USE_JEMALLOC) + aligned_free(data); +#else if (length == 4096) FastAllocator<4096>::release(data); else aligned_free(data); +#endif } } else { --o->second; diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index ea4d0c9e97..47a7b9d282 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -79,7 +79,11 @@ struct EvictablePageCache : ReferenceCounted { void allocate(EvictablePage* page) { try_evict(); try_evict(); +#if defined(USE_JEMALLOC) + page->data = aligned_alloc(4096, pageSize); +#else page->data = pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageSize); +#endif if (RANDOM == cacheEvictionType) { page->index = pages.size(); pages.push_back(page); @@ -387,7 +391,11 @@ struct AFCPage : public EvictablePage, public FastAllocated { owner->orphanedPages[data] = zeroCopyRefCount; zeroCopyRefCount = 0; notReading = Void(); +#if defined(USE_JEMALLOC) + data = aligned_alloc(4096, pageCache->pageSize); +#else data = pageCache->pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageCache->pageSize); +#endif } Future write(void const* data, int length, int offset) { diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index 6fbc6ce0af..a91102d12e 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -278,6 +278,7 @@ inline void freeFast(int size, void* ptr) { } [[nodiscard]] inline void* allocateFast4kAligned(int size) { +#if !defined(USE_JEMALLOC) // Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc if (size <= 4096) return FastAllocator<4096>::allocate(); @@ -285,10 +286,12 @@ inline void freeFast(int size, void* ptr) { return FastAllocator<8192>::allocate(); if (size <= 16384) return FastAllocator<16384>::allocate(); +#endif return aligned_alloc(4096, size); } inline void freeFast4kAligned(int size, void* ptr) { +#if !defined(USE_JEMALLOC) // Sizes supported by FastAllocator must be release via FastAllocator if (size <= 4096) return FastAllocator<4096>::release(ptr); @@ -296,6 +299,7 @@ inline void freeFast4kAligned(int size, void* ptr) { return FastAllocator<8192>::release(ptr); if (size <= 16384) return FastAllocator<16384>::release(ptr); +#endif aligned_free(ptr); } From 92ce0de40429ce1b99bb677c7a6ce8b5f15d0d69 Mon Sep 17 00:00:00 2001 From: Renxuan Wang Date: Wed, 9 Mar 2022 10:59:11 -0800 Subject: [PATCH 045/138] When storing coordinators string, we should concatenate by comma. We are splitting by comma (https://github.com/apple/foundationdb/blob/402fa4dd9e2151807fc8a89d6471f2da41e41d4a/fdbclient/SpecialKeySpace.actor.cpp#L1686). So when we concatenating, if we use ", ", there will be an extra space, causing hostname parsing error. NetworkAddress happens not to have the same issue because it uses sscanf (https://github.com/apple/foundationdb/blob/402fa4dd9e2151807fc8a89d6471f2da41e41d4a/flow/network.cpp#L119). --- fdbcli/CoordinatorsCommand.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbcli/CoordinatorsCommand.actor.cpp b/fdbcli/CoordinatorsCommand.actor.cpp index f274f17df7..a62848861c 100644 --- a/fdbcli/CoordinatorsCommand.actor.cpp +++ b/fdbcli/CoordinatorsCommand.actor.cpp @@ -132,7 +132,7 @@ ACTOR Future changeCoordinators(Reference db, std::vectorset(fdb_cli::coordinatorsProcessSpecialKey, new_coordinators_str); } wait(safeThreadFutureToFuture(tr->commit())); From e2c7c30faf9515b03269730541605cb77d38221a Mon Sep 17 00:00:00 2001 From: Tao Lin Date: Thu, 10 Mar 2022 10:05:44 -0800 Subject: [PATCH 046/138] GetMappedRange support serializable & check RYW & continuation (#6181) --- bindings/c/fdb_c.cpp | 82 ++-- bindings/c/foundationdb/fdb_c.h | 98 ++++- bindings/c/test/unit/fdb_api.cpp | 12 +- bindings/c/test/unit/fdb_api.hpp | 14 +- bindings/c/test/unit/unit_tests.cpp | 129 ++++-- bindings/java/CMakeLists.txt | 7 + bindings/java/fdbJNI.cpp | 296 +++++++++++-- ...a => MappedRangeQueryIntegrationTest.java} | 132 +++--- .../foundationdb/FakeFDBTransaction.java | 2 - .../foundationdb/DirectBufferIterator.java | 30 +- .../apple/foundationdb/FDBTransaction.java | 57 ++- .../foundationdb/FutureMappedResults.java | 87 ++++ .../com/apple/foundationdb/FutureResults.java | 2 +- .../main/com/apple/foundationdb/KeyValue.java | 11 + .../apple/foundationdb/MappedKeyValue.java | 96 +++++ .../apple/foundationdb/MappedRangeQuery.java | 333 ++++++++++++++ .../apple/foundationdb/MappedRangeResult.java | 64 +++ ...MappedRangeResultDirectBufferIterator.java | 71 +++ .../foundationdb/MappedRangeResultInfo.java | 29 ++ .../com/apple/foundationdb/RangeQuery.java | 20 +- .../com/apple/foundationdb/RangeResult.java | 2 +- .../RangeResultDirectBufferIterator.java | 62 +++ .../foundationdb/RangeResultSummary.java | 12 + .../apple/foundationdb/ReadTransaction.java | 4 +- bindings/java/src/tests.cmake | 2 +- fdbclient/DatabaseContext.h | 2 +- fdbclient/FDBTypes.h | 113 +++++ fdbclient/IClientApi.h | 12 +- fdbclient/ISingleThreadTransaction.h | 12 +- fdbclient/MultiVersionTransaction.actor.cpp | 80 ++-- fdbclient/MultiVersionTransaction.h | 97 +++-- fdbclient/NativeAPI.actor.cpp | 211 +++++---- fdbclient/NativeAPI.actor.h | 22 +- fdbclient/PaxosConfigTransaction.h | 12 +- fdbclient/RYWIterator.h | 6 +- fdbclient/ReadYourWrites.actor.cpp | 270 ++++++++---- fdbclient/ReadYourWrites.h | 15 +- fdbclient/ServerKnobs.cpp | 6 +- fdbclient/ServerKnobs.h | 2 + fdbclient/SimpleConfigTransaction.h | 12 +- fdbclient/StorageServerInterface.cpp | 18 +- fdbclient/StorageServerInterface.h | 23 +- fdbclient/ThreadSafeTransaction.cpp | 16 +- fdbclient/ThreadSafeTransaction.h | 12 +- fdbrpc/TSSComparison.h | 12 +- fdbserver/CMakeLists.txt | 2 +- fdbserver/storageserver.actor.cpp | 188 ++++---- fdbserver/worker.actor.cpp | 10 +- fdbserver/workloads/ApiWorkload.actor.cpp | 1 + fdbserver/workloads/ApiWorkload.h | 28 ++ fdbserver/workloads/GetMappedRange.actor.cpp | 406 ++++++++++++++++++ fdbserver/workloads/GetRangeAndMap.actor.cpp | 186 -------- flow/Arena.h | 2 + flow/error_definitions.h | 3 +- tests/CMakeLists.txt | 2 +- tests/fast/GetMappedRange.toml | 6 + tests/fast/GetRangeAndMap.toml | 6 - 57 files changed, 2559 insertions(+), 888 deletions(-) rename bindings/java/src/integration/com/apple/foundationdb/{RangeAndFlatMapQueryIntegrationTest.java => MappedRangeQueryIntegrationTest.java} (63%) create mode 100644 bindings/java/src/main/com/apple/foundationdb/FutureMappedResults.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/MappedRangeQuery.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/MappedRangeResult.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/MappedRangeResultDirectBufferIterator.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/MappedRangeResultInfo.java create mode 100644 bindings/java/src/main/com/apple/foundationdb/RangeResultDirectBufferIterator.java create mode 100644 fdbserver/workloads/GetMappedRange.actor.cpp delete mode 100644 fdbserver/workloads/GetRangeAndMap.actor.cpp create mode 100644 tests/fast/GetMappedRange.toml delete mode 100644 tests/fast/GetRangeAndMap.toml diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index baa4caf521..6bbc360a1f 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -281,6 +281,16 @@ fdb_error_t fdb_future_get_keyvalue_array_v13(FDBFuture* f, FDBKeyValue const** *out_count = rrr.size();); } +extern "C" DLLEXPORT fdb_error_t fdb_future_get_mappedkeyvalue_array(FDBFuture* f, + FDBMappedKeyValue const** out_kvm, + int* out_count, + fdb_bool_t* out_more) { + CATCH_AND_RETURN(Standalone rrr = TSAV(Standalone, f)->get(); + *out_kvm = (FDBMappedKeyValue*)rrr.begin(); + *out_count = rrr.size(); + *out_more = rrr.more;); +} + extern "C" DLLEXPORT fdb_error_t fdb_future_get_string_array(FDBFuture* f, const char*** out_strings, int* out_count) { CATCH_AND_RETURN(Standalone> na = TSAV(Standalone>, f)->get(); *out_strings = (const char**)na.begin(); @@ -571,29 +581,29 @@ FDBFuture* fdb_transaction_get_range_impl(FDBTransaction* tr, .extractPtr()); } -FDBFuture* fdb_transaction_get_range_and_flat_map_impl(FDBTransaction* tr, - uint8_t const* begin_key_name, - int begin_key_name_length, - fdb_bool_t begin_or_equal, - int begin_offset, - uint8_t const* end_key_name, - int end_key_name_length, - fdb_bool_t end_or_equal, - int end_offset, - uint8_t const* mapper_name, - int mapper_name_length, - int limit, - int target_bytes, - FDBStreamingMode mode, - int iteration, - fdb_bool_t snapshot, - fdb_bool_t reverse) { +FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + fdb_bool_t begin_or_equal, + int begin_offset, + uint8_t const* end_key_name, + int end_key_name_length, + fdb_bool_t end_or_equal, + int end_offset, + uint8_t const* mapper_name, + int mapper_name_length, + int limit, + int target_bytes, + FDBStreamingMode mode, + int iteration, + fdb_bool_t snapshot, + fdb_bool_t reverse) { FDBFuture* r = validate_and_update_parameters(limit, target_bytes, mode, iteration, reverse); if (r != nullptr) return r; return ( FDBFuture*)(TXN(tr) - ->getRangeAndFlatMap( + ->getMappedRange( KeySelectorRef(KeyRef(begin_key_name, begin_key_name_length), begin_or_equal, begin_offset), KeySelectorRef(KeyRef(end_key_name, end_key_name_length), end_or_equal, end_offset), StringRef(mapper_name, mapper_name_length), @@ -604,23 +614,23 @@ FDBFuture* fdb_transaction_get_range_and_flat_map_impl(FDBTransaction* tr, } // TODO: Support FDB_API_ADDED in generate_asm.py and then this can be replaced with fdb_api_ptr_unimpl. -FDBFuture* fdb_transaction_get_range_and_flat_map_v699(FDBTransaction* tr, - uint8_t const* begin_key_name, - int begin_key_name_length, - fdb_bool_t begin_or_equal, - int begin_offset, - uint8_t const* end_key_name, - int end_key_name_length, - fdb_bool_t end_or_equal, - int end_offset, - uint8_t const* mapper_name, - int mapper_name_length, - int limit, - int target_bytes, - FDBStreamingMode mode, - int iteration, - fdb_bool_t snapshot, - fdb_bool_t reverse) { +FDBFuture* fdb_transaction_get_mapped_range_v699(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + fdb_bool_t begin_or_equal, + int begin_offset, + uint8_t const* end_key_name, + int end_key_name_length, + fdb_bool_t end_or_equal, + int end_offset, + uint8_t const* mapper_name, + int mapper_name_length, + int limit, + int target_bytes, + FDBStreamingMode mode, + int iteration, + fdb_bool_t snapshot, + fdb_bool_t reverse) { fprintf(stderr, "UNIMPLEMENTED FDB API FUNCTION\n"); abort(); } @@ -857,7 +867,7 @@ extern "C" DLLEXPORT fdb_error_t fdb_select_api_version_impl(int runtime_version // WARNING: use caution when implementing removed functions by calling public API functions. This can lead to // undesired behavior when using the multi-version API. Instead, it is better to have both the removed and public // functions call an internal implementation function. See fdb_create_database_impl for an example. - FDB_API_CHANGED(fdb_transaction_get_range_and_flat_map, 700); + FDB_API_CHANGED(fdb_transaction_get_mapped_range, 700); FDB_API_REMOVED(fdb_future_get_version, 620); FDB_API_REMOVED(fdb_create_cluster, 610); FDB_API_REMOVED(fdb_cluster_create_database, 610); diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 9779582b75..5fc64eb741 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -113,6 +113,64 @@ typedef struct keyvalue { int value_length; } FDBKeyValue; #endif + +#pragma pack(pop) + +/* Memory layout of KeySelectorRef. */ +typedef struct keyselector { + FDBKey key; + /* orEqual and offset have not be tested in C binding. Just a placeholder. */ + fdb_bool_t orEqual; + int offset; +} FDBKeySelector; + +/* Memory layout of GetRangeReqAndResultRef. */ +typedef struct getrangereqandresult { + FDBKeySelector begin; + FDBKeySelector end; + FDBKeyValue* data; + int m_size, m_capacity; +} FDBGetRangeReqAndResult; + +/* Memory layout of MappedKeyValueRef. + +Total 112 bytes +- key (12 bytes) +:74:8F:8E:5F:AE:7F:00:00 +:4A:00:00:00 +- value (12 bytes) +:70:8F:8E:5F:AE:7F:00:00 +:00:00:00:00 +- begin selector (20 bytes) +:30:8F:8E:5F:AE:7F:00:00 +:2D:00:00:00 +:00:7F:00:00 +:01:00:00:00 +- end selector (20 bytes) +:EC:8E:8E:5F:AE:7F:00:00 +:2D:00:00:00 +:00:2B:3C:60 +:01:00:00:00 +- vector (16 bytes) +:74:94:8E:5F:AE:7F:00:00 +:01:00:00:00 +:01:00:00:00 +- buffer (32 bytes) +:00:20:D1:61:00:00:00:00 +:00:00:00:00:00:00:00:00 +:00:00:00:00:00:00:00:00 +:01:00:00:00:AE:7F:00:00 +*/ +typedef struct mappedkeyvalue { + FDBKey key; + FDBKey value; + /* It's complicated to map a std::variant to C. For now we assume the underlying requests are always getRange and + * take the shortcut. */ + FDBGetRangeReqAndResult getRange; + unsigned char buffer[32]; +} FDBMappedKeyValue; + +#pragma pack(push, 4) typedef struct keyrange { const uint8_t* begin_key; int begin_key_length; @@ -176,6 +234,12 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_keyvalue_array(FDBFuture int* out_count, fdb_bool_t* out_more); #endif + +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_mappedkeyvalue_array(FDBFuture* f, + FDBMappedKeyValue const** out_kv, + int* out_count, + fdb_bool_t* out_more); + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_key_array(FDBFuture* f, FDBKey const** out_key_array, int* out_count); @@ -283,23 +347,23 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_range(FDBTransaction fdb_bool_t reverse); #endif -DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_range_and_flat_map(FDBTransaction* tr, - uint8_t const* begin_key_name, - int begin_key_name_length, - fdb_bool_t begin_or_equal, - int begin_offset, - uint8_t const* end_key_name, - int end_key_name_length, - fdb_bool_t end_or_equal, - int end_offset, - uint8_t const* mapper_name, - int mapper_name_length, - int limit, - int target_bytes, - FDBStreamingMode mode, - int iteration, - fdb_bool_t snapshot, - fdb_bool_t reverse); +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + fdb_bool_t begin_or_equal, + int begin_offset, + uint8_t const* end_key_name, + int end_key_name_length, + fdb_bool_t end_or_equal, + int end_offset, + uint8_t const* mapper_name, + int mapper_name_length, + int limit, + int target_bytes, + FDBStreamingMode mode, + int iteration, + fdb_bool_t snapshot, + fdb_bool_t reverse); DLLEXPORT void fdb_transaction_set(FDBTransaction* tr, uint8_t const* key_name, diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index f91868e14a..301cc5832b 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -90,6 +90,14 @@ void Future::cancel() { return fdb_future_get_keyvalue_array(future_, out_kv, out_count, out_more); } +// MappedKeyValueArrayFuture + +[[nodiscard]] fdb_error_t MappedKeyValueArrayFuture::get(const FDBMappedKeyValue** out_kv, + int* out_count, + fdb_bool_t* out_more) { + return fdb_future_get_mappedkeyvalue_array(future_, out_kv, out_count, out_more); +} + // Result Result::~Result() { @@ -210,7 +218,7 @@ KeyValueArrayFuture Transaction::get_range(const uint8_t* begin_key_name, reverse)); } -KeyValueArrayFuture Transaction::get_range_and_flat_map(const uint8_t* begin_key_name, +MappedKeyValueArrayFuture Transaction::get_mapped_range(const uint8_t* begin_key_name, int begin_key_name_length, fdb_bool_t begin_or_equal, int begin_offset, @@ -226,7 +234,7 @@ KeyValueArrayFuture Transaction::get_range_and_flat_map(const uint8_t* begin_key int iteration, fdb_bool_t snapshot, fdb_bool_t reverse) { - return KeyValueArrayFuture(fdb_transaction_get_range_and_flat_map(tr_, + return MappedKeyValueArrayFuture(fdb_transaction_get_mapped_range(tr_, begin_key_name, begin_key_name_length, begin_or_equal, diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index a583e33fbb..63ee9573c8 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -135,6 +135,18 @@ private: KeyValueArrayFuture(FDBFuture* f) : Future(f) {} }; +class MappedKeyValueArrayFuture : public Future { +public: + // Call this function instead of fdb_future_get_mappedkeyvalue_array when using + // the MappedKeyValueArrayFuture type. Its behavior is identical to + // fdb_future_get_mappedkeyvalue_array. + fdb_error_t get(const FDBMappedKeyValue** out_kv, int* out_count, fdb_bool_t* out_more); + +private: + friend class Transaction; + MappedKeyValueArrayFuture(FDBFuture* f) : Future(f) {} +}; + class KeyRangeArrayFuture : public Future { public: // Call this function instead of fdb_future_get_keyrange_array when using @@ -254,7 +266,7 @@ public: // WARNING: This feature is considered experimental at this time. It is only allowed when using snapshot isolation // AND disabling read-your-writes. Returns a future which will be set to an FDBKeyValue array. - KeyValueArrayFuture get_range_and_flat_map(const uint8_t* begin_key_name, + MappedKeyValueArrayFuture get_mapped_range(const uint8_t* begin_key_name, int begin_key_name_length, fdb_bool_t begin_or_equal, int begin_offset, diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 420a16dc3c..d96f5ccdfc 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -173,6 +173,20 @@ struct GetRangeResult { fdb_error_t err; }; +struct GetMappedRangeResult { + std::vector> // range results + >> + mkvs; + // True if values remain in the key range requested. + bool more; + // Set to a non-zero value if an error occurred during the transaction. + fdb_error_t err; +}; + // Helper function to get a range of kv pairs. Returns a GetRangeResult struct // containing the results of the range read. Caller is responsible for checking // error on failure and retrying if necessary. @@ -225,7 +239,11 @@ GetRangeResult get_range(fdb::Transaction& tr, return GetRangeResult{ results, out_more != 0, 0 }; } -GetRangeResult get_range_and_flat_map(fdb::Transaction& tr, +static inline std::string extractString(FDBKey key) { + return std::string((const char*)key.key, key.key_length); +} + +GetMappedRangeResult get_mapped_range(fdb::Transaction& tr, const uint8_t* begin_key_name, int begin_key_name_length, fdb_bool_t begin_or_equal, @@ -242,7 +260,7 @@ GetRangeResult get_range_and_flat_map(fdb::Transaction& tr, int iteration, fdb_bool_t snapshot, fdb_bool_t reverse) { - fdb::KeyValueArrayFuture f1 = tr.get_range_and_flat_map(begin_key_name, + fdb::MappedKeyValueArrayFuture f1 = tr.get_mapped_range(begin_key_name, begin_key_name_length, begin_or_equal, begin_offset, @@ -261,21 +279,41 @@ GetRangeResult get_range_and_flat_map(fdb::Transaction& tr, fdb_error_t err = wait_future(f1); if (err) { - return GetRangeResult{ {}, false, err }; + return GetMappedRangeResult{ {}, false, err }; } - const FDBKeyValue* out_kv; + const FDBMappedKeyValue* out_mkv; int out_count; fdb_bool_t out_more; - fdb_check(f1.get(&out_kv, &out_count, &out_more)); - std::vector> results; + fdb_check(f1.get(&out_mkv, &out_count, &out_more)); + + GetMappedRangeResult result; + result.more = (out_more != 0); + result.err = 0; + + // std::cout << "out_count:" << out_count << " out_more:" << out_more << " out_mkv:" << (void*)out_mkv << + // std::endl; + for (int i = 0; i < out_count; ++i) { - std::string key((const char*)out_kv[i].key, out_kv[i].key_length); - std::string value((const char*)out_kv[i].value, out_kv[i].value_length); - results.emplace_back(key, value); + FDBMappedKeyValue mkv = out_mkv[i]; + auto key = extractString(mkv.key); + auto value = extractString(mkv.value); + auto begin = extractString(mkv.getRange.begin.key); + auto end = extractString(mkv.getRange.end.key); + // std::cout << "key:" << key << " value:" << value << " begin:" << begin << " end:" << end << std::endl; + + std::vector> range_results; + for (int i = 0; i < mkv.getRange.m_size; ++i) { + const auto& kv = mkv.getRange.data[i]; + std::string k((const char*)kv.key, kv.key_length); + std::string v((const char*)kv.value, kv.value_length); + range_results.emplace_back(k, v); + // std::cout << "[" << i << "]" << k << " -> " << v << std::endl; + } + result.mkvs.emplace_back(key, value, begin, end, range_results); } - return GetRangeResult{ results, out_more != 0, 0 }; + return result; } // Clears all data in the database. @@ -888,32 +926,35 @@ static Value dataOfRecord(const int i) { static std::string indexEntryKey(const int i) { return Tuple().append(StringRef(prefix)).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack().toString(); } -static std::string recordKey(const int i) { - return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).pack().toString(); +static std::string recordKey(const int i, const int split) { + return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack().toString(); } -static std::string recordValue(const int i) { - return Tuple().append(dataOfRecord(i)).pack().toString(); +static std::string recordValue(const int i, const int split) { + return Tuple().append(dataOfRecord(i)).append(split).pack().toString(); } +const static int SPLIT_SIZE = 3; std::map fillInRecords(int n) { // Note: The user requested `prefix` should be added as the first element of the tuple that forms the key, rather // than the prefix of the key. So we don't use key() or create_data() in this test. std::map data; for (int i = 0; i < n; i++) { data[indexEntryKey(i)] = EMPTY; - data[recordKey(i)] = recordValue(i); + for (int split = 0; split < SPLIT_SIZE; split++) { + data[recordKey(i, split)] = recordValue(i, split); + } } insert_data(db, data); return data; } -GetRangeResult getIndexEntriesAndMap(int beginId, int endId, fdb::Transaction& tr) { +GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transaction& tr) { std::string indexEntryKeyBegin = indexEntryKey(beginId); std::string indexEntryKeyEnd = indexEntryKey(endId); - std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString(); + std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).append("{...}"_sr).pack().toString(); - return get_range_and_flat_map( + return get_mapped_range( tr, FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((const uint8_t*)indexEntryKeyBegin.c_str(), indexEntryKeyBegin.size()), FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((const uint8_t*)indexEntryKeyEnd.c_str(), indexEntryKeyEnd.size()), @@ -923,20 +964,20 @@ GetRangeResult getIndexEntriesAndMap(int beginId, int endId, fdb::Transaction& t /* target_bytes */ 0, /* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL, /* iteration */ 0, - /* snapshot */ true, + /* snapshot */ false, /* reverse */ 0); } -TEST_CASE("fdb_transaction_get_range_and_flat_map") { - fillInRecords(20); +TEST_CASE("fdb_transaction_get_mapped_range") { + const int TOTAL_RECORDS = 20; + fillInRecords(TOTAL_RECORDS); fdb::Transaction tr(db); - // get_range_and_flat_map is only support without RYW. This is a must!!! - fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); + // RYW should be enabled. while (1) { int beginId = 1; int endId = 19; - auto result = getIndexEntriesAndMap(beginId, endId, tr); + auto result = getMappedIndexEntries(beginId, endId, tr); if (result.err) { fdb::EmptyFuture f1 = tr.on_error(result.err); @@ -945,32 +986,30 @@ TEST_CASE("fdb_transaction_get_range_and_flat_map") { } int expectSize = endId - beginId; - CHECK(result.kvs.size() == expectSize); + CHECK(result.mkvs.size() == expectSize); CHECK(!result.more); int id = beginId; - for (int i = 0; i < result.kvs.size(); i++, id++) { - const auto& [key, value] = result.kvs[i]; - CHECK(recordKey(id).compare(key) == 0); - CHECK(recordValue(id).compare(value) == 0); + for (int i = 0; i < expectSize; i++, id++) { + const auto& [key, value, begin, end, range_results] = result.mkvs[i]; + CHECK(indexEntryKey(id).compare(key) == 0); + CHECK(EMPTY.compare(value) == 0); + CHECK(range_results.size() == SPLIT_SIZE); + for (int split = 0; split < SPLIT_SIZE; split++) { + auto& [k, v] = range_results[split]; + CHECK(recordKey(id, split).compare(k) == 0); + CHECK(recordValue(id, split).compare(v) == 0); + } } break; } } -TEST_CASE("fdb_transaction_get_range_and_flat_map get_key_values_and_map_has_more") { - fillInRecords(2000); - fdb::Transaction tr(db); - fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); - auto result = getIndexEntriesAndMap(100, 1900, tr); - CHECK(result.err == error_code_get_key_values_and_map_has_more); -} - -TEST_CASE("fdb_transaction_get_range_and_flat_map_restricted_to_snapshot") { +TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") { std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString(); fdb::Transaction tr(db); fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); - auto result = get_range_and_flat_map( + auto result = get_mapped_range( tr, FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((const uint8_t*)indexEntryKey(0).c_str(), indexEntryKey(0).size()), FDB_KEYSEL_FIRST_GREATER_THAN((const uint8_t*)indexEntryKey(1).c_str(), indexEntryKey(1).size()), @@ -980,16 +1019,16 @@ TEST_CASE("fdb_transaction_get_range_and_flat_map_restricted_to_snapshot") { /* target_bytes */ 0, /* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL, /* iteration */ 0, - /* snapshot */ false, // Set snapshot to false + /* snapshot */ true, // Set snapshot to true /* reverse */ 0); - ASSERT(result.err == error_code_client_invalid_operation); + ASSERT(result.err == error_code_unsupported_operation); } -TEST_CASE("fdb_transaction_get_range_and_flat_map_restricted_to_ryw_disable") { +TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_ryw_enable") { std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString(); fdb::Transaction tr(db); - // Not set FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE. - auto result = get_range_and_flat_map( + fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0)); // Not disable RYW + auto result = get_mapped_range( tr, FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((const uint8_t*)indexEntryKey(0).c_str(), indexEntryKey(0).size()), FDB_KEYSEL_FIRST_GREATER_THAN((const uint8_t*)indexEntryKey(1).c_str(), indexEntryKey(1).size()), @@ -1001,7 +1040,7 @@ TEST_CASE("fdb_transaction_get_range_and_flat_map_restricted_to_ryw_disable") { /* iteration */ 0, /* snapshot */ true, /* reverse */ 0); - ASSERT(result.err == error_code_client_invalid_operation); + ASSERT(result.err == error_code_unsupported_operation); } TEST_CASE("fdb_transaction_get_range reverse") { diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 9dfb349040..9adf24a2f7 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -27,6 +27,8 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/directory/package-info.java src/main/com/apple/foundationdb/directory/PathUtil.java src/main/com/apple/foundationdb/DirectBufferIterator.java + src/main/com/apple/foundationdb/RangeResultDirectBufferIterator.java + src/main/com/apple/foundationdb/MappedRangeResultDirectBufferIterator.java src/main/com/apple/foundationdb/DirectBufferPool.java src/main/com/apple/foundationdb/FDB.java src/main/com/apple/foundationdb/FDBDatabase.java @@ -36,11 +38,13 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/FutureKeyArray.java src/main/com/apple/foundationdb/FutureResult.java src/main/com/apple/foundationdb/FutureResults.java + src/main/com/apple/foundationdb/FutureMappedResults.java src/main/com/apple/foundationdb/FutureStrings.java src/main/com/apple/foundationdb/FutureVoid.java src/main/com/apple/foundationdb/JNIUtil.java src/main/com/apple/foundationdb/KeySelector.java src/main/com/apple/foundationdb/KeyValue.java + src/main/com/apple/foundationdb/MappedKeyValue.java src/main/com/apple/foundationdb/LocalityUtil.java src/main/com/apple/foundationdb/NativeFuture.java src/main/com/apple/foundationdb/NativeObjectWrapper.java @@ -49,9 +53,12 @@ set(JAVA_BINDING_SRCS src/main/com/apple/foundationdb/package-info.java src/main/com/apple/foundationdb/Range.java src/main/com/apple/foundationdb/RangeQuery.java + src/main/com/apple/foundationdb/MappedRangeQuery.java src/main/com/apple/foundationdb/KeyArrayResult.java src/main/com/apple/foundationdb/RangeResult.java + src/main/com/apple/foundationdb/MappedRangeResult.java src/main/com/apple/foundationdb/RangeResultInfo.java + src/main/com/apple/foundationdb/MappedRangeResultInfo.java src/main/com/apple/foundationdb/RangeResultSummary.java src/main/com/apple/foundationdb/ReadTransaction.java src/main/com/apple/foundationdb/ReadTransactionContext.java diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index b330b210b9..1032c418e2 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -20,6 +20,7 @@ #include #include +#include #include "com_apple_foundationdb_FDB.h" #include "com_apple_foundationdb_FDBDatabase.h" @@ -50,10 +51,14 @@ static thread_local jmethodID g_IFutureCallback_call_methodID = JNI_NULL; static thread_local bool is_external = false; static jclass range_result_summary_class; static jclass range_result_class; +static jclass mapped_range_result_class; +static jclass mapped_key_value_class; static jclass string_class; static jclass key_array_result_class; static jmethodID key_array_result_init; static jmethodID range_result_init; +static jmethodID mapped_range_result_init; +static jmethodID mapped_key_value_from_bytes; static jmethodID range_result_summary_init; void detachIfExternalThread(void* ignore) { @@ -478,6 +483,127 @@ JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureResults_FutureResult return result; } +class ExecuteOnLeave { + std::function func; + +public: + explicit ExecuteOnLeave(std::function func) : func(func) {} + ~ExecuteOnLeave() { func(); } +}; + +void cpBytesAndLengthInner(uint8_t*& pByte, jint*& pLength, const uint8_t* data, const int& length) { + *pLength = length; + pLength++; + + memcpy(pByte, data, length); + pByte += length; +} + +void cpBytesAndLength(uint8_t*& pByte, jint*& pLength, const FDBKey& key) { + cpBytesAndLengthInner(pByte, pLength, key.key, key.key_length); +} + +JNIEXPORT jobject JNICALL Java_com_apple_foundationdb_FutureMappedResults_FutureMappedResults_1get(JNIEnv* jenv, + jobject, + jlong future) { + if (!future) { + throwParamNotNull(jenv); + return JNI_NULL; + } + + FDBFuture* f = (FDBFuture*)future; + + const FDBMappedKeyValue* kvms; + int count; + fdb_bool_t more; + fdb_error_t err = fdb_future_get_mappedkeyvalue_array(f, &kvms, &count, &more); + if (err) { + safeThrow(jenv, getThrowable(jenv, err)); + return JNI_NULL; + } + + jobjectArray mrr_values = jenv->NewObjectArray(count, mapped_key_value_class, NULL); + if (!mrr_values) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + + for (int i = 0; i < count; i++) { + FDBMappedKeyValue kvm = kvms[i]; + int kvm_count = kvm.getRange.m_size; + + const int totalLengths = 4 + kvm_count * 2; + + int totalBytes = kvm.key.key_length + kvm.value.key_length + kvm.getRange.begin.key.key_length + + kvm.getRange.end.key.key_length; + for (int i = 0; i < kvm_count; i++) { + auto kv = kvm.getRange.data[i]; + totalBytes += kv.key_length + kv.value_length; + } + + jbyteArray bytesArray = jenv->NewByteArray(totalBytes); + if (!bytesArray) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + + jintArray lengthArray = jenv->NewIntArray(totalLengths); + if (!lengthArray) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + + uint8_t* bytes_barr = (uint8_t*)jenv->GetByteArrayElements(bytesArray, JNI_NULL); + if (!bytes_barr) { + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return JNI_NULL; + } + { + ExecuteOnLeave e([&]() { jenv->ReleaseByteArrayElements(bytesArray, (jbyte*)bytes_barr, 0); }); + + jint* length_barr = jenv->GetIntArrayElements(lengthArray, JNI_NULL); + if (!length_barr) { + if (!jenv->ExceptionOccurred()) + throwOutOfMem(jenv); + return JNI_NULL; + } + { + ExecuteOnLeave e([&]() { jenv->ReleaseIntArrayElements(lengthArray, length_barr, 0); }); + + uint8_t* pByte = bytes_barr; + jint* pLength = length_barr; + + cpBytesAndLength(pByte, pLength, kvm.key); + cpBytesAndLength(pByte, pLength, kvm.value); + cpBytesAndLength(pByte, pLength, kvm.getRange.begin.key); + cpBytesAndLength(pByte, pLength, kvm.getRange.end.key); + for (int kvm_i = 0; kvm_i < kvm_count; kvm_i++) { + auto kv = kvm.getRange.data[kvm_i]; + cpBytesAndLengthInner(pByte, pLength, kv.key, kv.key_length); + cpBytesAndLengthInner(pByte, pLength, kv.value, kv.value_length); + } + } + } + // After native arrays are released + jobject mkv = jenv->CallStaticObjectMethod( + mapped_key_value_class, mapped_key_value_from_bytes, (jbyteArray)bytesArray, (jintArray)lengthArray); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + jenv->SetObjectArrayElement(mrr_values, i, mkv); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + } + + jobject mrr = jenv->NewObject(mapped_range_result_class, mapped_range_result_init, mrr_values, (jboolean)more); + if (jenv->ExceptionOccurred()) + return JNI_NULL; + + return mrr; +} + // SOMEDAY: explore doing this more efficiently with Direct ByteBuffers JNIEXPORT jbyteArray JNICALL Java_com_apple_foundationdb_FutureResult_FutureResult_1get(JNIEnv* jenv, jobject, @@ -767,23 +893,22 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1 return (jlong)f; } -JNIEXPORT jlong JNICALL -Java_com_apple_foundationdb_FDBTransaction_Transaction_1getRangeAndFlatMap(JNIEnv* jenv, - jobject, - jlong tPtr, - jbyteArray keyBeginBytes, - jboolean orEqualBegin, - jint offsetBegin, - jbyteArray keyEndBytes, - jboolean orEqualEnd, - jint offsetEnd, - jbyteArray mapperBytes, - jint rowLimit, - jint targetBytes, - jint streamingMode, - jint iteration, - jboolean snapshot, - jboolean reverse) { +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1getMappedRange(JNIEnv* jenv, + jobject, + jlong tPtr, + jbyteArray keyBeginBytes, + jboolean orEqualBegin, + jint offsetBegin, + jbyteArray keyEndBytes, + jboolean orEqualEnd, + jint offsetEnd, + jbyteArray mapperBytes, + jint rowLimit, + jint targetBytes, + jint streamingMode, + jint iteration, + jboolean snapshot, + jboolean reverse) { if (!tPtr || !keyBeginBytes || !keyEndBytes || !mapperBytes) { throwParamNotNull(jenv); return 0; @@ -814,23 +939,23 @@ Java_com_apple_foundationdb_FDBTransaction_Transaction_1getRangeAndFlatMap(JNIEn return 0; } - FDBFuture* f = fdb_transaction_get_range_and_flat_map(tr, - barrBegin, - jenv->GetArrayLength(keyBeginBytes), - orEqualBegin, - offsetBegin, - barrEnd, - jenv->GetArrayLength(keyEndBytes), - orEqualEnd, - offsetEnd, - barrMapper, - jenv->GetArrayLength(mapperBytes), - rowLimit, - targetBytes, - (FDBStreamingMode)streamingMode, - iteration, - snapshot, - reverse); + FDBFuture* f = fdb_transaction_get_mapped_range(tr, + barrBegin, + jenv->GetArrayLength(keyBeginBytes), + orEqualBegin, + offsetBegin, + barrEnd, + jenv->GetArrayLength(keyEndBytes), + orEqualEnd, + offsetEnd, + barrMapper, + jenv->GetArrayLength(mapperBytes), + rowLimit, + targetBytes, + (FDBStreamingMode)streamingMode, + iteration, + snapshot, + reverse); jenv->ReleaseByteArrayElements(keyBeginBytes, (jbyte*)barrBegin, JNI_ABORT); jenv->ReleaseByteArrayElements(keyEndBytes, (jbyte*)barrEnd, JNI_ABORT); jenv->ReleaseByteArrayElements(mapperBytes, (jbyte*)barrMapper, JNI_ABORT); @@ -842,7 +967,6 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FutureResults_FutureResults_1 jlong future, jobject jbuffer, jint bufferCapacity) { - if (!future) { throwParamNotNull(jenv); return; @@ -902,6 +1026,92 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FutureResults_FutureResults_1 } } +void memcpyStringInner(uint8_t* buffer, int& offset, const uint8_t* data, const int& length) { + memcpy(buffer + offset, &length, sizeof(jint)); + offset += sizeof(jint); + memcpy(buffer + offset, data, length); + offset += length; +} + +void memcpyString(uint8_t* buffer, int& offset, const FDBKey& key) { + memcpyStringInner(buffer, offset, key.key, key.key_length); +} + +JNIEXPORT void JNICALL +Java_com_apple_foundationdb_FutureMappedResults_FutureMappedResults_1getDirect(JNIEnv* jenv, + jobject, + jlong future, + jobject jbuffer, + jint bufferCapacity) { + + if (!future) { + throwParamNotNull(jenv); + return; + } + + uint8_t* buffer = (uint8_t*)jenv->GetDirectBufferAddress(jbuffer); + if (!buffer) { + if (!jenv->ExceptionOccurred()) + throwRuntimeEx(jenv, "Error getting handle to native resources"); + return; + } + + FDBFuture* f = (FDBFuture*)future; + const FDBMappedKeyValue* kvms; + int count; + fdb_bool_t more; + fdb_error_t err = fdb_future_get_mappedkeyvalue_array(f, &kvms, &count, &more); + if (err) { + safeThrow(jenv, getThrowable(jenv, err)); + return; + } + + int totalCapacityNeeded = 2 * sizeof(jint); + for (int i = 0; i < count; i++) { + const FDBMappedKeyValue& kvm = kvms[i]; + totalCapacityNeeded += kvm.key.key_length + kvm.value.key_length + kvm.getRange.begin.key.key_length + + kvm.getRange.end.key.key_length + + 5 * sizeof(jint); // Besides the 4 lengths above, also one for kvm_count. + int kvm_count = kvm.getRange.m_size; + for (int i = 0; i < kvm_count; i++) { + auto kv = kvm.getRange.data[i]; + totalCapacityNeeded += kv.key_length + kv.value_length + 2 * sizeof(jint); + } + if (bufferCapacity < totalCapacityNeeded) { + count = i; /* Only fit first `i` K/V pairs */ + more = true; + break; + } + } + + int offset = 0; + + // First copy RangeResultSummary, i.e. [keyCount, more] + memcpy(buffer + offset, &count, sizeof(jint)); + offset += sizeof(jint); + + memcpy(buffer + offset, &more, sizeof(jint)); + offset += sizeof(jint); + + for (int i = 0; i < count; i++) { + const FDBMappedKeyValue& kvm = kvms[i]; + memcpyString(buffer, offset, kvm.key); + memcpyString(buffer, offset, kvm.value); + memcpyString(buffer, offset, kvm.getRange.begin.key); + memcpyString(buffer, offset, kvm.getRange.end.key); + + int kvm_count = kvm.getRange.m_size; + memcpy(buffer + offset, &kvm_count, sizeof(jint)); + offset += sizeof(jint); + + for (int i = 0; i < kvm_count; i++) { + auto kv = kvm.getRange.data[i]; + memcpyStringInner(buffer, offset, kv.key, kv.key_length); + memcpyStringInner(buffer, offset, kv.value, kv.value_length); + } + } +} + JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1getEstimatedRangeSizeBytes(JNIEnv* jenv, jobject, @@ -1396,6 +1606,16 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { range_result_init = env->GetMethodID(local_range_result_class, "", "([B[IZ)V"); range_result_class = (jclass)(env)->NewGlobalRef(local_range_result_class); + jclass local_mapped_range_result_class = env->FindClass("com/apple/foundationdb/MappedRangeResult"); + mapped_range_result_init = + env->GetMethodID(local_mapped_range_result_class, "", "([Lcom/apple/foundationdb/MappedKeyValue;Z)V"); + mapped_range_result_class = (jclass)(env)->NewGlobalRef(local_mapped_range_result_class); + + jclass local_mapped_key_value_class = env->FindClass("com/apple/foundationdb/MappedKeyValue"); + mapped_key_value_from_bytes = env->GetStaticMethodID( + local_mapped_key_value_class, "fromBytes", "([B[I)Lcom/apple/foundationdb/MappedKeyValue;"); + mapped_key_value_class = (jclass)(env)->NewGlobalRef(local_mapped_key_value_class); + jclass local_key_array_result_class = env->FindClass("com/apple/foundationdb/KeyArrayResult"); key_array_result_init = env->GetMethodID(local_key_array_result_class, "", "([B[I)V"); key_array_result_class = (jclass)(env)->NewGlobalRef(local_key_array_result_class); @@ -1424,6 +1644,12 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { if (range_result_class != JNI_NULL) { env->DeleteGlobalRef(range_result_class); } + if (mapped_range_result_class != JNI_NULL) { + env->DeleteGlobalRef(mapped_range_result_class); + } + if (mapped_key_value_class != JNI_NULL) { + env->DeleteGlobalRef(mapped_key_value_class); + } if (string_class != JNI_NULL) { env->DeleteGlobalRef(string_class); } diff --git a/bindings/java/src/integration/com/apple/foundationdb/RangeAndFlatMapQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java similarity index 63% rename from bindings/java/src/integration/com/apple/foundationdb/RangeAndFlatMapQueryIntegrationTest.java rename to bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java index c97ce1f750..f8661f716c 100644 --- a/bindings/java/src/integration/com/apple/foundationdb/RangeAndFlatMapQueryIntegrationTest.java +++ b/bindings/java/src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java @@ -1,5 +1,5 @@ /* - * RangeAndFlatMapQueryIntegrationTest.java + * MappedRangeQueryIntegrationTest.java * * This source file is part of the FoundationDB open source project * @@ -40,7 +40,7 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @ExtendWith(RequiresDatabase.class) -class RangeAndFlatMapQueryIntegrationTest { +class MappedRangeQueryIntegrationTest { private static final FDB fdb = FDB.selectAPIVersion(710); public String databaseArg = null; private Database openFDB() { return fdb.open(databaseArg); } @@ -67,16 +67,27 @@ class RangeAndFlatMapQueryIntegrationTest { static private String indexKey(int i) { return String.format("index-key-of-record-%08d", i); } static private String dataOfRecord(int i) { return String.format("data-of-record-%08d", i); } - static byte[] MAPPER = Tuple.from(PREFIX, RECORD, "{K[3]}").pack(); + static byte[] MAPPER = Tuple.from(PREFIX, RECORD, "{K[3]}", "{...}").pack(); + static int SPLIT_SIZE = 3; + static private byte[] indexEntryKey(final int i) { return Tuple.from(PREFIX, INDEX, indexKey(i), primaryKey(i)).pack(); } - static private byte[] recordKey(final int i) { return Tuple.from(PREFIX, RECORD, primaryKey(i)).pack(); } - static private byte[] recordValue(final int i) { return Tuple.from(dataOfRecord(i)).pack(); } + static private byte[] recordKeyPrefix(final int i) { + return Tuple.from(PREFIX, RECORD, primaryKey(i)).pack(); + } + static private byte[] recordKey(final int i, final int split) { + return Tuple.from(PREFIX, RECORD, primaryKey(i), split).pack(); + } + static private byte[] recordValue(final int i, final int split) { + return Tuple.from(dataOfRecord(i), split).pack(); + } static private void insertRecordWithIndex(final Transaction tr, final int i) { tr.set(indexEntryKey(i), EMPTY); - tr.set(recordKey(i), recordValue(i)); + for (int split = 0; split < SPLIT_SIZE; split++) { + tr.set(recordKey(i, split), recordValue(i, split)); + } } private static String getArgFromEnv() { @@ -86,7 +97,7 @@ class RangeAndFlatMapQueryIntegrationTest { return cluster; } public static void main(String[] args) throws Exception { - final RangeAndFlatMapQueryIntegrationTest test = new RangeAndFlatMapQueryIntegrationTest(); + final MappedRangeQueryIntegrationTest test = new MappedRangeQueryIntegrationTest(); test.databaseArg = getArgFromEnv(); test.clearDatabase(); test.comparePerformance(); @@ -94,21 +105,21 @@ class RangeAndFlatMapQueryIntegrationTest { } int numRecords = 10000; - int numQueries = 10000; + int numQueries = 1; int numRecordsPerQuery = 100; - boolean validate = false; + boolean validate = true; @Test void comparePerformance() { FDB fdb = FDB.selectAPIVersion(710); try (Database db = openFDB()) { insertRecordsWithIndexes(numRecords, db); - instrument(rangeQueryAndGet, "rangeQueryAndGet", db); - instrument(rangeQueryAndFlatMap, "rangeQueryAndFlatMap", db); + instrument(rangeQueryAndThenRangeQueries, "rangeQueryAndThenRangeQueries", db); + instrument(mappedRangeQuery, "mappedRangeQuery", db); } } private void instrument(final RangeQueryWithIndex query, final String name, final Database db) { - System.out.printf("Starting %s (numQueries:%d, numRecordsPerQuery:%d)\n", name, numQueries, numRecordsPerQuery); + System.out.printf("Starting %s (numQueries:%d, numRecordsPerQuery:%d, validation:%s)\n", name, numQueries, numRecordsPerQuery, validate ? "on" : "off"); long startTime = System.currentTimeMillis(); for (int queryId = 0; queryId < numQueries; queryId++) { int begin = ThreadLocalRandom.current().nextInt(numRecords - numRecordsPerQuery); @@ -140,7 +151,7 @@ class RangeAndFlatMapQueryIntegrationTest { void run(int begin, int end, Database db); } - RangeQueryWithIndex rangeQueryAndGet = (int begin, int end, Database db) -> db.run(tr -> { + RangeQueryWithIndex rangeQueryAndThenRangeQueries = (int begin, int end, Database db) -> db.run(tr -> { try { List kvs = tr.getRange(KeySelector.firstGreaterOrEqual(indexEntryKey(begin)), KeySelector.firstGreaterOrEqual(indexEntryKey(end)), @@ -150,22 +161,25 @@ class RangeAndFlatMapQueryIntegrationTest { Assertions.assertEquals(end - begin, kvs.size()); // Get the records of each index entry IN PARALLEL. - List> resultFutures = new ArrayList<>(); + List>> resultFutures = new ArrayList<>(); // In reality, we need to get the record key by parsing the index entry key. But considering this is a // performance test, we just ignore the returned key and simply generate it from recordKey. for (int id = begin; id < end; id++) { - resultFutures.add(tr.get(recordKey(id))); + resultFutures.add(tr.getRange(Range.startsWith(recordKeyPrefix(id)), + ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL).asList()); } AsyncUtil.whenAll(resultFutures).get(); if (validate) { final Iterator indexes = kvs.iterator(); - final Iterator> records = resultFutures.iterator(); + final Iterator>> records = resultFutures.iterator(); for (int id = begin; id < end; id++) { Assertions.assertTrue(indexes.hasNext()); assertByteArrayEquals(indexEntryKey(id), indexes.next().getKey()); + Assertions.assertTrue(records.hasNext()); - assertByteArrayEquals(recordValue(id), records.next().get()); + List rangeResult = records.next().get(); + validateRangeResult(id, rangeResult); } Assertions.assertFalse(indexes.hasNext()); Assertions.assertFalse(records.hasNext()); @@ -176,23 +190,32 @@ class RangeAndFlatMapQueryIntegrationTest { return null; }); - RangeQueryWithIndex rangeQueryAndFlatMap = (int begin, int end, Database db) -> db.run(tr -> { + RangeQueryWithIndex mappedRangeQuery = (int begin, int end, Database db) -> db.run(tr -> { try { - tr.options().setReadYourWritesDisable(); - List kvs = - tr.snapshot() - .getRangeAndFlatMap(KeySelector.firstGreaterOrEqual(indexEntryKey(begin)), - KeySelector.firstGreaterOrEqual(indexEntryKey(end)), MAPPER, - ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL) + List kvs = + tr.getMappedRange(KeySelector.firstGreaterOrEqual(indexEntryKey(begin)), + KeySelector.firstGreaterOrEqual(indexEntryKey(end)), MAPPER, + ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL) .asList() .get(); Assertions.assertEquals(end - begin, kvs.size()); if (validate) { - final Iterator results = kvs.iterator(); + final Iterator results = kvs.iterator(); for (int id = begin; id < end; id++) { Assertions.assertTrue(results.hasNext()); - assertByteArrayEquals(recordValue(id), results.next().getValue()); + MappedKeyValue mappedKeyValue = results.next(); + assertByteArrayEquals(indexEntryKey(id), mappedKeyValue.getKey()); + assertByteArrayEquals(EMPTY, mappedKeyValue.getValue()); + assertByteArrayEquals(indexEntryKey(id), mappedKeyValue.getKey()); + + byte[] prefix = recordKeyPrefix(id); + assertByteArrayEquals(prefix, mappedKeyValue.getRangeBegin()); + prefix[prefix.length - 1] = (byte)0x01; + assertByteArrayEquals(prefix, mappedKeyValue.getRangeEnd()); + + List rangeResult = mappedKeyValue.getRangeResult(); + validateRangeResult(id, rangeResult); } Assertions.assertFalse(results.hasNext()); } @@ -202,55 +225,16 @@ class RangeAndFlatMapQueryIntegrationTest { return null; }); + void validateRangeResult(int id, List rangeResult) { + Assertions.assertEquals(rangeResult.size(), SPLIT_SIZE); + for (int split = 0; split < SPLIT_SIZE; split++) { + KeyValue keyValue = rangeResult.get(split); + assertByteArrayEquals(recordKey(id, split), keyValue.getKey()); + assertByteArrayEquals(recordValue(id, split), keyValue.getValue()); + } + } + void assertByteArrayEquals(byte[] expected, byte[] actual) { Assertions.assertEquals(ByteArrayUtil.printable(expected), ByteArrayUtil.printable(actual)); } - - @Test - void rangeAndFlatMapQueryOverMultipleRows() throws Exception { - try (Database db = openFDB()) { - insertRecordsWithIndexes(3, db); - - List expected_data_of_records = new ArrayList<>(); - for (int i = 0; i <= 1; i++) { - expected_data_of_records.add(recordValue(i)); - } - - db.run(tr -> { - // getRangeAndFlatMap is only support without RYW. This is a must!!! - tr.options().setReadYourWritesDisable(); - - // getRangeAndFlatMap is only supported with snapshot. - Iterator kvs = - tr.snapshot() - .getRangeAndFlatMap(KeySelector.firstGreaterOrEqual(indexEntryKey(0)), - KeySelector.firstGreaterThan(indexEntryKey(1)), MAPPER, - ReadTransaction.ROW_LIMIT_UNLIMITED, false, StreamingMode.WANT_ALL) - .iterator(); - Iterator expected_data_of_records_iter = expected_data_of_records.iterator(); - while (expected_data_of_records_iter.hasNext()) { - Assertions.assertTrue(kvs.hasNext(), "iterator ended too early"); - KeyValue kv = kvs.next(); - byte[] actual_data_of_record = kv.getValue(); - byte[] expected_data_of_record = expected_data_of_records_iter.next(); - - // System.out.println("result key:" + ByteArrayUtil.printable(kv.getKey()) + " value:" + - // ByteArrayUtil.printable(kv.getValue())); Output: - // result - // key:\x02prefix\x00\x02INDEX\x00\x02index-key-of-record-0\x00\x02primary-key-of-record-0\x00 - // value:\x02data-of-record-0\x00 - // result - // key:\x02prefix\x00\x02INDEX\x00\x02index-key-of-record-1\x00\x02primary-key-of-record-1\x00 - // value:\x02data-of-record-1\x00 - - // For now, we don't guarantee what that the returned keys mean. - Assertions.assertArrayEquals(expected_data_of_record, actual_data_of_record, - "Incorrect data of record!"); - } - Assertions.assertFalse(kvs.hasNext(), "Iterator returned too much data"); - - return null; - }); - } - } } diff --git a/bindings/java/src/junit/com/apple/foundationdb/FakeFDBTransaction.java b/bindings/java/src/junit/com/apple/foundationdb/FakeFDBTransaction.java index 0c5a121c64..f557b6f7a1 100644 --- a/bindings/java/src/junit/com/apple/foundationdb/FakeFDBTransaction.java +++ b/bindings/java/src/junit/com/apple/foundationdb/FakeFDBTransaction.java @@ -89,8 +89,6 @@ public class FakeFDBTransaction extends FDBTransaction { @Override protected FutureResults getRange_internal(KeySelector begin, KeySelector end, - // TODO: map is not supported in FakeFDBTransaction yet. - byte[] mapper, // Nullable int rowLimit, int targetBytes, int streamingMode, int iteration, boolean isSnapshot, boolean reverse) { numRangeCalls++; diff --git a/bindings/java/src/main/com/apple/foundationdb/DirectBufferIterator.java b/bindings/java/src/main/com/apple/foundationdb/DirectBufferIterator.java index 289f0f0432..ac2eb2b2dc 100644 --- a/bindings/java/src/main/com/apple/foundationdb/DirectBufferIterator.java +++ b/bindings/java/src/main/com/apple/foundationdb/DirectBufferIterator.java @@ -32,11 +32,11 @@ import java.util.NoSuchElementException; * The serialization format of result is => * [int keyCount, boolean more, ListOf<(int keyLen, int valueLen, byte[] key, byte[] value)>] */ -class DirectBufferIterator implements Iterator, AutoCloseable { - private ByteBuffer byteBuffer; - private int current = 0; - private int keyCount = -1; - private boolean more = false; +abstract class DirectBufferIterator implements AutoCloseable { + protected ByteBuffer byteBuffer; + protected int current = 0; + protected int keyCount = -1; + protected boolean more = false; public DirectBufferIterator(ByteBuffer buffer) { byteBuffer = buffer; @@ -55,31 +55,11 @@ class DirectBufferIterator implements Iterator, AutoCloseable { return keyCount > -1; } - @Override public boolean hasNext() { assert (hasResultReady()); return current < keyCount; } - @Override - public KeyValue next() { - assert (hasResultReady()); // Must be called once its ready. - if (!hasNext()) { - throw new NoSuchElementException(); - } - - final int keyLen = byteBuffer.getInt(); - final int valueLen = byteBuffer.getInt(); - byte[] key = new byte[keyLen]; - byteBuffer.get(key); - - byte[] value = new byte[valueLen]; - byteBuffer.get(value); - - current += 1; - return new KeyValue(key, value); - } - public ByteBuffer getBuffer() { return byteBuffer; } diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java index 8a30280a4d..65a1e9f254 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBTransaction.java @@ -92,12 +92,10 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } @Override - public AsyncIterable getRangeAndFlatMap(KeySelector begin, KeySelector end, byte[] mapper, int limit, - boolean reverse, StreamingMode mode) { - if (mapper == null) { - throw new IllegalArgumentException("Mapper must be non-null"); - } - return new RangeQuery(FDBTransaction.this, true, begin, end, mapper, limit, reverse, mode, eventKeeper); + public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, + int limit, boolean reverse, StreamingMode mode) { + + throw new UnsupportedOperationException("getMappedRange is only supported in serializable"); } /////////////////// @@ -348,9 +346,12 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC } @Override - public AsyncIterable getRangeAndFlatMap(KeySelector begin, KeySelector end, byte[] mapper, int limit, - boolean reverse, StreamingMode mode) { - throw new UnsupportedOperationException("getRangeAndFlatMap is only supported in snapshot"); + public AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, + int limit, boolean reverse, StreamingMode mode) { + if (mapper == null) { + throw new IllegalArgumentException("Mapper must be non-null"); + } + return new MappedRangeQuery(FDBTransaction.this, false, begin, end, mapper, limit, reverse, mode, eventKeeper); } /////////////////// @@ -431,7 +432,6 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC // Users of this function must close the returned FutureResults when finished protected FutureResults getRange_internal(KeySelector begin, KeySelector end, - byte[] mapper, // Nullable int rowLimit, int targetBytes, int streamingMode, int iteration, boolean isSnapshot, boolean reverse) { if (eventKeeper != null) { @@ -443,14 +443,33 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC " -- range get: (%s, %s) limit: %d, bytes: %d, mode: %d, iteration: %d, snap: %s, reverse %s", begin.toString(), end.toString(), rowLimit, targetBytes, streamingMode, iteration, Boolean.toString(isSnapshot), Boolean.toString(reverse)));*/ - return new FutureResults( - mapper == null - ? Transaction_getRange(getPtr(), begin.getKey(), begin.orEqual(), begin.getOffset(), end.getKey(), - end.orEqual(), end.getOffset(), rowLimit, targetBytes, streamingMode, - iteration, isSnapshot, reverse) - : Transaction_getRangeAndFlatMap(getPtr(), begin.getKey(), begin.orEqual(), begin.getOffset(), - end.getKey(), end.orEqual(), end.getOffset(), mapper, rowLimit, - targetBytes, streamingMode, iteration, isSnapshot, reverse), + return new FutureResults(Transaction_getRange(getPtr(), begin.getKey(), begin.orEqual(), begin.getOffset(), + end.getKey(), end.orEqual(), end.getOffset(), rowLimit, + targetBytes, streamingMode, iteration, isSnapshot, reverse), + FDB.instance().isDirectBufferQueriesEnabled(), executor, eventKeeper); + } finally { + pointerReadLock.unlock(); + } + } + + // Users of this function must close the returned FutureResults when finished + protected FutureMappedResults getMappedRange_internal(KeySelector begin, KeySelector end, + byte[] mapper, // Nullable + int rowLimit, int targetBytes, int streamingMode, + int iteration, boolean isSnapshot, boolean reverse) { + if (eventKeeper != null) { + eventKeeper.increment(Events.JNI_CALL); + } + pointerReadLock.lock(); + try { + /*System.out.println(String.format( + " -- range get: (%s, %s) limit: %d, bytes: %d, mode: %d, iteration: %d, snap: %s, reverse %s", + begin.toString(), end.toString(), rowLimit, targetBytes, streamingMode, + iteration, Boolean.toString(isSnapshot), Boolean.toString(reverse)));*/ + return new FutureMappedResults( + Transaction_getMappedRange(getPtr(), begin.getKey(), begin.orEqual(), begin.getOffset(), + end.getKey(), end.orEqual(), end.getOffset(), mapper, rowLimit, + targetBytes, streamingMode, iteration, isSnapshot, reverse), FDB.instance().isDirectBufferQueriesEnabled(), executor, eventKeeper); } finally { pointerReadLock.unlock(); @@ -790,7 +809,7 @@ class FDBTransaction extends NativeObjectWrapper implements Transaction, OptionC byte[] keyEnd, boolean orEqualEnd, int offsetEnd, int rowLimit, int targetBytes, int streamingMode, int iteration, boolean isSnapshot, boolean reverse); - private native long Transaction_getRangeAndFlatMap(long cPtr, byte[] keyBegin, boolean orEqualBegin, + private native long Transaction_getMappedRange(long cPtr, byte[] keyBegin, boolean orEqualBegin, int offsetBegin, byte[] keyEnd, boolean orEqualEnd, int offsetEnd, byte[] mapper, // Nonnull diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureMappedResults.java b/bindings/java/src/main/com/apple/foundationdb/FutureMappedResults.java new file mode 100644 index 0000000000..8d4436d6f7 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/FutureMappedResults.java @@ -0,0 +1,87 @@ +/* + * FutureMappedResults.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.nio.ByteBuffer; +import java.util.concurrent.Executor; + +import com.apple.foundationdb.EventKeeper.Events; + +class FutureMappedResults extends NativeFuture { + private final EventKeeper eventKeeper; + FutureMappedResults(long cPtr, boolean enableDirectBufferQueries, Executor executor, EventKeeper eventKeeper) { + super(cPtr); + registerMarshalCallback(executor); + this.enableDirectBufferQueries = enableDirectBufferQueries; + this.eventKeeper = eventKeeper; + } + + @Override + protected void postMarshal(MappedRangeResultInfo rri) { + // We can't close because this class actually marshals on-demand + } + + @Override + protected MappedRangeResultInfo getIfDone_internal(long cPtr) throws FDBException { + if (eventKeeper != null) { + eventKeeper.increment(Events.JNI_CALL); + } + FDBException err = Future_getError(cPtr); + + if (err != null && !err.isSuccess()) { + throw err; + } + + return new MappedRangeResultInfo(this); + } + + public MappedRangeResult getResults() { + ByteBuffer buffer = enableDirectBufferQueries ? DirectBufferPool.getInstance().poll() : null; + if (buffer != null && eventKeeper != null) { + eventKeeper.increment(Events.RANGE_QUERY_DIRECT_BUFFER_HIT); + eventKeeper.increment(Events.JNI_CALL); + } else if (eventKeeper != null) { + eventKeeper.increment(Events.RANGE_QUERY_DIRECT_BUFFER_MISS); + eventKeeper.increment(Events.JNI_CALL); + } + + try { + pointerReadLock.lock(); + if (buffer != null) { + try (MappedRangeResultDirectBufferIterator directIterator = + new MappedRangeResultDirectBufferIterator(buffer)) { + FutureMappedResults_getDirect(getPtr(), directIterator.getBuffer(), + directIterator.getBuffer().capacity()); + return new MappedRangeResult(directIterator); + } + } else { + return FutureMappedResults_get(getPtr()); + } + } finally { + pointerReadLock.unlock(); + } + } + + private boolean enableDirectBufferQueries = false; + + private native MappedRangeResult FutureMappedResults_get(long cPtr) throws FDBException; + private native void FutureMappedResults_getDirect(long cPtr, ByteBuffer buffer, int capacity) throws FDBException; +} diff --git a/bindings/java/src/main/com/apple/foundationdb/FutureResults.java b/bindings/java/src/main/com/apple/foundationdb/FutureResults.java index d941a06582..989c376620 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FutureResults.java +++ b/bindings/java/src/main/com/apple/foundationdb/FutureResults.java @@ -66,7 +66,7 @@ class FutureResults extends NativeFuture { try { pointerReadLock.lock(); if (buffer != null) { - try (DirectBufferIterator directIterator = new DirectBufferIterator(buffer)) { + try (RangeResultDirectBufferIterator directIterator = new RangeResultDirectBufferIterator(buffer)) { FutureResults_getDirect(getPtr(), directIterator.getBuffer(), directIterator.getBuffer().capacity()); return new RangeResult(directIterator); } diff --git a/bindings/java/src/main/com/apple/foundationdb/KeyValue.java b/bindings/java/src/main/com/apple/foundationdb/KeyValue.java index b3594c5903..e1e22b76a3 100644 --- a/bindings/java/src/main/com/apple/foundationdb/KeyValue.java +++ b/bindings/java/src/main/com/apple/foundationdb/KeyValue.java @@ -20,6 +20,8 @@ package com.apple.foundationdb; +import com.apple.foundationdb.tuple.ByteArrayUtil; + import java.util.Arrays; /** @@ -77,4 +79,13 @@ public class KeyValue { public int hashCode() { return 17 + (37 * Arrays.hashCode(key) + Arrays.hashCode(value)); } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("KeyValue{"); + sb.append("key=").append(ByteArrayUtil.printable(key)); + sb.append(", value=").append(ByteArrayUtil.printable(value)); + sb.append('}'); + return sb.toString(); + } } \ No newline at end of file diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java b/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java new file mode 100644 index 0000000000..71bad2caa9 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/MappedKeyValue.java @@ -0,0 +1,96 @@ +/* + * MappedKeyValue.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import com.apple.foundationdb.tuple.ByteArrayUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +public class MappedKeyValue extends KeyValue { + private final byte[] rangeBegin; + private final byte[] rangeEnd; + private final List rangeResult; + + MappedKeyValue(byte[] key, byte[] value, byte[] rangeBegin, byte[] rangeEnd, List rangeResult) { + super(key, value); + this.rangeBegin = rangeBegin; + this.rangeEnd = rangeEnd; + this.rangeResult = rangeResult; + } + + public byte[] getRangeBegin() { return rangeBegin; } + + public byte[] getRangeEnd() { return rangeEnd; } + + public List getRangeResult() { return rangeResult; } + + public static MappedKeyValue fromBytes(byte[] bytes, int[] lengths) { + // Lengths include: key, value, rangeBegin, rangeEnd, count * (underlying key, underlying value) + if (lengths.length < 4) { + throw new IllegalArgumentException("There needs to be at least 4 lengths to cover the metadata"); + } + + Offset offset = new Offset(); + byte[] key = takeBytes(offset, bytes, lengths); + byte[] value = takeBytes(offset, bytes, lengths); + byte[] rangeBegin = takeBytes(offset, bytes, lengths); + byte[] rangeEnd = takeBytes(offset, bytes, lengths); + + if ((lengths.length - 4) % 2 != 0) { + throw new IllegalArgumentException("There needs to be an even number of lengths!"); + } + int count = (lengths.length - 4) / 2; + List rangeResult = new ArrayList<>(count); + for (int i = 0; i < count; i++) { + byte[] k = takeBytes(offset, bytes, lengths); + byte[] v = takeBytes(offset, bytes, lengths); + rangeResult.add(new KeyValue(k, v)); + } + return new MappedKeyValue(key, value, rangeBegin, rangeEnd, rangeResult); + } + + static class Offset { + int bytes = 0; + int lengths = 0; + } + + static byte[] takeBytes(Offset offset, byte[] bytes, int[] lengths) { + int len = lengths[offset.lengths]; + byte[] b = new byte[len]; + System.arraycopy(bytes, offset.bytes, b, 0, len); + offset.lengths++; + offset.bytes += len; + return b; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("MappedKeyValue{"); + sb.append("rangeBegin=").append(ByteArrayUtil.printable(rangeBegin)); + sb.append(", rangeEnd=").append(ByteArrayUtil.printable(rangeEnd)); + sb.append(", rangeResult=").append(rangeResult); + sb.append('}'); + return super.toString() + "->" + sb.toString(); + } +} \ No newline at end of file diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedRangeQuery.java b/bindings/java/src/main/com/apple/foundationdb/MappedRangeQuery.java new file mode 100644 index 0000000000..9de3753ec3 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/MappedRangeQuery.java @@ -0,0 +1,333 @@ +/* + * RangeQuery.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import com.apple.foundationdb.EventKeeper.Events; +import com.apple.foundationdb.async.AsyncIterable; +import com.apple.foundationdb.async.AsyncIterator; +import com.apple.foundationdb.async.AsyncUtil; + +import java.util.List; +import java.util.NoSuchElementException; +import java.util.concurrent.CancellationException; +import java.util.concurrent.CompletableFuture; +import java.util.function.BiConsumer; + +// TODO: Share code with RangeQuery? +/** + * Represents a query against FoundationDB for a range of keys. The + * result of this query can be iterated over in a blocking fashion with a call to + * {@link #iterator()} (as specified by {@link Iterable}). + * If the calling program uses an asynchronous paradigm, a non-blocking + * {@link AsyncIterator} is returned from {@link #iterator()}. Both of these + * constructions will not begin to query the database until the first call to + * {@code hasNext()}. As the query uses its {@link Transaction} of origin to fetch + * all the data, the use of this query object must not span more than a few seconds. + * + *

NOTE: although resulting {@code Iterator}s do support the {@code remove()} + * operation, the remove is not durable until {@code commit()} on the {@code Transaction} + * that yielded this query returns true. + */ +class MappedRangeQuery implements AsyncIterable { + private final FDBTransaction tr; + private final KeySelector begin; + private final KeySelector end; + private final byte[] mapper; // Nonnull + private final boolean snapshot; + private final int rowLimit; + private final boolean reverse; + private final StreamingMode streamingMode; + private final EventKeeper eventKeeper; + + MappedRangeQuery(FDBTransaction transaction, boolean isSnapshot, KeySelector begin, KeySelector end, byte[] mapper, + int rowLimit, boolean reverse, StreamingMode streamingMode, EventKeeper eventKeeper) { + this.tr = transaction; + this.begin = begin; + this.end = end; + this.mapper = mapper; + this.snapshot = isSnapshot; + this.rowLimit = rowLimit; + this.reverse = reverse; + this.streamingMode = streamingMode; + this.eventKeeper = eventKeeper; + } + + /** + * Returns all the results from the range requested as a {@code List}. If there were no + * limits on the original query and there is a large amount of data in the database + * this call could use a very large amount of memory. + * + * @return a {@code CompletableFuture} that will be set to the contents of the database + * constrained by the query parameters. + */ + @Override + public CompletableFuture> asList() { + StreamingMode mode = this.streamingMode; + if (mode == StreamingMode.ITERATOR) mode = (this.rowLimit == 0) ? StreamingMode.WANT_ALL : StreamingMode.EXACT; + + // if the streaming mode is EXACT, try and grab things as one chunk + if (mode == StreamingMode.EXACT) { + + FutureMappedResults range = + tr.getMappedRange_internal(this.begin, this.end, this.mapper, this.rowLimit, 0, + StreamingMode.EXACT.code(), 1, this.snapshot, this.reverse); + return range.thenApply(result -> result.get().values).whenComplete((result, e) -> range.close()); + } + + // If the streaming mode is not EXACT, simply collect the results of an + // iteration into a list + return AsyncUtil.collect( + new MappedRangeQuery(tr, snapshot, begin, end, mapper, rowLimit, reverse, mode, eventKeeper), + tr.getExecutor()); + } + + /** + * Returns an {@code Iterator} over the results of this query against FoundationDB. + * + * @return an {@code Iterator} over type {@code MappedKeyValue}. + */ + @Override + public AsyncRangeIterator iterator() { + return new AsyncRangeIterator(this.rowLimit, this.reverse, this.streamingMode); + } + + private class AsyncRangeIterator implements AsyncIterator { + // immutable aspects of this iterator + private final boolean rowsLimited; + private final boolean reverse; + private final StreamingMode streamingMode; + + // There is the chance for parallelism in the two "chunks" for fetched data + private MappedRangeResult chunk = null; + private MappedRangeResult nextChunk = null; + private boolean fetchOutstanding = false; + private byte[] prevKey = null; + private int index = 0; + private int iteration = 0; + private KeySelector begin; + private KeySelector end; + + private int rowsRemaining; + + private FutureMappedResults fetchingChunk; + private CompletableFuture nextFuture; + private boolean isCancelled = false; + + private AsyncRangeIterator(int rowLimit, boolean reverse, StreamingMode streamingMode) { + this.begin = MappedRangeQuery.this.begin; + this.end = MappedRangeQuery.this.end; + this.rowsLimited = rowLimit != 0; + this.rowsRemaining = rowLimit; + this.reverse = reverse; + this.streamingMode = streamingMode; + + startNextFetch(); + } + + private synchronized boolean mainChunkIsTheLast() { return !chunk.more || (rowsLimited && rowsRemaining < 1); } + + class FetchComplete implements BiConsumer { + final FutureMappedResults fetchingChunk; + final CompletableFuture promise; + + FetchComplete(FutureMappedResults fetch, CompletableFuture promise) { + this.fetchingChunk = fetch; + this.promise = promise; + } + + @Override + public void accept(MappedRangeResultInfo data, Throwable error) { + try { + if (error != null) { + if (eventKeeper != null) { + eventKeeper.increment(Events.RANGE_QUERY_CHUNK_FAILED); + } + promise.completeExceptionally(error); + if (error instanceof Error) { + throw(Error) error; + } + + return; + } + + final MappedRangeResult rangeResult = data.get(); + final RangeResultSummary summary = rangeResult.getSummary(); + if (summary.lastKey == null) { + promise.complete(Boolean.FALSE); + return; + } + + synchronized (MappedRangeQuery.AsyncRangeIterator.this) { + fetchOutstanding = false; + + // adjust the total number of rows we should ever fetch + rowsRemaining -= summary.keyCount; + + // set up the next fetch + if (reverse) { + end = KeySelector.firstGreaterOrEqual(summary.lastKey); + } else { + begin = KeySelector.firstGreaterThan(summary.lastKey); + } + + // If this is the first fetch or the main chunk is exhausted + if (chunk == null || index == chunk.values.size()) { + nextChunk = null; + chunk = rangeResult; + index = 0; + } else { + nextChunk = rangeResult; + } + } + + promise.complete(Boolean.TRUE); + } finally { + fetchingChunk.close(); + } + } + } + + private synchronized void startNextFetch() { + if (fetchOutstanding) + throw new IllegalStateException("Reentrant call not allowed"); // This can not be called reentrantly + if (isCancelled) return; + + if (chunk != null && mainChunkIsTheLast()) return; + + fetchOutstanding = true; + nextChunk = null; + + nextFuture = new CompletableFuture<>(); + final long sTime = System.nanoTime(); + fetchingChunk = tr.getMappedRange_internal(begin, end, mapper, rowsLimited ? rowsRemaining : 0, 0, + streamingMode.code(), ++iteration, snapshot, reverse); + + BiConsumer cons = new FetchComplete(fetchingChunk, nextFuture); + if (eventKeeper != null) { + eventKeeper.increment(Events.RANGE_QUERY_FETCHES); + cons = cons.andThen((r, t) -> { + eventKeeper.timeNanos(Events.RANGE_QUERY_FETCH_TIME_NANOS, System.nanoTime() - sTime); + }); + } + + fetchingChunk.whenComplete(cons); + } + + @Override + public synchronized CompletableFuture onHasNext() { + if (isCancelled) throw new CancellationException(); + + // This will only happen before the first fetch has completed + if (chunk == null) { + return nextFuture; + } + + // We have a chunk and are still working though it + if (index < chunk.values.size()) { + return AsyncUtil.READY_TRUE; + } + + // If we are at the end of the current chunk there is either: + // - no more data -or- + // - we are already fetching the next block + return mainChunkIsTheLast() ? AsyncUtil.READY_FALSE : nextFuture; + } + + @Override + public boolean hasNext() { + return onHasNext().join(); + } + + @Override + public MappedKeyValue next() { + CompletableFuture nextFuture; + synchronized (this) { + if (isCancelled) throw new CancellationException(); + + // at least the first chunk has been fetched and there is at least one + // available result + if (chunk != null && index < chunk.values.size()) { + // If this is the first call to next() on a chunk, then we will want to + // start fetching the data for the next block + boolean initialNext = index == 0; + + MappedKeyValue result = chunk.values.get(index); + prevKey = result.getKey(); + index++; + + if (eventKeeper != null) { + // We record the BYTES_FETCHED here, rather than at a lower level, + // because some parts of the construction of a MappedRangeResult occur underneath + // the JNI boundary, and we don't want to pass the eventKeeper down there + // (note: account for the length fields as well when recording the bytes + // fetched) + eventKeeper.count(Events.BYTES_FETCHED, result.getKey().length + result.getValue().length + 8); + eventKeeper.increment(Events.RANGE_QUERY_RECORDS_FETCHED); + } + + // If this is the first call to next() on a chunk there cannot + // be another waiting, since we could not have issued a request + assert (!(initialNext && nextChunk != null)); + + // we are at the end of the current chunk and there is more to be had already + if (index == chunk.values.size() && nextChunk != null) { + index = 0; + chunk = nextChunk; + nextChunk = null; + } + + if (initialNext) { + startNextFetch(); + } + + return result; + } + + nextFuture = onHasNext(); + } + + // If there was no result ready then we need to wait on the future + // and return the proper result, throwing if there are no more elements + return nextFuture + .thenApply(hasNext -> { + if (hasNext) { + return next(); + } + throw new NoSuchElementException(); + }) + .join(); + } + + @Override + public synchronized void remove() { + if (prevKey == null) throw new IllegalStateException("No value has been fetched from database"); + + tr.clear(prevKey); + } + + @Override + public synchronized void cancel() { + isCancelled = true; + nextFuture.cancel(true); + fetchingChunk.cancel(true); + } + } +} diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedRangeResult.java b/bindings/java/src/main/com/apple/foundationdb/MappedRangeResult.java new file mode 100644 index 0000000000..e629984d7a --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/MappedRangeResult.java @@ -0,0 +1,64 @@ +/* + * MappedRangeResult.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import com.apple.foundationdb.tuple.ByteArrayUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +class MappedRangeResult { + final List values; + final boolean more; + + public MappedRangeResult(MappedKeyValue[] values, boolean more) { + this.values = Arrays.asList(values); + this.more = more; + } + + MappedRangeResult(MappedRangeResultDirectBufferIterator iterator) { + iterator.readResultsSummary(); + more = iterator.hasMore(); + + int count = iterator.count(); + values = new ArrayList<>(count); + + for (int i = 0; i < count; ++i) { + values.add(iterator.next()); + } + } + + public RangeResultSummary getSummary() { + final int keyCount = values.size(); + final byte[] lastKey = keyCount > 0 ? values.get(keyCount - 1).getKey() : null; + return new RangeResultSummary(lastKey, keyCount, more); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("MappedRangeResult{"); + sb.append("values=").append(values); + sb.append(", more=").append(more); + sb.append('}'); + return sb.toString(); + } +} diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedRangeResultDirectBufferIterator.java b/bindings/java/src/main/com/apple/foundationdb/MappedRangeResultDirectBufferIterator.java new file mode 100644 index 0000000000..169cef42e0 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/MappedRangeResultDirectBufferIterator.java @@ -0,0 +1,71 @@ +/* + * MappedRangeResultDirectBufferIterator.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2015-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.io.Closeable; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Holds the direct buffer that is shared with JNI wrapper. + */ +class MappedRangeResultDirectBufferIterator extends DirectBufferIterator implements Iterator { + + MappedRangeResultDirectBufferIterator(ByteBuffer buffer) { super(buffer); } + + @Override + public boolean hasNext() { + return super.hasNext(); + } + + @Override + public MappedKeyValue next() { + assert (hasResultReady()); // Must be called once its ready. + if (!hasNext()) { + throw new NoSuchElementException(); + } + + final byte[] key = getString(); + final byte[] value = getString(); + final byte[] rangeBegin = getString(); + final byte[] rangeEnd = getString(); + final int rangeResultSize = byteBuffer.getInt(); + List rangeResult = new ArrayList(); + for (int i = 0; i < rangeResultSize; i++) { + final byte[] k = getString(); + final byte[] v = getString(); + rangeResult.add(new KeyValue(k, v)); + } + current += 1; + return new MappedKeyValue(key, value, rangeBegin, rangeEnd, rangeResult); + } + + private byte[] getString() { + final int len = byteBuffer.getInt(); + byte[] s = new byte[len]; + byteBuffer.get(s); + return s; + } +} \ No newline at end of file diff --git a/bindings/java/src/main/com/apple/foundationdb/MappedRangeResultInfo.java b/bindings/java/src/main/com/apple/foundationdb/MappedRangeResultInfo.java new file mode 100644 index 0000000000..6f5ce16d2b --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/MappedRangeResultInfo.java @@ -0,0 +1,29 @@ +/* + * MappedRangeResultInfo.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +class MappedRangeResultInfo { + MappedRangeResult get() { return f.getResults(); } + + MappedRangeResultInfo(FutureMappedResults f) { this.f = f; } + + private FutureMappedResults f; +} diff --git a/bindings/java/src/main/com/apple/foundationdb/RangeQuery.java b/bindings/java/src/main/com/apple/foundationdb/RangeQuery.java index f91b00471a..77e6242735 100644 --- a/bindings/java/src/main/com/apple/foundationdb/RangeQuery.java +++ b/bindings/java/src/main/com/apple/foundationdb/RangeQuery.java @@ -49,19 +49,17 @@ class RangeQuery implements AsyncIterable { private final FDBTransaction tr; private final KeySelector begin; private final KeySelector end; - private final byte[] mapper; // Nullable private final boolean snapshot; private final int rowLimit; private final boolean reverse; private final StreamingMode streamingMode; private final EventKeeper eventKeeper; - RangeQuery(FDBTransaction transaction, boolean isSnapshot, KeySelector begin, KeySelector end, byte[] mapper, - int rowLimit, boolean reverse, StreamingMode streamingMode, EventKeeper eventKeeper) { + RangeQuery(FDBTransaction transaction, boolean isSnapshot, KeySelector begin, KeySelector end, int rowLimit, + boolean reverse, StreamingMode streamingMode, EventKeeper eventKeeper) { this.tr = transaction; this.begin = begin; this.end = end; - this.mapper = mapper; this.snapshot = isSnapshot; this.rowLimit = rowLimit; this.reverse = reverse; @@ -69,12 +67,6 @@ class RangeQuery implements AsyncIterable { this.eventKeeper = eventKeeper; } - // RangeQueryAndFlatMap - RangeQuery(FDBTransaction transaction, boolean isSnapshot, KeySelector begin, KeySelector end, int rowLimit, - boolean reverse, StreamingMode streamingMode, EventKeeper eventKeeper) { - this(transaction, isSnapshot, begin, end, null, rowLimit, reverse, streamingMode, eventKeeper); - } - /** * Returns all the results from the range requested as a {@code List}. If there were no * limits on the original query and there is a large amount of data in the database @@ -92,7 +84,7 @@ class RangeQuery implements AsyncIterable { // if the streaming mode is EXACT, try and grab things as one chunk if(mode == StreamingMode.EXACT) { - FutureResults range = tr.getRange_internal(this.begin, this.end, this.mapper, this.rowLimit, 0, + FutureResults range = tr.getRange_internal(this.begin, this.end, this.rowLimit, 0, StreamingMode.EXACT.code(), 1, this.snapshot, this.reverse); return range.thenApply(result -> result.get().values) .whenComplete((result, e) -> range.close()); @@ -100,7 +92,7 @@ class RangeQuery implements AsyncIterable { // If the streaming mode is not EXACT, simply collect the results of an // iteration into a list - return AsyncUtil.collect(new RangeQuery(tr, snapshot, begin, end, mapper, rowLimit, reverse, mode, eventKeeper), + return AsyncUtil.collect(new RangeQuery(tr, snapshot, begin, end, rowLimit, reverse, mode, eventKeeper), tr.getExecutor()); } @@ -229,8 +221,8 @@ class RangeQuery implements AsyncIterable { nextFuture = new CompletableFuture<>(); final long sTime = System.nanoTime(); - fetchingChunk = tr.getRange_internal(begin, end, mapper, rowsLimited ? rowsRemaining : 0, 0, - streamingMode.code(), ++iteration, snapshot, reverse); + fetchingChunk = tr.getRange_internal(begin, end, rowsLimited ? rowsRemaining : 0, 0, streamingMode.code(), + ++iteration, snapshot, reverse); BiConsumer cons = new FetchComplete(fetchingChunk,nextFuture); if(eventKeeper!=null){ diff --git a/bindings/java/src/main/com/apple/foundationdb/RangeResult.java b/bindings/java/src/main/com/apple/foundationdb/RangeResult.java index c20c1556e0..7c9ffaf25e 100644 --- a/bindings/java/src/main/com/apple/foundationdb/RangeResult.java +++ b/bindings/java/src/main/com/apple/foundationdb/RangeResult.java @@ -58,7 +58,7 @@ class RangeResult { this.more = more; } - RangeResult(DirectBufferIterator iterator) { + RangeResult(RangeResultDirectBufferIterator iterator) { iterator.readResultsSummary(); more = iterator.hasMore(); diff --git a/bindings/java/src/main/com/apple/foundationdb/RangeResultDirectBufferIterator.java b/bindings/java/src/main/com/apple/foundationdb/RangeResultDirectBufferIterator.java new file mode 100644 index 0000000000..c2c66e1227 --- /dev/null +++ b/bindings/java/src/main/com/apple/foundationdb/RangeResultDirectBufferIterator.java @@ -0,0 +1,62 @@ +/* + * RangeResultDirectBufferIterator.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2015-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb; + +import java.io.Closeable; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Holds the direct buffer that is shared with JNI wrapper. A typical usage is as follows: + * + * The serialization format of result is => + * [int keyCount, boolean more, ListOf<(int keyLen, int valueLen, byte[] key, byte[] value)>] + */ +class RangeResultDirectBufferIterator extends DirectBufferIterator implements Iterator { + + RangeResultDirectBufferIterator(ByteBuffer buffer) { super(buffer); } + + @Override + public boolean hasNext() { + return super.hasNext(); + } + + @Override + public KeyValue next() { + assert (hasResultReady()); // Must be called once its ready. + if (!hasNext()) { + throw new NoSuchElementException(); + } + + final int keyLen = byteBuffer.getInt(); + final int valueLen = byteBuffer.getInt(); + byte[] key = new byte[keyLen]; + byteBuffer.get(key); + + byte[] value = new byte[valueLen]; + byteBuffer.get(value); + + current += 1; + return new KeyValue(key, value); + } +} diff --git a/bindings/java/src/main/com/apple/foundationdb/RangeResultSummary.java b/bindings/java/src/main/com/apple/foundationdb/RangeResultSummary.java index 67d50c1823..5cbfca04af 100644 --- a/bindings/java/src/main/com/apple/foundationdb/RangeResultSummary.java +++ b/bindings/java/src/main/com/apple/foundationdb/RangeResultSummary.java @@ -20,6 +20,8 @@ package com.apple.foundationdb; +import com.apple.foundationdb.tuple.ByteArrayUtil; + class RangeResultSummary { final byte[] lastKey; final int keyCount; @@ -30,4 +32,14 @@ class RangeResultSummary { this.keyCount = keyCount; this.more = more; } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("RangeResultSummary{"); + sb.append("lastKey=").append(ByteArrayUtil.printable(lastKey)); + sb.append(", keyCount=").append(keyCount); + sb.append(", more=").append(more); + sb.append('}'); + return sb.toString(); + } } diff --git a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java index b2b81553ef..417068441d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java +++ b/bindings/java/src/main/com/apple/foundationdb/ReadTransaction.java @@ -457,8 +457,8 @@ public interface ReadTransaction extends ReadTransactionContext { *

* @return a handle to access the results of the asynchronous call */ - AsyncIterable getRangeAndFlatMap(KeySelector begin, KeySelector end, byte[] mapper, int limit, - boolean reverse, StreamingMode mode); + AsyncIterable getMappedRange(KeySelector begin, KeySelector end, byte[] mapper, int limit, + boolean reverse, StreamingMode mode); /** * Gets an estimate for the number of bytes stored in the given range. diff --git a/bindings/java/src/tests.cmake b/bindings/java/src/tests.cmake index b84c148ac2..40a097da5d 100644 --- a/bindings/java/src/tests.cmake +++ b/bindings/java/src/tests.cmake @@ -52,7 +52,7 @@ set(JAVA_INTEGRATION_TESTS src/integration/com/apple/foundationdb/CycleMultiClientIntegrationTest.java src/integration/com/apple/foundationdb/SidebandMultiThreadClientTest.java src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java - src/integration/com/apple/foundationdb/RangeAndFlatMapQueryIntegrationTest.java + src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java ) # Resources that are used in integration testing, but are not explicitly test files (JUnit rules, diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index f98aabcb64..f8c8fb58b3 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -446,7 +446,7 @@ public: Counter transactionGetKeyRequests; Counter transactionGetValueRequests; Counter transactionGetRangeRequests; - Counter transactionGetRangeAndFlatMapRequests; + Counter transactionGetMappedRangeRequests; Counter transactionGetRangeStreamRequests; Counter transactionWatchRequests; Counter transactionGetAddressesForKeyRequests; diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 48a8d6d3e3..f405fa7d13 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -475,6 +475,7 @@ using KeyRange = Standalone; using KeyValue = Standalone; using KeySelector = Standalone; using RangeResult = Standalone; +using MappedRangeResult = Standalone; enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits::max() }; @@ -616,6 +617,8 @@ KeyRangeWith keyRangeWith(const KeyRangeRef& range, const Val& value) { return KeyRangeWith(range, value); } +struct MappedKeyValueRef; + struct GetRangeLimits { enum { ROW_LIMIT_UNLIMITED = -1, BYTE_LIMIT_UNLIMITED = -1 }; @@ -629,6 +632,8 @@ struct GetRangeLimits { void decrement(VectorRef const& data); void decrement(KeyValueRef const& data); + void decrement(VectorRef const& data); + void decrement(MappedKeyValueRef const& data); // True if either the row or byte limit has been reached bool isReached(); @@ -689,6 +694,114 @@ struct Traceable : std::true_type { } }; +// Similar to KeyValueRef, but result can be empty. +struct GetValueReqAndResultRef { + KeyRef key; + Optional result; + + GetValueReqAndResultRef() {} + GetValueReqAndResultRef(Arena& a, const GetValueReqAndResultRef& copyFrom) + : key(a, copyFrom.key), result(a, copyFrom.result) {} + + bool operator==(const GetValueReqAndResultRef& rhs) const { return key == rhs.key && result == rhs.result; } + bool operator!=(const GetValueReqAndResultRef& rhs) const { return !(rhs == *this); } + int expectedSize() const { return key.expectedSize() + result.expectedSize(); } + + template + void serialize(Ar& ar) { + serializer(ar, key, result); + } +}; + +struct GetRangeReqAndResultRef { + KeySelectorRef begin, end; + RangeResultRef result; + + GetRangeReqAndResultRef() {} + // KeyValueRef(const KeyRef& key, const ValueRef& value) : key(key), value(value) {} + GetRangeReqAndResultRef(Arena& a, const GetRangeReqAndResultRef& copyFrom) + : begin(a, copyFrom.begin), end(a, copyFrom.end), result(a, copyFrom.result) {} + + bool operator==(const GetRangeReqAndResultRef& rhs) const { + return begin == rhs.begin && end == rhs.end && result == rhs.result; + } + bool operator!=(const GetRangeReqAndResultRef& rhs) const { return !(rhs == *this); } + + template + void serialize(Ar& ar) { + serializer(ar, begin, end, result); + } +}; + +using MappedReqAndResultRef = std::variant; + +struct MappedKeyValueRef : KeyValueRef { + // Save the original key value at the base (KeyValueRef). + + MappedReqAndResultRef reqAndResult; + + MappedKeyValueRef() = default; + MappedKeyValueRef(Arena& a, const MappedKeyValueRef& copyFrom) : KeyValueRef(a, copyFrom) { + const auto& reqAndResultCopyFrom = copyFrom.reqAndResult; + if (std::holds_alternative(reqAndResultCopyFrom)) { + auto getValue = std::get(reqAndResultCopyFrom); + reqAndResult = GetValueReqAndResultRef(a, getValue); + } else if (std::holds_alternative(reqAndResultCopyFrom)) { + auto getRange = std::get(reqAndResultCopyFrom); + reqAndResult = GetRangeReqAndResultRef(a, getRange); + } else { + throw internal_error(); + } + } + + bool operator==(const MappedKeyValueRef& rhs) const { + return static_cast(*this) == static_cast(rhs) && + reqAndResult == rhs.reqAndResult; + } + bool operator!=(const MappedKeyValueRef& rhs) const { return !(rhs == *this); } + + // It relies on the base to provide the expectedSize. TODO: Consider add the underlying request and key values into + // expected size? + // int expectedSize() const { return ((KeyValueRef*)this)->expectedSisze() + reqA } + + template + void serialize(Ar& ar) { + serializer(ar, ((KeyValueRef&)*this), reqAndResult); + } +}; + +struct MappedRangeResultRef : VectorRef { + // Additional information on range result. See comments on RangeResultRef. + bool more; + Optional readThrough; + bool readToBegin; + bool readThroughEnd; + + MappedRangeResultRef() : more(false), readToBegin(false), readThroughEnd(false) {} + MappedRangeResultRef(Arena& p, const MappedRangeResultRef& toCopy) + : VectorRef(p, toCopy), more(toCopy.more), + readThrough(toCopy.readThrough.present() ? KeyRef(p, toCopy.readThrough.get()) : Optional()), + readToBegin(toCopy.readToBegin), readThroughEnd(toCopy.readThroughEnd) {} + MappedRangeResultRef(const VectorRef& value, + bool more, + Optional readThrough = Optional()) + : VectorRef(value), more(more), readThrough(readThrough), readToBegin(false), + readThroughEnd(false) {} + MappedRangeResultRef(bool readToBegin, bool readThroughEnd) + : more(false), readToBegin(readToBegin), readThroughEnd(readThroughEnd) {} + + template + void serialize(Ar& ar) { + serializer(ar, ((VectorRef&)*this), more, readThrough, readToBegin, readThroughEnd); + } + + std::string toString() const { + return "more:" + std::to_string(more) + + " readThrough:" + (readThrough.present() ? readThrough.get().toString() : "[unset]") + + " readToBegin:" + std::to_string(readToBegin) + " readThroughEnd:" + std::to_string(readThroughEnd); + } +}; + struct KeyValueStoreType { constexpr static FileIdentifier file_identifier = 6560359; // These enumerated values are stored in the database configuration, so should NEVER be changed. diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index f62d588591..34ab01b445 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -59,12 +59,12 @@ public: GetRangeLimits limits, bool snapshot = false, bool reverse = false) = 0; - virtual ThreadFuture getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot = false, - bool reverse = false) = 0; + virtual ThreadFuture getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot = false, + bool reverse = false) = 0; virtual ThreadFuture>> getAddressesForKey(const KeyRef& key) = 0; virtual ThreadFuture> getVersionstamp() = 0; diff --git a/fdbclient/ISingleThreadTransaction.h b/fdbclient/ISingleThreadTransaction.h index d575bb221a..23448e4579 100644 --- a/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/ISingleThreadTransaction.h @@ -63,12 +63,12 @@ public: GetRangeLimits limits, Snapshot = Snapshot::False, Reverse = Reverse::False) = 0; - virtual Future getRangeAndFlatMap(KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Snapshot = Snapshot::False, - Reverse = Reverse::False) = 0; + virtual Future getMappedRange(KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Snapshot = Snapshot::False, + Reverse = Reverse::False) = 0; virtual Future>> getAddressesForKey(Key const& key) = 0; virtual Future>> getRangeSplitPoints(KeyRange const& range, int64_t chunkSize) = 0; virtual Future getEstimatedRangeSizeBytes(KeyRange const& keys) = 0; diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 39dac56bf9..6719a43d7d 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -146,38 +146,39 @@ ThreadFuture DLTransaction::getRange(const KeyRangeRef& keys, return getRange(firstGreaterOrEqual(keys.begin), firstGreaterOrEqual(keys.end), limits, snapshot, reverse); } -ThreadFuture DLTransaction::getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot, - bool reverse) { - FdbCApi::FDBFuture* f = api->transactionGetRangeAndFlatMap(tr, - begin.getKey().begin(), - begin.getKey().size(), - begin.orEqual, - begin.offset, - end.getKey().begin(), - end.getKey().size(), - end.orEqual, - end.offset, - mapper.begin(), - mapper.size(), - limits.rows, - limits.bytes, - FDB_STREAMING_MODE_EXACT, - 0, - snapshot, - reverse); - return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { - const FdbCApi::FDBKeyValue* kvs; +ThreadFuture DLTransaction::getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot, + bool reverse) { + FdbCApi::FDBFuture* f = api->transactionGetMappedRange(tr, + begin.getKey().begin(), + begin.getKey().size(), + begin.orEqual, + begin.offset, + end.getKey().begin(), + end.getKey().size(), + end.orEqual, + end.offset, + mapper.begin(), + mapper.size(), + limits.rows, + limits.bytes, + FDB_STREAMING_MODE_EXACT, + 0, + snapshot, + reverse); + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { + const FdbCApi::FDBMappedKeyValue* kvms; int count; FdbCApi::fdb_bool_t more; - FdbCApi::fdb_error_t error = api->futureGetKeyValueArray(f, &kvs, &count, &more); + FdbCApi::fdb_error_t error = api->futureGetMappedKeyValueArray(f, &kvms, &count, &more); ASSERT(!error); // The memory for this is stored in the FDBFuture and is released when the future gets destroyed - return RangeResult(RangeResultRef(VectorRef((KeyValueRef*)kvs, count), more), Arena()); + return MappedRangeResult( + MappedRangeResultRef(VectorRef((MappedKeyValueRef*)kvms, count), more), Arena()); }); } @@ -555,11 +556,8 @@ void DLApi::init() { "fdb_transaction_get_addresses_for_key", headerVersion >= 0); loadClientFunction(&api->transactionGetRange, lib, fdbCPath, "fdb_transaction_get_range", headerVersion >= 0); - loadClientFunction(&api->transactionGetRangeAndFlatMap, - lib, - fdbCPath, - "fdb_transaction_get_range_and_flat_map", - headerVersion >= 700); + loadClientFunction( + &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 700); loadClientFunction( &api->transactionGetVersionstamp, lib, fdbCPath, "fdb_transaction_get_versionstamp", headerVersion >= 410); loadClientFunction(&api->transactionSet, lib, fdbCPath, "fdb_transaction_set", headerVersion >= 0); @@ -616,6 +614,8 @@ void DLApi::init() { loadClientFunction(&api->futureGetKeyArray, lib, fdbCPath, "fdb_future_get_key_array", headerVersion >= 700); loadClientFunction( &api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0); + loadClientFunction( + &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 700); loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0); loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0); loadClientFunction(&api->futureDestroy, lib, fdbCPath, "fdb_future_destroy", headerVersion >= 0); @@ -861,15 +861,15 @@ ThreadFuture MultiVersionTransaction::getRange(const KeyRangeRef& k return abortableFuture(f, tr.onChange); } -ThreadFuture MultiVersionTransaction::getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot, - bool reverse) { +ThreadFuture MultiVersionTransaction::getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot, + bool reverse) { auto tr = getTransaction(); - auto f = tr.transaction ? tr.transaction->getRangeAndFlatMap(begin, end, mapper, limits, snapshot, reverse) - : makeTimeout(); + auto f = tr.transaction ? tr.transaction->getMappedRange(begin, end, mapper, limits, snapshot, reverse) + : makeTimeout(); return abortableFuture(f, tr.onChange); } diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index f8bde8c000..87556b4c67 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -38,6 +38,9 @@ struct FdbCApi : public ThreadSafeReferenceCounted { typedef struct FDB_database FDBDatabase; typedef struct FDB_transaction FDBTransaction; + typedef int fdb_error_t; + typedef int fdb_bool_t; + #pragma pack(push, 4) typedef struct key { const uint8_t* key; @@ -49,6 +52,35 @@ struct FdbCApi : public ThreadSafeReferenceCounted { const void* value; int valueLength; } FDBKeyValue; + +#pragma pack(pop) + + /* Memory layout of KeySelectorRef. */ + typedef struct keyselector { + FDBKey key; + /* orEqual and offset have not be tested in C binding. Just a placeholder. */ + fdb_bool_t orEqual; + int offset; + } FDBKeySelector; + + /* Memory layout of GetRangeReqAndResultRef. */ + typedef struct getrangereqandresult { + FDBKeySelector begin; + FDBKeySelector end; + FDBKeyValue* data; + int m_size, m_capacity; + } FDBGetRangeReqAndResult; + + typedef struct mappedkeyvalue { + FDBKey key; + FDBKey value; + /* It's complicated to map a std::variant to C. For now we assume the underlying requests are always getRange + * and take the shortcut. */ + FDBGetRangeReqAndResult getRange; + unsigned char buffer[32]; + } FDBMappedKeyValue; + +#pragma pack(push, 4) typedef struct keyrange { const void* beginKey; int beginKeyLength; @@ -57,9 +89,6 @@ struct FdbCApi : public ThreadSafeReferenceCounted { } FDBKeyRange; #pragma pack(pop) - typedef int fdb_error_t; - typedef int fdb_bool_t; - typedef struct readgranulecontext { // User context to pass along to functions void* userContext; @@ -144,23 +173,23 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int iteration, fdb_bool_t snapshot, fdb_bool_t reverse); - FDBFuture* (*transactionGetRangeAndFlatMap)(FDBTransaction* tr, - uint8_t const* beginKeyName, - int beginKeyNameLength, - fdb_bool_t beginOrEqual, - int beginOffset, - uint8_t const* endKeyName, - int endKeyNameLength, - fdb_bool_t endOrEqual, - int endOffset, - uint8_t const* mapper_name, - int mapper_name_length, - int limit, - int targetBytes, - FDBStreamingMode mode, - int iteration, - fdb_bool_t snapshot, - fdb_bool_t reverse); + FDBFuture* (*transactionGetMappedRange)(FDBTransaction* tr, + uint8_t const* beginKeyName, + int beginKeyNameLength, + fdb_bool_t beginOrEqual, + int beginOffset, + uint8_t const* endKeyName, + int endKeyNameLength, + fdb_bool_t endOrEqual, + int endOffset, + uint8_t const* mapper_name, + int mapper_name_length, + int limit, + int targetBytes, + FDBStreamingMode mode, + int iteration, + fdb_bool_t snapshot, + fdb_bool_t reverse); FDBFuture* (*transactionGetVersionstamp)(FDBTransaction* tr); void (*transactionSet)(FDBTransaction* tr, @@ -236,6 +265,10 @@ struct FdbCApi : public ThreadSafeReferenceCounted { fdb_error_t (*futureGetKeyRangeArray)(FDBFuture* f, const FDBKeyRange** out_keyranges, int* outCount); fdb_error_t (*futureGetKeyArray)(FDBFuture* f, FDBKey const** outKeys, int* outCount); fdb_error_t (*futureGetKeyValueArray)(FDBFuture* f, FDBKeyValue const** outKV, int* outCount, fdb_bool_t* outMore); + fdb_error_t (*futureGetMappedKeyValueArray)(FDBFuture* f, + FDBMappedKeyValue const** outKVM, + int* outCount, + fdb_bool_t* outMore); fdb_error_t (*futureSetCallback)(FDBFuture* f, FDBCallback callback, void* callback_parameter); void (*futureCancel)(FDBFuture* f); void (*futureDestroy)(FDBFuture* f); @@ -281,12 +314,12 @@ public: GetRangeLimits limits, bool snapshot = false, bool reverse = false) override; - ThreadFuture getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot, - bool reverse) override; + ThreadFuture getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot, + bool reverse) override; ThreadFuture>> getAddressesForKey(const KeyRef& key) override; ThreadFuture> getVersionstamp() override; ThreadFuture getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override; @@ -434,12 +467,12 @@ public: GetRangeLimits limits, bool snapshot = false, bool reverse = false) override; - ThreadFuture getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot, - bool reverse) override; + ThreadFuture getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot, + bool reverse) override; ThreadFuture>> getAddressesForKey(const KeyRef& key) override; ThreadFuture> getVersionstamp() override; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8d094401a4..1d4c898925 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -171,8 +171,8 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics)); - queueModel.updateTssEndpoint(ssi.getKeyValuesAndFlatMap.getEndpoint().token.first(), - TSSEndpointData(tssi.id(), tssi.getKeyValuesAndFlatMap.getEndpoint(), metrics)); + queueModel.updateTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics)); @@ -196,7 +196,7 @@ void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) { queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first()); - queueModel.removeTssEndpoint(ssi.getKeyValuesAndFlatMap.getEndpoint().token.first()); + queueModel.removeTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first()); @@ -476,9 +476,9 @@ ACTOR Future tssLogger(DatabaseContext* cx) { tssEv, "GetKeyValuesLatency", it.second->SSgetKeyValuesLatency, it.second->TSSgetKeyValuesLatency); traceTSSPercentiles(tssEv, "GetKeyLatency", it.second->SSgetKeyLatency, it.second->TSSgetKeyLatency); traceTSSPercentiles(tssEv, - "GetKeyValuesAndFlatMapLatency", - it.second->SSgetKeyValuesAndFlatMapLatency, - it.second->TSSgetKeyValuesAndFlatMapLatency); + "GetMappedKeyValuesLatency", + it.second->SSgetMappedKeyValuesLatency, + it.second->TSSgetMappedKeyValuesLatency); it.second->clear(); } @@ -1314,7 +1314,7 @@ DatabaseContext::DatabaseContext(Reference const& data) { + if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) { + ASSERT(data.size() <= rows); + rows -= data.size(); + } + + minRows = std::max(0, minRows - data.size()); + + // TODO: For now, expectedSize only considers the size of the original key values, but not the underlying queries or + // results. Also, double check it is correct when dealing with sizeof(MappedKeyValueRef). + if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) + bytes = std::max(0, bytes - (int)data.expectedSize() - (8 - (int)sizeof(MappedKeyValueRef)) * data.size()); +} + +void GetRangeLimits::decrement(MappedKeyValueRef const& data) { + minRows = std::max(0, minRows - 1); + if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) + rows--; + // TODO: For now, expectedSize only considers the size of the original key values, but not the underlying queries or + // results. Also, double check it is correct when dealing with sizeof(MappedKeyValueRef). + if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) + bytes = std::max(0, bytes - (int)8 - (int)data.expectedSize()); +} + // True if either the row or byte limit has been reached bool GetRangeLimits::isReached() { return rows == 0 || (bytes == 0 && minRows == 0); @@ -3355,21 +3379,21 @@ template RequestStream StorageServerInterface::*getRangeRequestStream() { if constexpr (std::is_same::value) { return &StorageServerInterface::getKeyValues; - } else if (std::is_same::value) { - return &StorageServerInterface::getKeyValuesAndFlatMap; + } else if (std::is_same::value) { + return &StorageServerInterface::getMappedKeyValues; } else { UNREACHABLE(); } } -ACTOR template -Future getExactRange(Reference trState, - Version version, - KeyRange keys, - Key mapper, - GetRangeLimits limits, - Reverse reverse) { - state RangeResult output; +ACTOR template +Future getExactRange(Reference trState, + Version version, + KeyRange keys, + Key mapper, + GetRangeLimits limits, + Reverse reverse) { + state RangeResultFamily output; state Span span("NAPI:getExactRange"_loc, trState->spanID); // printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); @@ -3547,14 +3571,14 @@ Future resolveKey(Reference trState, KeySelector const& k return getKey(trState, key, version); } -ACTOR template -Future getRangeFallback(Reference trState, - Version version, - KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Reverse reverse) { +ACTOR template +Future getRangeFallback(Reference trState, + Version version, + KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Reverse reverse) { if (version == latestVersion) { state Transaction transaction(trState->cx); transaction.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY); @@ -3570,16 +3594,16 @@ Future getRangeFallback(Reference trState, state Key b = wait(fb); state Key e = wait(fe); if (b >= e) { - return RangeResult(); + return RangeResultFamily(); } // if e is allKeys.end, we have read through the end of the database // if b is allKeys.begin, we have either read through the beginning of the database, // or allKeys.begin exists in the database and will be part of the conflict range anyways - RangeResult _r = wait(getExactRange( + RangeResultFamily _r = wait(getExactRange( trState, version, KeyRangeRef(b, e), mapper, limits, reverse)); - RangeResult r = _r; + RangeResultFamily r = _r; if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) r.readToBegin = true; @@ -3603,7 +3627,31 @@ Future getRangeFallback(Reference trState, return r; } +int64_t inline getRangeResultFamilyBytes(RangeResultRef result) { + return result.expectedSize(); +} + +int64_t inline getRangeResultFamilyBytes(MappedRangeResultRef result) { + int64_t bytes = 0; + for (const MappedKeyValueRef& mappedKeyValue : result) { + bytes += mappedKeyValue.key.size() + mappedKeyValue.value.size(); + + auto& reqAndResult = mappedKeyValue.reqAndResult; + if (std::holds_alternative(reqAndResult)) { + auto getValue = std::get(reqAndResult); + bytes += getValue.expectedSize(); + } else if (std::holds_alternative(reqAndResult)) { + auto getRange = std::get(reqAndResult); + bytes += getRange.result.expectedSize(); + } else { + throw internal_error(); + } + } + return bytes; +} + // TODO: Client should add mapped keys to conflict ranges. +ACTOR template // RangeResult or MappedRangeResult void getRangeFinished(Reference trState, double startTime, KeySelector begin, @@ -3611,11 +3659,8 @@ void getRangeFinished(Reference trState, Snapshot snapshot, Promise> conflictRange, Reverse reverse, - RangeResult result) { - int64_t bytes = 0; - for (const KeyValueRef& kv : result) { - bytes += kv.key.size() + kv.value.size(); - } + RangeResultFamily result) { + int64_t bytes = getRangeResultFamilyBytes(result); trState->cx->transactionBytesRead += bytes; trState->cx->transactionKeysRead += result.size(); @@ -3657,24 +3702,26 @@ void getRangeFinished(Reference trState, } } -// GetKeyValuesFamilyRequest: GetKeyValuesRequest or GetKeyValuesAndFlatMapRequest -// GetKeyValuesFamilyReply: GetKeyValuesReply or GetKeyValuesAndFlatMapReply -// Sadly we need GetKeyValuesFamilyReply because cannot do something like: state -// REPLY_TYPE(GetKeyValuesFamilyRequest) rep; -ACTOR template -Future getRange(Reference trState, - Future fVersion, - KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Promise> conflictRange, - Snapshot snapshot, - Reverse reverse) { +ACTOR template +Future getRange(Reference trState, + Future fVersion, + KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Promise> conflictRange, + Snapshot snapshot, + Reverse reverse) { + // state using RangeResultRefFamily = typename RangeResultFamily::RefType; state GetRangeLimits originalLimits(limits); state KeySelector originalBegin = begin; state KeySelector originalEnd = end; - state RangeResult output; + state RangeResultFamily output; state Span span("NAPI:getRange"_loc, trState->spanID); try { @@ -3822,15 +3869,16 @@ Future getRange(Reference trState, bool readToBegin = output.readToBegin; bool readThroughEnd = output.readThroughEnd; - output = RangeResult(RangeResultRef(rep.data, modifiedSelectors || limits.isReached() || rep.more), - rep.arena); + using RangeResultRefFamily = typename RangeResultFamily::RefType; + output = RangeResultFamily( + RangeResultRefFamily(rep.data, modifiedSelectors || limits.isReached() || rep.more), rep.arena); output.readToBegin = readToBegin; output.readThroughEnd = readThroughEnd; if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) { // Copy instead of resizing because TSS maybe be using output's arena for comparison. This only // happens in simulation so it's fine - RangeResult copy; + RangeResultFamily copy; int newSize = deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()); for (int i = 0; i < newSize; i++) { @@ -3876,8 +3924,9 @@ Future getRange(Reference trState, TEST(true); // !GetKeyValuesFamilyReply.more and modifiedSelectors in getRange if (!rep.data.size()) { - RangeResult result = wait(getRangeFallback( - trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse)); + RangeResultFamily result = wait( + getRangeFallback( + trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse)); getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); return result; @@ -3907,8 +3956,9 @@ Future getRange(Reference trState, Reverse{ reverse ? (end - 1).isBackward() : begin.isBackward() }); if (e.code() == error_code_wrong_shard_server) { - RangeResult result = wait(getRangeFallback( - trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse)); + RangeResultFamily result = wait( + getRangeFallback( + trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse)); getRangeFinished( trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); return result; @@ -4461,7 +4511,7 @@ Future getRange(Reference const& trState, KeySelector const& end, GetRangeLimits const& limits, Reverse const& reverse) { - return getRange( + return getRange( trState, fVersion, begin, end, ""_sr, limits, Promise>(), Snapshot::True, reverse); } @@ -4755,25 +4805,25 @@ template void increaseCounterForRequest(Database cx) { if constexpr (std::is_same::value) { ++cx->transactionGetRangeRequests; - } else if (std::is_same::value) { - ++cx->transactionGetRangeAndFlatMapRequests; + } else if (std::is_same::value) { + ++cx->transactionGetMappedRangeRequests; } else { UNREACHABLE(); } } -template -Future Transaction::getRangeInternal(const KeySelector& begin, - const KeySelector& end, - const Key& mapper, - GetRangeLimits limits, - Snapshot snapshot, - Reverse reverse) { +template +Future Transaction::getRangeInternal(const KeySelector& begin, + const KeySelector& end, + const Key& mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse) { ++trState->cx->transactionLogicalReads; increaseCounterForRequest(trState->cx); if (limits.isReached()) - return RangeResult(); + return RangeResultFamily(); if (!limits.isValid()) return range_limits_invalid(); @@ -4794,15 +4844,21 @@ Future Transaction::getRangeInternal(const KeySelector& begin, if (b.offset >= e.offset && b.getKey() >= e.getKey()) { TEST(true); // Native range inverted - return RangeResult(); + return RangeResultFamily(); } + if (!snapshot && !std::is_same_v) { + // Currently, NativeAPI does not support serialization for getMappedRange. You should consider use + // ReadYourWrites APIs which wraps around NativeAPI and provides serialization for getMappedRange. (Even if + // you don't want RYW, you may use ReadYourWrites APIs with RYW disabled.) + throw unsupported_operation(); + } Promise> conflictRange; if (!snapshot) { extraConflictRanges.push_back(conflictRange.getFuture()); } - return ::getRange( + return ::getRange( trState, getReadVersion(), b, e, mapper, limits, conflictRange, snapshot, reverse); } @@ -4811,16 +4867,17 @@ Future Transaction::getRange(const KeySelector& begin, GetRangeLimits limits, Snapshot snapshot, Reverse reverse) { - return getRangeInternal(begin, end, ""_sr, limits, snapshot, reverse); + return getRangeInternal( + begin, end, ""_sr, limits, snapshot, reverse); } -Future Transaction::getRangeAndFlatMap(const KeySelector& begin, - const KeySelector& end, - const Key& mapper, - GetRangeLimits limits, - Snapshot snapshot, - Reverse reverse) { - return getRangeInternal( +Future Transaction::getMappedRange(const KeySelector& begin, + const KeySelector& end, + const Key& mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse) { + return getRangeInternal( begin, end, mapper, limits, snapshot, reverse); } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 29e2dda610..d2ee31d20b 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -309,13 +309,23 @@ public: reverse); } - [[nodiscard]] Future getRangeAndFlatMap(const KeySelector& begin, - const KeySelector& end, - const Key& mapper, - GetRangeLimits limits, - Snapshot = Snapshot::False, - Reverse = Reverse::False); + [[nodiscard]] Future getMappedRange(const KeySelector& begin, + const KeySelector& end, + const Key& mapper, + GetRangeLimits limits, + Snapshot = Snapshot::False, + Reverse = Reverse::False); +private: + template + Future getRangeInternal(const KeySelector& begin, + const KeySelector& end, + const Key& mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse); + +public: // A method for streaming data from the storage server that is more efficient than getRange when reading large // amounts of data [[nodiscard]] Future getRangeStream(const PromiseStream>& results, diff --git a/fdbclient/PaxosConfigTransaction.h b/fdbclient/PaxosConfigTransaction.h index 3854d4be96..276450c8a4 100644 --- a/fdbclient/PaxosConfigTransaction.h +++ b/fdbclient/PaxosConfigTransaction.h @@ -50,12 +50,12 @@ public: GetRangeLimits limits, Snapshot = Snapshot::False, Reverse = Reverse::False) override; - Future getRangeAndFlatMap(KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Snapshot = Snapshot::False, - Reverse = Reverse::False) override { + Future getMappedRange(KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Snapshot = Snapshot::False, + Reverse = Reverse::False) override { throw client_invalid_operation(); } void set(KeyRef const& key, ValueRef const& value) override; diff --git a/fdbclient/RYWIterator.h b/fdbclient/RYWIterator.h index 90ab1884e0..fb5b4768bd 100644 --- a/fdbclient/RYWIterator.h +++ b/fdbclient/RYWIterator.h @@ -44,7 +44,7 @@ public: ExtStringRef beginKey(); ExtStringRef endKey(); - const KeyValueRef* kv(Arena& arena); + virtual const KeyValueRef* kv(Arena& arena); RYWIterator& operator++(); @@ -61,14 +61,14 @@ public: void bypassUnreadableProtection() { bypassUnreadable = true; } - WriteMap::iterator& extractWriteMapIterator(); + virtual WriteMap::iterator& extractWriteMapIterator(); // Really this should return an iterator by value, but for performance it's convenient to actually grab the internal // one. Consider copying the return value if performance isn't critical. If you modify the returned iterator, it // invalidates this iterator until the next call to skip() void dbg(); -private: +protected: int begin_key_cmp; // -1 if cache.beginKey() < writes.beginKey(), 0 if ==, +1 if > int end_key_cmp; // SnapshotCache::iterator cache; diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index ee021b6a05..bc12302472 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -75,13 +75,13 @@ public: }; template - struct GetRangeAndFlatMapReq { - GetRangeAndFlatMapReq(KeySelector begin, KeySelector end, Key mapper, GetRangeLimits limits) + struct GetMappedRangeReq { + GetMappedRangeReq(KeySelector begin, KeySelector end, Key mapper, GetRangeLimits limits) : begin(begin), end(end), mapper(mapper), limits(limits) {} KeySelector begin, end; Key mapper; GetRangeLimits limits; - using Result = RangeResult; + using Result = MappedRangeResult; }; // read() Performs a read (get, getKey, getRange, etc), in the context of the given transaction. Snapshot or RYW @@ -213,46 +213,17 @@ public: return v; } - ACTOR template - static Future readThroughAndFlatMap(ReadYourWritesTransaction* ryw, - GetRangeAndFlatMapReq read, - Snapshot snapshot) { - if (backwards && read.end.offset > 1) { - // FIXME: Optimistically assume that this will not run into the system keys, and only reissue if the result - // actually does. - Key key = wait(ryw->tr.getKey(read.end, snapshot)); - if (key > ryw->getMaxReadKey()) - read.end = firstGreaterOrEqual(ryw->getMaxReadKey()); - else - read.end = KeySelector(firstGreaterOrEqual(key), key.arena()); - } - - RangeResult v = wait(ryw->tr.getRangeAndFlatMap( - read.begin, read.end, read.mapper, read.limits, snapshot, backwards ? Reverse::True : Reverse::False)); - KeyRef maxKey = ryw->getMaxReadKey(); - if (v.size() > 0) { - if (!backwards && v[v.size() - 1].key >= maxKey) { - state RangeResult _v = v; - int i = _v.size() - 2; - for (; i >= 0 && _v[i].key >= maxKey; --i) { - } - return RangeResult(RangeResultRef(VectorRef(&_v[0], i + 1), false), _v.arena()); - } - } - - return v; - } - // addConflictRange(ryw,read,result) is called after a serializable read and is responsible for adding the relevant // conflict range + template static void addConflictRange(ReadYourWritesTransaction* ryw, GetValueReq read, WriteMap::iterator& it, Optional result) { // it will already point to the right segment (see the calling code in read()), so we don't need to skip // read.key will be copied into ryw->arena inside of updateConflictMap if it is being added - ryw->updateConflictMap(read.key, it); + updateConflictMap(ryw, read.key, it); } static void addConflictRange(ReadYourWritesTransaction* ryw, GetKeyReq read, WriteMap::iterator& it, Key result) { @@ -270,10 +241,11 @@ public: ryw->updateConflictMap(readRange, it); } + template static void addConflictRange(ReadYourWritesTransaction* ryw, GetRangeReq read, WriteMap::iterator& it, - RangeResult const& result) { + RangeResultFamily& result) { KeyRef rangeBegin, rangeEnd; bool endInArena = false; @@ -302,13 +274,15 @@ public: KeyRangeRef readRange = KeyRangeRef(KeyRef(ryw->arena, rangeBegin), endInArena ? rangeEnd : KeyRef(ryw->arena, rangeEnd)); it.skip(readRange.begin); - ryw->updateConflictMap(readRange, it); + updateConflictMap(ryw, readRange, it); } + // In the case where RangeResultFamily is MappedRangeResult, it only adds the primary range to conflict. + template static void addConflictRange(ReadYourWritesTransaction* ryw, GetRangeReq read, WriteMap::iterator& it, - RangeResult const& result) { + RangeResultFamily& result) { KeyRef rangeBegin, rangeEnd; bool endInArena = false; @@ -336,7 +310,39 @@ public: KeyRangeRef readRange = KeyRangeRef(KeyRef(ryw->arena, rangeBegin), endInArena ? rangeEnd : KeyRef(ryw->arena, rangeEnd)); it.skip(readRange.begin); - ryw->updateConflictMap(readRange, it); + updateConflictMap(ryw, readRange, it); + } + + template + static void updateConflictMap(ReadYourWritesTransaction* ryw, KeyRef const& key, WriteMap::iterator& it) { + // it.skip( key ); + // ASSERT( it.beginKey() <= key && key < it.endKey() ); + if (mustUnmodified && !it.is_unmodified_range()) { + throw get_mapped_range_reads_your_writes(); + } + if (it.is_unmodified_range() || (it.is_operation() && !it.is_independent())) { + ryw->approximateSize += 2 * key.expectedSize() + 1 + sizeof(KeyRangeRef); + ryw->readConflicts.insert(singleKeyRange(key, ryw->arena), true); + } + } + + template + static void updateConflictMap(ReadYourWritesTransaction* ryw, KeyRangeRef const& keys, WriteMap::iterator& it) { + // it.skip( keys.begin ); + // ASSERT( it.beginKey() <= keys.begin && keys.begin < it.endKey() ); + for (; it.beginKey() < keys.end; ++it) { + if (mustUnmodified && !it.is_unmodified_range()) { + throw get_mapped_range_reads_your_writes(); + } + if (it.is_unmodified_range() || (it.is_operation() && !it.is_independent())) { + KeyRangeRef insert_range = KeyRangeRef(std::max(keys.begin, it.beginKey().toArenaOrRef(ryw->arena)), + std::min(keys.end, it.endKey().toArenaOrRef(ryw->arena))); + if (!insert_range.empty()) { + ryw->approximateSize += keys.expectedSize() + sizeof(KeyRangeRef); + ryw->readConflicts.insert(insert_range, true); + } + } + } } ACTOR template @@ -349,15 +355,6 @@ public: } } ACTOR template - static Future readWithConflictRangeThroughAndFlatMap(ReadYourWritesTransaction* ryw, - Req req, - Snapshot snapshot) { - choose { - when(typename Req::Result result = wait(readThroughAndFlatMap(ryw, req, snapshot))) { return result; } - when(wait(ryw->resetPromise.getFuture())) { throw internal_error(); } - } - } - ACTOR template static Future readWithConflictRangeSnapshot(ReadYourWritesTransaction* ryw, Req req) { state SnapshotCache::iterator it(&ryw->cache, &ryw->writes); choose { @@ -393,19 +390,6 @@ public: return readWithConflictRangeRYW(ryw, req, snapshot); } - template - static inline Future readWithConflictRangeAndFlatMap(ReadYourWritesTransaction* ryw, - Req const& req, - Snapshot snapshot) { - // For now, getRangeAndFlatMap is only supported if transaction use snapshot isolation AND read-your-writes is - // disabled. - if (snapshot && ryw->options.readYourWritesDisabled) { - return readWithConflictRangeThroughAndFlatMap(ryw, req, snapshot); - } - TEST(true); // readWithConflictRangeRYW not supported for getRangeAndFlatMap - throw client_invalid_operation(); - } - template static void resolveKeySelectorFromCache(KeySelector& key, Iter& it, @@ -1126,6 +1110,119 @@ public: return result; } +#ifndef __INTEL_COMPILER +#pragma region GetMappedRange +#endif + + template + static Future read(ReadYourWritesTransaction* ryw, GetMappedRangeReq read, Iter* it) { + return getMappedRangeValue(ryw, read.begin, read.end, read.mapper, read.limits, it); + }; + + template + static Future read(ReadYourWritesTransaction* ryw, GetMappedRangeReq read, Iter* it) { + throw unsupported_operation(); + // TODO: Support reverse. return getMappedRangeValueBack(ryw, read.begin, read.end, read.mapper, + // read.limits, it); + }; + + ACTOR template + static Future readThrough(ReadYourWritesTransaction* ryw, + GetMappedRangeReq read, + Snapshot snapshot) { + if (backwards && read.end.offset > 1) { + // FIXME: Optimistically assume that this will not run into the system keys, and only reissue if the result + // actually does. + Key key = wait(ryw->tr.getKey(read.end, snapshot)); + if (key > ryw->getMaxReadKey()) + read.end = firstGreaterOrEqual(ryw->getMaxReadKey()); + else + read.end = KeySelector(firstGreaterOrEqual(key), key.arena()); + } + + MappedRangeResult v = wait(ryw->tr.getMappedRange( + read.begin, read.end, read.mapper, read.limits, snapshot, backwards ? Reverse::True : Reverse::False)); + return v; + } + + template + static void addConflictRangeAndMustUnmodified(ReadYourWritesTransaction* ryw, + GetMappedRangeReq read, + WriteMap::iterator& it, + MappedRangeResult result) { + // Primary getRange. + addConflictRange( + ryw, GetRangeReq(read.begin, read.end, read.limits), it, result); + + // Secondary getValue/getRanges. + for (const auto& mappedKeyValue : result) { + const auto& reqAndResult = mappedKeyValue.reqAndResult; + if (std::holds_alternative(reqAndResult)) { + auto getValue = std::get(reqAndResult); + // GetValueReq variation of addConflictRange require it to point at the right segment. + it.skip(getValue.key); + // The result is not used in GetValueReq variation of addConflictRange. Let's just pass in a + // placeholder. + addConflictRange(ryw, GetValueReq(getValue.key), it, Optional()); + } else if (std::holds_alternative(reqAndResult)) { + auto getRange = std::get(reqAndResult); + // We only support forward scan for secondary getRange requests. + // The limits are not used in addConflictRange. Let's just pass in a placeholder. + addConflictRange( + ryw, GetRangeReq(getRange.begin, getRange.end, GetRangeLimits()), it, getRange.result); + } else { + throw internal_error(); + } + } + } + + // For Snapshot::True and NOT readYourWritesDisabled. + ACTOR template + static Future readWithConflictRangeRYW(ReadYourWritesTransaction* ryw, + GetMappedRangeReq req, + Snapshot snapshot) { + choose { + when(MappedRangeResult result = wait(readThrough(ryw, req, Snapshot::True))) { + // Insert read conflicts (so that it supported Snapshot::True) and check it is not modified (so it masks + // sure not break RYW semantic while not implementing RYW) for both the primary getRange and all + // underlying getValue/getRanges. + WriteMap::iterator writes(&ryw->writes); + addConflictRangeAndMustUnmodified(ryw, req, writes, result); + return result; + } + when(wait(ryw->resetPromise.getFuture())) { throw internal_error(); } + } + } + + template + static inline Future readWithConflictRangeForGetMappedRange( + ReadYourWritesTransaction* ryw, + GetMappedRangeReq const& req, + Snapshot snapshot) { + // For now, getMappedRange requires serializable isolation. (Technically it is trivial to add snapshot + // isolation support. But it is not default and is rarely used. So we disallow it until we have thorough test + // coverage for it.) + if (snapshot) { + TEST(true); // getMappedRange not supported for snapshot. + throw unsupported_operation(); + } + // For now, getMappedRange requires read-your-writes being NOT disabled. But the support of RYW is limited + // to throwing get_mapped_range_reads_your_writes error when getMappedRange actually reads your own writes. + // Applications should fall back in their own ways. This is different from what is usually expected from RYW, + // which returns the written value transparently. In another word, it makes sure not break RYW semantics without + // actually implementing reading from the writes. + if (ryw->options.readYourWritesDisabled) { + TEST(true); // getMappedRange not supported for read-your-writes disabled. + throw unsupported_operation(); + } + + return readWithConflictRangeRYW(ryw, req, snapshot); + } + +#ifndef __INTEL_COMPILER +#pragma endregion +#endif + static void triggerWatches(ReadYourWritesTransaction* ryw, KeyRangeRef range, Optional val, @@ -1571,16 +1668,16 @@ Future ReadYourWritesTransaction::getRange(const KeySelector& begin return getRange(begin, end, GetRangeLimits(limit), snapshot, reverse); } -Future ReadYourWritesTransaction::getRangeAndFlatMap(KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Snapshot snapshot, - Reverse reverse) { +Future ReadYourWritesTransaction::getMappedRange(KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse) { if (getDatabase()->apiVersionAtLeast(630)) { if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() && end.getKey() <= specialKeys.end) { - TEST(true); // Special key space get range (FlatMap) + TEST(true); // Special key space get range (getMappedRange) throw client_invalid_operation(); // Not support special keys. } } else { @@ -1602,8 +1699,8 @@ Future ReadYourWritesTransaction::getRangeAndFlatMap(KeySelector be // This optimization prevents nullptr operations from being added to the conflict range if (limits.isReached()) { - TEST(true); // RYW range read limit 0 (FlatMap) - return RangeResult(); + TEST(true); // RYW range read limit 0 (getMappedRange) + return MappedRangeResult(); } if (!limits.isValid()) @@ -1616,17 +1713,16 @@ Future ReadYourWritesTransaction::getRangeAndFlatMap(KeySelector be end.removeOrEqual(end.arena()); if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) { - TEST(true); // RYW range inverted (FlatMap) - return RangeResult(); + TEST(true); // RYW range inverted (getMappedRange) + return MappedRangeResult(); } - Future result = - reverse ? RYWImpl::readWithConflictRangeAndFlatMap( - this, RYWImpl::GetRangeAndFlatMapReq(begin, end, mapper, limits), snapshot) - : RYWImpl::readWithConflictRangeAndFlatMap( - this, RYWImpl::GetRangeAndFlatMapReq(begin, end, mapper, limits), snapshot); + Future result = + reverse ? RYWImpl::readWithConflictRangeForGetMappedRange( + this, RYWImpl::GetMappedRangeReq(begin, end, mapper, limits), snapshot) + : RYWImpl::readWithConflictRangeForGetMappedRange( + this, RYWImpl::GetMappedRangeReq(begin, end, mapper, limits), snapshot); - reading.add(success(result)); return result; } @@ -1761,27 +1857,11 @@ void ReadYourWritesTransaction::addReadConflictRange(KeyRangeRef const& keys) { } void ReadYourWritesTransaction::updateConflictMap(KeyRef const& key, WriteMap::iterator& it) { - // it.skip( key ); - // ASSERT( it.beginKey() <= key && key < it.endKey() ); - if (it.is_unmodified_range() || (it.is_operation() && !it.is_independent())) { - approximateSize += 2 * key.expectedSize() + 1 + sizeof(KeyRangeRef); - readConflicts.insert(singleKeyRange(key, arena), true); - } + RYWImpl::updateConflictMap(this, key, it); } void ReadYourWritesTransaction::updateConflictMap(KeyRangeRef const& keys, WriteMap::iterator& it) { - // it.skip( keys.begin ); - // ASSERT( it.beginKey() <= keys.begin && keys.begin < it.endKey() ); - for (; it.beginKey() < keys.end; ++it) { - if (it.is_unmodified_range() || (it.is_operation() && !it.is_independent())) { - KeyRangeRef insert_range = KeyRangeRef(std::max(keys.begin, it.beginKey().toArenaOrRef(arena)), - std::min(keys.end, it.endKey().toArenaOrRef(arena))); - if (!insert_range.empty()) { - approximateSize += keys.expectedSize() + sizeof(KeyRangeRef); - readConflicts.insert(insert_range, true); - } - } - } + RYWImpl::updateConflictMap(this, keys, it); } void ReadYourWritesTransaction::writeRangeToNativeTransaction(KeyRangeRef const& keys) { diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h index b8ccd23e54..84bc05e4ef 100644 --- a/fdbclient/ReadYourWrites.h +++ b/fdbclient/ReadYourWrites.h @@ -61,6 +61,9 @@ struct TransactionDebugInfo : public ReferenceCounted { // Values returned by a ReadYourWritesTransaction will contain a reference to the transaction's arena. Therefore, // keeping a reference to a value longer than its creating transaction would hold all of the memory generated by the // transaction +// If options.readYourWritesDisabled, rely on NativeAPI to handle everything. Otherwise, read NativeAPI with +// Snapshot::True and handle read conflicts at ReadYourWritesTransaction, write NativeAPI with AddConflictRange::False +// and handle write conflicts at ReadYourWritesTransaction, eventually send this information to NativeAPI on commit. class ReadYourWritesTransaction final : NonCopyable, public ISingleThreadTransaction, public FastAllocated { @@ -104,12 +107,12 @@ public: snapshot, reverse); } - Future getRangeAndFlatMap(KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Snapshot = Snapshot::False, - Reverse = Reverse::False) override; + Future getMappedRange(KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Snapshot = Snapshot::False, + Reverse = Reverse::False) override; [[nodiscard]] Future>> getAddressesForKey(const Key& key) override; Future>> getRangeSplitPoints(const KeyRange& range, int64_t chunkSize) override; diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 6ec7778f8e..a65099edd5 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -676,8 +676,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_STORAGE_COMMIT_TIME, 120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1; init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); - init( QUICK_GET_VALUE_FALLBACK, false ); - init( QUICK_GET_KEY_VALUES_FALLBACK, false ); + init( QUICK_GET_VALUE_FALLBACK, true ); + init( QUICK_GET_KEY_VALUES_FALLBACK, true ); + init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); + init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 3e25bbe7fe..f962ec78b9 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -615,6 +615,8 @@ public: bool ENABLE_CLEAR_RANGE_EAGER_READS; bool QUICK_GET_VALUE_FALLBACK; bool QUICK_GET_KEY_VALUES_FALLBACK; + int QUICK_GET_KEY_VALUES_LIMIT; + int QUICK_GET_KEY_VALUES_LIMIT_BYTES; // Wait Failure int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; diff --git a/fdbclient/SimpleConfigTransaction.h b/fdbclient/SimpleConfigTransaction.h index 168b1a6c29..871d0efa97 100644 --- a/fdbclient/SimpleConfigTransaction.h +++ b/fdbclient/SimpleConfigTransaction.h @@ -59,12 +59,12 @@ public: GetRangeLimits limits, Snapshot = Snapshot::False, Reverse = Reverse::False) override; - Future getRangeAndFlatMap(KeySelector begin, - KeySelector end, - Key mapper, - GetRangeLimits limits, - Snapshot = Snapshot::False, - Reverse = Reverse::False) override { + Future getMappedRange(KeySelector begin, + KeySelector end, + Key mapper, + GetRangeLimits limits, + Snapshot = Snapshot::False, + Reverse = Reverse::False) override { throw client_invalid_operation(); } Future commit() override; diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp index 3c389b5ab4..dce4df7736 100644 --- a/fdbclient/StorageServerInterface.cpp +++ b/fdbclient/StorageServerInterface.cpp @@ -156,20 +156,20 @@ void TSS_traceMismatch(TraceEvent& event, // range reads and flat map template <> -bool TSS_doCompare(const GetKeyValuesAndFlatMapReply& src, const GetKeyValuesAndFlatMapReply& tss) { +bool TSS_doCompare(const GetMappedKeyValuesReply& src, const GetMappedKeyValuesReply& tss) { return src.more == tss.more && src.data == tss.data; } template <> -const char* TSS_mismatchTraceName(const GetKeyValuesAndFlatMapRequest& req) { - return "TSSMismatchGetKeyValuesAndFlatMap"; +const char* TSS_mismatchTraceName(const GetMappedKeyValuesRequest& req) { + return "TSSMismatchGetMappedKeyValues"; } template <> void TSS_traceMismatch(TraceEvent& event, - const GetKeyValuesAndFlatMapRequest& req, - const GetKeyValuesAndFlatMapReply& src, - const GetKeyValuesAndFlatMapReply& tss) { + const GetMappedKeyValuesRequest& req, + const GetMappedKeyValuesReply& src, + const GetMappedKeyValuesReply& tss) { std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : ""); for (auto& it : src.data) { ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); @@ -400,9 +400,9 @@ void TSSMetrics::recordLatency(const GetKeyValuesRequest& req, double ssLatency, } template <> -void TSSMetrics::recordLatency(const GetKeyValuesAndFlatMapRequest& req, double ssLatency, double tssLatency) { - SSgetKeyValuesAndFlatMapLatency.addSample(ssLatency); - TSSgetKeyValuesAndFlatMapLatency.addSample(tssLatency); +void TSSMetrics::recordLatency(const GetMappedKeyValuesRequest& req, double ssLatency, double tssLatency) { + SSgetMappedKeyValuesLatency.addSample(ssLatency); + TSSgetMappedKeyValuesLatency.addSample(tssLatency); } template <> diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index aef7486c3d..75c9411f18 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -67,7 +67,7 @@ struct StorageServerInterface { // Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large // selector offset prevents all data from being read in one range read RequestStream getKeyValues; - RequestStream getKeyValuesAndFlatMap; + RequestStream getMappedKeyValues; RequestStream getShardState; RequestStream waitMetrics; @@ -127,8 +127,8 @@ struct StorageServerInterface { RequestStream(getValue.getEndpoint().getAdjustedEndpoint(12)); getKeyValuesStream = RequestStream(getValue.getEndpoint().getAdjustedEndpoint(13)); - getKeyValuesAndFlatMap = - RequestStream(getValue.getEndpoint().getAdjustedEndpoint(14)); + getMappedKeyValues = + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(14)); changeFeedStream = RequestStream(getValue.getEndpoint().getAdjustedEndpoint(15)); overlappingChangeFeeds = @@ -179,7 +179,7 @@ struct StorageServerInterface { streams.push_back(getReadHotRanges.getReceiver()); streams.push_back(getRangeSplitPoints.getReceiver()); streams.push_back(getKeyValuesStream.getReceiver(TaskPriority::LoadBalancedEndpoint)); - streams.push_back(getKeyValuesAndFlatMap.getReceiver(TaskPriority::LoadBalancedEndpoint)); + streams.push_back(getMappedKeyValues.getReceiver(TaskPriority::LoadBalancedEndpoint)); streams.push_back(changeFeedStream.getReceiver()); streams.push_back(overlappingChangeFeeds.getReceiver()); streams.push_back(changeFeedPop.getReceiver()); @@ -362,15 +362,17 @@ struct GetKeyValuesRequest : TimedRequest { } }; -struct GetKeyValuesAndFlatMapReply : public LoadBalancedReply { +struct GetMappedKeyValuesReply : public LoadBalancedReply { constexpr static FileIdentifier file_identifier = 1783067; Arena arena; - VectorRef data; + // MappedKeyValueRef is not string_serialized_traits, so we have to use FlatBuffers. + VectorRef data; + Version version; // useful when latestVersion was requested bool more; bool cached = false; - GetKeyValuesAndFlatMapReply() : version(invalidVersion), more(false), cached(false) {} + GetMappedKeyValuesReply() : version(invalidVersion), more(false), cached(false) {} template void serialize(Ar& ar) { @@ -378,7 +380,7 @@ struct GetKeyValuesAndFlatMapReply : public LoadBalancedReply { } }; -struct GetKeyValuesAndFlatMapRequest : TimedRequest { +struct GetMappedKeyValuesRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 6795747; SpanID spanContext; Arena arena; @@ -390,10 +392,9 @@ struct GetKeyValuesAndFlatMapRequest : TimedRequest { bool isFetchKeys; Optional tags; Optional debugID; - ReplyPromise reply; - - GetKeyValuesAndFlatMapRequest() : isFetchKeys(false) {} + ReplyPromise reply; + GetMappedKeyValuesRequest() : isFetchKeys(false) {} template void serialize(Ar& ar) { serializer(ar, diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index eb50b0f16f..ce2d6c39c4 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -258,20 +258,20 @@ ThreadFuture ThreadSafeTransaction::getRange(const KeySelectorRef& }); } -ThreadFuture ThreadSafeTransaction::getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot, - bool reverse) { +ThreadFuture ThreadSafeTransaction::getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot, + bool reverse) { KeySelector b = begin; KeySelector e = end; Key h = mapper; ISingleThreadTransaction* tr = this->tr; - return onMainThread([tr, b, e, h, limits, snapshot, reverse]() -> Future { + return onMainThread([tr, b, e, h, limits, snapshot, reverse]() -> Future { tr->checkDeferredError(); - return tr->getRangeAndFlatMap(b, e, h, limits, Snapshot{ snapshot }, Reverse{ reverse }); + return tr->getMappedRange(b, e, h, limits, Snapshot{ snapshot }, Reverse{ reverse }); }); } diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index f61e0de2e8..6c03262891 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -106,12 +106,12 @@ public: bool reverse = false) override { return getRange(firstGreaterOrEqual(keys.begin), firstGreaterOrEqual(keys.end), limits, snapshot, reverse); } - ThreadFuture getRangeAndFlatMap(const KeySelectorRef& begin, - const KeySelectorRef& end, - const StringRef& mapper, - GetRangeLimits limits, - bool snapshot, - bool reverse) override; + ThreadFuture getMappedRange(const KeySelectorRef& begin, + const KeySelectorRef& end, + const StringRef& mapper, + GetRangeLimits limits, + bool snapshot, + bool reverse) override; ThreadFuture>> getAddressesForKey(const KeyRef& key) override; ThreadFuture> getVersionstamp() override; ThreadFuture getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override; diff --git a/fdbrpc/TSSComparison.h b/fdbrpc/TSSComparison.h index 88c453a7b0..fcf3286784 100644 --- a/fdbrpc/TSSComparison.h +++ b/fdbrpc/TSSComparison.h @@ -51,12 +51,12 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { ContinuousSample SSgetValueLatency; ContinuousSample SSgetKeyLatency; ContinuousSample SSgetKeyValuesLatency; - ContinuousSample SSgetKeyValuesAndFlatMapLatency; + ContinuousSample SSgetMappedKeyValuesLatency; ContinuousSample TSSgetValueLatency; ContinuousSample TSSgetKeyLatency; ContinuousSample TSSgetKeyValuesLatency; - ContinuousSample TSSgetKeyValuesAndFlatMapLatency; + ContinuousSample TSSgetMappedKeyValuesLatency; std::unordered_map ssErrorsByCode; std::unordered_map tssErrorsByCode; @@ -90,12 +90,12 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { SSgetValueLatency.clear(); SSgetKeyLatency.clear(); SSgetKeyValuesLatency.clear(); - SSgetKeyValuesAndFlatMapLatency.clear(); + SSgetMappedKeyValuesLatency.clear(); TSSgetValueLatency.clear(); TSSgetKeyLatency.clear(); TSSgetKeyValuesLatency.clear(); - TSSgetKeyValuesAndFlatMapLatency.clear(); + TSSgetMappedKeyValuesLatency.clear(); tssErrorsByCode.clear(); ssErrorsByCode.clear(); @@ -107,8 +107,8 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { : cc("TSSClientMetrics"), requests("Requests", cc), streamComparisons("StreamComparisons", cc), ssErrors("SSErrors", cc), tssErrors("TSSErrors", cc), tssTimeouts("TSSTimeouts", cc), mismatches("Mismatches", cc), SSgetValueLatency(1000), SSgetKeyLatency(1000), SSgetKeyValuesLatency(1000), - SSgetKeyValuesAndFlatMapLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), - TSSgetKeyValuesLatency(1000), TSSgetKeyValuesAndFlatMapLatency(1000) {} + SSgetMappedKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), + TSSgetKeyValuesLatency(1000), TSSgetMappedKeyValuesLatency(1000) {} }; template diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 731eb0521a..dd83764289 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -224,7 +224,7 @@ set(FDBSERVER_SRCS workloads/MetricLogging.actor.cpp workloads/MiniCycle.actor.cpp workloads/MutationLogReaderCorrectness.actor.cpp - workloads/GetRangeAndMap.actor.cpp + workloads/GetMappedRange.actor.cpp workloads/ParallelRestore.actor.cpp workloads/Performance.actor.cpp workloads/Ping.actor.cpp diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index cf85b5be77..d4760dd982 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -88,14 +88,14 @@ bool canReplyWith(Error e) { case error_code_server_overloaded: case error_code_tenant_name_required: case error_code_unknown_tenant: - // getRangeAndMap related exceptions that are not retriable: + // getMappedRange related exceptions that are not retriable: case error_code_mapper_bad_index: case error_code_mapper_no_such_key: case error_code_mapper_bad_range_decriptor: case error_code_quick_get_key_values_has_more: case error_code_quick_get_value_miss: case error_code_quick_get_key_values_miss: - case error_code_get_key_values_and_map_has_more: + case error_code_get_mapped_key_values_has_more: // case error_code_all_alternatives_failed: return true; default: @@ -820,7 +820,7 @@ public: struct Counters { CounterCollection cc; - Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, getRangeAndFlatMapQueries, + Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, getMappedRangeQueries, getRangeStreamQueries, finishedQueries, lowPriorityQueries, rowsQueried, bytesQueried, watchQueries, emptyQueries; @@ -862,7 +862,7 @@ public: Counter wrongShardServer; Counter fetchedVersions; Counter fetchesFromLogs; - // The following counters measure how many of lookups in the getRangeAndFlatMapQueries are effective. "Miss" + // The following counters measure how many of lookups in the getMappedRangeQueries are effective. "Miss" // means fallback if fallback is enabled, otherwise means failure (so that another layer could implement // fallback). Counter quickGetValueHit, quickGetValueMiss, quickGetKeyValuesHit, quickGetKeyValuesMiss; @@ -886,7 +886,7 @@ public: Counters(StorageServer* self) : cc("StorageServer", self->thisServerID.toString()), allQueries("QueryQueue", cc), getKeyQueries("GetKeyQueries", cc), getValueQueries("GetValueQueries", cc), - getRangeQueries("GetRangeQueries", cc), getRangeAndFlatMapQueries("GetRangeAndFlatMapQueries", cc), + getRangeQueries("GetRangeQueries", cc), getMappedRangeQueries("GetMappedRangeQueries", cc), getRangeStreamQueries("GetRangeStreamQueries", cc), finishedQueries("FinishedQueries", cc), lowPriorityQueries("LowPriorityQueries", cc), rowsQueried("RowsQueried", cc), bytesQueried("BytesQueried", cc), watchQueries("WatchQueries", cc), emptyQueries("EmptyQueries", cc), @@ -2203,11 +2203,24 @@ void merge(Arena& arena, } } -ACTOR Future> quickGetValue(StorageServer* data, - StringRef key, - Version version, - // To provide span context, tags, debug ID to underlying lookups. - GetKeyValuesAndFlatMapRequest* pOriginalReq) { +static inline void copyOptionalValue(Arena* a, + GetValueReqAndResultRef& getValue, + const Optional& optionalValue) { + std::function contents = [](Value value) { return value.contents(); }; + getValue.result = optionalValue.map(contents); + if (optionalValue.present()) { + a->dependsOn(optionalValue.get().arena()); + } +} +ACTOR Future quickGetValue(StorageServer* data, + StringRef key, + Version version, + Arena* a, + // To provide span context, tags, debug ID to underlying lookups. + GetMappedKeyValuesRequest* pOriginalReq) { + state GetValueReqAndResultRef getValue; + getValue.key = key; + if (data->shards[key]->isReadable()) { try { // TODO: Use a lower level API may be better? Or tweak priorities? @@ -2220,7 +2233,8 @@ ACTOR Future> quickGetValue(StorageServer* data, GetValueReply reply = wait(req.reply.getFuture()); if (!reply.error.present()) { ++data->counters.quickGetValueHit; - return reply.value; + copyOptionalValue(a, getValue, reply.value); + return getValue; } // Otherwise fallback. } catch (Error& e) { @@ -2237,8 +2251,9 @@ ACTOR Future> quickGetValue(StorageServer* data, tr.trState->taskID = TaskPriority::DefaultPromiseEndpoint; Future> valueFuture = tr.get(key, Snapshot::True); // TODO: async in case it needs to read from other servers. - state Optional valueOption = wait(valueFuture); - return valueOption; + Optional valueOption = wait(valueFuture); + copyOptionalValue(a, getValue, valueOption); + return getValue; } else { throw quick_get_value_miss(); } @@ -2783,19 +2798,29 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) return Void(); } -ACTOR Future quickGetKeyValues(StorageServer* data, - StringRef prefix, - Version version, - // To provide span context, tags, debug ID to underlying lookups. - GetKeyValuesAndFlatMapRequest* pOriginalReq) { +ACTOR Future quickGetKeyValues( + StorageServer* data, + StringRef prefix, + Version version, + Arena* a, + // To provide span context, tags, debug ID to underlying lookups. + GetMappedKeyValuesRequest* pOriginalReq) { + state GetRangeReqAndResultRef getRange; + getRange.begin = firstGreaterOrEqual(KeyRef(*a, prefix)); + getRange.end = firstGreaterOrEqual(strinc(prefix, *a)); try { // TODO: Use a lower level API may be better? Or tweak priorities? GetKeyValuesRequest req; req.spanContext = pOriginalReq->spanContext; - req.arena = Arena(); - req.begin = firstGreaterOrEqual(KeyRef(req.arena, prefix)); - req.end = firstGreaterOrEqual(strinc(prefix, req.arena)); + req.arena = *a; + req.begin = getRange.begin; + req.end = getRange.end; req.version = version; + // TODO: Validate when the underlying range query exceeds the limit. + // TODO: Use remainingLimit, remainingLimitBytes rather than separate knobs. + req.limit = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT; + req.limitBytes = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT_BYTES; + req.isFetchKeys = false; req.tags = pOriginalReq->tags; req.debugID = pOriginalReq->debugID; @@ -2807,7 +2832,9 @@ ACTOR Future quickGetKeyValues(StorageServer* data, if (!reply.error.present()) { ++data->counters.quickGetKeyValuesHit; // Convert GetKeyValuesReply to RangeResult. - return RangeResult(RangeResultRef(reply.data, reply.more), reply.arena); + a->dependsOn(reply.arena); + getRange.result = RangeResultRef(reply.data, reply.more); + return getRange; } // Otherwise fallback. } catch (Error& e) { @@ -2823,7 +2850,9 @@ ACTOR Future quickGetKeyValues(StorageServer* data, Future rangeResultFuture = tr.getRange(prefixRange(prefix), Snapshot::True); // TODO: async in case it needs to read from other servers. RangeResult rangeResult = wait(rangeResultFuture); - return rangeResult; + a->dependsOn(rangeResult.arena()); + getRange.result = rangeResult; + return getRange; } else { throw quick_get_key_values_miss(); } @@ -3039,73 +3068,59 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { return Void(); } -ACTOR Future flatMap(StorageServer* data, - GetKeyValuesReply input, - StringRef mapper, - // To provide span context, tags, debug ID to underlying lookups. - GetKeyValuesAndFlatMapRequest* pOriginalReq, - Optional tenantPrefix) { - state GetKeyValuesAndFlatMapReply result; +ACTOR Future mapKeyValues(StorageServer* data, + GetKeyValuesReply input, + StringRef mapper, + // To provide span context, tags, debug ID to underlying lookups. + GetMappedKeyValuesRequest* pOriginalReq, + Optional tenantPrefix) { + state GetMappedKeyValuesReply result; result.version = input.version; - if (input.more) { - throw get_key_values_and_map_has_more(); - } result.more = input.more; result.cached = input.cached; result.arena.dependsOn(input.arena); result.data.reserve(result.arena, input.data.size()); - state bool isRangeQuery = false; + state Tuple mappedKeyFormatTuple = Tuple::unpack(mapper); state KeyValueRef* it = input.data.begin(); for (; it != input.data.end(); it++) { + state MappedKeyValueRef kvm; + kvm.key = it->key; + kvm.value = it->value; + + state bool isRangeQuery = false; state Key mappedKey = constructMappedKey(it, mappedKeyFormatTuple, isRangeQuery, tenantPrefix); // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. result.arena.dependsOn(mappedKey.arena()); + // std::cout << "key:" << printable(kvm.key) << ", value:" << printable(kvm.value) + // << ", mappedKey:" << printable(mappedKey) << std::endl; + if (isRangeQuery) { // Use the mappedKey as the prefix of the range query. - RangeResult rangeResult = wait(quickGetKeyValues(data, mappedKey, input.version, pOriginalReq)); - - if (rangeResult.more) { - // Probably the fan out is too large. The user should use the old way to query. - throw quick_get_key_values_has_more(); - } - result.arena.dependsOn(rangeResult.arena()); - for (int i = 0; i < rangeResult.size(); i++) { - KeyRef key = rangeResult[i].key; - if (tenantPrefix.present()) { - key = key.removePrefix(tenantPrefix.get()); - } - result.data.emplace_back(result.arena, key, rangeResult[i].value); - } + GetRangeReqAndResultRef getRange = + wait(quickGetKeyValues(data, mappedKey, input.version, &(result.arena), pOriginalReq)); + // TODO: Remove tenant prefixes in the keys if they haven't been removed? + kvm.reqAndResult = getRange; } else { - Optional valueOption = wait(quickGetValue(data, mappedKey, input.version, pOriginalReq)); - - if (valueOption.present()) { - Value value = valueOption.get(); - result.arena.dependsOn(value.arena()); - - KeyRef key = mappedKey; - if (tenantPrefix.present()) { - key = key.removePrefix(tenantPrefix.get()); - } - result.data.emplace_back(result.arena, key, value); - } else { - // TODO: Shall we throw exception if the key doesn't exist or the range is empty? - } + GetValueReqAndResultRef getValue = + wait(quickGetValue(data, mappedKey, input.version, &(result.arena), pOriginalReq)); + // TODO: Remove tenant prefixes in the keys if they haven't been removed? + kvm.reqAndResult = getValue; } + result.data.push_back(result.arena, kvm); } return result; } // Most of the actor is copied from getKeyValuesQ. I tried to use templates but things become nearly impossible after // combining actor shenanigans with template shenanigans. -ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndFlatMapRequest req) +ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRequest req) // Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large // selector offset prevents all data from being read in one range read { - state Span span("SS:getKeyValuesAndFlatMap"_loc, { req.spanContext }); + state Span span("SS:getMappedKeyValues"_loc, { req.spanContext }); state int64_t resultSize = 0; state IKeyValueStore::ReadType type = req.isFetchKeys ? IKeyValueStore::ReadType::FETCH : IKeyValueStore::ReadType::NORMAL; @@ -3116,7 +3131,7 @@ ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndF getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); - ++data->counters.getRangeAndFlatMapQueries; + ++data->counters.getMappedRangeQueries; ++data->counters.allQueries; ++data->readQueueSizeMetric; data->maxQueryQueue = std::max( @@ -3133,7 +3148,7 @@ ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndF try { if (req.debugID.present()) g_traceBatch.addEvent( - "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesAndFlatMap.Before"); + "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.Before"); state Version version = wait(waitForVersion(data, req.version, span.context)); state Optional tenantEntry = data->getTenantEntry(req.version, req.tenantInfo); @@ -3149,16 +3164,16 @@ ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndF if (req.debugID.present()) g_traceBatch.addEvent( - "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesAndFlatMap.AfterVersion"); + "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.AfterVersion"); //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); //} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", // req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", - //"None").detail("In", "getKeyValuesAndFlatMap>getShardKeyRange"); throw e; } + //"None").detail("In", "getMappedKeyValues>getShardKeyRange"); throw e; } if (!selectorInRange(req.end, shard) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == shard.end)) { // TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin", // req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", - // shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValuesAndFlatMap>checkShardExtents"); + // shard.begin).detail("ShardEnd", shard.end).detail("In", "getMappedKeyValues>checkShardExtents"); throw wrong_shard_server(); } @@ -3196,7 +3211,7 @@ ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndF if (req.debugID.present()) g_traceBatch.addEvent( - "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesAndFlatMap.AfterKeys"); + "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.AfterKeys"); //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); // Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query @@ -3204,22 +3219,22 @@ ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndF // end the last actual key returned must be from this shard. A begin offset of 1 is also OK because then either // begin is past end or equal to end (so the result is definitely empty) if ((offset1 && offset1 != 1) || (offset2 && offset2 != 1)) { - TEST(true); // wrong_shard_server due to offset in getKeyValuesAndFlatMapQ + TEST(true); // wrong_shard_server due to offset in getMappedKeyValuesQ // We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end, // and return a clipped range rather than an error (since that is what the NativeAPI.getRange will do anyway // via its "slow path"), but we would have to add some flags to the response to encode whether we went off // the beginning and the end, since it needs that information. - //TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValuesAndFlatMap>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2); + //TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getMappedKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2); throw wrong_shard_server(); } if (begin >= end) { if (req.debugID.present()) g_traceBatch.addEvent( - "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValuesAndFlatMap.Send"); + "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.Send"); //.detail("Begin",begin).detail("End",end); - GetKeyValuesAndFlatMapReply none; + GetMappedKeyValuesReply none; none.version = version; none.more = false; none.penalty = data->getPenalty(); @@ -3240,27 +3255,27 @@ ACTOR Future getKeyValuesAndFlatMapQ(StorageServer* data, GetKeyValuesAndF type, tenantPrefix)); - state GetKeyValuesAndFlatMapReply r; + state GetMappedKeyValuesReply r; try { // Map the scanned range to another list of keys and look up. - GetKeyValuesAndFlatMapReply _r = wait(flatMap(data, getKeyValuesReply, req.mapper, &req, tenantPrefix)); + GetMappedKeyValuesReply _r = + wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, tenantPrefix)); r = _r; } catch (Error& e) { - TraceEvent("FlatMapError").error(e); + TraceEvent("MapError").error(e); throw; } if (req.debugID.present()) - g_traceBatch.addEvent("TransactionDebug", - req.debugID.get().first(), - "storageserver.getKeyValuesAndFlatMap.AfterReadRange"); + g_traceBatch.addEvent( + "TransactionDebug", req.debugID.get().first(), "storageserver.getMappedKeyValues.AfterReadRange"); //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); data->checkChangeCounter( changeCounter, KeyRangeRef(std::min(begin, std::min(req.begin.getKey(), req.end.getKey())), std::max(end, std::max(req.begin.getKey(), req.end.getKey())))); if (EXPENSIVE_VALIDATION) { - // TODO: GetKeyValuesWithFlatMapRequest doesn't respect limit yet. + // TODO: GetMappedKeyValuesRequest doesn't respect limit yet. // ASSERT(r.data.size() <= std::abs(req.limit)); } @@ -6813,17 +6828,16 @@ ACTOR Future serveGetKeyValuesRequests(StorageServer* self, FutureStream serveGetKeyValuesAndFlatMapRequests( - StorageServer* self, - FutureStream getKeyValuesAndFlatMap) { +ACTOR Future serveGetMappedKeyValuesRequests(StorageServer* self, + FutureStream getMappedKeyValues) { // TODO: Is it fine to keep TransactionLineage::Operation::GetKeyValues here? getCurrentLineage()->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyValues; loop { - GetKeyValuesAndFlatMapRequest req = waitNext(getKeyValuesAndFlatMap); + GetMappedKeyValuesRequest req = waitNext(getMappedKeyValues); // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade // before doing real work - self->actors.add(self->readGuard(req, getKeyValuesAndFlatMapQ)); + self->actors.add(self->readGuard(req, getMappedKeyValuesQ)); } } @@ -7049,7 +7063,7 @@ ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface self->actors.add(checkBehind(self)); self->actors.add(serveGetValueRequests(self, ssi.getValue.getFuture())); self->actors.add(serveGetKeyValuesRequests(self, ssi.getKeyValues.getFuture())); - self->actors.add(serveGetKeyValuesAndFlatMapRequests(self, ssi.getKeyValuesAndFlatMap.getFuture())); + self->actors.add(serveGetMappedKeyValuesRequests(self, ssi.getMappedKeyValues.getFuture())); self->actors.add(serveGetKeyValuesStreamRequests(self, ssi.getKeyValuesStream.getFuture())); self->actors.add(serveGetKeyRequests(self, ssi.getKey.getFuture())); self->actors.add(serveWatchValueRequests(self, ssi.watchValue.getFuture())); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 326852950f..faad491488 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1126,7 +1126,7 @@ ACTOR Future storageServerRollbackRebooter(std::set storageServerRollbackRebooter(std::set(), Reference(nullptr)); @@ -1540,7 +1540,7 @@ ACTOR Future workerServer(Reference connRecord, DUMPTOKEN(recruited.getKeyValueStoreType); DUMPTOKEN(recruited.watchValue); DUMPTOKEN(recruited.getKeyValuesStream); - DUMPTOKEN(recruited.getKeyValuesAndFlatMap); + DUMPTOKEN(recruited.getMappedKeyValues); Promise recovery; Future f = storageServer(kv, recruited, dbInfo, folder, recovery, connRecord); @@ -1636,7 +1636,7 @@ ACTOR Future workerServer(Reference connRecord, DUMPTOKEN(recruited.getValue); DUMPTOKEN(recruited.getKey); DUMPTOKEN(recruited.getKeyValues); - DUMPTOKEN(recruited.getKeyValuesAndFlatMap); + DUMPTOKEN(recruited.getMappedKeyValues); DUMPTOKEN(recruited.getShardState); DUMPTOKEN(recruited.waitMetrics); DUMPTOKEN(recruited.splitMetrics); @@ -2039,7 +2039,7 @@ ACTOR Future workerServer(Reference connRecord, DUMPTOKEN(recruited.getKeyValueStoreType); DUMPTOKEN(recruited.watchValue); DUMPTOKEN(recruited.getKeyValuesStream); - DUMPTOKEN(recruited.getKeyValuesAndFlatMap); + DUMPTOKEN(recruited.getMappedKeyValues); // printf("Recruited as storageServer\n"); std::string filename = diff --git a/fdbserver/workloads/ApiWorkload.actor.cpp b/fdbserver/workloads/ApiWorkload.actor.cpp index f04c0d59cf..86c4809b79 100644 --- a/fdbserver/workloads/ApiWorkload.actor.cpp +++ b/fdbserver/workloads/ApiWorkload.actor.cpp @@ -287,6 +287,7 @@ Value ApiWorkload::generateValue() { // Creates a random transaction factory to produce transaction of one of the TransactionType choices ACTOR Future chooseTransactionFactory(Database cx, std::vector choices, ApiWorkload* self) { TransactionType transactionType = deterministicRandom()->randomChoice(choices); + self->transactionType = transactionType; if (transactionType == NATIVE) { printf("client %d: Running NativeAPI Transactions\n", self->clientPrefixInt); diff --git a/fdbserver/workloads/ApiWorkload.h b/fdbserver/workloads/ApiWorkload.h index b9e9a16247..adc5324290 100644 --- a/fdbserver/workloads/ApiWorkload.h +++ b/fdbserver/workloads/ApiWorkload.h @@ -52,6 +52,13 @@ struct TransactionWrapper : public ReferenceCounted { // Gets a range of key-value pairs from the database specified by a pair of key selectors virtual Future getRange(KeySelectorRef& begin, KeySelectorRef& end, int limit, Reverse reverse) = 0; + virtual Future getMappedRange(KeySelector& begin, + KeySelector& end, + Key& mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse) = 0; + // Gets the key from the database specified by a given key selector virtual Future getKey(KeySelectorRef& key) = 0; @@ -111,6 +118,15 @@ struct FlowTransactionWrapper : public TransactionWrapper { return transaction.getRange(begin, end, limit, Snapshot::False, reverse); } + Future getMappedRange(KeySelector& begin, + KeySelector& end, + Key& mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse) override { + return transaction.getMappedRange(begin, end, mapper, limits, snapshot, reverse); + } + // Gets the key from the database specified by a given key selector Future getKey(KeySelectorRef& key) override { return transaction.getKey(key); } @@ -171,6 +187,15 @@ struct ThreadTransactionWrapper : public TransactionWrapper { return unsafeThreadFutureToFuture(transaction->getRange(begin, end, limit, Snapshot::False, reverse)); } + Future getMappedRange(KeySelector& begin, + KeySelector& end, + Key& mapper, + GetRangeLimits limits, + Snapshot snapshot, + Reverse reverse) override { + return unsafeThreadFutureToFuture(transaction->getMappedRange(begin, end, mapper, limits, snapshot, reverse)); + } + // Gets the key from the database specified by a given key selector Future getKey(KeySelectorRef& key) override { return unsafeThreadFutureToFuture(transaction->getKey(key)); } @@ -347,6 +372,9 @@ struct ApiWorkload : TestWorkload { // The transaction factory used to create transactions in this run Reference transactionFactory; + + // Transaction type of the transaction factory above. + TransactionType transactionType; }; #include "flow/unactorcompiler.h" diff --git a/fdbserver/workloads/GetMappedRange.actor.cpp b/fdbserver/workloads/GetMappedRange.actor.cpp new file mode 100644 index 0000000000..197ccbb29c --- /dev/null +++ b/fdbserver/workloads/GetMappedRange.actor.cpp @@ -0,0 +1,406 @@ +/* + * GetMappedRange.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "fdbrpc/simulator.h" +#include "fdbclient/MutationLogReader.actor.h" +#include "fdbclient/Tuple.h" +#include "fdbserver/workloads/ApiWorkload.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" +#include "flow/Error.h" +#include "flow/IRandom.h" +#include "flow/flow.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +const Value EMPTY = Tuple().pack(); +ValueRef SOMETHING = "SOMETHING"_sr; +const KeyRef prefix = "prefix"_sr; +const KeyRef RECORD = "RECORD"_sr; +const KeyRef INDEX = "INDEX"_sr; + +struct GetMappedRangeWorkload : ApiWorkload { + bool enabled; + Snapshot snapshot = Snapshot::False; + + // const bool BAD_MAPPER = deterministicRandom()->random01() < 0.1; + const bool BAD_MAPPER = false; + // const bool SPLIT_RECORDS = deterministicRandom()->random01() < 0.5; + const bool SPLIT_RECORDS = true; + const static int SPLIT_SIZE = 3; + + GetMappedRangeWorkload(WorkloadContext const& wcx) : ApiWorkload(wcx) { + enabled = !clientId; // only do this on the "first" client + } + + std::string description() const override { return "GetMappedRange"; } + + Future start(Database const& cx) override { + // This workload is generated different from typical ApiWorkload. So don't use ApiWorkload::_start. + if (enabled) { + return GetMappedRangeWorkload::_start(cx, this); + } + return Void(); + } + + ACTOR Future performSetup(Database cx, GetMappedRangeWorkload* self) { + std::vector types; + types.push_back(NATIVE); + types.push_back(READ_YOUR_WRITES); + + wait(self->chooseTransactionFactory(cx, types)); + return Void(); + } + + Future performSetup(Database const& cx) override { return performSetup(cx, this); } + + Future performTest(Database const& cx, Standalone> const& data) override { + // Ignore this because we are not using ApiWorkload's default ::start. + return Future(); + } + + static Key primaryKey(int i) { return Key(format("primary-key-of-record-%08d", i)); } + static Key indexKey(int i) { return Key(format("index-key-of-record-%08d", i)); } + static Value dataOfRecord(int i) { return Key(format("data-of-record-%08d", i)); } + static Value dataOfRecord(int i, int split) { return Key(format("data-of-record-%08d-split-%08d", i, split)); } + + static Key indexEntryKey(int i) { + return Tuple().append(prefix).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack(); + } + static Key recordKey(int i) { return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).pack(); } + static Key recordKey(int i, int split) { + return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).append(split).pack(); + } + static Value recordValue(int i) { return Tuple().append(dataOfRecord(i)).pack(); } + static Value recordValue(int i, int split) { return Tuple().append(dataOfRecord(i, split)).pack(); } + + ACTOR Future fillInRecords(Database cx, int n, GetMappedRangeWorkload* self) { + state Transaction tr(cx); + loop { + std::cout << "start fillInRecords n=" << n << std::endl; + // TODO: When n is large, split into multiple transactions. + try { + for (int i = 0; i < n; i++) { + if (self->SPLIT_RECORDS) { + for (int split = 0; split < SPLIT_SIZE; split++) { + tr.set(recordKey(i, split), recordValue(i, split)); + } + } else { + tr.set(recordKey(i), recordValue(i)); + } + tr.set(indexEntryKey(i), EMPTY); + } + wait(tr.commit()); + std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << std::endl; + break; + } catch (Error& e) { + std::cout << "failed fillInRecords, retry" << std::endl; + wait(tr.onError(e)); + } + } + return Void(); + } + + static void showResult(const RangeResult& result) { + std::cout << "result size: " << result.size() << std::endl; + for (const KeyValueRef* it = result.begin(); it != result.end(); it++) { + std::cout << "key=" << it->key.printable() << ", value=" << it->value.printable() << std::endl; + } + } + + ACTOR Future scanRange(Database cx, KeyRangeRef range) { + std::cout << "start scanRange " << range.toString() << std::endl; + // TODO: When n is large, split into multiple transactions. + state Transaction tr(cx); + loop { + try { + RangeResult result = wait(tr.getRange(range, CLIENT_KNOBS->TOO_MANY)); + // showResult(result); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + std::cout << "finished scanRange" << std::endl; + return Void(); + } + + static void validateRecord(int expectedId, const MappedKeyValueRef* it, GetMappedRangeWorkload* self) { + // std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key) << " + // indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId)) << std::endl; + ASSERT(it->key == indexEntryKey(expectedId)); + ASSERT(it->value == EMPTY); + + if (self->SPLIT_RECORDS) { + ASSERT(std::holds_alternative(it->reqAndResult)); + auto& getRange = std::get(it->reqAndResult); + auto& rangeResult = getRange.result; + // std::cout << "rangeResult.size()=" << rangeResult.size() << std::endl; + ASSERT(rangeResult.more == false); + ASSERT(rangeResult.size() == SPLIT_SIZE); + for (int split = 0; split < SPLIT_SIZE; split++) { + auto& kv = rangeResult[split]; + // std::cout << "kv.key=" << printable(kv.key) + // << ", recordKey(id, split)=" << printable(recordKey(id, split)) << + // std::endl; std::cout << "kv.value=" << printable(kv.value) + // << ", recordValue(id, split)=" << printable(recordValue(id, split)) << + // std::endl; + ASSERT(kv.key == recordKey(expectedId, split)); + ASSERT(kv.value == recordValue(expectedId, split)); + } + } else { + ASSERT(std::holds_alternative(it->reqAndResult)); + auto& getValue = std::get(it->reqAndResult); + ASSERT(getValue.key == recordKey(expectedId)); + ASSERT(getValue.result.present()); + ASSERT(getValue.result.get() == recordValue(expectedId)); + } + } + + ACTOR Future scanMappedRangeWithLimits(Database cx, + KeySelector beginSelector, + KeySelector endSelector, + Key mapper, + int limit, + int expectedBeginId, + GetMappedRangeWorkload* self) { + + std::cout << "start scanMappedRangeWithLimits beginSelector:" << beginSelector.toString() + << " endSelector:" << endSelector.toString() << " expectedBeginId:" << expectedBeginId + << " limit:" << limit << std::endl; + loop { + state Reference tr = self->createTransaction(); + try { + MappedRangeResult result = wait(tr->getMappedRange( + beginSelector, endSelector, mapper, GetRangeLimits(limit), self->snapshot, Reverse::False)); + // showResult(result); + if (self->BAD_MAPPER) { + TraceEvent("GetMappedRangeWorkloadShouldNotReachable").detail("ResultSize", result.size()); + } + std::cout << "result.size()=" << result.size() << std::endl; + std::cout << "result.more=" << result.more << std::endl; + ASSERT(result.size() <= limit); + int expectedId = expectedBeginId; + for (const MappedKeyValueRef* it = result.begin(); it != result.end(); it++) { + validateRecord(expectedId, it, self); + expectedId++; + } + std::cout << "finished scanMappedRangeWithLimits" << std::endl; + return result; + } catch (Error& e) { + if ((self->BAD_MAPPER && e.code() == error_code_mapper_bad_index) || + (!SERVER_KNOBS->QUICK_GET_VALUE_FALLBACK && e.code() == error_code_quick_get_value_miss) || + (!SERVER_KNOBS->QUICK_GET_KEY_VALUES_FALLBACK && + e.code() == error_code_quick_get_key_values_miss)) { + TraceEvent("GetMappedRangeWorkloadExpectedErrorDetected").error(e); + return MappedRangeResult(); + } else { + std::cout << "error " << e.what() << std::endl; + wait(tr->onError(e)); + } + std::cout << "failed scanMappedRangeWithLimits" << std::endl; + } + } + } + + ACTOR Future scanMappedRange(Database cx, int beginId, int endId, Key mapper, GetMappedRangeWorkload* self) { + Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone(); + state KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple)); + Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone(); + state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple)); + state int limit = 100; + state int expectedBeginId = beginId; + while (true) { + MappedRangeResult result = wait( + self->scanMappedRangeWithLimits(cx, beginSelector, endSelector, mapper, limit, expectedBeginId, self)); + expectedBeginId += result.size(); + if (result.more) { + if (result.empty()) { + // This is usually not expected. + std::cout << "not result but have more, try again" << std::endl; + } else { + beginSelector = KeySelector(firstGreaterThan(result.back().key)); + } + } else { + // No more, finished. + break; + } + } + ASSERT(expectedBeginId == endId); + return Void(); + } + + static void conflictWriteOnRecord(int conflictRecordId, + Reference& tr, + GetMappedRangeWorkload* self) { + Key writeKey; + if (deterministicRandom()->random01() < 0.5) { + // Concurrent write to the primary scanned range + writeKey = indexEntryKey(conflictRecordId); + } else { + // Concurrent write to the underlying scanned ranges/keys + if (self->SPLIT_RECORDS) { + // Update one of the splits is sufficient. + writeKey = recordKey(conflictRecordId, 0); + } else { + writeKey = recordKey(conflictRecordId); + } + } + tr->set(writeKey, SOMETHING); + std::cout << "conflict write to " << printable(writeKey) << std::endl; + } + + static Future runGetMappedRange(int beginId, + int endId, + Reference& tr, + GetMappedRangeWorkload* self) { + Key mapper = getMapper(self); + Key beginTuple = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone(); + KeySelector beginSelector = KeySelector(firstGreaterOrEqual(beginTuple)); + Key endTuple = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone(); + KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple)); + return tr->getMappedRange(beginSelector, + endSelector, + mapper, + GetRangeLimits(GetRangeLimits::ROW_LIMIT_UNLIMITED), + self->snapshot, + Reverse::False); + } + + // If another transaction writes to our read set (the scanned ranges) before we commit, the transaction should + // fail. + ACTOR Future testSerializableConflicts(GetMappedRangeWorkload* self) { + std::cout << "testSerializableConflicts" << std::endl; + + loop { + state Reference tr1 = self->createTransaction(); + try { + MappedRangeResult result = wait(runGetMappedRange(5, 10, tr1, self)); + + // Commit another transaction that has conflict writes. + loop { + state Reference tr2 = self->createTransaction(); + try { + conflictWriteOnRecord(7, tr2, self); + wait(tr2->commit()); + break; + } catch (Error& e) { + std::cout << "tr2 error " << e.what() << std::endl; + wait(tr2->onError(e)); + } + } + + // Do some writes so that tr1 is not read-only. + tr1->set(SOMETHING, SOMETHING); + wait(tr1->commit()); + UNREACHABLE(); + } catch (Error& e) { + if (e.code() == error_code_not_committed) { + std::cout << "tr1 failed because of conflicts (as expected)" << std::endl; + TraceEvent("GetMappedRangeWorkloadExpectedErrorDetected").error(e); + return Void(); + } else { + std::cout << "tr1 error " << e.what() << std::endl; + wait(tr1->onError(e)); + } + } + } + } + + // If the same transaction writes to the read set (the scanned ranges) before reading, it should throw read your + // write exception. + ACTOR Future testRYW(GetMappedRangeWorkload* self) { + std::cout << "testRYW" << std::endl; + loop { + state Reference tr1 = self->createTransaction(); + try { + // Write something that will be read in getMappedRange. + conflictWriteOnRecord(7, tr1, self); + MappedRangeResult result = wait(runGetMappedRange(5, 10, tr1, self)); + UNREACHABLE(); + } catch (Error& e) { + if (e.code() == error_code_get_mapped_range_reads_your_writes) { + std::cout << "tr1 failed because of read your writes (as expected)" << std::endl; + TraceEvent("GetMappedRangeWorkloadExpectedErrorDetected").error(e); + return Void(); + } else { + std::cout << "tr1 error " << e.what() << std::endl; + wait(tr1->onError(e)); + } + } + } + } + + ACTOR Future _start(Database cx, GetMappedRangeWorkload* self) { + TraceEvent("GetMappedRangeWorkloadConfig").detail("BadMapper", self->BAD_MAPPER); + + // TODO: Use toml to config + wait(self->fillInRecords(cx, 500, self)); + + if (self->transactionType == NATIVE) { + self->snapshot = Snapshot::True; + } else if (self->transactionType == READ_YOUR_WRITES) { + self->snapshot = Snapshot::False; + const double rand = deterministicRandom()->random01(); + if (rand < 0.1) { + wait(self->testSerializableConflicts(self)); + return Void(); + } else if (rand < 0.2) { + wait(self->testRYW(self)); + return Void(); + } else { + // Test the happy path where there is no conflicts or RYW + } + } else { + UNREACHABLE(); + } + + std::cout << "Test configuration: transactionType:" << self->transactionType << " snapshot:" << self->snapshot + << "bad_mapper:" << self->BAD_MAPPER << std::endl; + + Key mapper = getMapper(self); + // The scanned range cannot be too large to hit get_mapped_key_values_has_more. We have a unit validating the + // error is thrown when the range is large. + wait(self->scanMappedRange(cx, 10, 490, mapper, self)); + return Void(); + } + + static Key getMapper(GetMappedRangeWorkload* self) { + Tuple mapperTuple; + if (self->BAD_MAPPER) { + mapperTuple << prefix << RECORD << "{K[xxx]}"_sr; + } else { + mapperTuple << prefix << RECORD << "{K[3]}"_sr; + if (self->SPLIT_RECORDS) { + mapperTuple << "{...}"_sr; + } + } + Key mapper = mapperTuple.getDataAsStandalone(); + return mapper; + } + + Future check(Database const& cx) override { return true; } + + void getMetrics(std::vector& m) override {} +}; + +WorkloadFactory GetMappedRangeWorkloadFactory("GetMappedRange"); diff --git a/fdbserver/workloads/GetRangeAndMap.actor.cpp b/fdbserver/workloads/GetRangeAndMap.actor.cpp deleted file mode 100644 index 7d0bcb955a..0000000000 --- a/fdbserver/workloads/GetRangeAndMap.actor.cpp +++ /dev/null @@ -1,186 +0,0 @@ -/* - * GetRangeAndMap.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include "fdbrpc/simulator.h" -#include "fdbclient/MutationLogReader.actor.h" -#include "fdbclient/Tuple.h" -#include "fdbserver/workloads/workloads.actor.h" -#include "fdbserver/Knobs.h" -#include "flow/Error.h" -#include "flow/IRandom.h" -#include "flow/flow.h" -#include "flow/actorcompiler.h" // This must be the last #include. - -const Value EMPTY = Tuple().pack(); -const KeyRef prefix = "prefix"_sr; -const KeyRef RECORD = "RECORD"_sr; -const KeyRef INDEX = "INDEX"_sr; - -struct GetRangeAndMapWorkload : TestWorkload { - bool enabled; - const bool BAD_MAPPER = deterministicRandom()->random01() < 0.1; - - GetRangeAndMapWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { - enabled = !clientId; // only do this on the "first" client - } - - std::string description() const override { return "GetRangeAndMap"; } - - Future start(Database const& cx) override { - if (enabled) { - return _start(cx, this); - } - return Void(); - } - - static Key primaryKey(int i) { return Key(format("primary-key-of-record-%08d", i)); } - static Key indexKey(int i) { return Key(format("index-key-of-record-%08d", i)); } - static Value dataOfRecord(int i) { return Key(format("data-of-record-%08d", i)); } - - static Key indexEntryKey(int i) { - return Tuple().append(prefix).append(INDEX).append(indexKey(i)).append(primaryKey(i)).pack(); - } - static Key recordKey(int i) { return Tuple().append(prefix).append(RECORD).append(primaryKey(i)).pack(); } - static Value recordValue(int i) { return Tuple().append(dataOfRecord(i)).pack(); } - - ACTOR Future fillInRecords(Database cx, int n) { - loop { - std::cout << "start fillInRecords n=" << n << std::endl; - // TODO: When n is large, split into multiple transactions. - state Transaction tr(cx); - try { - tr.reset(); - for (int i = 0; i < n; i++) { - tr.set(recordKey(i), recordValue(i)); - tr.set(indexEntryKey(i), EMPTY); - } - wait(tr.commit()); - std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << std::endl; - break; - } catch (Error& e) { - std::cout << "failed fillInRecords, retry" << std::endl; - wait(tr.onError(e)); - } - } - return Void(); - } - - static void showResult(const RangeResult& result) { - std::cout << "result size: " << result.size() << std::endl; - for (const KeyValueRef* it = result.begin(); it != result.end(); it++) { - std::cout << "key=" << it->key.printable() << ", value=" << it->value.printable() << std::endl; - } - } - - ACTOR Future scanRange(Database cx, KeyRangeRef range) { - std::cout << "start scanRange " << range.toString() << std::endl; - // TODO: When n is large, split into multiple transactions. - state Transaction tr(cx); - try { - tr.reset(); - RangeResult result = wait(tr.getRange(range, CLIENT_KNOBS->TOO_MANY)); - // showResult(result); - } catch (Error& e) { - wait(tr.onError(e)); - } - std::cout << "finished scanRange" << std::endl; - return Void(); - } - - ACTOR Future scanRangeAndFlatMap(Database cx, - int beginId, - int endId, - Key mapper, - GetRangeAndMapWorkload* self) { - Key someIndexesBegin = Tuple().append(prefix).append(INDEX).append(indexKey(beginId)).getDataAsStandalone(); - Key someIndexesEnd = Tuple().append(prefix).append(INDEX).append(indexKey(endId)).getDataAsStandalone(); - state KeyRange range = KeyRangeRef(someIndexesBegin, someIndexesEnd); - - std::cout << "start scanRangeAndFlatMap " << range.toString() << std::endl; - // TODO: When n is large, split into multiple transactions. - state Transaction tr(cx); - try { - tr.reset(); - RangeResult result = - wait(tr.getRangeAndFlatMap(KeySelector(firstGreaterOrEqual(range.begin), range.arena()), - KeySelector(firstGreaterOrEqual(range.end), range.arena()), - mapper, - GetRangeLimits(CLIENT_KNOBS->TOO_MANY), - Snapshot::True)); - // showResult(result); - if (self->BAD_MAPPER) { - TraceEvent("GetRangeAndMapWorkloadShouldNotReachable").detail("ResultSize", result.size()); - } - // Examples: - // key=\x01prefix\x00\x01RECORD\x00\x01primary-key-of-record-2\x00, value=\x01data-of-record-2\x00 - // key=\x01prefix\x00\x01RECORD\x00\x01primary-key-of-record-3\x00, value=\x01data-of-record-3\x00 - std::cout << "result.size()=" << result.size() << std::endl; - std::cout << "result.more=" << result.more << std::endl; - ASSERT(result.size() == endId - beginId); - int id = beginId; - for (const KeyValueRef* it = result.begin(); it != result.end(); it++) { - ASSERT(it->key == recordKey(id)); - ASSERT(it->value == recordValue(id)); - id++; - } - } catch (Error& e) { - if ((self->BAD_MAPPER && e.code() == error_code_mapper_bad_index) || - (!SERVER_KNOBS->QUICK_GET_VALUE_FALLBACK && e.code() == error_code_quick_get_value_miss) || - (!SERVER_KNOBS->QUICK_GET_KEY_VALUES_FALLBACK && e.code() == error_code_quick_get_key_values_miss)) { - TraceEvent("GetRangeAndMapWorkloadExpectedErrorDetected").error(e); - } else { - wait(tr.onError(e)); - } - } - std::cout << "finished scanRangeAndFlatMap" << std::endl; - return Void(); - } - - ACTOR Future _start(Database cx, GetRangeAndMapWorkload* self) { - TraceEvent("GetRangeAndMapWorkloadConfig").detail("BadMapper", self->BAD_MAPPER); - - // TODO: Use toml to config - wait(self->fillInRecords(cx, 200)); - - wait(self->scanRange(cx, normalKeys)); - - // wait(self->scanRange(cx, someIndexes)); - - Tuple mapperTuple; - if (self->BAD_MAPPER) { - mapperTuple << prefix << RECORD << "{K[xxx]}"_sr; - } else { - mapperTuple << prefix << RECORD << "{K[3]}"_sr; - } - Key mapper = mapperTuple.getDataAsStandalone(); - // The scanned range cannot be too large to hit get_key_values_and_map_has_more. We have a unit validating the - // error is thrown when the range is large. - wait(self->scanRangeAndFlatMap(cx, 10, 190, mapper, self)); - return Void(); - } - - Future check(Database const& cx) override { return true; } - - void getMetrics(std::vector& m) override {} -}; - -WorkloadFactory GetRangeAndMapWorkloadFactory("GetRangeAndMap"); diff --git a/flow/Arena.h b/flow/Arena.h index c61f4bff4c..ef7708b274 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -349,6 +349,8 @@ struct union_like_traits> : std::true_type { template class Standalone : private Arena, public T { public: + using RefType = T; + // T must have no destructor Arena& arena() { return *(Arena*)this; } const Arena& arena() const { return *(const Arena*)this; } diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 15a3df8fde..5864a31563 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -172,7 +172,8 @@ ERROR( quick_get_value_miss, 2034, "Found a mapped key that is not served in the ERROR( quick_get_key_values_miss, 2035, "Found a mapped range that is not served in the same SS" ) ERROR( blob_granule_no_ryw, 2036, "Blob Granule Read Transactions must be specified as ryw-disabled" ) ERROR( blob_granule_not_materialized, 2037, "Blob Granule Read Transactions must be specified as ryw-disabled" ) -ERROR( get_key_values_and_map_has_more, 2038, "getRangeAndFlatMap does not support continuation for now" ) +ERROR( get_mapped_key_values_has_more, 2038, "getMappedRange does not support continuation for now" ) +ERROR( get_mapped_range_reads_your_writes, 2039, "getMappedRange tries to read data that were previously written in the transaction" ) ERROR( incompatible_protocol_version, 2100, "Incompatible protocol version" ) ERROR( transaction_too_large, 2101, "Transaction exceeds byte limit" ) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 48315c5f94..1c6d535706 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -155,7 +155,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/MemoryLifetime.toml) add_fdb_test(TEST_FILES fast/MoveKeysCycle.toml) add_fdb_test(TEST_FILES fast/MutationLogReaderCorrectness.toml) - add_fdb_test(TEST_FILES fast/GetRangeAndMap.toml) + add_fdb_test(TEST_FILES fast/GetMappedRange.toml) add_fdb_test(TEST_FILES fast/ProtocolVersion.toml) add_fdb_test(TEST_FILES fast/RandomSelector.toml) add_fdb_test(TEST_FILES fast/RandomUnitTests.toml) diff --git a/tests/fast/GetMappedRange.toml b/tests/fast/GetMappedRange.toml new file mode 100644 index 0000000000..2dd4672592 --- /dev/null +++ b/tests/fast/GetMappedRange.toml @@ -0,0 +1,6 @@ +[[test]] +testTitle = 'GetMappedRange' +useDB = true + + [[test.workload]] + testName = 'GetMappedRange' diff --git a/tests/fast/GetRangeAndMap.toml b/tests/fast/GetRangeAndMap.toml deleted file mode 100644 index 3864546ef0..0000000000 --- a/tests/fast/GetRangeAndMap.toml +++ /dev/null @@ -1,6 +0,0 @@ -[[test]] -testTitle = 'GetRangeAndMap' -useDB = true - - [[test.workload]] - testName = 'GetRangeAndMap' From 9e52456eeb3b9ff9d30959522d65dc8247d2929b Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Thu, 10 Mar 2022 19:06:56 +0100 Subject: [PATCH 047/138] ApiTester: Test cancelling transaction with pending futures; Refactor transaction executor to improve concurrency handing and memory management --- bindings/c/CMakeLists.txt | 3 + .../c/test/apitester/TesterApiWorkload.cpp | 129 ++++++ bindings/c/test/apitester/TesterApiWorkload.h | 89 ++++ .../c/test/apitester/TesterApiWrapper.cpp | 4 + bindings/c/test/apitester/TesterApiWrapper.h | 5 +- .../TesterCancelTransactionWorkload.cpp | 113 +++++ .../apitester/TesterCorrectnessWorkload.cpp | 126 +----- .../apitester/TesterTransactionExecutor.cpp | 414 ++++++++++++------ .../apitester/TesterTransactionExecutor.h | 29 +- bindings/c/test/apitester/TesterWorkload.cpp | 6 +- .../tests/CApiCancelTransactionBlocking.toml | 24 + .../tests/CApiCancelTransactionCB.toml | 23 + .../tests/CApiCancelTransactionDBPerTX.toml | 24 + 13 files changed, 729 insertions(+), 260 deletions(-) create mode 100644 bindings/c/test/apitester/TesterApiWorkload.cpp create mode 100644 bindings/c/test/apitester/TesterApiWorkload.h create mode 100644 bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp create mode 100644 bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml create mode 100644 bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml create mode 100644 bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index f440de842f..587b18048b 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -101,10 +101,13 @@ if(NOT WIN32 AND NOT IS_ARM_MAC) set(API_TESTER_SRCS test/apitester/fdb_c_api_tester.cpp + test/apitester/TesterApiWorkload.cpp + test/apitester/TesterApiWorkload.h test/apitester/TesterApiWrapper.cpp test/apitester/TesterApiWrapper.h test/apitester/TesterTestSpec.cpp test/apitester/TesterTestSpec.h + test/apitester/TesterCancelTransactionWorkload.cpp test/apitester/TesterCorrectnessWorkload.cpp test/apitester/TesterKeyValueStore.cpp test/apitester/TesterKeyValueStore.h diff --git a/bindings/c/test/apitester/TesterApiWorkload.cpp b/bindings/c/test/apitester/TesterApiWorkload.cpp new file mode 100644 index 0000000000..899329f2f6 --- /dev/null +++ b/bindings/c/test/apitester/TesterApiWorkload.cpp @@ -0,0 +1,129 @@ +/* + * TesterApiWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TesterApiWorkload.h" +#include "TesterUtil.h" +#include + +namespace FdbApiTester { + +ApiWorkload::ApiWorkload(const WorkloadConfig& config) : WorkloadBase(config) { + minKeyLength = config.getIntOption("minKeyLength", 1); + maxKeyLength = config.getIntOption("maxKeyLength", 64); + minValueLength = config.getIntOption("minValueLength", 1); + maxValueLength = config.getIntOption("maxValueLength", 1000); + maxKeysPerTransaction = config.getIntOption("maxKeysPerTransaction", 50); + initialSize = config.getIntOption("initialSize", 1000); + readExistingKeysRatio = config.getFloatOption("readExistingKeysRatio", 0.9); + keyPrefix = fmt::format("{}/", workloadId); +} + +void ApiWorkload::start() { + schedule([this]() { + // 1. Clear data + clearData([this]() { + // 2. Populate initial data + populateData([this]() { + // 3. Generate random workload + runTests(); + }); + }); + }); +} + +std::string ApiWorkload::randomKeyName() { + return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength); +} + +std::string ApiWorkload::randomValue() { + return Random::get().randomStringLowerCase(minValueLength, maxValueLength); +} + +std::string ApiWorkload::randomNotExistingKey() { + while (true) { + std::string key = randomKeyName(); + if (!store.exists(key)) { + return key; + } + } +} + +std::string ApiWorkload::randomExistingKey() { + std::string genKey = randomKeyName(); + std::string key = store.getKey(genKey, true, 1); + if (key != store.endKey()) { + return key; + } + key = store.getKey(genKey, true, 0); + if (key != store.startKey()) { + return key; + } + info("No existing key found, using a new random key."); + return genKey; +} + +std::string ApiWorkload::randomKey(double existingKeyRatio) { + if (Random::get().randomBool(existingKeyRatio)) { + return randomExistingKey(); + } else { + return randomNotExistingKey(); + } +} + +void ApiWorkload::populateDataTx(TTaskFct cont) { + int numKeys = maxKeysPerTransaction; + auto kvPairs = std::make_shared>(); + for (int i = 0; i < numKeys; i++) { + kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() }); + } + execTransaction( + [kvPairs](auto ctx) { + for (const KeyValue& kv : *kvPairs) { + ctx->tx()->set(kv.key, kv.value); + } + ctx->commit(); + }, + [this, kvPairs, cont]() { + for (const KeyValue& kv : *kvPairs) { + store.set(kv.key, kv.value); + } + schedule(cont); + }); +} + +void ApiWorkload::clearData(TTaskFct cont) { + execTransaction( + [this](auto ctx) { + ctx->tx()->clearRange(keyPrefix, fmt::format("{}\xff", keyPrefix)); + ctx->commit(); + }, + [this, cont]() { schedule(cont); }); +} + +void ApiWorkload::populateData(TTaskFct cont) { + if (store.size() < initialSize) { + populateDataTx([this, cont]() { populateData(cont); }); + } else { + info("Data population completed"); + schedule(cont); + } +} + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterApiWorkload.h b/bindings/c/test/apitester/TesterApiWorkload.h new file mode 100644 index 0000000000..fb9df2dcda --- /dev/null +++ b/bindings/c/test/apitester/TesterApiWorkload.h @@ -0,0 +1,89 @@ +/* + * TesterApiWorkload.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef APITESTER_API_WORKLOAD_H +#define APITESTER_API_WORKLOAD_H + +#include "TesterWorkload.h" +#include "TesterKeyValueStore.h" + +namespace FdbApiTester { + +/** + * Base class for implementing API testing workloads. + * Provides various helper methods and reusable configuration parameters + */ +class ApiWorkload : public WorkloadBase { +public: + void start() override; + + // Method to be overridden to run specific tests + virtual void runTests() = 0; + +protected: + // The minimum length of a key + int minKeyLength; + + // The maximum length of a key + int maxKeyLength; + + // The minimum length of a value + int minValueLength; + + // The maximum length of a value + int maxValueLength; + + // Maximum number of keys to be accessed by a transaction + int maxKeysPerTransaction; + + // Initial data size (number of key-value pairs) + int initialSize; + + // The ratio of reading existing keys + double readExistingKeysRatio; + + // Key prefix + std::string keyPrefix; + + // In-memory store maintaining expected database state + KeyValueStore store; + + ApiWorkload(const WorkloadConfig& config); + + // Methods for generating random keys and values + std::string randomKeyName(); + std::string randomValue(); + std::string randomNotExistingKey(); + std::string randomExistingKey(); + std::string randomKey(double existingKeyRatio); + + // Generate initial random data for the workload + void populateData(TTaskFct cont); + + // Clear the data of the workload + void clearData(TTaskFct cont); + +private: + void populateDataTx(TTaskFct cont); +}; + +} // namespace FdbApiTester + +#endif \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index a8cf50b0d4..869b02d89c 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -40,6 +40,10 @@ void Future::reset() { future_.reset(); } +void Future::cancel() { + fdb_future_cancel(future_.get()); +} + fdb_error_t Future::getError() const { return fdb_future_get_error(future_.get()); } diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index d4a2793ffc..61275f1109 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -41,7 +41,7 @@ namespace FdbApiTester { // FDBFuture when this instance goes out of scope. class Future { public: - Future() : future_(nullptr) {} + Future() = default; Future(FDBFuture* f); FDBFuture* fdbFuture() { return future_.get(); }; @@ -49,6 +49,7 @@ public: fdb_error_t getError() const; explicit operator bool() const { return future_ != nullptr; }; void reset(); + void cancel(); protected: std::shared_ptr future_; @@ -63,7 +64,7 @@ public: class Transaction { public: - Transaction(); + Transaction() = default; Transaction(FDBTransaction* tx); ValueFuture get(std::string_view key, fdb_bool_t snapshot); void set(std::string_view key, std::string_view value); diff --git a/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp new file mode 100644 index 0000000000..6812841f80 --- /dev/null +++ b/bindings/c/test/apitester/TesterCancelTransactionWorkload.cpp @@ -0,0 +1,113 @@ +/* + * TesterCancelTransactionWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "TesterApiWorkload.h" +#include "TesterUtil.h" + +namespace FdbApiTester { + +class CancelTransactionWorkload : public ApiWorkload { +public: + CancelTransactionWorkload(const WorkloadConfig& config) : ApiWorkload(config) { + numRandomOperations = config.getIntOption("numRandomOperations", 1000); + numOpLeft = numRandomOperations; + } + + void runTests() override { randomOperations(); } + +private: + enum OpType { OP_CANCEL_GET, OP_CANCEL_AFTER_FIRST_GET, OP_LAST = OP_CANCEL_AFTER_FIRST_GET }; + + // The number of operations to be executed + int numRandomOperations; + + // Operations counter + int numOpLeft; + + // Start multiple concurrent gets and cancel the transaction + void randomCancelGetTx(TTaskFct cont) { + int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); + auto keys = std::make_shared>(); + for (int i = 0; i < numKeys; i++) { + keys->push_back(randomKey(readExistingKeysRatio)); + } + execTransaction( + [keys](auto ctx) { + std::vector futures; + for (const auto& key : *keys) { + futures.push_back(ctx->tx()->get(key, false)); + } + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + // Start multiple concurrent gets and cancel the transaction after the first get returns + void randomCancelAfterFirstResTx(TTaskFct cont) { + int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); + auto keys = std::make_shared>(); + for (int i = 0; i < numKeys; i++) { + keys->push_back(randomKey(readExistingKeysRatio)); + } + execTransaction( + [this, keys](auto ctx) { + std::vector futures; + for (const auto& key : *keys) { + futures.push_back(ctx->tx()->get(key, false)); + } + for (int i = 0; i < keys->size(); i++) { + ValueFuture f = futures[i]; + auto expectedVal = store.get((*keys)[i]); + ctx->continueAfter(f, [expectedVal, f, this, ctx]() { + auto val = f.getValue(); + if (expectedVal != val) { + error(fmt::format( + "cancelAfterFirstResTx mismatch. expected: {:.80} actual: {:.80}", expectedVal, val)); + } + ctx->done(); + }); + } + }, + [this, cont]() { schedule(cont); }); + } + + void randomOperation(TTaskFct cont) { + OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + switch (txType) { + case OP_CANCEL_GET: + randomCancelGetTx(cont); + break; + case OP_CANCEL_AFTER_FIRST_GET: + randomCancelAfterFirstResTx(cont); + break; + } + } + + void randomOperations() { + if (numOpLeft == 0) + return; + + numOpLeft--; + randomOperation([this]() { randomOperations(); }); + } +}; + +WorkloadFactory MiscTestWorkloadFactory("CancelTransaction"); + +} // namespace FdbApiTester \ No newline at end of file diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index b597645665..6da940c155 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -17,109 +17,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "TesterWorkload.h" +#include "TesterApiWorkload.h" #include "TesterUtil.h" -#include "TesterKeyValueStore.h" -#include "test/apitester/TesterScheduler.h" #include -#include -#include #include namespace FdbApiTester { -class ApiCorrectnessWorkload : public WorkloadBase { +class ApiCorrectnessWorkload : public ApiWorkload { public: + ApiCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) { + numRandomOperations = config.getIntOption("numRandomOperations", 1000); + numOpLeft = numRandomOperations; + } + + void runTests() override { randomOperations(); } + +private: enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ }; - // The minimum length of a key - int minKeyLength; - - // The maximum length of a key - int maxKeyLength; - - // The minimum length of a value - int minValueLength; - - // The maximum length of a value - int maxValueLength; - - // Maximum number of keys to be accessed by a transaction - int maxKeysPerTransaction; - - // Initial data size (number of key-value pairs) - int initialSize; - // The number of operations to be executed int numRandomOperations; - // The ratio of reading existing keys - double readExistingKeysRatio; - - // Key prefix - std::string keyPrefix; - - ApiCorrectnessWorkload(const WorkloadConfig& config) : WorkloadBase(config) { - minKeyLength = config.getIntOption("minKeyLength", 1); - maxKeyLength = config.getIntOption("maxKeyLength", 64); - minValueLength = config.getIntOption("minValueLength", 1); - maxValueLength = config.getIntOption("maxValueLength", 1000); - maxKeysPerTransaction = config.getIntOption("maxKeysPerTransaction", 50); - initialSize = config.getIntOption("initialSize", 1000); - numRandomOperations = config.getIntOption("numRandomOperations", 1000); - readExistingKeysRatio = config.getFloatOption("readExistingKeysRatio", 0.9); - keyPrefix = fmt::format("{}/", workloadId); - numOpLeft = numRandomOperations; - } - - void start() override { - schedule([this]() { - // 1. Clear data - clearData([this]() { - // 2. Populate initial data - populateData([this]() { - // 3. Generate random workload - randomOperations(); - }); - }); - }); - } - -private: - std::string randomKeyName() { return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength); } - - std::string randomValue() { return Random::get().randomStringLowerCase(minValueLength, maxValueLength); } - - std::string randomNotExistingKey() { - while (true) { - std::string key = randomKeyName(); - if (!store.exists(key)) { - return key; - } - } - } - - std::string randomExistingKey() { - std::string genKey = randomKeyName(); - std::string key = store.getKey(genKey, true, 1); - if (key != store.endKey()) { - return key; - } - key = store.getKey(genKey, true, 0); - if (key != store.startKey()) { - return key; - } - info("No existing key found, using a new random key."); - return genKey; - } - - std::string randomKey(double existingKeyRatio) { - if (Random::get().randomBool(existingKeyRatio)) { - return randomExistingKey(); - } else { - return randomNotExistingKey(); - } - } + // Operations counter + int numOpLeft; void randomInsertOp(TTaskFct cont) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); @@ -169,6 +90,7 @@ private: futures->push_back(ctx->tx()->get(kv.key, false)); } ctx->continueAfterAll(futures, [ctx, futures, results]() { + results->clear(); for (auto& f : *futures) { results->push_back(((ValueFuture&)f).getValue()); } @@ -206,6 +128,7 @@ private: futures->push_back(ctx->tx()->get(key, false)); } ctx->continueAfterAll(futures, [ctx, futures, results]() { + results->clear(); for (auto& f : *futures) { results->push_back(((ValueFuture&)f).getValue()); } @@ -286,24 +209,6 @@ private: } } - void clearData(TTaskFct cont) { - execTransaction( - [this](auto ctx) { - ctx->tx()->clearRange(keyPrefix, fmt::format("{}\xff", keyPrefix)); - ctx->commit(); - }, - [this, cont]() { schedule(cont); }); - } - - void populateData(TTaskFct cont) { - if (store.size() < initialSize) { - randomInsertOp([this, cont]() { populateData(cont); }); - } else { - info("Data population completed"); - schedule(cont); - } - } - void randomOperations() { if (numOpLeft == 0) return; @@ -311,9 +216,6 @@ private: numOpLeft--; randomOperation([this]() { randomOperations(); }); } - - int numOpLeft; - KeyValueStore store; }; WorkloadFactory ApiCorrectnessWorkloadFactory("ApiCorrectness"); diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 16e2659a58..0fdf913a2b 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -25,9 +25,16 @@ #include #include #include +#include +#include namespace FdbApiTester { +void TransactionActorBase::complete(fdb_error_t err) { + error = err; + context = {}; +} + void ITransactionContext::continueAfterAll(std::shared_ptr> futures, TTaskFct cont) { auto counter = std::make_shared>(futures->size()); for (auto& f : *futures) { @@ -39,154 +46,295 @@ void ITransactionContext::continueAfterAll(std::shared_ptr> } } -class TransactionContext : public ITransactionContext { +/** + * Transaction context base class, containing reusable functionality + */ +class TransactionContextBase : public ITransactionContext, public std::enable_shared_from_this { public: - TransactionContext(FDBTransaction* tx, - std::shared_ptr txActor, - TTaskFct cont, - const TransactionExecutorOptions& options, - IScheduler* scheduler) - : options(options), fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler) {} + TransactionContextBase(FDBTransaction* tx, + std::shared_ptr txActor, + TTaskFct cont, + IScheduler* scheduler) + : fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), txState(TxState::IN_PROGRESS) {} + + // A state machine: + // IN_PROGRESS -> (ON_ERROR -> IN_PROGRESS)* [-> ON_ERROR] -> DONE + enum class TxState { IN_PROGRESS, ON_ERROR, DONE }; Transaction* tx() override { return &fdbTx; } + + // Set a continuation to be executed when a future gets ready void continueAfter(Future f, TTaskFct cont) override { doContinueAfter(f, cont); } + + // Complete the transaction with a commit void commit() override { - Future f = fdbTx.commit(); - doContinueAfter(f, [this]() { done(); }); - } - void done() override { - TTaskFct cont = contAfterDone; - ASSERT(!onErrorFuture); - ASSERT(waitMap.empty()); - delete this; - cont(); - } - -private: - void doContinueAfter(Future f, TTaskFct cont) { - if (options.blockOnFutures) { - blockingContinueAfter(f, cont); - } else { - asyncContinueAfter(f, cont); - } - } - - void blockingContinueAfter(Future f, TTaskFct cont) { - scheduler->schedule([this, f, cont]() mutable { - std::unique_lock lock(mutex); - if (!onErrorFuture) { - fdb_error_t err = fdb_future_block_until_ready(f.fdbFuture()); - if (err) { - lock.unlock(); - transactionFailed(err); - return; - } - err = f.getError(); - if (err) { - if (err != error_code_transaction_cancelled) { - onErrorFuture = fdbTx.onError(err); - fdb_error_t err2 = fdb_future_block_until_ready(onErrorFuture.fdbFuture()); - if (err2) { - lock.unlock(); - transactionFailed(err2); - return; - } - scheduler->schedule([this]() { handleOnErrorResult(); }); - } - } else { - scheduler->schedule([cont]() { cont(); }); - } - } - }); - } - - void asyncContinueAfter(Future f, TTaskFct cont) { std::unique_lock lock(mutex); - if (!onErrorFuture) { - waitMap[f.fdbFuture()] = WaitInfo{ f, cont }; - lock.unlock(); - fdb_error_t err = fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this); - if (err) { - transactionFailed(err); - } - } - } - - static void futureReadyCallback(FDBFuture* f, void* param) { - TransactionContext* txCtx = (TransactionContext*)param; - txCtx->onFutureReady(f); - } - - void onFutureReady(FDBFuture* f) { - std::unique_lock lock(mutex); - auto iter = waitMap.find(f); - if (iter == waitMap.end()) { + if (txState != TxState::IN_PROGRESS) { return; } - fdb_error_t err = fdb_future_get_error(f); - TTaskFct cont = iter->second.cont; - waitMap.erase(iter); - if (err) { - if (err != error_code_transaction_cancelled) { - waitMap.clear(); - onErrorFuture = tx()->onError(err); - lock.unlock(); - fdb_error_t err = fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this); - if (err) { - transactionFailed(err); - } - } - } else { - scheduler->schedule(cont); - } + lock.unlock(); + Future f = fdbTx.commit(); + auto thisRef = shared_from_this(); + doContinueAfter(f, [thisRef]() { thisRef->done(); }); } - static void onErrorReadyCallback(FDBFuture* f, void* param) { - TransactionContext* txCtx = (TransactionContext*)param; - txCtx->onErrorReady(f); - } - - void onErrorReady(FDBFuture* f) { - scheduler->schedule([this]() { handleOnErrorResult(); }); - } - - void handleOnErrorResult() { + // Complete the transaction without a commit (for read transactions) + void done() override { std::unique_lock lock(mutex); + if (txState != TxState::IN_PROGRESS) { + return; + } + txState = TxState::DONE; + lock.unlock(); + txActor->complete(error_code_success); + cleanUp(); + contAfterDone(); + } + +protected: + virtual void doContinueAfter(Future f, TTaskFct cont) = 0; + + // Clean up transaction state after completing the transaction + // Note that the object may live longer, because it is referenced + // by not yet triggered callbacks + virtual void cleanUp() { + ASSERT(txState == TxState::DONE); + ASSERT(!onErrorFuture); + txActor = {}; + fdbTx = {}; + } + + // Complete the transaction with an (unretriable) error + void transactionFailed(fdb_error_t err) { + ASSERT(err != error_code_success); + std::unique_lock lock(mutex); + if (txState == TxState::DONE) { + return; + } + txState = TxState::DONE; + lock.unlock(); + txActor->complete(err); + cleanUp(); + contAfterDone(); + } + + // Handle result of an a transaction onError call + void handleOnErrorResult() { + ASSERT(txState == TxState::ON_ERROR); fdb_error_t err = onErrorFuture.getError(); - onErrorFuture.reset(); + onErrorFuture = {}; if (err) { transactionFailed(err); } else { + std::unique_lock lock(mutex); + txState = TxState::IN_PROGRESS; lock.unlock(); - txActor->reset(); txActor->start(); } } - struct WaitInfo { - Future future; - TTaskFct cont; - }; - - void transactionFailed(fdb_error_t err) { - std::unique_lock lock(mutex); - onErrorFuture.reset(); - waitMap.clear(); - lock.unlock(); - txActor->setError(err); - done(); - } - - const TransactionExecutorOptions& options; + // FDB transaction Transaction fdbTx; + + // Actor implementing the transaction worklflow std::shared_ptr txActor; + + // Mutex protecting access to shared mutable state std::mutex mutex; - std::unordered_map waitMap; - Future onErrorFuture; + + // Continuation to be called after completion of the transaction TTaskFct contAfterDone; + + // Reference to the scheduler IScheduler* scheduler; + + // Transaction execution state + TxState txState; + + // onError future used in ON_ERROR state + Future onErrorFuture; }; +/** + * Transaction context using blocking waits to implement continuations on futures + */ +class BlockingTransactionContext : public TransactionContextBase { +public: + BlockingTransactionContext(FDBTransaction* tx, + std::shared_ptr txActor, + TTaskFct cont, + IScheduler* scheduler) + : TransactionContextBase(tx, txActor, cont, scheduler) {} + +protected: + void doContinueAfter(Future f, TTaskFct cont) override { + auto thisRef = std::static_pointer_cast(shared_from_this()); + scheduler->schedule([thisRef, f, cont]() mutable { thisRef->blockingContinueAfter(f, cont); }); + } + + void blockingContinueAfter(Future f, TTaskFct cont) { + std::unique_lock lock(mutex); + if (txState != TxState::IN_PROGRESS) { + return; + } + lock.unlock(); + fdb_error_t err = fdb_future_block_until_ready(f.fdbFuture()); + if (err) { + transactionFailed(err); + return; + } + err = f.getError(); + if (err == error_code_transaction_cancelled) { + return; + } + if (err == error_code_success) { + scheduler->schedule([cont]() { cont(); }); + return; + } + + lock.lock(); + if (txState != TxState::IN_PROGRESS) { + // Ignore further errors, if the transaction is in the error handing mode or completed + return; + } + txState = TxState::ON_ERROR; + lock.unlock(); + + ASSERT(!onErrorFuture); + onErrorFuture = fdbTx.onError(err); + fdb_error_t err2 = fdb_future_block_until_ready(onErrorFuture.fdbFuture()); + if (err2) { + transactionFailed(err2); + return; + } + auto thisRef = std::static_pointer_cast(shared_from_this()); + scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); }); + } +}; + +/** + * Transaction context using callbacks to implement continuations on futures + */ +class AsyncTransactionContext : public TransactionContextBase { +public: + AsyncTransactionContext(FDBTransaction* tx, + std::shared_ptr txActor, + TTaskFct cont, + IScheduler* scheduler) + : TransactionContextBase(tx, txActor, cont, scheduler) {} + +protected: + void doContinueAfter(Future f, TTaskFct cont) override { + std::unique_lock lock(mutex); + if (txState != TxState::IN_PROGRESS) { + return; + } + callbackMap[f.fdbFuture()] = CallbackInfo{ f, cont, shared_from_this() }; + lock.unlock(); + fdb_error_t err = fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this); + if (err) { + lock.lock(); + callbackMap.erase(f.fdbFuture()); + lock.unlock(); + transactionFailed(err); + } + } + + static void futureReadyCallback(FDBFuture* f, void* param) { + AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param; + txCtx->onFutureReady(f); + } + + void onFutureReady(FDBFuture* f) { + injectRandomSleep(); + std::unique_lock lock(mutex); + auto iter = callbackMap.find(f); + ASSERT(iter != callbackMap.end()); + CallbackInfo cbInfo = iter->second; + callbackMap.erase(iter); + if (txState != TxState::IN_PROGRESS) { + return; + } + lock.unlock(); + fdb_error_t err = fdb_future_get_error(f); + if (err == error_code_transaction_cancelled) { + return; + } + if (err == error_code_success) { + scheduler->schedule(cbInfo.cont); + return; + } + + lock.lock(); + if (txState != TxState::IN_PROGRESS) { + // Ignore further errors, if the transaction is in the error handing mode or completed + return; + } + txState = TxState::ON_ERROR; + lock.unlock(); + + ASSERT(!onErrorFuture); + onErrorFuture = tx()->onError(err); + onErrorThisRef = std::static_pointer_cast(shared_from_this()); + fdb_error_t err2 = fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this); + if (err2) { + onErrorFuture = {}; + transactionFailed(err2); + } + } + + static void onErrorReadyCallback(FDBFuture* f, void* param) { + AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param; + txCtx->onErrorReady(f); + } + + void onErrorReady(FDBFuture* f) { + injectRandomSleep(); + auto thisRef = onErrorThisRef; + onErrorThisRef = {}; + scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); }); + } + + void cleanUp() override { + TransactionContextBase::cleanUp(); + + // Cancel all pending operations + // Note that the callbacks of the cancelled futures will still be called + std::unique_lock lock(mutex); + std::vector futures; + for (auto& iter : callbackMap) { + futures.push_back(iter.second.future); + } + lock.unlock(); + for (auto& f : futures) { + f.cancel(); + } + } + + // Inject a random sleep with a low probability + void injectRandomSleep() { + if (Random::get().randomBool(0.01)) { + std::this_thread::sleep_for(std::chrono::milliseconds(Random::get().randomInt(1, 5))); + } + } + + // Object references for a future callback + struct CallbackInfo { + Future future; + TTaskFct cont; + std::shared_ptr thisRef; + }; + + // Map for keeping track of future waits and holding necessary object references + std::unordered_map callbackMap; + + // Holding reference to this for onError future C callback + std::shared_ptr onErrorThisRef; +}; + +/** + * Transaction executor base class, containing reusable functionality + */ class TransactionExecutorBase : public ITransactionExecutor { public: TransactionExecutorBase(const TransactionExecutorOptions& options) : options(options), scheduler(nullptr) {} @@ -197,14 +345,20 @@ public: } protected: - void executeWithDatabase(FDBDatabase* db, std::shared_ptr txActor, TTaskFct cont) { + // Execute the transaction on the given database instance + void executeOnDatabase(FDBDatabase* db, std::shared_ptr txActor, TTaskFct cont) { FDBTransaction* tx; fdb_error_t err = fdb_database_create_transaction(db, &tx); if (err != error_code_success) { - txActor->setError(err); + txActor->complete(err); cont(); } else { - TransactionContext* ctx = new TransactionContext(tx, txActor, cont, options, scheduler); + std::shared_ptr ctx; + if (options.blockOnFutures) { + ctx = std::make_shared(tx, txActor, cont, scheduler); + } else { + ctx = std::make_shared(tx, txActor, cont, scheduler); + } txActor->init(ctx); txActor->start(); } @@ -216,6 +370,9 @@ protected: IScheduler* scheduler; }; +/** + * Transaction executor load balancing transactions over a fixed pool of databases + */ class DBPoolTransactionExecutor : public TransactionExecutorBase { public: DBPoolTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {} @@ -239,7 +396,7 @@ public: void execute(std::shared_ptr txActor, TTaskFct cont) override { int idx = Random::get().randomInt(0, options.numDatabases - 1); - executeWithDatabase(databases[idx], txActor, cont); + executeOnDatabase(databases[idx], txActor, cont); } void release() { @@ -252,6 +409,9 @@ private: std::vector databases; }; +/** + * Transaction executor executing each transaction on a separate database + */ class DBPerTransactionExecutor : public TransactionExecutorBase { public: DBPerTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {} @@ -260,10 +420,10 @@ public: FDBDatabase* db = nullptr; fdb_error_t err = fdb_create_database(clusterFile.c_str(), &db); if (err != error_code_success) { - txActor->setError(err); + txActor->complete(err); cont(); } - executeWithDatabase(db, txActor, [cont, db]() { + executeOnDatabase(db, txActor, [cont, db]() { fdb_database_destroy(db); cont(); }); diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 3797d6df9e..8a279ba198 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -44,7 +44,7 @@ public: // Schedule a continuation to be executed when the future gets ready virtual void continueAfter(Future f, TTaskFct cont) = 0; - // Commit the transaction + // Complete the transaction with a commit virtual void commit() = 0; // Mark the transaction as completed without committing it (for read transactions) @@ -62,19 +62,16 @@ public: virtual ~ITransactionActor() {} // Initialize with the given transaction context - virtual void init(ITransactionContext* ctx) = 0; + virtual void init(std::shared_ptr ctx) = 0; // Start execution of the transaction, also called on retries virtual void start() = 0; - // Reset the transaction state - virtual void reset() = 0; + // Transaction completion result (error_code_success in case of success) + virtual fdb_error_t getErrorCode() = 0; - // Abort the transaction with an unretriable error - virtual void setError(fdb_error_t err) { error = err; } - - // Unretriable error, set if the transaction has failed - fdb_error_t error = error_code_success; + // Notification about the completion of the transaction + virtual void complete(fdb_error_t err) = 0; }; /** @@ -82,20 +79,20 @@ public: */ class TransactionActorBase : public ITransactionActor { public: - void init(ITransactionContext* ctx) override { context = ctx; } + void init(std::shared_ptr ctx) override { context = ctx; } + fdb_error_t getErrorCode() override { return error; } + void complete(fdb_error_t err) override; protected: - ITransactionContext* ctx() { return context; } - Transaction* tx() { return ctx()->tx(); } - void commit() { ctx()->commit(); } - void reset() override {} + std::shared_ptr ctx() { return context; } private: - ITransactionContext* context = nullptr; + std::shared_ptr context; + fdb_error_t error = error_code_success; }; // Type of the lambda functions implementing a transaction -using TTxStartFct = std::function; +using TTxStartFct = std::function)>; /** * A wrapper class for transactions implemented by lambda functions diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index a374e5a09c..ce269ca824 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -86,11 +86,11 @@ void WorkloadBase::execTransaction(std::shared_ptr tx, TTaskF } tasksScheduled++; manager->txExecutor->execute(tx, [this, tx, cont, failOnError]() { - if (tx->error == error_code_success) { + fdb_error_t err = tx->getErrorCode(); + if (tx->getErrorCode() == error_code_success) { cont(); } else { - std::string msg = - fmt::format("Transaction failed with error: {} ({}})", tx->error, fdb_get_error(tx->error)); + std::string msg = fmt::format("Transaction failed with error: {} ({}})", err, fdb_get_error(err)); if (failOnError) { error(msg); failed = true; diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml new file mode 100644 index 0000000000..9f153645e7 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionBlocking.toml @@ -0,0 +1,24 @@ +[[test]] +title = 'Cancel Transaction with Blocking Waits' +multiThreaded = true +buggify = true +blockOnFutures = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'CancelTransaction' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml new file mode 100644 index 0000000000..96108c69b1 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionCB.toml @@ -0,0 +1,23 @@ +[[test]] +title = 'Cancel Transactions with Future Callbacks' +multiThreaded = true +buggify = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'CancelTransaction' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file diff --git a/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml new file mode 100644 index 0000000000..ae40fbf696 --- /dev/null +++ b/bindings/c/test/apitester/tests/CApiCancelTransactionDBPerTX.toml @@ -0,0 +1,24 @@ +[[test]] +title = 'Cancel Transaction with Database per Transaction' +multiThreaded = true +buggify = true +databasePerTransaction = true +minFdbThreads = 2 +maxFdbThreads = 8 +minDatabases = 2 +maxDatabases = 8 +minClientThreads = 2 +maxClientThreads = 8 +minClients = 2 +maxClients = 8 + + [[test.workload]] + name = 'CancelTransaction' + minKeyLength = 1 + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 \ No newline at end of file From a9ee2b99e27002176a978a2adbf8ec9a90bee3ab Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Thu, 10 Mar 2022 19:30:22 +0100 Subject: [PATCH 048/138] ApiTester: fix boost target for sanitizer builds --- bindings/c/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index d03d29937d..213b3daa30 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -163,7 +163,12 @@ if(NOT WIN32) target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads) target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow) target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads) + +if(USE_SANITIZER) + target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_asan) +else() target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_target) +endif() # do not set RPATH for mako set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE) From 944ec484153c3a8c40dc6d8dd3b3e5f87563cb60 Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Thu, 10 Mar 2022 12:06:49 -0800 Subject: [PATCH 049/138] Introduce a simulate EncryptKeyVaultProxy interface (#6576) Description Major changes proposed are: 1. Rename ServerKnob->ENABLE_ENCRYPT_KEY_PROXY to ServerKnob->ENABLE_ENCRYPTION. Approach simplifies enabling controlling encyrption code change using a single knob (desirable) 2. Implement EncyrptKeyVaultProxy simulated interface to assist validating encyrption workflows in simulation runs. The interface is leveraged to satisfy "encryption keys" lookup which otherwise gets satisfied by integrating organization preferred Encryption Key Management solution. Testing Unit test to validate the newly added code --- fdbclient/ServerKnobs.cpp | 2 +- fdbclient/ServerKnobs.h | 2 +- fdbserver/CMakeLists.txt | 2 + fdbserver/ClusterController.actor.cpp | 19 ++- fdbserver/EncryptKeyProxy.actor.cpp | 15 ++ fdbserver/EncryptKeyProxyInterface.h | 1 + fdbserver/SimEncryptVaultProxy.actor.cpp | 153 ++++++++++++++++++ fdbserver/SimEncryptVaultProxy.actor.h | 132 +++++++++++++++ fdbserver/Status.actor.cpp | 2 +- .../workloads/ConsistencyCheck.actor.cpp | 2 +- fdbserver/workloads/UnitTests.actor.cpp | 2 + 11 files changed, 318 insertions(+), 14 deletions(-) create mode 100644 fdbserver/SimEncryptVaultProxy.actor.cpp create mode 100644 fdbserver/SimEncryptVaultProxy.actor.h diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index a65099edd5..38b8fa5f55 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -816,7 +816,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init ( CLUSTER_RECOVERY_EVENT_NAME_PREFIX, "Master"); // encrypt key proxy - init( ENABLE_ENCRYPT_KEY_PROXY, false ); + init( ENABLE_ENCRYPTION, false ); // Blob granlues init( BG_URL, "" ); // TODO: store in system key space, eventually diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index f962ec78b9..bf12da046a 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -765,7 +765,7 @@ public: std::string CLUSTER_RECOVERY_EVENT_NAME_PREFIX; // encrypt key proxy - bool ENABLE_ENCRYPT_KEY_PROXY; + bool ENABLE_ENCRYPTION; // blob granule stuff // FIXME: configure url with database configuration instead of knob eventually diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index dd83764289..728b4f9ab1 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -117,6 +117,8 @@ set(FDBSERVER_SRCS ServerDBInfo.actor.h ServerDBInfo.h SigStack.cpp + SimEncryptVaultProxy.actor.h + SimEncryptVaultProxy.actor.cpp SimpleConfigConsumer.actor.cpp SimpleConfigConsumer.h SimulatedCluster.actor.cpp diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 753ec0e2ef..b5c5c67348 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -599,7 +599,7 @@ void checkBetterSingletons(ClusterControllerData* self) { } WorkerDetails newEKPWorker; - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { newEKPWorker = findNewProcessForSingleton(self, ProcessClass::EncryptKeyProxy, id_used); } @@ -613,7 +613,7 @@ void checkBetterSingletons(ClusterControllerData* self) { } ProcessClass::Fitness bestFitnessForEKP; - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { bestFitnessForEKP = findBestFitnessForSingleton(self, newEKPWorker, ProcessClass::EncryptKeyProxy); } @@ -638,7 +638,7 @@ void checkBetterSingletons(ClusterControllerData* self) { } bool ekpHealthy = true; - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { ekpHealthy = isHealthySingleton( self, newEKPWorker, ekpSingleton, bestFitnessForEKP, self->recruitingEncryptKeyProxyID); } @@ -662,7 +662,7 @@ void checkBetterSingletons(ClusterControllerData* self) { } Optional> currEKPProcessId, newEKPProcessId; - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { currEKPProcessId = ekpSingleton.interface.get().locality.processId(); newEKPProcessId = newEKPWorker.interf.locality.processId(); } @@ -674,7 +674,7 @@ void checkBetterSingletons(ClusterControllerData* self) { newPids.emplace_back(newBMProcessId); } - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { currPids.emplace_back(currEKPProcessId); newPids.emplace_back(newEKPProcessId); } @@ -689,7 +689,7 @@ void checkBetterSingletons(ClusterControllerData* self) { } // if the knob is disabled, the EKP coloc counts should have no affect on the coloc counts check below - if (!SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (!SERVER_KNOBS->ENABLE_ENCRYPTION) { ASSERT(currColocMap[currEKPProcessId] == 0); ASSERT(newColocMap[newEKPProcessId] == 0); } @@ -706,8 +706,7 @@ void checkBetterSingletons(ClusterControllerData* self) { ddSingleton.recruit(self); } else if (CLIENT_KNOBS->ENABLE_BLOB_GRANULES && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) { bmSingleton.recruit(self); - } else if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY && - newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) { + } else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) { ekpSingleton.recruit(self); } } @@ -1240,7 +1239,7 @@ void registerWorker(RegisterWorkerRequest req, self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID); } - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY && req.encryptKeyProxyInterf.present()) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION && req.encryptKeyProxyInterf.present()) { auto currSingleton = EncryptKeyProxySingleton(self->db.serverInfo->get().encryptKeyProxy); auto registeringSingleton = EncryptKeyProxySingleton(req.encryptKeyProxyInterf); haltRegisteringOrCurrentSingleton( @@ -2416,7 +2415,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, state Future> error = errorOr(actorCollection(self.addActor.getFuture())); // EncryptKeyProxy is necessary for TLog recovery, recruit it as the first process - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION) { self.addActor.send(monitorEncryptKeyProxy(&self)); } self.addActor.send(clusterWatchDatabase(&self, &self.db, coordinators, leaderFail)); // Start the master database diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp index 0f666b0f53..4fca28d2f3 100644 --- a/fdbserver/EncryptKeyProxy.actor.cpp +++ b/fdbserver/EncryptKeyProxy.actor.cpp @@ -19,14 +19,18 @@ */ #include "fdbserver/EncryptKeyProxyInterface.h" +#include "fdbserver/SimEncryptVaultProxy.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/ServerDBInfo.h" #include "flow/Arena.h" #include "flow/Error.h" #include "flow/EventTypes.actor.h" #include "flow/FastRef.h" +#include "flow/IRandom.h" #include "flow/Trace.h" #include "flow/genericactors.actor.h" +#include "flow/network.h" + #include "flow/actorcompiler.h" // This must be the last #include. struct EncryptKeyProxyData : NonCopyable, ReferenceCounted { @@ -42,6 +46,17 @@ ACTOR Future encryptKeyProxyServer(EncryptKeyProxyInterface ekpInterface, state Future collection = actorCollection(self->addActor.getFuture()); self->addActor.send(traceRole(Role::ENCRYPT_KEY_PROXY, ekpInterface.id())); + SimEncryptVaultProxyInterface simEncryptVaultProxyInf; + if (g_network->isSimulated()) { + + // In simulation construct an EncryptVaultProxy actor to satisfy encryption keys lookups otherwise satisfied by + // integrating external Encryption Key Management solutions. + + const uint32_t maxEncryptKeys = deterministicRandom()->randomInt(1024, 2048); + simEncryptVaultProxyInf.initEndpoints(); + self->addActor.send(simEncryptVaultProxyCore(simEncryptVaultProxyInf, maxEncryptKeys)); + } + TraceEvent("EKP_Start", self->myId).log(); // TODO(ahusain): skeleton implementation, more to come diff --git a/fdbserver/EncryptKeyProxyInterface.h b/fdbserver/EncryptKeyProxyInterface.h index 0123bc4edd..ad2da2f8ba 100644 --- a/fdbserver/EncryptKeyProxyInterface.h +++ b/fdbserver/EncryptKeyProxyInterface.h @@ -20,6 +20,7 @@ #ifndef FDBSERVER_ENCRYPTKEYPROXYINTERFACE_H #define FDBSERVER_ENCRYPTKEYPROXYINTERFACE_H +#include "flow/FileIdentifier.h" #include "flow/network.h" #pragma once diff --git a/fdbserver/SimEncryptVaultProxy.actor.cpp b/fdbserver/SimEncryptVaultProxy.actor.cpp new file mode 100644 index 0000000000..ad9ffdbcd0 --- /dev/null +++ b/fdbserver/SimEncryptVaultProxy.actor.cpp @@ -0,0 +1,153 @@ +/* + * SimEncryptVaulProxy.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "fdbrpc/sim_validation.h" +#include "fdbserver/SimEncryptVaultProxy.actor.h" +#include "flow/ActorCollection.h" +#include "flow/Error.h" +#include "flow/IRandom.h" +#include "flow/ITrace.h" +#include "flow/StreamCipher.h" +#include "flow/UnitTest.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +struct SimEncryptKeyCtx { + SimEncryptKeyId id; + SimEncryptKey key; + + SimEncryptKeyCtx() : id(0) {} + explicit SimEncryptKeyCtx(SimEncryptKeyId kId, const char* data) : id(kId), key(data) {} +}; + +struct SimEncyrptVaultProxyContext { + uint32_t maxEncryptionKeys; + std::unordered_map> simEncryptKeyStore; + + SimEncyrptVaultProxyContext() : maxEncryptionKeys(0) {} + explicit SimEncyrptVaultProxyContext(uint32_t keyCount) : maxEncryptionKeys(keyCount) { + uint8_t buffer[AES_256_KEY_LENGTH]; + + // Construct encryption keyStore. + for (int i = 0; i < maxEncryptionKeys; i++) { + generateRandomData(&buffer[0], AES_256_KEY_LENGTH); + SimEncryptKeyCtx ctx(i, reinterpret_cast(buffer)); + simEncryptKeyStore[i] = std::make_unique(i, reinterpret_cast(buffer)); + } + } +}; + +ACTOR Future simEncryptVaultProxyCore(SimEncryptVaultProxyInterface interf, uint32_t maxEncryptKeys) { + state SimEncyrptVaultProxyContext vaultProxyCtx(maxEncryptKeys); + + ASSERT(vaultProxyCtx.simEncryptKeyStore.size() == maxEncryptKeys); + + TraceEvent("SimEncryptVaultProxy_Init", interf.id()).detail("MaxEncrptKeys", maxEncryptKeys); + + loop { + choose { + when(SimGetEncryptKeyByKeyIdRequest req = waitNext(interf.encryptKeyLookupByKeyId.getFuture())) { + SimGetEncryptKeyByKeyIdReply reply; + + // Lookup corresponding EncryptKeyCtx for input keyId + if (vaultProxyCtx.simEncryptKeyStore.find(req.encryptKeyId) != vaultProxyCtx.simEncryptKeyStore.end()) { + reply.encryptKey = StringRef(vaultProxyCtx.simEncryptKeyStore[req.encryptKeyId].get()->key); + req.reply.send(reply); + } else { + req.reply.sendError(key_not_found()); + } + } + when(SimGetEncryptKeyByDomainIdRequest req = waitNext(interf.encryptKeyLookupByDomainId.getFuture())) { + SimGetEncryptKeyByDomainIdReply reply; + + // Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This would + // mean multiple domains gets mapped to the same encryption key which is fine, the EncryptKeyStore + // guarantees that keyId -> plaintext encryptKey mapping is idempotent. + + reply.encryptKeyId = req.encryptDomainId % maxEncryptKeys; + reply.encryptKey = StringRef(vaultProxyCtx.simEncryptKeyStore[reply.encryptKeyId].get()->key); + req.reply.send(reply); + } + } + } +} + +void forceLinkSimEncryptVaultProxyTests() {} + +namespace { + +ACTOR Future testRunWorkload(SimEncryptVaultProxyInterface inf, uint32_t nEncryptionKeys) { + state uint32_t maxEncryptionKeys = nEncryptionKeys; + state int maxDomainIds = deterministicRandom()->randomInt(121, 295); + state int maxIterations = deterministicRandom()->randomInt(786, 1786); + state std::unordered_map> domainIdKeyMap; + state int i = 0; + + TraceEvent("RunWorkloadStart").detail("MaxDomainIds", maxDomainIds).detail("MaxIterations", maxIterations); + + { + // construct domainId to EncryptKeyCtx map + for (i = 0; i < maxDomainIds; i++) { + SimGetEncryptKeyByDomainIdRequest req; + req.encryptDomainId = i; + SimGetEncryptKeyByDomainIdReply reply = wait(inf.encryptKeyLookupByDomainId.getReply(req)); + domainIdKeyMap[i] = + std::make_unique(reply.encryptKeyId, reply.encryptKey.toString().c_str()); + } + + // randomly pick any domainId and validate if lookupByKeyId result matches + for (i = 0; i < maxIterations; i++) { + state int idx = deterministicRandom()->randomInt(0, maxDomainIds); + state SimEncryptKeyCtx* ctx = domainIdKeyMap[idx].get(); + SimGetEncryptKeyByKeyIdRequest req(ctx->id); + SimGetEncryptKeyByKeyIdReply reply = wait(inf.encryptKeyLookupByKeyId.getReply(req)); + ASSERT(reply.encryptKey.compare(ctx->key) == 0); + } + } + + { + // Verify unknown key access returns the error + state SimGetEncryptKeyByKeyIdRequest req; + req.encryptKeyId = maxEncryptionKeys + 1; + try { + SimGetEncryptKeyByKeyIdReply reply = wait(inf.encryptKeyLookupByKeyId.getReply(req)); + } catch (Error& e) { + ASSERT(e.code() == error_code_key_not_found); + } + } + + TraceEvent("RunWorkloadDone").log(); + return Void(); +} + +} // namespace + +TEST_CASE("fdbserver/SimEncryptVaultProxy") { + state SimEncryptVaultProxyInterface inf; + state uint32_t maxEncryptKeys = 64; + + loop choose { + when(wait(simEncryptVaultProxyCore(inf, maxEncryptKeys))) { throw internal_error(); } + when(wait(testRunWorkload(inf, maxEncryptKeys))) { break; } + } + return Void(); +} \ No newline at end of file diff --git a/fdbserver/SimEncryptVaultProxy.actor.h b/fdbserver/SimEncryptVaultProxy.actor.h new file mode 100644 index 0000000000..169cc351cf --- /dev/null +++ b/fdbserver/SimEncryptVaultProxy.actor.h @@ -0,0 +1,132 @@ +/* + * SimEncryptVaultProxy.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_SIMENCRYPTVAULTPROXY_ACTOR_G_H) +#define FDBSERVER_SIMENCRYPTVAULTPROXY_ACTOR_G_H +#include "fdbserver/SimEncryptVaultProxy.actor.g.h" +#elif !defined(FDBSERVER_SIMENCRYPTVAULTPROXY_ACTOR_H) +#define FDBSERVER_SIMENCRYPTVAULTPROXY_ACTOR_H + +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/fdbrpc.h" +#include "flow/FileIdentifier.h" +#include "flow/Trace.h" +#include "flow/flow.h" +#include "flow/network.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +using SimEncryptKeyId = uint64_t; +using SimEncryptDomainId = uint64_t; +using SimEncryptKey = std::string; + +struct SimEncryptVaultProxyInterface { + constexpr static FileIdentifier file_identifier = 2416711; + RequestStream> waitFailure; + RequestStream encryptKeyLookupByKeyId; + RequestStream encryptKeyLookupByDomainId; + + SimEncryptVaultProxyInterface() {} + + UID id() const { return encryptKeyLookupByKeyId.getEndpoint().token; } + template + void serialize(Archive& ar) { + if constexpr (!is_fb_function) { + ASSERT(ar.protocolVersion().isValid()); + } + serializer(ar, waitFailure); + if (Archive::isDeserializing) { + encryptKeyLookupByKeyId = + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(1)); + encryptKeyLookupByDomainId = + RequestStream(waitFailure.getEndpoint().getAdjustedEndpoint(2)); + } + } + + void initEndpoints() { + std::vector> streams; + streams.push_back(waitFailure.getReceiver()); + streams.push_back(encryptKeyLookupByKeyId.getReceiver(TaskPriority::DefaultPromiseEndpoint)); + streams.push_back(encryptKeyLookupByDomainId.getReceiver(TaskPriority::DefaultPromiseEndpoint)); + FlowTransport::transport().addEndpoints(streams); + } +}; + +struct SimGetEncryptKeyByKeyIdReply { + constexpr static FileIdentifier file_identifier = 2313778; + Standalone encryptKey; + + SimGetEncryptKeyByKeyIdReply() : encryptKey(StringRef()) {} + explicit SimGetEncryptKeyByKeyIdReply(Standalone key) : encryptKey(key) {} + + template + void serialize(Ar& ar) { + serializer(ar, encryptKey); + } +}; + +struct SimGetEncryptKeyByKeyIdRequest { + constexpr static FileIdentifier file_identifier = 6913396; + SimEncryptKeyId encryptKeyId; + ReplyPromise reply; + + SimGetEncryptKeyByKeyIdRequest() : encryptKeyId(0) {} + explicit SimGetEncryptKeyByKeyIdRequest(SimEncryptKeyId keyId) : encryptKeyId(keyId) {} + + template + void serialize(Ar& ar) { + serializer(ar, encryptKeyId, reply); + } +}; + +struct SimGetEncryptKeyByDomainIdReply { + constexpr static FileIdentifier file_identifier = 3009025; + SimEncryptDomainId encryptKeyId; + Standalone encryptKey; + + SimGetEncryptKeyByDomainIdReply() : encryptKeyId(0), encryptKey(StringRef()) {} + explicit SimGetEncryptKeyByDomainIdReply(SimEncryptKeyId keyId, Standalone key) + : encryptKeyId(keyId), encryptKey(key) {} + + template + void serialize(Ar& ar) { + serializer(ar, encryptKeyId, encryptKey); + } +}; + +struct SimGetEncryptKeyByDomainIdRequest { + constexpr static FileIdentifier file_identifier = 9918682; + SimEncryptDomainId encryptDomainId; + ReplyPromise reply; + + SimGetEncryptKeyByDomainIdRequest() : encryptDomainId(0) {} + explicit SimGetEncryptKeyByDomainIdRequest(SimEncryptDomainId domainId) : encryptDomainId(domainId) {} + + template + void serialize(Ar& ar) { + serializer(ar, encryptDomainId, reply); + } +}; + +ACTOR Future simEncryptVaultProxyCore(struct SimEncryptVaultProxyInterface interf, uint32_t maxEncryptKeys); + +#include "flow/unactorcompiler.h" +#endif // FDBSERVER_SIMENCRYPTVAULTPROXY_ACTOR_H diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index fd43618170..bd6dcc70c1 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -813,7 +813,7 @@ ACTOR static Future processStatusFetcher( roles.addRole("blob_manager", db->get().blobManager.get()); } - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY && db->get().encryptKeyProxy.present()) { + if (SERVER_KNOBS->ENABLE_ENCRYPTION && db->get().encryptKeyProxy.present()) { roles.addRole("encrypt_key_proxy", db->get().encryptKeyProxy.get()); } diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 5ee5e3c8f5..b64850f544 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -2374,7 +2374,7 @@ struct ConsistencyCheckWorkload : TestWorkload { } // Check EncryptKeyProxy - if (SERVER_KNOBS->ENABLE_ENCRYPT_KEY_PROXY && db.encryptKeyProxy.present() && + if (SERVER_KNOBS->ENABLE_ENCRYPTION && db.encryptKeyProxy.present() && (!nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address()) || nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()].processClass.machineClassFitness( ProcessClass::EncryptKeyProxy) > fitnessLowerBound)) { diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp index cffd6aaad3..8127b0ae87 100644 --- a/fdbserver/workloads/UnitTests.actor.cpp +++ b/fdbserver/workloads/UnitTests.actor.cpp @@ -34,6 +34,7 @@ void forceLinkStreamCipherTests(); void forceLinkParallelStreamTests(); void forceLinkSimExternalConnectionTests(); void forceLinkMutationLogReaderTests(); +void forceLinkSimEncryptVaultProxyTests(); void forceLinkIThreadPoolTests(); struct UnitTestWorkload : TestWorkload { @@ -79,6 +80,7 @@ struct UnitTestWorkload : TestWorkload { forceLinkParallelStreamTests(); forceLinkSimExternalConnectionTests(); forceLinkMutationLogReaderTests(); + forceLinkSimEncryptVaultProxyTests(); forceLinkIThreadPoolTests(); } From 30957c3a43460eab10931fd808e13a0a061b230f Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 10 Mar 2022 14:22:45 -0800 Subject: [PATCH 050/138] Fix unclear comment. --- fdbserver/IPager.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index bf14613cbd..ca3000dd33 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -309,8 +309,9 @@ public: // Advance the commit version and the oldest readble version and commit until the remap queue is empty. virtual Future clearRemapQueue() = 0; - // If set to a valid pointer, the page cache should behave as though the page cache size limit has been - // reduced by the target byte count. + // Get a pointer to an integer representing a byte count penalty the pager should apply against usable page cache + // memory. This is used to track significant memory usage external to the pager. Such usages should + // increment/decrement the value at this pointer based on their memory footprint. virtual int64_t* getPageCachePenaltySource() = 0; protected: From a7bce7798794674f266fc20452b3923f5496a3c1 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 11 Mar 2022 00:35:25 -0400 Subject: [PATCH 051/138] Mark Smoother::smooth* methods const --- fdbrpc/Smoother.h | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/fdbrpc/Smoother.h b/fdbrpc/Smoother.h index 667c4c60ed..aab53ebd8b 100644 --- a/fdbrpc/Smoother.h +++ b/fdbrpc/Smoother.h @@ -25,9 +25,22 @@ #include "flow/flow.h" #include -struct Smoother { +class Smoother { // Times (t) are expected to be nondecreasing + double eFoldingTime; + double total; + mutable double time, estimate; + + void update(double t) const { + double elapsed = t - time; + if (elapsed) { + time = t; + estimate += (total - estimate) * (1 - exp(-elapsed / eFoldingTime)); + } + } + +public: explicit Smoother(double eFoldingTime) : eFoldingTime(eFoldingTime) { reset(0); } void reset(double value) { time = 0; @@ -41,28 +54,18 @@ struct Smoother { total += delta; } // smoothTotal() is a continuous (under)estimate of the sum of all addDeltas() - double smoothTotal(double t = now()) { + double smoothTotal(double t = now()) const { update(t); return estimate; } // smoothRate() is d/dt[smoothTotal], and is NOT continuous - double smoothRate(double t = now()) { + double smoothRate(double t = now()) const { update(t); return (total - estimate) / eFoldingTime; } - - void update(double t) { - double elapsed = t - time; - if (elapsed) { - time = t; - estimate += (total - estimate) * (1 - exp(-elapsed / eFoldingTime)); - } - } - - double eFoldingTime; - double time, total, estimate; }; +// TODO: Improve encapsulation struct TimerSmoother { // Times (t) are expected to be nondecreasing From 9aca5f80112fab9da667e5c0579834ffbad1902b Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 11 Mar 2022 00:47:35 -0400 Subject: [PATCH 052/138] Mark several methods const --- fdbclient/DatabaseContext.h | 6 +++--- fdbrpc/LoadBalance.actor.h | 4 ++-- fdbrpc/QueueModel.cpp | 2 +- fdbrpc/QueueModel.h | 2 +- fdbserver/TagThrottler.actor.cpp | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index f8c8fb58b3..a03c25d50b 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -107,13 +107,13 @@ public: void addReleased(int released) { smoothReleased.addDelta(released); } - bool expired() { return expiration <= now(); } + bool expired() const { return expiration <= now(); } void updateChecked() { lastCheck = now(); } - bool canRecheck() { return lastCheck < now() - CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL; } + bool canRecheck() const { return lastCheck < now() - CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL; } - double throttleDuration() { + double throttleDuration() const { if (expiration <= now()) { return 0.0; } diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 838a1762b8..cb9321d3d1 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -490,7 +490,7 @@ Future loadBalance( RequestStream const* thisStream = &alternatives->get(i, channel); if (!IFailureMonitor::failureMonitor().getState(thisStream->getEndpoint()).failed) { - auto& qd = model->getMeasurement(thisStream->getEndpoint().token.first()); + auto const& qd = model->getMeasurement(thisStream->getEndpoint().token.first()); if (now() > qd.failedUntil) { double thisMetric = qd.smoothOutstanding.smoothTotal(); double thisTime = qd.latency; @@ -529,7 +529,7 @@ Future loadBalance( for (int i = alternatives->countBest(); i < alternatives->size(); i++) { RequestStream const* thisStream = &alternatives->get(i, channel); if (!IFailureMonitor::failureMonitor().getState(thisStream->getEndpoint()).failed) { - auto& qd = model->getMeasurement(thisStream->getEndpoint().token.first()); + auto const& qd = model->getMeasurement(thisStream->getEndpoint().token.first()); if (now() > qd.failedUntil) { double thisMetric = qd.smoothOutstanding.smoothTotal(); double thisTime = qd.latency; diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp index 0fceb8929d..e9c2ddca4e 100644 --- a/fdbrpc/QueueModel.cpp +++ b/fdbrpc/QueueModel.cpp @@ -50,7 +50,7 @@ void QueueModel::endRequest(uint64_t id, double latency, double penalty, double } } -QueueData& QueueModel::getMeasurement(uint64_t id) { +QueueData const& QueueModel::getMeasurement(uint64_t id) { return data[id]; // return smoothed penalty } diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h index 06d883c5e6..f7eedc6fa8 100644 --- a/fdbrpc/QueueModel.h +++ b/fdbrpc/QueueModel.h @@ -93,7 +93,7 @@ public: // - futureVersion: indicates whether there was "future version" error or // not. void endRequest(uint64_t id, double latency, double penalty, double delta, bool clean, bool futureVersion); - QueueData& getMeasurement(uint64_t id); + QueueData const& getMeasurement(uint64_t id); // Starts a new request to storage server with `id`. If the storage // server contains a penalty, add it to the queue size, and return the diff --git a/fdbserver/TagThrottler.actor.cpp b/fdbserver/TagThrottler.actor.cpp index 2e215a7c9b..e83f16aaf7 100644 --- a/fdbserver/TagThrottler.actor.cpp +++ b/fdbserver/TagThrottler.actor.cpp @@ -38,7 +38,7 @@ class RkTagThrottleCollection : NonCopyable { RkTagThrottleData() : clientRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {} - double getTargetRate(Optional requestRate) { + double getTargetRate(Optional requestRate) const { if (limits.tpsRate == 0.0 || !requestRate.present() || requestRate.get() == 0.0 || !rateSet) { return limits.tpsRate; } else { From e00820cdd71768b847479fe4fb003e2699b88b01 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 11 Mar 2022 01:04:13 -0400 Subject: [PATCH 053/138] Reduce mutable access to *QueueInfo objects in Ratekeeper.actor.cpp --- fdbserver/Ratekeeper.actor.cpp | 8 ++++---- fdbserver/TagThrottler.actor.cpp | 4 ++-- fdbserver/TagThrottler.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 8edb87a0fb..9b8b62e5ac 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -511,8 +511,8 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) { int64_t limitingStorageQueueStorageServer = 0; int64_t worstDurabilityLag = 0; - std::multimap storageTpsLimitReverseIndex; - std::multimap storageDurabilityLagReverseIndex; + std::multimap storageTpsLimitReverseIndex; + std::multimap storageDurabilityLagReverseIndex; std::map ssReasons; @@ -522,7 +522,7 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) { // Look at each storage server's write queue and local rate, compute and store the desired rate ratio for (auto i = storageQueueInfo.begin(); i != storageQueueInfo.end(); ++i) { - auto& ss = i->value; + auto const& ss = i->value; if (!ss.valid || (remoteDC.present() && ss.locality.dcId() == remoteDC)) continue; ++sscount; @@ -779,7 +779,7 @@ void Ratekeeper::updateRate(RatekeeperLimits* limits) { int64_t worstStorageQueueTLog = 0; int tlcount = 0; for (auto& it : tlogQueueInfo) { - auto& tl = it.value; + auto const& tl = it.value; if (!tl.valid) continue; ++tlcount; diff --git a/fdbserver/TagThrottler.actor.cpp b/fdbserver/TagThrottler.actor.cpp index e83f16aaf7..5eaad8b072 100644 --- a/fdbserver/TagThrottler.actor.cpp +++ b/fdbserver/TagThrottler.actor.cpp @@ -538,7 +538,7 @@ public: int64_t manualThrottleCount() const { return throttledTags.manualThrottleCount(); } bool isAutoThrottlingEnabled() const { return autoThrottlingEnabled; } - Future tryAutoThrottleTag(StorageQueueInfo& ss, int64_t storageQueue, int64_t storageDurabilityLag) { + Future tryAutoThrottleTag(StorageQueueInfo const& ss, int64_t storageQueue, int64_t storageDurabilityLag) { // NOTE: we just keep it simple and don't differentiate write-saturation and read-saturation at the moment. In // most of situation, this works. More indicators besides queue size and durability lag could be investigated in // the future @@ -591,7 +591,7 @@ int64_t TagThrottler::manualThrottleCount() const { bool TagThrottler::isAutoThrottlingEnabled() const { return impl->isAutoThrottlingEnabled(); } -Future TagThrottler::tryAutoThrottleTag(StorageQueueInfo& ss, +Future TagThrottler::tryAutoThrottleTag(StorageQueueInfo const& ss, int64_t storageQueue, int64_t storageDurabilityLag) { return impl->tryAutoThrottleTag(ss, storageQueue, storageDurabilityLag); diff --git a/fdbserver/TagThrottler.h b/fdbserver/TagThrottler.h index 4070162545..29989a5083 100644 --- a/fdbserver/TagThrottler.h +++ b/fdbserver/TagThrottler.h @@ -38,5 +38,5 @@ public: uint32_t busyWriteTagCount() const; int64_t manualThrottleCount() const; bool isAutoThrottlingEnabled() const; - Future tryAutoThrottleTag(StorageQueueInfo&, int64_t storageQueue, int64_t storageDurabilityLag); + Future tryAutoThrottleTag(StorageQueueInfo const&, int64_t storageQueue, int64_t storageDurabilityLag); }; From 9e81945301d03bdca5fcdaf653490958ec6339d2 Mon Sep 17 00:00:00 2001 From: Mohamed Oulmahdi Date: Fri, 11 Mar 2022 10:06:07 +0100 Subject: [PATCH 054/138] Enable encryption for Windows --- fdbrpc/CMakeLists.txt | 2 +- flow/CMakeLists.txt | 2 +- flow/StreamCipher.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index 046ba4ff46..baff60b4f0 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -36,7 +36,7 @@ set(FDBRPC_SRCS TraceFileIO.cpp TSSComparison.h) -if(WITH_TLS AND NOT WIN32) +if(WITH_TLS) set(FDBRPC_SRCS ${FDBRPC_SRCS} AsyncFileEncrypted.actor.cpp) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index a64b028974..96a8842bbf 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -100,7 +100,7 @@ set(FLOW_SRCS xxhash.c xxhash.h) -if(WITH_TLS AND NOT WIN32) +if(WITH_TLS) set(FLOW_SRCS ${FLOW_SRCS} StreamCipher.cpp) diff --git a/flow/StreamCipher.h b/flow/StreamCipher.h index 2586ad8392..d963472d08 100644 --- a/flow/StreamCipher.h +++ b/flow/StreamCipher.h @@ -20,7 +20,7 @@ #pragma once -#if (!defined(TLS_DISABLED) && !defined(_WIN32)) +#if (!defined(TLS_DISABLED)) #define ENCRYPTION_ENABLED 1 #else #define ENCRYPTION_ENABLED 0 From 3a6568580a60a8f267e782698f6fb680f5c00683 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 11 Mar 2022 07:14:03 -0400 Subject: [PATCH 055/138] Improve encapsulation of TimerSmoother --- fdbrpc/Smoother.h | 27 +++++++++++---------- fdbserver/DataDistribution.actor.h | 38 +++++++++++++----------------- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/fdbrpc/Smoother.h b/fdbrpc/Smoother.h index aab53ebd8b..b6b2e010cd 100644 --- a/fdbrpc/Smoother.h +++ b/fdbrpc/Smoother.h @@ -65,10 +65,20 @@ public: } }; -// TODO: Improve encapsulation -struct TimerSmoother { +class TimerSmoother { // Times (t) are expected to be nondecreasing + double eFoldingTime; + double total; + mutable double time, estimate; + + void update(double t) const { + double elapsed = t - time; + time = t; + estimate += (total - estimate) * (1 - exp(-elapsed / eFoldingTime)); + } + +public: explicit TimerSmoother(double eFoldingTime) : eFoldingTime(eFoldingTime) { reset(0); } void reset(double value) { time = 0; @@ -82,24 +92,17 @@ struct TimerSmoother { total += delta; } // smoothTotal() is a continuous (under)estimate of the sum of all addDeltas() - double smoothTotal(double t = timer()) { + double smoothTotal(double t = timer()) const { update(t); return estimate; } // smoothRate() is d/dt[smoothTotal], and is NOT continuous - double smoothRate(double t = timer()) { + double smoothRate(double t = timer()) const { update(t); return (total - estimate) / eFoldingTime; } - void update(double t) { - double elapsed = t - time; - time = t; - estimate += (total - estimate) * (1 - exp(-elapsed / eFoldingTime)); - } - - double eFoldingTime; - double time, total, estimate; + double getTotal() const { return total; } }; #endif diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 3adca0b20e..6b391cc414 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -315,29 +315,23 @@ struct StorageWiggleMetrics { template void serialize(Ar& ar) { + double step_total, round_total; + if (!ar.isDeserializing) { + step_total = smoothed_wiggle_duration.getTotal(); + round_total = smoothed_round_duration.getTotal(); + } + serializer(ar, + last_wiggle_start, + last_wiggle_finish, + step_total, + finished_wiggle, + last_round_start, + last_round_finish, + round_total, + finished_round); if (ar.isDeserializing) { - double step_total, round_total; - serializer(ar, - last_wiggle_start, - last_wiggle_finish, - step_total, - finished_wiggle, - last_round_start, - last_round_finish, - round_total, - finished_round); smoothed_round_duration.reset(round_total); smoothed_wiggle_duration.reset(step_total); - } else { - serializer(ar, - last_wiggle_start, - last_wiggle_finish, - smoothed_wiggle_duration.total, - finished_wiggle, - last_round_start, - last_round_finish, - smoothed_round_duration.total, - finished_round); } } @@ -375,14 +369,14 @@ struct StorageWiggleMetrics { result["last_round_finish_datetime"] = timerIntToGmt(last_round_finish); result["last_round_start_timestamp"] = last_round_start; result["last_round_finish_timestamp"] = last_round_finish; - result["smoothed_round_seconds"] = smoothed_round_duration.estimate; + result["smoothed_round_seconds"] = smoothed_round_duration.smoothTotal(); result["finished_round"] = finished_round; result["last_wiggle_start_datetime"] = timerIntToGmt(last_wiggle_start); result["last_wiggle_finish_datetime"] = timerIntToGmt(last_wiggle_finish); result["last_wiggle_start_timestamp"] = last_wiggle_start; result["last_wiggle_finish_timestamp"] = last_wiggle_finish; - result["smoothed_wiggle_seconds"] = smoothed_wiggle_duration.estimate; + result["smoothed_wiggle_seconds"] = smoothed_wiggle_duration.smoothTotal(); result["finished_wiggle"] = finished_wiggle; return result; } From 82f709c9d3b4deb5885da4767b5bea990834f94e Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 11 Mar 2022 08:40:24 -0400 Subject: [PATCH 056/138] Use CRTP to remove duplicate code from Smoother.h --- fdbrpc/Smoother.h | 67 ++++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 48 deletions(-) diff --git a/fdbrpc/Smoother.h b/fdbrpc/Smoother.h index b6b2e010cd..80b9a6bbc3 100644 --- a/fdbrpc/Smoother.h +++ b/fdbrpc/Smoother.h @@ -18,14 +18,13 @@ * limitations under the License. */ -#ifndef FLOW_SMOOTHER_H -#define FLOW_SMOOTHER_H #pragma once #include "flow/flow.h" #include -class Smoother { +template +class SmootherImpl { // Times (t) are expected to be nondecreasing double eFoldingTime; @@ -40,64 +39,27 @@ class Smoother { } } +protected: + explicit SmootherImpl(double eFoldingTime) : eFoldingTime(eFoldingTime) { reset(0); } + public: - explicit Smoother(double eFoldingTime) : eFoldingTime(eFoldingTime) { reset(0); } void reset(double value) { time = 0; total = value; estimate = value; } - - void setTotal(double total, double t = now()) { addDelta(total - this->total, t); } - void addDelta(double delta, double t = now()) { + void setTotal(double total, double t = T::now()) { addDelta(total - this->total, t); } + void addDelta(double delta, double t = T::now()) { update(t); total += delta; } // smoothTotal() is a continuous (under)estimate of the sum of all addDeltas() - double smoothTotal(double t = now()) const { + double smoothTotal(double t = T::now()) const { update(t); return estimate; } // smoothRate() is d/dt[smoothTotal], and is NOT continuous - double smoothRate(double t = now()) const { - update(t); - return (total - estimate) / eFoldingTime; - } -}; - -class TimerSmoother { - // Times (t) are expected to be nondecreasing - - double eFoldingTime; - double total; - mutable double time, estimate; - - void update(double t) const { - double elapsed = t - time; - time = t; - estimate += (total - estimate) * (1 - exp(-elapsed / eFoldingTime)); - } - -public: - explicit TimerSmoother(double eFoldingTime) : eFoldingTime(eFoldingTime) { reset(0); } - void reset(double value) { - time = 0; - total = value; - estimate = value; - } - - void setTotal(double total, double t = timer()) { addDelta(total - this->total, t); } - void addDelta(double delta, double t = timer()) { - update(t); - total += delta; - } - // smoothTotal() is a continuous (under)estimate of the sum of all addDeltas() - double smoothTotal(double t = timer()) const { - update(t); - return estimate; - } - // smoothRate() is d/dt[smoothTotal], and is NOT continuous - double smoothRate(double t = timer()) const { + double smoothRate(double t = T::now()) const { update(t); return (total - estimate) / eFoldingTime; } @@ -105,4 +67,13 @@ public: double getTotal() const { return total; } }; -#endif +class Smoother : public SmootherImpl { +public: + static double now() { return ::now(); } + explicit Smoother(double eFoldingTime) : SmootherImpl(eFoldingTime) {} +}; +class TimerSmoother : public SmootherImpl { +public: + static double now() { return timer(); } + explicit TimerSmoother(double eFoldingTime) : SmootherImpl(eFoldingTime) {} +}; From 184f0d75863efce5b02f578a7ce4425425b1a9dc Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Fri, 11 Mar 2022 09:20:53 -0400 Subject: [PATCH 057/138] Improve const correctness of storage wiggler classes --- fdbserver/DataDistribution.actor.h | 16 ++++++++-------- fdbserver/DataDistributionTracker.actor.cpp | 10 ++++++---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 6b391cc414..e9bd7da53e 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -170,7 +170,7 @@ public: // intersecting shards. int getNumberOfShards(UID ssID) const; - std::vector getShardsFor(Team team); + std::vector getShardsFor(Team team) const; bool hasShards(Team team) const; // The first element of the pair is either the source for non-moving shards or the destination team for in-flight @@ -180,7 +180,7 @@ public: void defineShard(KeyRangeRef keys); void moveShard(KeyRangeRef keys, std::vector destinationTeam); void finishMove(KeyRangeRef keys); - void check(); + void check() const; private: struct OrderByTeamKey { @@ -363,7 +363,7 @@ struct StorageWiggleMetrics { }); } - StatusObject toJSON() { + StatusObject toJSON() const { StatusObject result; result["last_round_start_datetime"] = timerIntToGmt(last_round_start); result["last_round_finish_datetime"] = timerIntToGmt(last_round_finish); @@ -383,7 +383,7 @@ struct StorageWiggleMetrics { }; struct StorageWiggler : ReferenceCounted { - DDTeamCollection* teamCollection; + DDTeamCollection const* teamCollection; StorageWiggleMetrics metrics; // data structures @@ -410,8 +410,8 @@ struct StorageWiggler : ReferenceCounted { void removeServer(const UID& serverId); // update metadata and adjust priority_queue void updateMetadata(const UID& serverId, const StorageMetadataType& metadata); - bool contains(const UID& serverId) { return pq_handles.count(serverId) > 0; } - bool empty() { return wiggle_pq.empty(); } + bool contains(const UID& serverId) const { return pq_handles.count(serverId) > 0; } + bool empty() const { return wiggle_pq.empty(); } Optional getNextServerId(); // -- statistic update @@ -423,8 +423,8 @@ struct StorageWiggler : ReferenceCounted { // called when start wiggling a SS Future startWiggle(); Future finishWiggle(); - bool shouldStartNewRound() { return metrics.last_round_finish >= metrics.last_round_start; } - bool shouldFinishRound() { + bool shouldStartNewRound() const { return metrics.last_round_finish >= metrics.last_round_start; } + bool shouldFinishRound() const { if (wiggle_pq.empty()) return true; return (wiggle_pq.top().first.createdTime >= metrics.last_round_start); diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 11b8cdbf6f..d20cc65bec 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -973,7 +973,7 @@ ACTOR Future dataDistributionTracker(Reference in } } -std::vector ShardsAffectedByTeamFailure::getShardsFor(Team team) { +std::vector ShardsAffectedByTeamFailure::getShardsFor(Team team) const { std::vector r; for (auto it = team_shards.lower_bound(std::pair(team, KeyRangeRef())); it != team_shards.end() && it->first == team; @@ -1106,7 +1106,7 @@ void ShardsAffectedByTeamFailure::finishMove(KeyRangeRef keys) { } } -void ShardsAffectedByTeamFailure::check() { +void ShardsAffectedByTeamFailure::check() const { if (EXPENSIVE_VALIDATION) { for (auto t = team_shards.begin(); t != team_shards.end(); ++t) { auto i = shard_teams.rangeContaining(t->second.begin); @@ -1115,8 +1115,8 @@ void ShardsAffectedByTeamFailure::check() { } } auto rs = shard_teams.ranges(); - for (auto i = rs.begin(); i != rs.end(); ++i) - for (std::vector::iterator t = i->value().first.begin(); t != i->value().first.end(); ++t) + for (auto i = rs.begin(); i != rs.end(); ++i) { + for (auto t = i->value().first.begin(); t != i->value().first.end(); ++t) { if (!team_shards.count(std::make_pair(*t, i->range()))) { std::string teamDesc, shards; for (int k = 0; k < t->servers.size(); k++) @@ -1132,5 +1132,7 @@ void ShardsAffectedByTeamFailure::check() { .detail("Shards", shards); ASSERT(false); } + } + } } } From 3f2c9e5ff8dce8cd364e4b0ae2bd7c5dce1cc955 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Fri, 11 Mar 2022 18:00:31 +0100 Subject: [PATCH 058/138] ApiTester: Pass future vector per copy to continueAfterAll to avoid race conditions in error handing path --- bindings/c/test/apitester/TesterCorrectnessWorkload.cpp | 6 ++++-- bindings/c/test/apitester/TesterTransactionExecutor.cpp | 7 ++++--- bindings/c/test/apitester/TesterTransactionExecutor.h | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 6da940c155..d65fd3d1d3 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -89,11 +89,12 @@ private: for (const auto& kv : *kvPairs) { futures->push_back(ctx->tx()->get(kv.key, false)); } - ctx->continueAfterAll(futures, [ctx, futures, results]() { + ctx->continueAfterAll(*futures, [ctx, futures, results]() { results->clear(); for (auto& f : *futures) { results->push_back(((ValueFuture&)f).getValue()); } + ASSERT(results->size() == futures->size()); ctx->done(); }); }, @@ -127,11 +128,12 @@ private: for (const auto& key : *keys) { futures->push_back(ctx->tx()->get(key, false)); } - ctx->continueAfterAll(futures, [ctx, futures, results]() { + ctx->continueAfterAll(*futures, [ctx, futures, results]() { results->clear(); for (auto& f : *futures) { results->push_back(((ValueFuture&)f).getValue()); } + ASSERT(results->size() == futures->size()); ctx->done(); }); }, diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 0fdf913a2b..c50a6fd198 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -27,6 +27,7 @@ #include #include #include +#include namespace FdbApiTester { @@ -35,9 +36,9 @@ void TransactionActorBase::complete(fdb_error_t err) { context = {}; } -void ITransactionContext::continueAfterAll(std::shared_ptr> futures, TTaskFct cont) { - auto counter = std::make_shared>(futures->size()); - for (auto& f : *futures) { +void ITransactionContext::continueAfterAll(std::vector futures, TTaskFct cont) { + auto counter = std::make_shared>(futures.size()); + for (auto& f : futures) { continueAfter(f, [counter, cont]() { if (--(*counter) == 0) { cont(); diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index 8a279ba198..e78223bf63 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -51,7 +51,7 @@ public: virtual void done() = 0; // A continuation to be executed when all of the given futures get ready - virtual void continueAfterAll(std::shared_ptr> futures, TTaskFct cont); + virtual void continueAfterAll(std::vector futures, TTaskFct cont); }; /** From 26e95d43ef85fcb96382dec05019f7dbb89708d2 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 22 Feb 2022 12:11:25 -0600 Subject: [PATCH 059/138] SS needs to persist its TSS pair ID to make metrics accurate (and avoid buggify issues in simulation) --- fdbserver/storageserver.actor.cpp | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index d4760dd982..9d5f43207e 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -4174,6 +4174,7 @@ static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("Foundation LiteralStringRef("FoundationDB/StorageServer/1/5")); static const KeyRef persistID = LiteralStringRef(PERSIST_PREFIX "ID"); static const KeyRef persistTssPairID = LiteralStringRef(PERSIST_PREFIX "tssPairID"); +static const KeyRef persistSSPairID = LiteralStringRef(PERSIST_PREFIX "ssWithTSSPairID"); static const KeyRef persistTssQuarantine = LiteralStringRef(PERSIST_PREFIX "tssQ"); static const KeyRef persistClusterIdKey = LiteralStringRef(PERSIST_PREFIX "clusterId"); @@ -5178,6 +5179,10 @@ private: if (!data->isTss() && m.type == MutationRef::ClearRange && data->ssPairID.present() && serverTagKey == data->ssPairID.get()) { data->clearSSWithTssPair(); + // Add ss pair id change to mutation log to make durable + auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion()); + data->addMutationToMutationLog( + mLV, MutationRef(MutationRef::ClearRange, persistSSPairID, keyAfter(persistSSPairID))); } } else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) { data->rebootAfterDurableVersion = currentVersion; @@ -5281,11 +5286,19 @@ private: if (!data->isTss()) { UID ssId = Codec::unpack(Tuple::unpack(m.param1.substr(1).removePrefix(tssMappingKeys.begin))); ASSERT(ssId == data->thisServerID); + // Add ss pair id change to mutation log to make durable + auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion()); if (m.type == MutationRef::SetValue) { UID tssId = Codec::unpack(Tuple::unpack(m.param2)); data->setSSWithTssPair(tssId); + data->addMutationToMutationLog(mLV, + MutationRef(MutationRef::SetValue, + persistSSPairID, + BinaryWriter::toValue(tssId, Unversioned()))); } else { data->clearSSWithTssPair(); + data->addMutationToMutationLog( + mLV, MutationRef(MutationRef::ClearRange, persistSSPairID, keyAfter(persistSSPairID))); } } } else if (m.param1.substr(1).startsWith(tssQuarantineKeys.begin) && @@ -6276,6 +6289,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor state Future> fID = storage->readValue(persistID); state Future> fClusterID = storage->readValue(persistClusterIdKey); state Future> ftssPairID = storage->readValue(persistTssPairID); + state Future> fssPairID = storage->readValue(persistSSPairID); state Future> fTssQuarantine = storage->readValue(persistTssQuarantine); state Future> fVersion = storage->readValue(persistVersion); state Future> fLogProtocol = storage->readValue(persistLogProtocol); @@ -6291,8 +6305,8 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor restoreByteSample(data, storage, byteSampleSampleRecovered, startByteSampleRestore.getFuture()); TraceEvent("ReadingDurableState", data->thisServerID).log(); - wait(waitForAll( - std::vector{ fFormat, fID, fClusterID, ftssPairID, fTssQuarantine, fVersion, fLogProtocol, fPrimaryLocality })); + wait(waitForAll(std::vector{ + fFormat, fID, fClusterID, ftssPairID, fssPairID, fTssQuarantine, fVersion, fLogProtocol, fPrimaryLocality })); wait(waitForAll(std::vector{ fShardAssigned, fShardAvailable, fChangeFeeds, fTenantMap })); wait(byteSampleSampleRecovered.getFuture()); TraceEvent("RestoringDurableState", data->thisServerID).log(); @@ -6319,6 +6333,11 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor data->bytesRestored += ftssPairID.get().expectedSize(); } + if (fssPairID.get().present()) { + data->setSSWithTssPair(BinaryReader::fromStringRef(fssPairID.get().get(), Unversioned())); + data->bytesRestored += fssPairID.get().expectedSize(); + } + if (fClusterID.get().present()) { data->clusterId.send(BinaryReader::fromStringRef(fClusterID.get().get(), Unversioned())); data->bytesRestored += fClusterID.get().expectedSize(); From 5d1affa2c2bee05d17d58eeda84a3aba407b6c51 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 22 Feb 2022 12:44:42 -0600 Subject: [PATCH 060/138] Cleaning up and improving TSS debugging traces for range read mismatches --- fdbclient/StorageServerInterface.cpp | 158 +++++++++++++++------------ 1 file changed, 91 insertions(+), 67 deletions(-) diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp index dce4df7736..ed71b1be96 100644 --- a/fdbclient/StorageServerInterface.cpp +++ b/fdbclient/StorageServerInterface.cpp @@ -28,7 +28,7 @@ // if size + hex of checksum is shorter than value, record that instead of actual value. break-even point is 12 // characters -std::string traceChecksumValue(ValueRef s) { +std::string traceChecksumValue(const ValueRef& s) { return s.size() > 12 ? format("(%d)%08x", s.size(), crc32c_append(0, s.begin(), s.size())) : s.toString(); } @@ -49,6 +49,7 @@ void TSS_traceMismatch(TraceEvent& event, const GetValueReply& src, const GetValueReply& tss) { event.detail("Key", req.key.printable()) + .detail("Tenant", req.tenantInfo.name) .detail("Version", req.version) .detail("SSReply", src.value.present() ? traceChecksumValue(src.value.get()) : "missing") .detail("TSSReply", tss.value.present() ? traceChecksumValue(tss.value.get()) : "missing"); @@ -125,33 +126,77 @@ const char* TSS_mismatchTraceName(const GetKeyValuesRequest& req) { return "TSSMismatchGetKeyValues"; } +static void traceKeyValuesSummary(TraceEvent& event, + const KeySelectorRef& begin, + const KeySelectorRef& end, + Optional tenant, + Version version, + int limit, + int limitBytes, + size_t ssSize, + bool ssMore, + size_t tssSize, + bool tssMore) { + std::string ssSummaryString = format("(%d)%s", ssSize, ssMore ? "+" : ""); + std::string tssSummaryString = format("(%d)%s", tssSize, tssMore ? "+" : ""); + event.detail("Begin", format("%s%s:%d", begin.orEqual ? "=" : "", begin.getKey().printable().c_str(), begin.offset)) + .detail("End", format("%s%s:%d", end.orEqual ? "=" : "", end.getKey().printable().c_str(), end.offset)) + .detail("Tenant", tenant) + .detail("Version", version) + .detail("Limit", limit) + .detail("LimitBytes", limitBytes) + .detail("SSReplySummary", ssSummaryString) + .detail("TSSReplySummary", tssSummaryString); +} + +static void traceKeyValuesDiff(TraceEvent& event, + const KeySelectorRef& begin, + const KeySelectorRef& end, + Optional tenant, + Version version, + int limit, + int limitBytes, + const VectorRef& ssKV, + bool ssMore, + const VectorRef& tssKV, + bool tssMore) { + traceKeyValuesSummary( + event, begin, end, tenant, version, limit, limitBytes, ssKV.size(), ssMore, tssKV.size(), tssMore); + bool mismatchFound = false; + for (int i = 0; i < std::max(ssKV.size(), tssKV.size()); i++) { + if (i >= ssKV.size() || i >= tssKV.size() || ssKV[i] != tssKV[i]) { + event.detail("MismatchIndex", i); + if (i >= ssKV.size() || i >= tssKV.size() || ssKV[i].key != tssKV[i].key) { + event.detail("MismatchSSKey", i < ssKV.size() ? ssKV[i].key.printable() : "missing"); + event.detail("MismatchTSSKey", i < tssKV.size() ? tssKV[i].key.printable() : "missing"); + } else { + event.detail("MismatchKey", ssKV[i].key.printable()); + event.detail("MismatchSSValue", traceChecksumValue(ssKV[i].value)); + event.detail("MismatchTSSValue", traceChecksumValue(tssKV[i].value)); + } + mismatchFound = true; + break; + } + } + ASSERT(mismatchFound); +} + template <> void TSS_traceMismatch(TraceEvent& event, const GetKeyValuesRequest& req, const GetKeyValuesReply& src, const GetKeyValuesReply& tss) { - std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : ""); - for (auto& it : src.data) { - ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); - } - - std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : ""); - for (auto& it : tss.data) { - tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); - } - event - .detail( - "Begin", - format("%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset)) - .detail("End", - format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset)) - .detail("Tenant", req.tenantInfo.name) - .detail("Version", req.version) - .detail("Limit", req.limit) - .detail("LimitBytes", req.limitBytes) - .setMaxFieldLength(FLOW_KNOBS->TSS_LARGE_TRACE_SIZE * 4 / 10) - .detail("SSReply", ssResultsString) - .detail("TSSReply", tssResultsString); + traceKeyValuesDiff(event, + req.begin, + req.end, + req.tenantInfo.name, + req.version, + req.limit, + req.limitBytes, + src.data, + src.more, + tss.data, + tss.more); } // range reads and flat map @@ -170,28 +215,18 @@ void TSS_traceMismatch(TraceEvent& event, const GetMappedKeyValuesRequest& req, const GetMappedKeyValuesReply& src, const GetMappedKeyValuesReply& tss) { - std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : ""); - for (auto& it : src.data) { - ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); - } - - std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : ""); - for (auto& it : tss.data) { - tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); - } - event - .detail( - "Begin", - format("%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset)) - .detail("End", - format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset)) - .detail("Tenant", req.tenantInfo.name) - .detail("Version", req.version) - .detail("Limit", req.limit) - .detail("LimitBytes", req.limitBytes) - .setMaxFieldLength(FLOW_KNOBS->TSS_LARGE_TRACE_SIZE * 4 / 10) - .detail("SSReply", ssResultsString) - .detail("TSSReply", tssResultsString); + traceKeyValuesSummary(event, + req.begin, + req.end, + req.tenantInfo.name, + req.version, + req.limit, + req.limitBytes, + src.data.size(), + src.more, + tss.data.size(), + tss.more); + // FIXME: trace details for TSS mismatch of mapped data } // streaming range reads @@ -211,28 +246,17 @@ void TSS_traceMismatch(TraceEvent& event, const GetKeyValuesStreamRequest& req, const GetKeyValuesStreamReply& src, const GetKeyValuesStreamReply& tss) { - std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : ""); - for (auto& it : src.data) { - ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); - } - - std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : ""); - for (auto& it : tss.data) { - tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); - } - event - .detail( - "Begin", - format("%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset)) - .detail("End", - format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset)) - .detail("Tenant", req.tenantInfo.name) - .detail("Version", req.version) - .detail("Limit", req.limit) - .detail("LimitBytes", req.limitBytes) - .setMaxFieldLength(FLOW_KNOBS->TSS_LARGE_TRACE_SIZE * 4 / 10) - .detail("SSReply", ssResultsString) - .detail("TSSReply", tssResultsString); + traceKeyValuesDiff(event, + req.begin, + req.end, + req.tenantInfo.name, + req.version, + req.limit, + req.limitBytes, + src.data, + src.more, + tss.data, + tss.more); } template <> From 32f2731773010e5c8a6f831af85199c0b8b76dd1 Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Fri, 11 Mar 2022 14:02:35 -0500 Subject: [PATCH 061/138] Fix static initialization order for maxAllowedVersion (#6570) * Fix static initialization order for maxAllowedVersion As it was defined, maxAllowedVerion was statically and globally initialized with a dependency on CLIENT_KNOBS. Initialization order was not spelled out and therefore this relied on chance specifics of order. In some cases these two were constructed in the wrong order and led to a segfault on startup. Fix by deferring initialization of maxAllowedVerion. * Fix formatting with clang-format --- fdbclient/SpecialKeySpace.actor.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 78f9c93e50..2516901a09 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -41,10 +41,6 @@ namespace { const std::string kTracingTransactionIdKey = "transaction_id"; const std::string kTracingTokenKey = "token"; -// Max version we can set for minRequiredCommitVersionKey, -// making sure the cluster can still be alive for 1000 years after the recovery -const Version maxAllowedVerion = - std::numeric_limits::max() - 1 - CLIENT_KNOBS->VERSIONS_PER_SECOND * 3600 * 24 * 365 * 1000; static bool isAlphaNumeric(const std::string& key) { // [A-Za-z0-9_]+ @@ -1865,6 +1861,11 @@ Future AdvanceVersionImpl::getRange(ReadYourWritesTransaction* ryw, } ACTOR static Future> advanceVersionCommitActor(ReadYourWritesTransaction* ryw, Version v) { + // Max version we can set for minRequiredCommitVersionKey, + // making sure the cluster can still be alive for 1000 years after the recovery + static const Version maxAllowedVerion = + std::numeric_limits::max() - 1 - CLIENT_KNOBS->VERSIONS_PER_SECOND * 3600 * 24 * 365 * 1000; + ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE); ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); TraceEvent(SevDebug, "AdvanceVersion").detail("MaxAllowedVersion", maxAllowedVerion); From 8ba3c107ffad11fb5fd751c9cd2b47305dd66ad3 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Sat, 12 Mar 2022 14:12:37 +0100 Subject: [PATCH 062/138] ApiTester: cancel transaction instead of deleting to avoid crashes when it is accessed by pending callbacks --- bindings/c/test/apitester/TesterApiWrapper.cpp | 17 +++++++++++++++++ bindings/c/test/apitester/TesterApiWrapper.h | 1 + .../apitester/TesterTransactionExecutor.cpp | 4 +++- bindings/c/test/apitester/TesterWorkload.cpp | 4 ++-- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/bindings/c/test/apitester/TesterApiWrapper.cpp b/bindings/c/test/apitester/TesterApiWrapper.cpp index 869b02d89c..4fc3b79c9a 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.cpp +++ b/bindings/c/test/apitester/TesterApiWrapper.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ #include "TesterApiWrapper.h" +#include "TesterUtil.h" #include #include @@ -41,14 +42,17 @@ void Future::reset() { } void Future::cancel() { + ASSERT(future_); fdb_future_cancel(future_.get()); } fdb_error_t Future::getError() const { + ASSERT(future_); return fdb_future_get_error(future_.get()); } std::optional ValueFuture::getValue() const { + ASSERT(future_); int out_present; const std::uint8_t* val; int vallen; @@ -60,35 +64,48 @@ std::optional ValueFuture::getValue() const { Transaction::Transaction(FDBTransaction* tx) : tx_(tx, fdb_transaction_destroy) {} ValueFuture Transaction::get(std::string_view key, fdb_bool_t snapshot) { + ASSERT(tx_); return ValueFuture(fdb_transaction_get(tx_.get(), (const uint8_t*)key.data(), key.size(), snapshot)); } void Transaction::set(std::string_view key, std::string_view value) { + ASSERT(tx_); fdb_transaction_set(tx_.get(), (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size()); } void Transaction::clear(std::string_view key) { + ASSERT(tx_); fdb_transaction_clear(tx_.get(), (const uint8_t*)key.data(), key.size()); } void Transaction::clearRange(std::string_view begin, std::string_view end) { + ASSERT(tx_); fdb_transaction_clear_range( tx_.get(), (const uint8_t*)begin.data(), begin.size(), (const uint8_t*)end.data(), end.size()); } Future Transaction::commit() { + ASSERT(tx_); return Future(fdb_transaction_commit(tx_.get())); } +void Transaction::cancel() { + ASSERT(tx_); + fdb_transaction_cancel(tx_.get()); +} + Future Transaction::onError(fdb_error_t err) { + ASSERT(tx_); return Future(fdb_transaction_on_error(tx_.get(), err)); } void Transaction::reset() { + ASSERT(tx_); fdb_transaction_reset(tx_.get()); } fdb_error_t Transaction::setOption(FDBTransactionOption option) { + ASSERT(tx_); return fdb_transaction_set_option(tx_.get(), option, reinterpret_cast(""), 0); } diff --git a/bindings/c/test/apitester/TesterApiWrapper.h b/bindings/c/test/apitester/TesterApiWrapper.h index 61275f1109..567d9348c1 100644 --- a/bindings/c/test/apitester/TesterApiWrapper.h +++ b/bindings/c/test/apitester/TesterApiWrapper.h @@ -71,6 +71,7 @@ public: void clear(std::string_view key); void clearRange(std::string_view begin, std::string_view end); Future commit(); + void cancel(); Future onError(fdb_error_t err); void reset(); fdb_error_t setOption(FDBTransactionOption option); diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index c50a6fd198..563350acb9 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -87,6 +87,9 @@ public: } txState = TxState::DONE; lock.unlock(); + // cancel transaction so that any pending operations on it + // fail gracefully + fdbTx.cancel(); txActor->complete(error_code_success); cleanUp(); contAfterDone(); @@ -102,7 +105,6 @@ protected: ASSERT(txState == TxState::DONE); ASSERT(!onErrorFuture); txActor = {}; - fdbTx = {}; } // Complete the transaction with an (unretriable) error diff --git a/bindings/c/test/apitester/TesterWorkload.cpp b/bindings/c/test/apitester/TesterWorkload.cpp index ce269ca824..19b28731e9 100644 --- a/bindings/c/test/apitester/TesterWorkload.cpp +++ b/bindings/c/test/apitester/TesterWorkload.cpp @@ -145,9 +145,9 @@ void WorkloadManager::run() { } scheduler->join(); if (failed()) { - fmt::print(stderr, "{} workloads failed", numWorkloadsFailed); + fmt::print(stderr, "{} workloads failed\n", numWorkloadsFailed); } else { - fprintf(stderr, "All workloads succesfully completed"); + fprintf(stderr, "All workloads succesfully completed\n"); } } From 7cb5c8957c5ac82ae4d1a4902fee43b5cd7c309e Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Tue, 1 Mar 2022 10:56:24 -0800 Subject: [PATCH 063/138] Change dd doc to match recent encapsulation changes to TC data structures --- design/data-distributor-internals.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/design/data-distributor-internals.md b/design/data-distributor-internals.md index ce432bfe67..4e67534dfa 100644 --- a/design/data-distributor-internals.md +++ b/design/data-distributor-internals.md @@ -6,13 +6,13 @@ Data distribution manages the lifetime of storage servers, decides which storage ## Components -**Storage server (`struct TCServerInfo`):** DD creates a TCServerInfo object for each storage server (SS). The TCServerInfo includes: (1) the SS’ locality, which includes the processID that is unique to ip:port, the zoneId that specifies which rack the SS is on, and the dcId that specifies which DC the SS is in; (2) the server’s teams, which will be discussed in the following paragraph; (3) the tracker that monitor the status of the server; and (4) extra information related to the server’s interface and preference. A server is healthy if its storage engine on the process is the same with the configured storage engine, and it is marked as desired by DD. +**Storage server (`class TCServerInfo`):** DD creates a TCServerInfo object for each storage server (SS). The TCServerInfo includes: (1) the SS’ locality, which includes the processID that is unique to ip:port, the zoneId that specifies which rack the SS is on, and the dcId that specifies which DC the SS is in; (2) the server’s teams, which will be discussed in the following paragraph; (3) the tracker that monitor the status of the server; and (4) extra information related to the server’s interface and preference. A server is healthy if its storage engine on the process is the same with the configured storage engine, and it is marked as desired by DD. -**Machine (`struct TCMachineInfo`)**: A machine in FDB is considered as a rack, because a typical FDB cluster will only use one physical host from each rack in the datacenter to reduce the impact of regular rack-maintenance events on the cluster. All servers on the same rack belong to the same machine. A machine is healthy if there exists a healthy server on the machine. +**Machine (`class TCMachineInfo`)**: A machine in FDB is considered as a rack, because a typical FDB cluster will only use one physical host from each rack in the datacenter to reduce the impact of regular rack-maintenance events on the cluster. All servers on the same rack belong to the same machine. A machine is healthy if there exists a healthy server on the machine. -**Server team (`struct TCTeamInfo`)**: A server team is a group of *k* servers that host the same key ranges, where *k* is the replication factor that is usually three. A server team is healthy if every server in the team is healthy and those servers’ localities satisfy the replication requirement. Servers are grouped into server teams to reduce the possibility of data unavailability events at the event of *k* server failures. +**Server team (`class TCTeamInfo`)**: A server team is a group of *k* servers that host the same key ranges, where *k* is the replication factor that is usually three. A server team is healthy if every server in the team is healthy and those servers’ localities satisfy the replication requirement. Servers are grouped into server teams to reduce the possibility of data unavailability events at the event of *k* server failures. -**Machine team (`struct TCMachineTeamInfo`)**: A machine team is a group of k machines, where k is the replication factor. Each server team must be on a machine team, meaning that each server in the server team is on a machine in the machine team and that no two servers are on the same machine. Similar to the purpose of server teams, machine teams are used to reduce the possibility of data unavailability events at the event of *k* machine failures. A machine team is healthy if every machine on the team is healthy and machines’ localities satisfy the replication policy. +**Machine team (`class TCMachineTeamInfo`)**: A machine team is a group of k machines, where k is the replication factor. Each server team must be on a machine team, meaning that each server in the server team is on a machine in the machine team and that no two servers are on the same machine. Similar to the purpose of server teams, machine teams are used to reduce the possibility of data unavailability events at the event of *k* machine failures. A machine team is healthy if every machine on the team is healthy and machines’ localities satisfy the replication policy. **`TeamCollection`**: It has a global view of all servers and server teams, machines and machine teams. With the information, it creates server teams and machine teams. It also maintains the configuration settings for DD, which is used to create teams and decide which type of storage servers to recruit. @@ -30,7 +30,7 @@ Data distribution manages the lifetime of storage servers, decides which storage *`moveKeysLockOwnerKey`* (`\xff/moveKeysLock/Owner`) and *moveKeysLockWriteKey* (`\xff/moveKeysLock/Write`): When DD moves keys, it must grab the moveKeysLock, which consists of an owner key and a write key. The owner key (i.e., `moveKeysLockOwnerKey`) specifies which DD currently owns the lock. The write key (i.e., `moveKeysLockWriteKey`) specifies which DD is currently changing the mapping between keys and servers (i.e., operating on serverKeys and keyServers subspace). If DD finds it does not own both keys when it tries to move keys, it will kill itself by throwing an error. The cluster controller will recruit a new one. -When a new DD is initialized, it will set itself as the owner by setting its random UID to the `moveKeysLockOwnerKey`. Since the owner key has only one value, at most one DD can own the DD-related system subspace. This avoids the potential race condition between multiple DDs which may co-exit during DD recruitment. +When a new DD is initialized, it will set itself as the owner by setting its random UID to the `moveKeysLockOwnerKey`. Since the owner key has only one value, at most one DD can own the DD-related system subspace. This avoids the potential race condition between multiple DDs which may co-exist during DD recruitment. **Transaction State Store (txnStateStore)**: It is a replica of the special keyspace that stores the cluster’s states, such as which SS is responsible for which shard. Because commit proxies use txnStateStore to decide which tLog and SS should receive a mutation, commit proxies must have a consistent view of txnStateStore. Therefore, changes to txnStateStore must be populated to all commit proxies in total order. To achieve that, we use the special transaction (`applyMetaMutations`) to update txnStateStore and use resolvers to ensure the total ordering (serializable snapshot isolation). From c5a17f48c0b558deabf32d3783511e4c3d9826bc Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 2 Mar 2022 12:56:19 -0800 Subject: [PATCH 064/138] Make some TCServerInfo methods that are only used in DDTCUnitTest class private --- fdbserver/TCInfo.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index 0170de1870..1e3a4cf943 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -28,6 +28,7 @@ class TCMachineTeamInfo; class TCServerInfo : public ReferenceCounted { friend class TCServerInfoImpl; + friend class DDTeamCollectionUnitTest; UID id; bool inDesiredDC; DDTeamCollection* collection; @@ -47,6 +48,9 @@ class TCServerInfo : public ReferenceCounted { std::vector> teams; ErrorOr serverMetrics; + void setServerMetrics(GetStorageMetricsReply serverMetrics) { this->serverMetrics = serverMetrics; } + void markTeamUnhealthy(int teamIndex); + public: Reference machine; Promise> interfaceChanged; @@ -99,12 +103,6 @@ public: static Future updateServerMetrics(Reference server); Future serverMetricsPolling(); - // FIXME: Public for testing only: - void setServerMetrics(GetStorageMetricsReply serverMetrics) { this->serverMetrics = serverMetrics; } - - // FIXME: Public for testing only: - void markTeamUnhealthy(int teamIndex); - ~TCServerInfo(); }; From 2d2f7ed2c68bc44410fa501518b5d095a4ab10b1 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 2 Mar 2022 13:42:56 -0800 Subject: [PATCH 065/138] Refactor team space methods and create new TCServerInfo method to expose space metrics --- fdbserver/TCInfo.actor.cpp | 39 ++++++++++++++++++-------------------- fdbserver/TCInfo.h | 1 + 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 5cf7f6b2dd..eaa0e2253b 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -198,6 +198,19 @@ void TCServerInfo::removeTeamsContainingServer(UID removedServer) { } } +std::pair TCServerInfo::spaceInBytes(bool includeInFlight) const { + auto& metrics = getServerMetrics(); + ASSERT(metrics.capacity.bytes >= 0); + ASSERT(metrics.available.bytes >= 0); + + int64_t bytesAvailable = metrics.available.bytes; + if (includeInFlight) { + bytesAvailable -= getDataInFlightToServer(); + } + + return std::make_pair(bytesAvailable, metrics.capacity.bytes); // bytesAvailable could be negative +} + void TCServerInfo::removeTeam(Reference team) { for (int t = 0; t < teams.size(); t++) { if (teams[t] == team) { @@ -357,16 +370,7 @@ int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { int64_t minAvailableSpace = std::numeric_limits::max(); for (const auto& server : servers) { if (server->serverMetricsPresent()) { - auto& replyValue = server->getServerMetrics(); - - ASSERT(replyValue.available.bytes >= 0); - ASSERT(replyValue.capacity.bytes >= 0); - - int64_t bytesAvailable = replyValue.available.bytes; - if (includeInFlight) { - bytesAvailable -= server->getDataInFlightToServer(); - } - + const auto [bytesAvailable, bytesCapacity] = server->spaceInBytes(includeInFlight); minAvailableSpace = std::min(bytesAvailable, minAvailableSpace); } } @@ -378,20 +382,13 @@ double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const { double minRatio = 1.0; for (const auto& server : servers) { if (server->serverMetricsPresent()) { - auto const& replyValue = server->getServerMetrics(); + auto [bytesAvailable, bytesCapacity] = server->spaceInBytes(includeInFlight); + bytesAvailable = std::max((int64_t)0, bytesAvailable); - ASSERT(replyValue.available.bytes >= 0); - ASSERT(replyValue.capacity.bytes >= 0); - - int64_t bytesAvailable = replyValue.available.bytes; - if (includeInFlight) { - bytesAvailable = std::max((int64_t)0, bytesAvailable - server->getDataInFlightToServer()); - } - - if (replyValue.capacity.bytes == 0) + if (bytesCapacity == 0) minRatio = 0; else - minRatio = std::min(minRatio, ((double)bytesAvailable) / replyValue.capacity.bytes); + minRatio = std::min(minRatio, ((double)bytesAvailable) / bytesCapacity); } } diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index 1e3a4cf943..21b6046d45 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -97,6 +97,7 @@ public: return (storeType == configStoreType || storeType == KeyValueStoreType::END); } + std::pair spaceInBytes(bool includeInFlight = true) const; bool hasHealthyAvailableSpace(double minAvailableSpaceRatio) const; Future updateServerMetrics(); From 7bad6c509347c2aacf92ebedc04eec74ecee8f7d Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 2 Mar 2022 14:29:43 -0800 Subject: [PATCH 066/138] Make server API for load metrics --- fdbserver/TCInfo.actor.cpp | 17 +++++++++++------ fdbserver/TCInfo.h | 3 ++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index eaa0e2253b..cf9d3e7004 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -198,7 +198,7 @@ void TCServerInfo::removeTeamsContainingServer(UID removedServer) { } } -std::pair TCServerInfo::spaceInBytes(bool includeInFlight) const { +std::pair TCServerInfo::spaceBytes(bool includeInFlight) const { auto& metrics = getServerMetrics(); ASSERT(metrics.capacity.bytes >= 0); ASSERT(metrics.available.bytes >= 0); @@ -211,6 +211,10 @@ std::pair TCServerInfo::spaceInBytes(bool includeInFlight) con return std::make_pair(bytesAvailable, metrics.capacity.bytes); // bytesAvailable could be negative } +int64_t TCServerInfo::loadBytes() const { + return getServerMetrics().load.bytes; +} + void TCServerInfo::removeTeam(Reference team) { for (int t = 0; t < teams.size(); t++) { if (teams[t] == team) { @@ -370,7 +374,7 @@ int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { int64_t minAvailableSpace = std::numeric_limits::max(); for (const auto& server : servers) { if (server->serverMetricsPresent()) { - const auto [bytesAvailable, bytesCapacity] = server->spaceInBytes(includeInFlight); + const auto [bytesAvailable, bytesCapacity] = server->spaceBytes(includeInFlight); minAvailableSpace = std::min(bytesAvailable, minAvailableSpace); } } @@ -382,7 +386,7 @@ double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const { double minRatio = 1.0; for (const auto& server : servers) { if (server->serverMetricsPresent()) { - auto [bytesAvailable, bytesCapacity] = server->spaceInBytes(includeInFlight); + auto [bytesAvailable, bytesCapacity] = server->spaceBytes(includeInFlight); bytesAvailable = std::max((int64_t)0, bytesAvailable); if (bytesCapacity == 0) @@ -437,11 +441,12 @@ void TCTeamInfo::addServers(const std::vector& servers) { int64_t TCTeamInfo::getLoadAverage() const { int64_t bytesSum = 0; int added = 0; - for (int i = 0; i < servers.size(); i++) - if (servers[i]->serverMetricsPresent()) { + for (const auto& server : servers) { + if (server->serverMetricsPresent()) { added++; - bytesSum += servers[i]->getServerMetrics().load.bytes; + bytesSum += server->loadBytes(); } + } if (added < servers.size()) bytesSum *= 2; diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index 21b6046d45..e5f9a86929 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -97,7 +97,8 @@ public: return (storeType == configStoreType || storeType == KeyValueStoreType::END); } - std::pair spaceInBytes(bool includeInFlight = true) const; + std::pair spaceBytes(bool includeInFlight = true) const; + int64_t loadBytes() const; bool hasHealthyAvailableSpace(double minAvailableSpaceRatio) const; Future updateServerMetrics(); From 8be519a5d88c18ca81aefe711e4e7d00c71d3691 Mon Sep 17 00:00:00 2001 From: "Bharadwaj V.R" Date: Wed, 2 Mar 2022 15:14:29 -0800 Subject: [PATCH 067/138] Make getServerMetrics private to TCServerInfo class and rename some methods to remove redundant reference to 'server' --- fdbserver/DDTeamCollection.actor.cpp | 50 ++++++++++++++-------------- fdbserver/TCInfo.actor.cpp | 30 ++++++++--------- fdbserver/TCInfo.h | 9 ++--- 3 files changed, 45 insertions(+), 44 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 75747c9c65..58507ffdf6 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5316,10 +5316,10 @@ public: collection->disableBuildingTeams(); collection->setCheckTeamDelay(); - collection->server_info[UID(1, 0)]->setServerMetrics(mid_avail); - collection->server_info[UID(2, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(3, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(4, 0)]->setServerMetrics(high_avail); + collection->server_info[UID(1, 0)]->setMetrics(mid_avail); + collection->server_info[UID(2, 0)]->setMetrics(high_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(high_avail); /* * Suppose 1, 2 and 3 are complete sources, i.e., they have all shards in @@ -5372,10 +5372,10 @@ public: collection->disableBuildingTeams(); collection->setCheckTeamDelay(); - collection->server_info[UID(1, 0)]->setServerMetrics(mid_avail); - collection->server_info[UID(2, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(3, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(4, 0)]->setServerMetrics(high_avail); + collection->server_info[UID(1, 0)]->setMetrics(mid_avail); + collection->server_info[UID(2, 0)]->setMetrics(high_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(high_avail); collection->server_info[UID(1, 0)]->markTeamUnhealthy(0); /* @@ -5435,10 +5435,10 @@ public: * least utilized, if the caller says they preferLowerUtilization. */ - collection->server_info[UID(1, 0)]->setServerMetrics(mid_avail); - collection->server_info[UID(2, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(3, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(4, 0)]->setServerMetrics(high_avail); + collection->server_info[UID(1, 0)]->setMetrics(mid_avail); + collection->server_info[UID(2, 0)]->setMetrics(high_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(high_avail); bool wantsNewServers = true; bool wantsTrueBest = true; @@ -5485,10 +5485,10 @@ public: collection->disableBuildingTeams(); collection->setCheckTeamDelay(); - collection->server_info[UID(1, 0)]->setServerMetrics(mid_avail); - collection->server_info[UID(2, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(3, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(4, 0)]->setServerMetrics(high_avail); + collection->server_info[UID(1, 0)]->setMetrics(mid_avail); + collection->server_info[UID(2, 0)]->setMetrics(high_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(high_avail); /* * Among server teams that have healthy space available, pick the team that is @@ -5539,10 +5539,10 @@ public: collection->disableBuildingTeams(); collection->setCheckTeamDelay(); - collection->server_info[UID(1, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(2, 0)]->setServerMetrics(low_avail); - collection->server_info[UID(3, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(4, 0)]->setServerMetrics(low_avail); + collection->server_info[UID(1, 0)]->setMetrics(high_avail); + collection->server_info[UID(2, 0)]->setMetrics(low_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(low_avail); collection->server_info[UID(1, 0)]->markTeamUnhealthy(0); /* @@ -5599,11 +5599,11 @@ public: collection->disableBuildingTeams(); collection->setCheckTeamDelay(); - collection->server_info[UID(1, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(2, 0)]->setServerMetrics(low_avail); - collection->server_info[UID(3, 0)]->setServerMetrics(high_avail); - collection->server_info[UID(4, 0)]->setServerMetrics(low_avail); - collection->server_info[UID(5, 0)]->setServerMetrics(high_avail); + collection->server_info[UID(1, 0)]->setMetrics(high_avail); + collection->server_info[UID(2, 0)]->setMetrics(low_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(low_avail); + collection->server_info[UID(5, 0)]->setMetrics(high_avail); collection->server_info[UID(1, 0)]->markTeamUnhealthy(0); /* diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index cf9d3e7004..439a71255e 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -37,7 +37,7 @@ public: choose { when(ErrorOr rep = wait(metricsRequest)) { if (rep.present()) { - server->serverMetrics = rep; + server->metrics = rep; if (server->updated.canBeSet()) { server->updated.send(Void()); } @@ -65,27 +65,27 @@ public: } } - if (server->serverMetrics.get().lastUpdate < now() - SERVER_KNOBS->DD_SS_STUCK_TIME_LIMIT) { + if (server->metrics.get().lastUpdate < now() - SERVER_KNOBS->DD_SS_STUCK_TIME_LIMIT) { if (server->ssVersionTooFarBehind.get() == false) { TraceEvent("StorageServerStuck", server->collection->getDistributorId()) .detail("ServerId", server->id.toString()) - .detail("LastUpdate", server->serverMetrics.get().lastUpdate); + .detail("LastUpdate", server->metrics.get().lastUpdate); server->ssVersionTooFarBehind.set(true); server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get()); } - } else if (server->serverMetrics.get().versionLag > SERVER_KNOBS->DD_SS_FAILURE_VERSIONLAG) { + } else if (server->metrics.get().versionLag > SERVER_KNOBS->DD_SS_FAILURE_VERSIONLAG) { if (server->ssVersionTooFarBehind.get() == false) { TraceEvent(SevWarn, "SSVersionDiffLarge", server->collection->getDistributorId()) .detail("ServerId", server->id.toString()) - .detail("VersionLag", server->serverMetrics.get().versionLag); + .detail("VersionLag", server->metrics.get().versionLag); server->ssVersionTooFarBehind.set(true); server->collection->addLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get()); } - } else if (server->serverMetrics.get().versionLag < SERVER_KNOBS->DD_SS_ALLOWED_VERSIONLAG) { + } else if (server->metrics.get().versionLag < SERVER_KNOBS->DD_SS_ALLOWED_VERSIONLAG) { if (server->ssVersionTooFarBehind.get() == true) { TraceEvent("SSVersionDiffNormal", server->collection->getDistributorId()) .detail("ServerId", server->id.toString()) - .detail("VersionLag", server->serverMetrics.get().versionLag); + .detail("VersionLag", server->metrics.get().versionLag); server->ssVersionTooFarBehind.set(false); server->collection->removeLaggingStorageServer(server->lastKnownInterface.locality.zoneId().get()); } @@ -138,9 +138,9 @@ TCServerInfo::TCServerInfo(StorageServerInterface ssi, } bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const { - ASSERT(serverMetricsPresent()); + ASSERT(metricsPresent()); - auto& metrics = getServerMetrics(); + auto& metrics = getMetrics(); ASSERT(metrics.available.bytes >= 0); ASSERT(metrics.capacity.bytes >= 0); @@ -199,7 +199,7 @@ void TCServerInfo::removeTeamsContainingServer(UID removedServer) { } std::pair TCServerInfo::spaceBytes(bool includeInFlight) const { - auto& metrics = getServerMetrics(); + auto& metrics = getMetrics(); ASSERT(metrics.capacity.bytes >= 0); ASSERT(metrics.available.bytes >= 0); @@ -212,7 +212,7 @@ std::pair TCServerInfo::spaceBytes(bool includeInFlight) const } int64_t TCServerInfo::loadBytes() const { - return getServerMetrics().load.bytes; + return getMetrics().load.bytes; } void TCServerInfo::removeTeam(Reference team) { @@ -373,7 +373,7 @@ int64_t TCTeamInfo::getLoadBytes(bool includeInFlight, double inflightPenalty) c int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { int64_t minAvailableSpace = std::numeric_limits::max(); for (const auto& server : servers) { - if (server->serverMetricsPresent()) { + if (server->metricsPresent()) { const auto [bytesAvailable, bytesCapacity] = server->spaceBytes(includeInFlight); minAvailableSpace = std::min(bytesAvailable, minAvailableSpace); } @@ -385,7 +385,7 @@ int64_t TCTeamInfo::getMinAvailableSpace(bool includeInFlight) const { double TCTeamInfo::getMinAvailableSpaceRatio(bool includeInFlight) const { double minRatio = 1.0; for (const auto& server : servers) { - if (server->serverMetricsPresent()) { + if (server->metricsPresent()) { auto [bytesAvailable, bytesCapacity] = server->spaceBytes(includeInFlight); bytesAvailable = std::max((int64_t)0, bytesAvailable); @@ -404,7 +404,7 @@ bool TCTeamInfo::allServersHaveHealthyAvailableSpace() const { double minAvailableSpaceRatio = SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO + SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO_SAFETY_BUFFER; for (const auto& server : servers) { - if (!server->serverMetricsPresent() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) { + if (!server->metricsPresent() || !server->hasHealthyAvailableSpace(minAvailableSpaceRatio)) { result = false; break; } @@ -442,7 +442,7 @@ int64_t TCTeamInfo::getLoadAverage() const { int64_t bytesSum = 0; int added = 0; for (const auto& server : servers) { - if (server->serverMetricsPresent()) { + if (server->metricsPresent()) { added++; bytesSum += server->loadBytes(); } diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index e5f9a86929..da18d345e7 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -46,9 +46,11 @@ class TCServerInfo : public ReferenceCounted { int64_t dataInFlightToServer; std::vector> teams; - ErrorOr serverMetrics; + ErrorOr metrics; - void setServerMetrics(GetStorageMetricsReply serverMetrics) { this->serverMetrics = serverMetrics; } + GetStorageMetricsReply const& getMetrics() const { return metrics.get(); } + + void setMetrics(GetStorageMetricsReply serverMetrics) { this->metrics = serverMetrics; } void markTeamUnhealthy(int teamIndex); public: @@ -88,8 +90,7 @@ public: void addTeam(Reference team) { teams.push_back(team); } void removeTeamsContainingServer(UID removedServer); void removeTeam(Reference); - GetStorageMetricsReply const& getServerMetrics() const { return serverMetrics.get(); } - bool serverMetricsPresent() const { return serverMetrics.present(); } + bool metricsPresent() const { return metrics.present(); } bool isCorrectStoreType(KeyValueStoreType configStoreType) const { // A new storage server's store type may not be set immediately. From 8dcac2f76d9a0da0005ccaa85ae0d06d96622703 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Sun, 13 Mar 2022 10:02:11 -0300 Subject: [PATCH 068/138] Fix typos --- bindings/c/test/mako/mako.rst | 2 +- .../com/apple/foundationdb/tuple/Versionstamp.java | 3 +-- documentation/sphinx/source/api-c.rst | 2 +- documentation/sphinx/source/backups.rst | 2 +- documentation/sphinx/source/client-testing.rst | 4 ++-- documentation/sphinx/source/developer-guide.rst | 2 +- documentation/sphinx/source/special-keys.rst | 4 ++-- documentation/sphinx/source/tss.rst | 10 +++++----- fdbbackup/backup.actor.cpp | 2 +- fdbcli/StatusCommand.actor.cpp | 4 ++-- fdbclient/BlobGranuleReader.actor.cpp | 2 +- fdbclient/ClusterInterface.h | 2 +- fdbclient/HTTP.actor.cpp | 2 +- fdbclient/NativeAPI.actor.cpp | 10 +++++----- fdbclient/NativeAPI.actor.h | 4 ++-- fdbclient/S3BlobStore.h | 2 +- fdbclient/SpecialKeySpace.actor.cpp | 2 +- fdbclient/SpecialKeySpace.actor.h | 2 +- fdbclient/SystemData.h | 2 +- fdbclient/TaskBucket.h | 2 +- fdbrpc/FlowTransport.actor.cpp | 2 +- fdbrpc/IRateControl.h | 4 ++-- fdbrpc/fdbrpc.h | 2 +- fdbrpc/sim2.actor.cpp | 2 +- fdbserver/ApplyMetadataMutation.cpp | 2 +- fdbserver/BlobWorker.actor.cpp | 2 +- fdbserver/GrvProxyServer.actor.cpp | 2 +- fdbserver/OldTLogServer_6_2.actor.cpp | 2 +- fdbserver/RestoreController.actor.cpp | 2 +- fdbserver/RestoreLoader.actor.cpp | 8 ++++---- fdbserver/RestoreRoleCommon.actor.cpp | 2 +- fdbserver/RestoreWorkerInterface.actor.h | 2 +- fdbserver/Status.actor.cpp | 4 ++-- fdbserver/VersionedBTree.actor.cpp | 2 +- fdbserver/tester.actor.cpp | 2 +- flow/Error.h | 2 +- flow/Knobs.h | 2 +- flow/TDMetric.actor.h | 2 +- flow/TypeTraits.h | 4 ++-- 39 files changed, 57 insertions(+), 58 deletions(-) diff --git a/bindings/c/test/mako/mako.rst b/bindings/c/test/mako/mako.rst index ea9603c234..f5b2150260 100644 --- a/bindings/c/test/mako/mako.rst +++ b/bindings/c/test/mako/mako.rst @@ -149,7 +149,7 @@ Format ------ | One operation type is defined as ```` or ``:``. | When Count is omitted, it's equivalent to setting it to 1. (e.g. ``g`` is equivalent to ``g1``) -| Multiple operation types within the same trancaction can be concatenated. (e.g. ``g9u1`` = 9 GETs and 1 update) +| Multiple operation types within the same transaction can be concatenated. (e.g. ``g9u1`` = 9 GETs and 1 update) Transaction Specification Examples ---------------------------------- diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java index 07c3218eac..179ac19dc6 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Versionstamp.java @@ -24,7 +24,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; - /** * Used to represent values written by versionstamp operations with a {@link Tuple}. * This wraps a single array which should contain twelve bytes. The first ten bytes @@ -37,7 +36,7 @@ import java.util.Arrays; * over time. The final two bytes are the "user" version and should be set by the client. * This allows the user to use this class to impose a total order of items across multiple * transactions in the database in a consistent and conflict-free way. The user can elect to - * ignore this parameter by instantiating the class with the paramaterless {@link #incomplete() incomplete()} + * ignore this parameter by instantiating the class with the parameterless {@link #incomplete() incomplete()} * and one-parameter {@link #complete(byte[]) complete} static initializers. If they do so, * then versions are written with a default (constant) user version. * diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst index 5d746eff7a..c647489bee 100644 --- a/documentation/sphinx/source/api-c.rst +++ b/documentation/sphinx/source/api-c.rst @@ -454,7 +454,7 @@ An |database-blurb1| Modifications to a database are performed via transactions. The function will change the region configuration to have a positive priority for the chosen dcId, and a negative priority for all other dcIds. - In particular, no error will be thrown if the given dcId does not exist. It will just not attemp to force a recovery. + In particular, no error will be thrown if the given dcId does not exist. It will just not attempt to force a recovery. If the database has already recovered, the function does nothing. Thus it's safe to call it multiple times. diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst index 6d09954998..740da17706 100644 --- a/documentation/sphinx/source/backups.rst +++ b/documentation/sphinx/source/backups.rst @@ -115,7 +115,7 @@ Here is a complete list of valid parameters: *request_timeout_min* (or *rtom*) - Minimum number of seconds to wait for a request to succeed after a connection is established. - *request_tries* (or *rt*) - Number of times to try each request until a parseable HTTP response other than 429 is received. + *request_tries* (or *rt*) - Number of times to try each request until a parsable HTTP response other than 429 is received. *requests_per_second* (or *rps*) - Max number of requests to start per second. diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst index be596dd11d..8aca24b7a6 100644 --- a/documentation/sphinx/source/client-testing.rst +++ b/documentation/sphinx/source/client-testing.rst @@ -11,7 +11,7 @@ Testing Error Handling with Buggify FoundationDB clients need to handle errors correctly. Wrong error handling can lead to many bugs - in the worst case it can lead to a corrupted database. Because of this it is important that an application or layer author tests properly their -application during failure scenarios. But this is non-trivial. In a developement environment cluster failures are very +application during failure scenarios. But this is non-trivial. In a development environment cluster failures are very unlikely and it is therefore possible that certain types of exceptions are never tested in a controlled environment. The simplest way of testing for these kind of errors is a simple mechanism called ``Buggify``. If this option is enabled @@ -327,7 +327,7 @@ processes with the class test. So above 2-step process becomes a bit more comple 1. Write the test (same as above). 2. Set up a cluster with as many test clients as you want. -3. Run the orchestor to actually execute the test. +3. Run the orchestrator to actually execute the test. Step 1. is explained further up. For step 2., please refer to the general FoundationDB configuration. The main difference to a normal FoundationDB cluster is that some processes diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index af2ff736f8..92b1041145 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -915,7 +915,7 @@ When using FoundationDB we strongly recommend users to use the retry-loop. In Py except FDBError as e: tr.on_error(e.code).wait() -This is also what the transaction decoration in python does, if you pass a ``Database`` object to a decorated function. There are some interesting properies of this retry loop: +This is also what the transaction decoration in python does, if you pass a ``Database`` object to a decorated function. There are some interesting properties of this retry loop: * We never create a new transaction within that loop. Instead ``tr.on_error`` will create a soft reset on the transaction. * ``tr.on_error`` returns a future. This is because ``on_error`` will do back off to make sure we don't overwhelm the cluster. diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index 0dd5840ab8..58978f1de7 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -121,8 +121,8 @@ Aggregate stats about cluster health. Reading this key alone is slightly cheaper **Field** **Type** **Description** ----------------------------------- -------- --------------- batch_limited boolean Whether or not the cluster is limiting batch priority transactions -limiting_storage_durability_lag number storage_durability_lag that ratekeeper is using to determing throttling (see the description for storage_durability_lag) -limiting_storage_queue number storage_queue that ratekeeper is using to determing throttling (see the description for storage_queue) +limiting_storage_durability_lag number storage_durability_lag that ratekeeper is using to determine throttling (see the description for storage_durability_lag) +limiting_storage_queue number storage_queue that ratekeeper is using to determine throttling (see the description for storage_queue) tps_limit number The rate at which normal priority transactions are allowed to start worst_storage_durability_lag number See the description for storage_durability_lag worst_storage_queue number See the description for storage_queue diff --git a/documentation/sphinx/source/tss.rst b/documentation/sphinx/source/tss.rst index 215e9967b9..0332e042a2 100644 --- a/documentation/sphinx/source/tss.rst +++ b/documentation/sphinx/source/tss.rst @@ -13,7 +13,7 @@ This document covers the operation and architecture of the Testing Storage Serve Summary ============ -The TSS feature allows FoundationDB to run an "untrusted" storage engine (the *testing storage engine*) directly in a QA or production envronment with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance. +The TSS feature allows FoundationDB to run an "untrusted" storage engine (the *testing storage engine*) directly in a QA or production environment with identical workload to the current storage engine, with zero impact on durability or correctness, and minimal impact on performance. This allows a FoundationDB cluster operator to validate the correctness and performance of a different storage engine on the exact cluster workload before migrating data to the different storage engine. @@ -44,10 +44,10 @@ The ``status`` command in the FDB :ref:`command line interface getLayerStatus(Reference tr return json; } -// Check for unparseable or expired statuses and delete them. +// Check for unparsable or expired statuses and delete them. // First checks the first doc in the key range, and if it is valid, alive and not "me" then // returns. Otherwise, checks the rest of the range as well. ACTOR Future cleanupStatus(Reference tr, diff --git a/fdbcli/StatusCommand.actor.cpp b/fdbcli/StatusCommand.actor.cpp index d1a772e0a0..a1f0f5faba 100644 --- a/fdbcli/StatusCommand.actor.cpp +++ b/fdbcli/StatusCommand.actor.cpp @@ -1201,7 +1201,7 @@ void printStatus(StatusObjectReader statusObj, // "db" is the handler to the multiversion database // localDb is the native Database object -// localDb is rarely needed except the "db" has not establised a connection to the cluster where the operation will +// localDb is rarely needed except the "db" has not established a connection to the cluster where the operation will // return Never as we expect status command to always return, we use "localDb" to return the default result ACTOR Future statusCommandActor(Reference db, Database localDb, @@ -1255,4 +1255,4 @@ CommandFactory statusFactory( "statistics.\n\nSpecifying `minimal' will provide a minimal description of the status of your " "database.\n\nSpecifying `details' will provide load information for individual " "workers.\n\nSpecifying `json' will provide status information in a machine readable JSON format.")); -} // namespace fdb_cli \ No newline at end of file +} // namespace fdb_cli diff --git a/fdbclient/BlobGranuleReader.actor.cpp b/fdbclient/BlobGranuleReader.actor.cpp index e07f100e32..bd723df2f3 100644 --- a/fdbclient/BlobGranuleReader.actor.cpp +++ b/fdbclient/BlobGranuleReader.actor.cpp @@ -30,7 +30,7 @@ #include "fdbclient/BlobWorkerInterface.h" #include "flow/actorcompiler.h" // This must be the last #include. -// TODO more efficient data structure besides std::map? PTree is unecessary since this isn't versioned, but some other +// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other // sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything // else is in 1 arena and discarded at the end. diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h index 01ac351f61..6b86322bb6 100644 --- a/fdbclient/ClusterInterface.h +++ b/fdbclient/ClusterInterface.h @@ -308,7 +308,7 @@ struct SplitShardReply { }; // Split keyrange [shard.begin, shard.end) into num shards. -// Split points are chosen as the arithmeticlly equal division points of the given range. +// Split points are chosen as the arithmetically equal division points of the given range. struct SplitShardRequest { constexpr static FileIdentifier file_identifier = 1384443; KeyRange shard; diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp index d10445d38d..6ac53ecbbe 100644 --- a/fdbclient/HTTP.actor.cpp +++ b/fdbclient/HTTP.actor.cpp @@ -207,7 +207,7 @@ ACTOR Future read_http_response_headers(Reference conn, // Reads an HTTP response from a network connection // If the connection fails while being read the exception will emitted -// If the response is not parseable or complete in some way, http_bad_response will be thrown +// If the response is not parsable or complete in some way, http_bad_response will be thrown ACTOR Future read_http_response(Reference r, Reference conn, bool header_only) { state std::string buf; state size_t pos = 0; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1d4c898925..d128824f55 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -555,7 +555,7 @@ ACTOR static Future transactionInfoCommitActor(Transaction* tr, std::vecto for (auto& chunk : *chunks) { tr->atomicOp(chunk.key, chunk.value, MutationRef::SetVersionstampedKey); numCommitBytes += chunk.key.size() + chunk.value.size() - - 4; // subtract number of bytes of key that denotes verstion stamp index + 4; // subtract number of bytes of key that denotes version stamp index } tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&numCommitBytes, 8), MutationRef::AddValue); wait(tr->commit()); @@ -3263,7 +3263,7 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference state Optional valSS = wait(tr.get(parameters->key)); Reference metadata = cx->getWatchMetadata(parameters->key.contents()); - // val_3 != val_1 (storage server value doesnt match value in map) + // val_3 != val_1 (storage server value doesn't match value in map) if (metadata.isValid() && valSS != metadata->parameters->value) { cx->deleteWatchMetadata(parameters->key.contents()); @@ -6582,9 +6582,9 @@ ACTOR Future>> getReadHotRanges(Da UseProvisionalProxies::False)); try { // TODO: how to handle this? - // This function is called whenever a shard becomes read-hot. But somehow the shard was splitted across more - // than one storage server after become read-hot and before this function is called, i.e. a race condition. - // Should we abort and wait the newly splitted shards to be hot again? + // This function is called whenever a shard becomes read-hot. But somehow the shard was split across more + // than one storage server after becoming read-hot and before this function is called, i.e. a race + // condition. Should we abort and wait for the newly split shards to be hot again? state int nLocs = locations.size(); // if (nLocs > 1) { // TraceEvent("RHDDebug") diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index d2ee31d20b..f9651a78bc 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -132,12 +132,12 @@ void setupNetwork(uint64_t transportId = 0, UseMetrics = UseMetrics::False); // call stopNetwork (from a non-networking thread) can cause the runNetwork() call to // return. // -// Throws network_already_setup if g_network has already been initalized +// Throws network_already_setup if g_network has already been initialized void runNetwork(); // See above. Can be called from a thread that is not the "networking thread" // -// Throws network_not_setup if g_network has not been initalized +// Throws network_not_setup if g_network has not been initialized void stopNetwork(); struct StorageMetrics; diff --git a/fdbclient/S3BlobStore.h b/fdbclient/S3BlobStore.h index 8153fa3b22..fab21225c1 100644 --- a/fdbclient/S3BlobStore.h +++ b/fdbclient/S3BlobStore.h @@ -68,7 +68,7 @@ public: "connect_tries (or ct) Number of times to try to connect for each request.", "connect_timeout (or cto) Number of seconds to wait for a connect request to succeed.", "max_connection_life (or mcl) Maximum number of seconds to use a single TCP connection.", - "request_tries (or rt) Number of times to try each request until a parseable HTTP " + "request_tries (or rt) Number of times to try each request until a parsable HTTP " "response other than 429 is received.", "request_timeout_min (or rtom) Number of seconds to wait for a request to succeed after a " "connection is established.", diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 78f9c93e50..61481cb27b 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2567,7 +2567,7 @@ void includeLocalities(ReadYourWritesTransaction* ryw) { } } -// Reads the excludedlocality and failed locality keys using managment api, +// Reads the excludedlocality and failed locality keys using management api, // parses them and returns the list. bool parseLocalitiesFromKeys(ReadYourWritesTransaction* ryw, bool failed, diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 857c8c8ab0..2d9e206e21 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -281,7 +281,7 @@ public: // Use special key prefix "\xff\xff/transaction/conflicting_keys/", // to retrieve keys which caused latest not_committed(conflicting with another transaction) error. -// The returned key value pairs are interpretted as : +// The returned key value pairs are interpreted as : // prefix/ : '1' - any keys equal or larger than this key are (probably) conflicting keys // prefix/ : '0' - any keys equal or larger than this key are (definitely) not conflicting keys // Currently, the conflicting keyranges returned are original read_conflict_ranges or union of them. diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 993efb1dba..b9cfa478e2 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -490,7 +490,7 @@ const Value healthyZoneValue(StringRef const& zoneId, Version version); std::pair decodeHealthyZoneValue(ValueRef const&); // All mutations done to this range are blindly copied into txnStateStore. -// Used to create artifically large txnStateStore instances in testing. +// Used to create artificially large txnStateStore instances in testing. extern const KeyRangeRef testOnlyTxnStateStorePrefixRange; // Snapshot + Incremental Restore diff --git a/fdbclient/TaskBucket.h b/fdbclient/TaskBucket.h index e492f26226..cfd4cd50ea 100644 --- a/fdbclient/TaskBucket.h +++ b/fdbclient/TaskBucket.h @@ -52,7 +52,7 @@ FDB_DECLARE_BOOLEAN_PARAM(UpdateParams); // 4. If the executor loses contact with FDB, another executor may begin at step 2. The first // Task execution can detect this by checking the result of keepRunning() periodically. // 5. Once a Task execution's _execute() call returns, the _finish() step is called. -// _finish() is transactional and is guaraunteed to never be called more than once for the +// _finish() is transactional and is guaranteed to never be called more than once for the // same Task class Task : public ReferenceCounted { public: diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 3e3b13709a..4cf799957e 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -921,7 +921,7 @@ ACTOR static void deliver(TransportData* self, bool inReadSocket) { // We want to run the task at the right priority. If the priority is higher than the current priority (which is // ReadSocket) we can just upgrade. Otherwise we'll context switch so that we don't block other tasks that might run - // with a higher priority. ReplyPromiseStream needs to guarentee that messages are recieved in the order they were + // with a higher priority. ReplyPromiseStream needs to guarentee that messages are received in the order they were // sent, so we are using orderedDelay. // NOTE: don't skip delay(0) when it's local deliver since it could cause out of order object deconstruction. if (priority < TaskPriority::ReadSocket || !inReadSocket) { diff --git a/fdbrpc/IRateControl.h b/fdbrpc/IRateControl.h index b2c3c43aa5..83ad94b478 100644 --- a/fdbrpc/IRateControl.h +++ b/fdbrpc/IRateControl.h @@ -36,7 +36,7 @@ public: virtual void delref() = 0; }; -// An IRateControl implemenation that allows at most hands out at most windowLimit units of 'credit' in windowSeconds +// An IRateControl implementation that allows at most hands out at most windowLimit units of 'credit' in windowSeconds // seconds class SpeedLimit final : public IRateControl, ReferenceCounted { public: @@ -89,7 +89,7 @@ private: Promise m_stop; }; -// An IRateControl implemenation that enforces no limit +// An IRateControl implementation that enforces no limit class Unlimited final : public IRateControl, ReferenceCounted { public: Unlimited() {} diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 70e199289b..fd80249034 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -274,7 +274,7 @@ struct AcknowledgementReply { } }; -// Registered on the server to recieve acknowledgements that the client has received stream data. This prevents the +// Registered on the server to receive acknowledgements that the client has received stream data. This prevents the // server from sending too much data to the client if the client is not consuming it. struct AcknowledgementReceiver final : FlowReceiver, FastAllocated { using FastAllocated::operator new; diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index cff1b1d5ad..2de43a3bf7 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -865,7 +865,7 @@ public: if (!ordered && !currentProcess->rebooting && machine == currentProcess && !currentProcess->shutdownSignal.isSet() && FLOW_KNOBS->MAX_BUGGIFIED_DELAY > 0 && - deterministicRandom()->random01() < 0.25) { // FIXME: why doesnt this work when we are changing machines? + deterministicRandom()->random01() < 0.25) { // FIXME: why doesn't this work when we are changing machines? seconds += FLOW_KNOBS->MAX_BUGGIFIED_DELAY * pow(deterministicRandom()->random01(), 1000.0); } diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 510f8ebc8e..471652dcda 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -703,7 +703,7 @@ private: } } // Might be a tss removal, which doesn't store a tag there. - // Chained if is a little verbose, but avoids unecessary work + // Chained if is a little verbose, but avoids unnecessary work if (toCommit && !initialCommit && !serverKeysCleared.size()) { KeyRangeRef maybeTssRange = range & serverTagKeys; if (maybeTssRange.singleKeyRange()) { diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index a8a066f6ea..7e497fd3cc 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -473,7 +473,7 @@ ACTOR Future> getGranuleSplitState(Tra } // writeDelta file writes speculatively in the common case to optimize throughput. It creates the s3 object even though -// the data in it may not yet be committed, and even though previous delta fiels with lower versioned data may still be +// the data in it may not yet be committed, and even though previous delta files with lower versioned data may still be // in flight. The synchronization happens after the s3 file is written, but before we update the FDB index of what files // exist. Before updating FDB, we ensure the version is committed and all previous delta files have updated FDB. ACTOR Future writeDeltaFile(Reference bwData, diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index ebfdb8dc55..1523acbd39 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -598,7 +598,7 @@ ACTOR Future getLiveCommittedVersion(SpanID parentSpan, return rep; } -// Returns the current read version (or minimum known committed verison if requested), +// Returns the current read version (or minimum known committed version if requested), // to each request in the provided list. Also check if the request should be throttled. // Update GRV statistics according to the request's priority. ACTOR Future sendGrvReplies(Future replyFuture, diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 550defee1a..c2e5bbaa86 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -2531,7 +2531,7 @@ void removeLog(TLogData* self, Reference logData) { } } -// copy data from old gene to new gene without desiarlzing +// copy data from old gene to new gene without deserializing ACTOR Future pullAsyncData(TLogData* self, Reference logData, std::vector tags, diff --git a/fdbserver/RestoreController.actor.cpp b/fdbserver/RestoreController.actor.cpp index bbc5e8cd06..441ad49050 100644 --- a/fdbserver/RestoreController.actor.cpp +++ b/fdbserver/RestoreController.actor.cpp @@ -1140,7 +1140,7 @@ ACTOR static Future signalRestoreCompleted(Reference updateHeartbeatTime(Reference self) { wait(self->recruitedRoles.getFuture()); diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 42135a7825..ec15168433 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -893,8 +893,8 @@ ACTOR Future sendMutationsToApplier( UID applierID = nodeIDs[splitMutationIndex]; DEBUG_MUTATION("RestoreLoaderSplitMutation", commitVersion.version, mutation) .detail("CommitVersion", commitVersion.toString()); - // CAREFUL: The splitted mutations' lifetime is shorter than the for-loop - // Must use deep copy for splitted mutations + // CAREFUL: The split mutations' lifetime is shorter than the for-loop + // Must use deep copy for split mutations applierVersionedMutationsBuffer[applierID].push_back_deep( applierVersionedMutationsBuffer[applierID].arena(), VersionedMutation(mutation, commitVersion)); msgSize += mutation.expectedSize(); @@ -986,7 +986,7 @@ ACTOR Future sendMutationsToApplier( return Void(); } -// Splits a clear range mutation for Appliers and puts results of splitted mutations and +// Splits a clear range mutation for Appliers and puts results of split mutations and // Applier IDs into "mvector" and "nodeIDs" on return. void splitMutation(const KeyRangeMap& krMap, MutationRef m, @@ -1180,7 +1180,7 @@ void _parseSerializedMutation(KeyRangeMap* pRangeVersions, } // Parsing the data blocks in a range file -// kvOpsIter: saves the parsed versioned-mutations for the sepcific LoadingParam; +// kvOpsIter: saves the parsed versioned-mutations for the specific LoadingParam; // samplesIter: saves the sampled mutations from the parsed versioned-mutations; // bc: backup container to read the backup file // version: the version the parsed mutations should be at diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index 5dc1de2f0a..6cef2b1a3e 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -105,7 +105,7 @@ void updateProcessStats(Reference self) { } } -// An actor is schedulable to run if the current worker has enough resourc, i.e., +// An actor is schedulable to run if the current worker has enough resources, i.e., // the worker's memory usage is below the threshold; // Exception: If the actor is working on the current version batch, we have to schedule // the actor to run to avoid dead-lock. diff --git a/fdbserver/RestoreWorkerInterface.actor.h b/fdbserver/RestoreWorkerInterface.actor.h index 02b84af808..cc052d18f8 100644 --- a/fdbserver/RestoreWorkerInterface.actor.h +++ b/fdbserver/RestoreWorkerInterface.actor.h @@ -251,7 +251,7 @@ struct RestoreControllerInterface : RestoreRoleInterface { // RestoreAsset uniquely identifies the work unit done by restore roles; // It is used to ensure exact-once processing on restore loader and applier; -// By combining all RestoreAssets across all verstion batches, restore should process all mutations in +// By combining all RestoreAssets across all version batches, restore should process all mutations in // backup range and log files up to the target restore version. struct RestoreAsset { UID uid; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index bd6dcc70c1..fd87736eec 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1984,8 +1984,8 @@ ACTOR static Future>> getGrvP return results; } -// Returns the number of zones eligble for recruiting new tLogs after zone failures, to maintain the current replication -// factor. +// Returns the number of zones eligible for recruiting new tLogs after zone failures, to maintain the current +// replication factor. static int getExtraTLogEligibleZones(const std::vector& workers, const DatabaseConfiguration& configuration) { std::set allZones; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 7c4c21c57e..ae9ec44291 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2061,7 +2061,7 @@ class DWALPagerSnapshot; // oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, // copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory. // This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated -// alternate pages it references basically serve as a write ahead log for pages that will eventially be copied +// alternate pages it references basically serve as a write ahead log for pages that will eventually be copied // back to their original location once the original version is no longer needed. class DWALPager final : public IPager2 { public: diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 495f38cf48..b7af32cf1b 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -465,7 +465,7 @@ ACTOR Future databaseWarmer(Database cx) { } } -// Tries indefinitly to commit a simple, self conflicting transaction +// Tries indefinitely to commit a simple, self conflicting transaction ACTOR Future pingDatabase(Database cx) { state Transaction tr(cx); loop { diff --git a/flow/Error.h b/flow/Error.h index 56450ac467..590d2a747a 100644 --- a/flow/Error.h +++ b/flow/Error.h @@ -132,7 +132,7 @@ extern bool isAssertDisabled(int line); enum assert_op { EQ, NE, LT, GT, LE, GE }; -// TODO: magic so this works even if const-ness doesn not match. +// TODO: magic so this works even if const-ness doesn't not match. template void assert_num_impl(char const* a_nm, T const& a, diff --git a/flow/Knobs.h b/flow/Knobs.h index 564f64530e..d9685953cb 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -33,7 +33,7 @@ #include // Helper macros to allow the init macro to be called with an optional third -// paramater, used to explicit set atomicity of knobs. +// parameter, used to explicit set atomicity of knobs. #define KNOB_FN(_1, _2, _3, FN, ...) FN #define INIT_KNOB(knob, value) initKnob(knob, value, #knob) #define INIT_ATOMIC_KNOB(knob, value, atomic) initKnob(knob, value, #knob, atomic) diff --git a/flow/TDMetric.actor.h b/flow/TDMetric.actor.h index 00aa399f52..f2d30d1739 100644 --- a/flow/TDMetric.actor.h +++ b/flow/TDMetric.actor.h @@ -655,7 +655,7 @@ struct NullDescriptor { }; // Descriptor must have the methods name() and typeName(). They can be either static or member functions (such as for -// runtime configurability). Descriptor is inherited so that syntatically Descriptor::fn() works in either case and so +// runtime configurability). Descriptor is inherited so that syntactically Descriptor::fn() works in either case and so // that an empty Descriptor with static methods will take up 0 space. EventField() accepts an optional Descriptor // instance. template > diff --git a/flow/TypeTraits.h b/flow/TypeTraits.h index 9552f5f34e..261bb2e860 100644 --- a/flow/TypeTraits.h +++ b/flow/TypeTraits.h @@ -21,7 +21,7 @@ // This file, similar to `type_traits` in the standard library, contains utility types that can be used for template // metaprogramming. While they can be very useful and simplify certain things, please be aware that their use will // increase compilation times significantly. Therefore it is not recommended to use them in header file if not -// absosultely necessary. +// absolutely necessary. #pragma once #include @@ -50,6 +50,6 @@ struct variant_map_t, Fun> { }; // Helper definition for variant_map_t. Instead of using `typename variant_map<...>::type` one can simple use -// `varirant_map<...>` which is equivalent but shorter. +// `variant_map<...>` which is equivalent but shorter. template class Fun> using variant_map = typename variant_map_t::type; From 09bb37ce3ef6c96921afec93c830623c103f3330 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Sun, 13 Mar 2022 21:37:46 +0100 Subject: [PATCH 069/138] ApiTester: retry a transaction after all futures are ready --- .../apitester/TesterCorrectnessWorkload.cpp | 6 +- .../apitester/TesterTransactionExecutor.cpp | 61 +++++++++++++------ .../apitester/TesterTransactionExecutor.h | 9 ++- 3 files changed, 54 insertions(+), 22 deletions(-) diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index d65fd3d1d3..732f1778cb 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -102,12 +102,14 @@ private: ASSERT(results->size() == kvPairs->size()); for (int i = 0; i < kvPairs->size(); i++) { auto expected = store.get((*kvPairs)[i].key); - if ((*results)[i] != expected) { + auto actual = (*results)[i]; + if (actual != expected) { error( fmt::format("randomCommitReadOp mismatch. key: {} expected: {:.80} actual: {:.80}", (*kvPairs)[i].key, expected, - (*results)[i])); + actual)); + ASSERT(false); } } schedule(cont); diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 563350acb9..505b55251e 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -38,19 +38,33 @@ void TransactionActorBase::complete(fdb_error_t err) { void ITransactionContext::continueAfterAll(std::vector futures, TTaskFct cont) { auto counter = std::make_shared>(futures.size()); + auto errorCode = std::make_shared>(error_code_success); + auto thisPtr = shared_from_this(); for (auto& f : futures) { - continueAfter(f, [counter, cont]() { - if (--(*counter) == 0) { - cont(); - } - }); + continueAfter( + f, + [thisPtr, f, counter, errorCode, cont]() { + if (f.getError() != error_code_success) { + (*errorCode) = f.getError(); + } + if (--(*counter) == 0) { + if (*errorCode == error_code_success) { + // all futures successful -> continue + cont(); + } else { + // at least one future failed -> retry the transaction + thisPtr->onError(*errorCode); + } + } + }, + false); } } /** * Transaction context base class, containing reusable functionality */ -class TransactionContextBase : public ITransactionContext, public std::enable_shared_from_this { +class TransactionContextBase : public ITransactionContext { public: TransactionContextBase(FDBTransaction* tx, std::shared_ptr txActor, @@ -65,7 +79,7 @@ public: Transaction* tx() override { return &fdbTx; } // Set a continuation to be executed when a future gets ready - void continueAfter(Future f, TTaskFct cont) override { doContinueAfter(f, cont); } + void continueAfter(Future f, TTaskFct cont, bool retryOnError) override { doContinueAfter(f, cont, retryOnError); } // Complete the transaction with a commit void commit() override { @@ -76,7 +90,8 @@ public: lock.unlock(); Future f = fdbTx.commit(); auto thisRef = shared_from_this(); - doContinueAfter(f, [thisRef]() { thisRef->done(); }); + doContinueAfter( + f, [thisRef]() { thisRef->done(); }, true); } // Complete the transaction without a commit (for read transactions) @@ -96,7 +111,7 @@ public: } protected: - virtual void doContinueAfter(Future f, TTaskFct cont) = 0; + virtual void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) = 0; // Clean up transaction state after completing the transaction // Note that the object may live longer, because it is referenced @@ -170,12 +185,13 @@ public: : TransactionContextBase(tx, txActor, cont, scheduler) {} protected: - void doContinueAfter(Future f, TTaskFct cont) override { + void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) override { auto thisRef = std::static_pointer_cast(shared_from_this()); - scheduler->schedule([thisRef, f, cont]() mutable { thisRef->blockingContinueAfter(f, cont); }); + scheduler->schedule( + [thisRef, f, cont, retryOnError]() mutable { thisRef->blockingContinueAfter(f, cont, retryOnError); }); } - void blockingContinueAfter(Future f, TTaskFct cont) { + void blockingContinueAfter(Future f, TTaskFct cont, bool retryOnError) { std::unique_lock lock(mutex); if (txState != TxState::IN_PROGRESS) { return; @@ -190,12 +206,16 @@ protected: if (err == error_code_transaction_cancelled) { return; } - if (err == error_code_success) { + if (err == error_code_success || !retryOnError) { scheduler->schedule([cont]() { cont(); }); return; } - lock.lock(); + onError(err); + } + + virtual void onError(fdb_error_t err) override { + std::unique_lock lock(mutex); if (txState != TxState::IN_PROGRESS) { // Ignore further errors, if the transaction is in the error handing mode or completed return; @@ -227,12 +247,12 @@ public: : TransactionContextBase(tx, txActor, cont, scheduler) {} protected: - void doContinueAfter(Future f, TTaskFct cont) override { + void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) override { std::unique_lock lock(mutex); if (txState != TxState::IN_PROGRESS) { return; } - callbackMap[f.fdbFuture()] = CallbackInfo{ f, cont, shared_from_this() }; + callbackMap[f.fdbFuture()] = CallbackInfo{ f, cont, shared_from_this(), retryOnError }; lock.unlock(); fdb_error_t err = fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this); if (err) { @@ -263,12 +283,16 @@ protected: if (err == error_code_transaction_cancelled) { return; } - if (err == error_code_success) { + if (err == error_code_success || !cbInfo.retryOnError) { scheduler->schedule(cbInfo.cont); return; } - lock.lock(); + onError(err); + } + + virtual void onError(fdb_error_t err) override { + std::unique_lock lock(mutex); if (txState != TxState::IN_PROGRESS) { // Ignore further errors, if the transaction is in the error handing mode or completed return; @@ -326,6 +350,7 @@ protected: Future future; TTaskFct cont; std::shared_ptr thisRef; + bool retryOnError; }; // Map for keeping track of future waits and holding necessary object references diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.h b/bindings/c/test/apitester/TesterTransactionExecutor.h index e78223bf63..f8f9234e50 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.h +++ b/bindings/c/test/apitester/TesterTransactionExecutor.h @@ -34,7 +34,7 @@ namespace FdbApiTester { /** * Interface to be used for implementation of a concrete transaction */ -class ITransactionContext { +class ITransactionContext : public std::enable_shared_from_this { public: virtual ~ITransactionContext() {} @@ -42,11 +42,16 @@ public: virtual Transaction* tx() = 0; // Schedule a continuation to be executed when the future gets ready - virtual void continueAfter(Future f, TTaskFct cont) = 0; + // retryOnError controls whether transaction is retried in case of an error instead + // of calling the continuation + virtual void continueAfter(Future f, TTaskFct cont, bool retryOnError = true) = 0; // Complete the transaction with a commit virtual void commit() = 0; + // retry transaction on error + virtual void onError(fdb_error_t err) = 0; + // Mark the transaction as completed without committing it (for read transactions) virtual void done() = 0; From 415d1958d467cf4ecb8f646c54a972bbccde3271 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 14 Mar 2022 11:00:34 -0700 Subject: [PATCH 070/138] Build configuration fix: USE_JEMALLOC=OFF had no effect, did not disable using jemalloc on platforms that support it. (#6590) --- cmake/ConfigureCompiler.cmake | 1 + cmake/Jemalloc.cmake | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index f05ca279be..7d8f1bc45c 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -5,6 +5,7 @@ env_set(USE_DTRACE ON BOOL "Enable dtrace probes on supported platforms") env_set(USE_VALGRIND OFF BOOL "Compile for valgrind usage") env_set(USE_VALGRIND_FOR_CTEST ${USE_VALGRIND} BOOL "Use valgrind for ctest") env_set(ALLOC_INSTRUMENTATION OFF BOOL "Instrument alloc") +env_set(USE_JEMALLOC ON BOOL "Link with jemalloc") env_set(USE_ASAN OFF BOOL "Compile with address sanitizer") env_set(USE_GCOV OFF BOOL "Compile with gcov instrumentation") env_set(USE_MSAN OFF BOOL "Compile with memory sanitizer. To avoid false positives you need to dynamically link to a msan-instrumented libc++ and libc++abi, which you must compile separately. See https://github.com/google/sanitizers/wiki/MemorySanitizerLibcxxHowTo#instrumented-libc.") diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index 723ca6b081..c59a290d77 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -1,6 +1,5 @@ add_library(jemalloc INTERFACE) -set(USE_JEMALLOC ON) # We don't want to use jemalloc on Windows # Nor on FreeBSD, where jemalloc is the default system allocator if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE) @@ -8,6 +7,10 @@ if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE) return() endif() +if(NOT USE_JEMALLOC) + return() +endif() + add_definitions(-DUSE_JEMALLOC) find_path(JEMALLOC_INCLUDE_DIR NAMES From d0da6c63c1e189d568d93094ce561346080ad157 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Sun, 13 Mar 2022 22:44:41 -0700 Subject: [PATCH 071/138] Rollforward out of date nodes, compaction fixes --- fdbserver/PaxosConfigConsumer.actor.cpp | 63 +++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/fdbserver/PaxosConfigConsumer.actor.cpp b/fdbserver/PaxosConfigConsumer.actor.cpp index cc39b91df8..875b34cae4 100644 --- a/fdbserver/PaxosConfigConsumer.actor.cpp +++ b/fdbserver/PaxosConfigConsumer.actor.cpp @@ -44,6 +44,7 @@ class GetCommittedVersionQuorum { std::vector cfis; std::map> replies; std::map priorVersions; + std::map committed; // Last durably committed version. Version lastSeenVersion; size_t totalRepliesReceived{ 0 }; @@ -95,18 +96,42 @@ class GetCommittedVersionQuorum { rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }), SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); } catch (Error& e) { - if (e.code() == error_code_transaction_too_old) { + if (e.code() == error_code_version_already_compacted || e.code() == error_code_process_behind) { + // In the case of an already_compacted or process_behind + // error, fetch the latest snapshot and attempt to roll the + // node forward. + TEST(true); // PaxosConfigConsumer rollforward compacted or behind ConfigNode + + try { + ConfigFollowerGetSnapshotAndChangesReply reply = + wait(timeoutError(basicLoadBalance(quorumCfi, + &ConfigFollowerInterface::getSnapshotAndChanges, + ConfigFollowerGetSnapshotAndChangesRequest{ target }), + SERVER_KNOBS->GET_SNAPSHOT_AND_CHANGES_TIMEOUT)); + if (reply.changes.size() == 0 || reply.changes[reply.changes.size() - 1].version < target) { + return Void(); + } + + int64_t rollbackTo = reply.changes[0].version - 1; + if (rollback.present()) { + rollbackTo = std::min(rollbackTo, rollback.get()); + } + wait(timeoutError( + cfi.rollforward.getReply(ConfigFollowerRollforwardRequest{ + rollbackTo, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }), + SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); + } catch (Error& e2) { + if (e2.code() != error_code_transaction_too_old) { + throw; + } + } + } else if (e.code() == error_code_transaction_too_old) { // Seeing this trace is not necessarily a problem. There // are legitimate scenarios where a ConfigNode could return // transaction_too_old in response to a rollforward // request. TraceEvent(SevInfo, "ConfigNodeRollforwardError").error(e); } else { - // In the case of an already_compacted error, the retry - // loop will fetch the latest snapshot and a rollforward - // request will eventually be resent. - TEST(e.code() == - error_code_version_already_compacted); // PaxosConfigConsumer rollforward compacted ConfigNode throw; } } @@ -119,6 +144,7 @@ class GetCommittedVersionQuorum { ConfigFollowerGetCommittedVersionReply reply = wait(timeoutError(cfi.getCommittedVersion.getReply(ConfigFollowerGetCommittedVersionRequest{}), SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); + self->committed[cfi.address()] = reply.lastCommitted; ++self->totalRepliesReceived; self->largestCommitted = std::max(self->largestCommitted, reply.lastCommitted); @@ -166,7 +192,11 @@ class GetCommittedVersionQuorum { } catch (Error& e) { // Count a timeout as a reply. ++self->totalRepliesReceived; - if (e.code() != error_code_timed_out) { + if (e.code() == error_code_version_already_compacted) { + if (self->quorumVersion.canBeSet()) { + self->quorumVersion.sendError(e); + } + } else if (e.code() != error_code_timed_out) { throw; } else if (self->totalRepliesReceived == self->cfis.size() && self->quorumVersion.canBeSet() && !self->quorumVersion.isError()) { @@ -217,6 +247,16 @@ public: ASSERT(isReady()); return replies.at(quorumVersion.getFuture().get().versions.lastCommitted); } + Version getSmallestCommitted() const { + if (committed.size() == cfis.size()) { + Version smallest = MAX_VERSION; + for (const auto& [key, value] : committed) { + smallest = std::min(smallest, value); + } + return smallest; + } + return invalidVersion; + } Future complete() const { return waitForAll(actors); } }; @@ -224,6 +264,7 @@ class PaxosConfigConsumerImpl { std::vector cfis; GetCommittedVersionQuorum getCommittedVersionQuorum; Version lastSeenVersion{ 0 }; + Version compactionVersion{ 0 }; double pollingInterval; Optional compactionInterval; UID id; @@ -236,13 +277,15 @@ class PaxosConfigConsumerImpl { return quorumVersion.versions.lastCommitted; } + // Periodically compact knob changes on the configuration nodes. All nodes + // must have received a version before it can be compacted. ACTOR static Future compactor(PaxosConfigConsumerImpl* self, ConfigBroadcaster* broadcaster) { if (!self->compactionInterval.present()) { wait(Never()); return Void(); } loop { - state Version compactionVersion = self->lastSeenVersion; + state Version compactionVersion = self->compactionVersion; wait(delayJittered(self->compactionInterval.get())); std::vector> compactionRequests; compactionRequests.reserve(compactionRequests.size()); @@ -277,6 +320,8 @@ class PaxosConfigConsumerImpl { .detail("AnnotationsSize", reply.annotations.size()); ASSERT_GE(committedVersion, self->lastSeenVersion); self->lastSeenVersion = committedVersion; + Version smallestCommitted = self->getCommittedVersionQuorum.getSmallestCommitted(); + self->compactionVersion = std::max(self->compactionVersion, smallestCommitted); broadcaster->applySnapshotAndChanges(std::move(reply.snapshot), reply.snapshotVersion, reply.changes, @@ -334,6 +379,8 @@ class PaxosConfigConsumerImpl { } } self->lastSeenVersion = committedVersion; + Version smallestCommitted = self->getCommittedVersionQuorum.getSmallestCommitted(); + self->compactionVersion = std::max(self->compactionVersion, smallestCommitted); broadcaster->applyChanges(reply.changes, committedVersion, reply.annotations, From 6a28bddd351bad4c6706c7cf17c36af6c85f061a Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Mon, 14 Mar 2022 17:24:25 -0300 Subject: [PATCH 072/138] Fix file names in copyright headers (#6578) --- bindings/c/test/workloads/SimpleWorkload.cpp | 2 +- bindings/c/test/workloads/workloads.cpp | 2 +- fdbserver/BackupProgress.actor.h | 2 +- fdbserver/ProxyCommitData.actor.h | 2 +- fdbserver/RestoreController.actor.h | 2 +- flow/ArgParseUtil.h | 4 ++-- flow/BooleanParam.h | 2 +- flow/folly_memcpy.h | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bindings/c/test/workloads/SimpleWorkload.cpp b/bindings/c/test/workloads/SimpleWorkload.cpp index d869845a06..5cf7cf8344 100644 --- a/bindings/c/test/workloads/SimpleWorkload.cpp +++ b/bindings/c/test/workloads/SimpleWorkload.cpp @@ -1,5 +1,5 @@ /* - * workloads.h + * SimpleWorkload.cpp * * This source file is part of the FoundationDB open source project * diff --git a/bindings/c/test/workloads/workloads.cpp b/bindings/c/test/workloads/workloads.cpp index 1f01054e84..098f40ef28 100644 --- a/bindings/c/test/workloads/workloads.cpp +++ b/bindings/c/test/workloads/workloads.cpp @@ -1,5 +1,5 @@ /* - * workloads.h + * workloads.cpp * * This source file is part of the FoundationDB open source project * diff --git a/fdbserver/BackupProgress.actor.h b/fdbserver/BackupProgress.actor.h index 74f97d4d3d..aec6f8c5b9 100644 --- a/fdbserver/BackupProgress.actor.h +++ b/fdbserver/BackupProgress.actor.h @@ -1,5 +1,5 @@ /* - * BackupProgress.h + * BackupProgress.actor.h * * This source file is part of the FoundationDB open source project * diff --git a/fdbserver/ProxyCommitData.actor.h b/fdbserver/ProxyCommitData.actor.h index e4498f407a..75b84e5c38 100644 --- a/fdbserver/ProxyCommitData.actor.h +++ b/fdbserver/ProxyCommitData.actor.h @@ -1,5 +1,5 @@ /* - * ProxyCommitData.h + * ProxyCommitData.actor.h * * This source file is part of the FoundationDB open source project * diff --git a/fdbserver/RestoreController.actor.h b/fdbserver/RestoreController.actor.h index 5ff83895f2..1a91eb0b55 100644 --- a/fdbserver/RestoreController.actor.h +++ b/fdbserver/RestoreController.actor.h @@ -1,5 +1,5 @@ /* - * RestoreController.h + * RestoreController.actor.h * * This source file is part of the FoundationDB open source project * diff --git a/flow/ArgParseUtil.h b/flow/ArgParseUtil.h index bdb68ba30a..cc54be02d6 100644 --- a/flow/ArgParseUtil.h +++ b/flow/ArgParseUtil.h @@ -1,5 +1,5 @@ /* - * Arena.h + * ArgParseUtil.h * * This source file is part of the FoundationDB open source project * @@ -37,4 +37,4 @@ Optional extractPrefixedArgument(std::string prefix, std::string ar return arg; } -#endif \ No newline at end of file +#endif diff --git a/flow/BooleanParam.h b/flow/BooleanParam.h index d06faea79a..f6ef5744ce 100644 --- a/flow/BooleanParam.h +++ b/flow/BooleanParam.h @@ -1,5 +1,5 @@ /* - * Arena.h + * BooleanParam.h * * This source file is part of the FoundationDB open source project * diff --git a/flow/folly_memcpy.h b/flow/folly_memcpy.h index 7f1b8e443a..e6bdd17946 100644 --- a/flow/folly_memcpy.h +++ b/flow/folly_memcpy.h @@ -1,5 +1,5 @@ /* - * flow.h + * folly_memcpy.h * * This source file is part of the FoundationDB open source project * From 87640673f714a462c1f039adbdf3ddc247d9bc7e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 14 Mar 2022 16:02:42 -0700 Subject: [PATCH 073/138] add hasWigglePausedServer method; add new sort criteria --- fdbserver/DDTeamCollection.actor.cpp | 26 ++++++++++++++++++++++++-- fdbserver/DDTeamCollection.h | 3 +++ fdbserver/TCInfo.actor.cpp | 12 ++++++++++++ fdbserver/TCInfo.h | 2 ++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 58507ffdf6..b145d0eb47 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -200,8 +200,9 @@ public: } int64_t bestLoadBytes = 0; + bool wigglingBestOption = false; // best option contains server in paused wiggle state Optional> bestOption; - std::vector> randomTeams; + std::vector> randomTeams; const std::set completeSources(req.completeSources.begin(), req.completeSources.end()); // Note: this block does not apply any filters from the request @@ -249,9 +250,18 @@ public: (!req.teamMustHaveShards || self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team( self->teams[currentIndex]->getServerIDs(), self->primary)))) { + + // bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption + // in this case + if (bestOption.present() && !wigglingBestOption && + self->teams[currentIndex]->hasWigglePausedServer()) { + continue; + } + bestLoadBytes = loadBytes; bestOption = self->teams[currentIndex]; bestIndex = currentIndex; + wigglingBestOption = self->teams[bestIndex]->hasWigglePausedServer(); } } } @@ -262,7 +272,7 @@ public: while (randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT && nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES) { // If unhealthy team is majority, we may not find an ok dest in this while loop - Reference dest = deterministicRandom()->randomChoice(self->teams); + Reference dest = deterministicRandom()->randomChoice(self->teams); bool ok = dest->isHealthy() && (!req.preferLowerUtilization || dest->hasHealthyAvailableSpace(self->medianAvailableSpace)); @@ -298,8 +308,16 @@ public: int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty); if (!bestOption.present() || (req.preferLowerUtilization && loadBytes < bestLoadBytes) || (!req.preferLowerUtilization && loadBytes > bestLoadBytes)) { + + // bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption + // in this case + if (bestOption.present() && !wigglingBestOption && randomTeams[i]->hasWigglePausedServer()) { + continue; + } + bestLoadBytes = loadBytes; bestOption = randomTeams[i]; + wigglingBestOption = randomTeams[i]->hasWigglePausedServer(); } } } @@ -3611,6 +3629,10 @@ void DDTeamCollection::removeLaggingStorageServer(Key zoneId) { disableFailingLaggingServers.set(false); } +bool DDTeamCollection::isWigglePausedServer(const UID& server) const { + return pauseWiggle && pauseWiggle->get() && wigglingId == server; +} + std::vector DDTeamCollection::getRandomHealthyTeam(const UID& excludeServer) { std::vector candidates, backup; for (int i = 0; i < teams.size(); ++i) { diff --git a/fdbserver/DDTeamCollection.h b/fdbserver/DDTeamCollection.h index 86cd92e4cf..c307dd4598 100644 --- a/fdbserver/DDTeamCollection.h +++ b/fdbserver/DDTeamCollection.h @@ -594,6 +594,9 @@ public: void removeLaggingStorageServer(Key zoneId); + // whether server is under wiggling proces, but wiggle is paused for some healthy compliance. + bool isWigglePausedServer(const UID& server) const; + // Returns a random healthy team, which does not contain excludeServer. std::vector getRandomHealthyTeam(const UID& excludeServer); diff --git a/fdbserver/TCInfo.actor.cpp b/fdbserver/TCInfo.actor.cpp index 439a71255e..67a4e0a96c 100644 --- a/fdbserver/TCInfo.actor.cpp +++ b/fdbserver/TCInfo.actor.cpp @@ -154,6 +154,10 @@ bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const return availableSpaceRatio >= minAvailableSpaceRatio; } +bool TCServerInfo::isWigglePausedServer() const { + return collection && collection->isWigglePausedServer(id); +} + Future TCServerInfo::updateServerMetrics() { return TCServerInfoImpl::updateServerMetrics(this); } @@ -431,6 +435,14 @@ bool TCTeamInfo::hasServer(const UID& server) const { return std::find(serverIDs.begin(), serverIDs.end(), server) != serverIDs.end(); } +bool TCTeamInfo::hasWigglePausedServer() const { + for (const auto& server : servers) { + if (server->isWigglePausedServer()) + return true; + } + return false; +} + void TCTeamInfo::addServers(const std::vector& servers) { serverIDs.reserve(servers.size()); for (int i = 0; i < servers.size(); i++) { diff --git a/fdbserver/TCInfo.h b/fdbserver/TCInfo.h index da18d345e7..df08598c4e 100644 --- a/fdbserver/TCInfo.h +++ b/fdbserver/TCInfo.h @@ -97,6 +97,7 @@ public: // If a storage server does not reply its storeType, it will be tracked by failure monitor and removed. return (storeType == configStoreType || storeType == KeyValueStoreType::END); } + bool isWigglePausedServer() const; std::pair spaceBytes(bool includeInFlight = true) const; int64_t loadBytes() const; @@ -214,6 +215,7 @@ public: void delref() override { ReferenceCounted::delref(); } bool hasServer(const UID& server) const; + bool hasWigglePausedServer() const; void addServers(const std::vector& servers) override; From baec03090e130445583d3250c351b221cc70a0be Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Mon, 14 Mar 2022 16:33:09 -0700 Subject: [PATCH 074/138] Fix "guarantee" misspelling --- bindings/flow/FDBLoanerTypes.h | 2 +- fdbclient/FDBTypes.h | 2 +- fdbrpc/FlowTransport.actor.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/flow/FDBLoanerTypes.h b/bindings/flow/FDBLoanerTypes.h index d33a72203c..79af2f96f5 100644 --- a/bindings/flow/FDBLoanerTypes.h +++ b/bindings/flow/FDBLoanerTypes.h @@ -161,7 +161,7 @@ struct RangeResultRef : VectorRef { // False implies that no such values remain Optional readThrough; // Only present when 'more' is true. When present, this value represent the end (or // beginning if reverse) of the range - // which was read to produce these results. This is guarenteed to be less than the requested range. + // which was read to produce these results. This is guaranteed to be less than the requested range. bool readToBegin; bool readThroughEnd; diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index f405fa7d13..a10949aadc 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -656,7 +656,7 @@ struct RangeResultRef : VectorRef { // limits requested) False implies that no such values remain Optional readThrough; // Only present when 'more' is true. When present, this value represent the end (or // beginning if reverse) of the range which was read to produce these results. This is - // guarenteed to be less than the requested range. + // guaranteed to be less than the requested range. bool readToBegin; bool readThroughEnd; diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 4cf799957e..2c59be9af2 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -921,7 +921,7 @@ ACTOR static void deliver(TransportData* self, bool inReadSocket) { // We want to run the task at the right priority. If the priority is higher than the current priority (which is // ReadSocket) we can just upgrade. Otherwise we'll context switch so that we don't block other tasks that might run - // with a higher priority. ReplyPromiseStream needs to guarentee that messages are received in the order they were + // with a higher priority. ReplyPromiseStream needs to guarantee that messages are received in the order they were // sent, so we are using orderedDelay. // NOTE: don't skip delay(0) when it's local deliver since it could cause out of order object deconstruction. if (priority < TaskPriority::ReadSocket || !inReadSocket) { From 7855dc70f255fe78b34d06383554d8d6757d68db Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 14 Mar 2022 16:43:02 -0700 Subject: [PATCH 075/138] add unit test --- fdbserver/DDTeamCollection.actor.cpp | 61 ++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index b145d0eb47..d9187fc0a6 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -5651,6 +5651,62 @@ public: return Void(); } + + ACTOR static Future GetTeam_DeprioritizeWigglePausedTeam() { + Reference policy = Reference( + new PolicyAcross(3, "zoneid", Reference(new PolicyOne()))); + state int processSize = 5; + state int teamSize = 3; + state std::unique_ptr collection = testTeamCollection(teamSize, policy, processSize); + GetStorageMetricsReply mid_avail; + mid_avail.capacity.bytes = 1000 * 1024 * 1024; + mid_avail.available.bytes = 400 * 1024 * 1024; + mid_avail.load.bytes = 100 * 1024 * 1024; + + GetStorageMetricsReply high_avail; + high_avail.capacity.bytes = 1000 * 1024 * 1024; + high_avail.available.bytes = 800 * 1024 * 1024; + high_avail.load.bytes = 90 * 1024 * 1024; + + collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); + collection->addTeam(std::set({ UID(2, 0), UID(3, 0), UID(4, 0) }), true); + collection->disableBuildingTeams(); + collection->setCheckTeamDelay(); + + /* + * Among server teams that have healthy space available, pick the team that is + * least utilized, if the caller says they preferLowerUtilization. + */ + + collection->server_info[UID(1, 0)]->setMetrics(mid_avail); + collection->server_info[UID(2, 0)]->setMetrics(high_avail); + collection->server_info[UID(3, 0)]->setMetrics(high_avail); + collection->server_info[UID(4, 0)]->setMetrics(high_avail); + + collection->wigglingId = UID(4, 0); + collection->pauseWiggle = makeReference>(true); + + bool wantsNewServers = true; + bool wantsTrueBest = true; + bool preferLowerUtilization = true; + bool teamMustHaveShards = false; + std::vector completeSources{ UID(1, 0), UID(2, 0), UID(3, 0) }; + + state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards); + req.completeSources = completeSources; + + wait(collection->getTeam(req)); + + std::pair>, bool> resTeam = req.reply.getFuture().get(); + + std::set expectedServers{ UID(1, 0), UID(2, 0), UID(3, 0) }; + ASSERT(resTeam.first.present()); + auto servers = resTeam.first.get()->getServerIDs(); + const std::set selectedServers(servers.begin(), servers.end()); + ASSERT(expectedServers == selectedServers); + + return Void(); + } }; TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") { @@ -5712,3 +5768,8 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") { wait(DDTeamCollectionUnitTest::GetTeam_ServerUtilizationNearCutoff()); return Void(); } + +TEST_CASE("/DataDistribution/GetTeam/DeprioritizeWigglePausedTeam") { + wait(DDTeamCollectionUnitTest::GetTeam_DeprioritizeWigglePausedTeam()); + return Void(); +} From a7f6a9e8f7702ccb6fe52b8ed711d6824a650f65 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 15 Mar 2022 00:04:22 +0000 Subject: [PATCH 076/138] [doc] Add "Monitored Metrics" page to the docs. This page contains a description of suggested alerts and dashboard graphs to have to support a production FDB cluster. (Credit to AJ for the actual text.) --- documentation/sphinx/source/index.rst | 3 + .../sphinx/source/monitored-metrics.rst | 1105 +++++++++++++++++ 2 files changed, 1108 insertions(+) create mode 100644 documentation/sphinx/source/monitored-metrics.rst diff --git a/documentation/sphinx/source/index.rst b/documentation/sphinx/source/index.rst index b7377bcc8b..40c7a76279 100644 --- a/documentation/sphinx/source/index.rst +++ b/documentation/sphinx/source/index.rst @@ -38,6 +38,8 @@ The latest changes are detailed in :ref:`release-notes`. The documentation has t * :doc:`administration` contains documentation on administering FoundationDB. +* :doc:`monitored-metrics` contains documentation on monitoring and alerting for FoundationDB. + * :doc:`redwood` contains documentation on Redwood Storage Engine. * :doc:`visibility` contains documentation related to Visibility into FoundationDB. @@ -55,6 +57,7 @@ The latest changes are detailed in :ref:`release-notes`. The documentation has t api-reference tutorials administration + monitored-metrics redwood visibility earlier-release-notes diff --git a/documentation/sphinx/source/monitored-metrics.rst b/documentation/sphinx/source/monitored-metrics.rst new file mode 100644 index 0000000000..d48d47e64c --- /dev/null +++ b/documentation/sphinx/source/monitored-metrics.rst @@ -0,0 +1,1105 @@ +**Monitored Metrics** +================================ + +**Database Availability** +------------------------- + +*Database Availability Percentage* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - For our purposes, we’ve defined the database to be +unavailable if any operation (transaction start, read, or commit) cannot +be completed within 5 seconds. Currently, we only monitor transaction +start (get read version) and commit for this metric. We report the +percentage of each minute that the database is not unavailable according +to this definition. + +**How to compute** - Because this is a metric where we value precision, +we compute it by running external processes that each start a new +transaction every 0.5 seconds at system immediate priority and then +committing them. Any time we have a delay exceeding 5 seconds, we +measure the duration of that downtime. We exclude the last 5 seconds of +this downtime, as operations performed during this period don’t satisfy +the definition of unavailability above. + +**How we alert** - We do not alert on this metric. + +*Database Available* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Reports as a point in time measurement once per minute +whether the database is available. + +**How to compute** - This value begins reporting 0 anytime the process +described in ‘Database Availability Percentage’ detects an operation +taking longer than 5 seconds and resets to 1 whenever an operation +completes. + +**How we alert** - We alert immediately whenever this value is 0. + +*Max Unavailability Seconds* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Reports the largest period of unavailability +overlapping a given minute. For example, a 3 minute unavailability that +started half way through a minute will report 30, 90, 150, 180 for the 4 +minutes that overlap the unavailability period. + +**How to compute** - The process described in ‘Database Availability +Percentage’ tracks and reports this data for each minute of the +unavailability period. + +**How we alert** - We do not alert on this metric, though it could +possibly be combine with ‘Database Available’ to create an alert that +fires when unavailability reaches a minimum duration. + +**Fault Tolerance** +------------------- + +*Data Loss Margin* +~~~~~~~~~~~~~~~~~~ + +**Explanation** - Reports the number of fault tolerance domains (e.g. +separate Zone IDs) that can be safely lost without data loss. Fault +tolerance domains are typically assigned to correspond to something like +racks or machines, so for this metric you would be measuring something +like the number of racks that could be lost. + +**How to compute** - From status: + +cluster.fault_tolerance.max_machine_failures_without_losing_data + +**How we alert** - We do not alert on this metric because the +Availability Loss Margin alert captures the same circumstances and more. + +*Availability Loss Margin* +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Reports the number of fault tolerance domains (e.g. +separate Zone IDs) that can be safely lost without indefinite +availability loss. + +**How to compute** - From status: + +cluster.fault_tolerance.max_machine_failures_without_losing_availability + +**How we alert** - We have 3 different alerts on this metric, some based +on a relative measure with expected fault tolerance (e.g. 2 for triple, +1 for double): + +1. Fault tolerance is 2 less than expected (only relevant with at least + triple redundancy) + +2. Fault tolerance is 1 less than expected for 3 hours (to allow for + self-healing) + +3. Fault tolerance decreases more than 4 times in 1 hour (may indicate + flapping) + +*Maintenance Mode* +~~~~~~~~~~~~~~~~~~ + +**Explanation** - Whether or not maintenance mode has been activated, +which treats a zone as failed but doesn’t invoke data movement for it. + +**How to compute** - Maintenance mode is on if the following metric is +present in status: + +cluster.maintenance_seconds_remaining + +**How we alert** - We do not alert on this metric + +**Process and Machine Count** +----------------------------- + +*Process Count* +~~~~~~~~~~~~~~~ + +**Explanation** - The number of processes in the cluster, not counting +excluded processes. + +**How to compute** - Count the number of entries in the +cluster.processes array where excluded is not true. + +**How we alert** - We have 3 different alerts on this metric, some based +on a relative measure with the expected count: + +1. The process count decreases 5 times in 60 minutes (may indicate + flapping) + +2. The process count is less that 70% of expected + +3. The process count does not match expected (low severity notification) + +*Excluded Process Count* +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of processes in the cluster that are +excluded. + +**How to compute** - Count the number of entries in the +cluster.processes array where excluded is true. + +**How we alert** - We do not alert on this metric. + +*Expected Process Count* +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The expected number of non-excluded processes in the +cluster. + +**How to compute** - We determine this number from how we’ve configured +the cluster. + +**How we alert** - We do not alert on this metric. + +*Machine Count* +~~~~~~~~~~~~~~~ + +**Explanation** - The number of machines in the cluster, not counting +excluded machines. This number may not be relevant depending on the +environment. + +**How to compute** - Count the number of entries in the cluster.machines +array where excluded is not true. + +**How we alert** - We have 3 different alerts on this metric, some based +on a relative measure with the expected count: + +1. The machine count decreases 5 times in 60 minutes (may indicate + flapping) + +2. The machine count is less that 70% of expected + +3. The machine count does not match expected (low severity notification) + +*Excluded Machine Count* +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of machines in the cluster that are +excluded. + +**How to compute** - Count the number of entries in the cluster.machines +array where excluded is true. + +**How we alert** - We do not alert on this metric. + +*Expected Machine Count* +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The expected number of non-excluded machines in the +cluster. + +**How to compute** - We determine this number from how we’ve configured +the cluster. + +**How we alert** - We do not alert on this metric. + +**Latencies** +------------- + +*GRV Probe Latency* +~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The latency to get a read version as measured by the +cluster controller’s status latency probe. + +**How to compute** - From status: + +cluster.latency_probe.transaction_start_seconds + +**How we alert** - We have multiple alerts at different severities +depending on the magnitude of the latency. The specific magnitudes +depend on the details of the cluster and the guarantees provided. +Usually, we require elevated latencies over multiple minutes (e.g. 2 out +of 3) to trigger an alert. + +*Read Probe Latency* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The latency to read a key as measured by the cluster +controller’s status latency probe. Notably, this will only test a read +from a single storage server during any given probe and to only a single +team when measured over multiple probes. Data distribution could +sometimes change which team is responsible for the probed key. + +**How to compute** - From status: + +cluster.latency_probe.read_seconds + +**How we alert** - We have multiple alerts at different severities +depending on the magnitude of the latency. The specific magnitudes +depend on the details of the cluster and the guarantees provided. +Usually, we require elevated latencies over multiple minutes (e.g. 2 out +of 3) to trigger an alert. + +*Commit Probe Latency* +~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The latency to commit a transaction as measured by the +cluster controller’s status latency probe. + +**How to compute** - From status: + +cluster.latency_probe.commit_seconds + +**How we alert** - We have multiple alerts at different severities +depending on the magnitude of the latency. The specific magnitudes +depend on the details of the cluster and the guarantees provided. +Usually, we require elevated latencies over multiple minutes (e.g. 2 out +of 3) to trigger an alert. + +*Client GRV Latency* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - A sampled distribution of get read version latencies +as measured on the clients. + +**How to compute** - The use of this functionality is currently not well +documented. + +**How we alert** - We do not alert on this metric. + +*Client Read Latency* +~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - A sampled distribution of read latencies as measured +on the clients. + +**How to compute** - The use of this functionality is currently not well +documented. + +**How we alert** - We do not alert on this metric. + +*Client Commit Latency* +~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - A sampled distribution of commit latencies as measured +on the clients. + +**How to compute** - The use of this functionality is currently not well +documented. + +**How we alert** - We do not alert on this metric. + +**Workload** +------------ + +*Transaction Starts per Second* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of read versions issued per second. + +**How to compute** - From status: + +cluster.workload.transactions.started.hz + +**How we alert** - We do not alert on this metric. + +*Conflicts per Second* +~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of transaction conflicts per second. + +**How to compute** - From status: + +cluster.workload.transactions.conflicted.hz + +**How we alert** - We do not alert on this metric. + +*Commits per Second* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of transactions successfully committed per +second. + +**How to compute** - From status: + +cluster.workload.transactions.committed.hz + +**How we alert** - We do not alert on this metric. + +*Conflict Rate* +~~~~~~~~~~~~~~~ + +**Explanation** - The rate of conflicts relative to the total number of +committed and conflicted transactions. + +**How to compute** - Derived from the conflicts and commits per second +metrics: + +conflicts_per_second / (conflicts_per_second + commits_per_second) + +**How we alert** - We do not alert on this metric. + +*Reads per Second* +~~~~~~~~~~~~~~~~~~ + +**Explanation** - The total number of read operations issued per second +to storage servers. + +**How to compute** - From status: + +cluster.workload.operations.reads.hz + +**How we alert** - We do not alert on this metric. + +*Keys Read per Second* +~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The total number of keys read per second. + +**How to compute** - From status: + +cluster.workload.keys.read.hz + +**How we alert** - We do not alert on this metric. + +*Bytes Read per Second* +~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The total number of bytes read per second. + +**How to compute** - From status: + +cluster.workload.bytes.read.hz + +**How we alert** - We do not alert on this metric. + +*Writes per Second* +~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The total number of mutations committed per second. + +**How to compute** - From status: + +cluster.workload.operations.writes.hz + +**How we alert** - We do not alert on this metric. + +*Bytes Written Per Second* +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The total number of mutation bytes committed per +second. + +**How to compute** - From status: + +cluster.workload.bytes.written.hz + +**How we alert** - We do not alert on this metric. + +**Recoveries** +-------------- + +*Cluster Generation* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The cluster generation increases when there is a +cluster recovery (i.e. the write subsystem gets restarted). For a +successful recovery, the generation usually increases by 2. If it only +increases by 1, that could indicate that a recovery is stalled. If it +increases by a lot, that might suggest that multiple recoveries are +taking place. + +**How to compute** - From status: + +cluster.generation + +**How we alert** - We alert if the generation increases in 5 separate +minutes in a 60 minute window. + +**Cluster Load** +---------------- + +*Ratekeeper Limit* +~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of transactions that the cluster is +allowing to start per second + +**How to compute** - From status: + +cluster.qos.transactions_per_second_limit + +**How we alert** - We do not alert on this metric. + +*Ratekeeper Batch Priority Limit* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of transactions that the cluster is +allowing to start per second above which batch priority transactions +will not be allowed to start. + +**How to compute** - From status: + +cluster.qos.batch_transactions_per_second_limit + +**How we alert** - We do not alert on this metric. + +*Ratekeeper Released Transactions* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of transactions that the cluster is +releasing per second. If this number is near or above the ratekeeper +limit, that would indicate that the cluster is saturated and you may see +an increase in the get read version latencies. + +**How to compute** - From status: + +cluster.qos.released_transactions_per_second + +**How we alert** - We do not alert on this metric. + +*Max Storage Queue* +~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The largest write queue on a storage server, which +represents data being stored in memory that has not been persisted to +disk. With the default knobs, the target queue size is 1.0GB, and +ratekeeper will start trying to reduce the transaction rate when a +storage server’s queue size reaches 900MB. Depending on the replication +mode, the cluster allows all storage servers from one fault domain (i.e. +ZoneID) to exceed this limit without trying to adjust the transaction +rate in order to account for various failure scenarios. Storage servers +with a queue that reaches 1.5GB (the e-brake) will stop fetching +mutations from the transaction logs until they are able to flush some of +their data from memory. As of 6.1, batch priority transactions are +limited when the queue size reaches a smaller threshold (default target +queue size of 500MB). + +**How to compute** - From status: + +cluster.qos.worst_queue_bytes_storage_server + +**How we alert** - We alert when the largest queue exceeds 500MB for 30 +minutes in a 60 minute window. + +*Limiting Storage Queue* +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The largest write queue on a storage server that isn’t +being ignored for ratekeeper purposes (see max storage queue for +details). If this number is large, ratekeeper will start limiting the +transaction rate. + +**How to compute** - From status: + +cluster.qos.limiting_queue_bytes_storage_server + +**How we alert** - We alert when the limiting queue exceeds 500MB for 10 +consecutive minutes. + +*Max Log Queue* +~~~~~~~~~~~~~~~ + +**Explanation** - The largest write queue on a transaction log, which +represents data that is being stored in memory on the transaction log +but has not yet been made durable on all applicable storage servers. +With the default knobs, the target queue size is 2.4GB, and ratekeeper +will start trying to reduce the transaction rate when a transaction +log’s queue size reaches 2.0GB. When the queue reaches 1.5GB, the +transaction log will start spilling mutations to a persistent structure +on disk, which allows the mutations to be flushed from memory and +reduces the queue size. During a storage server failure, you will see +the queue size grow to this spilling threshold and ideally hold steady +at that point. As of 6.1, batch priority transactions are limited when +the queue size reaches a smaller threshold (default target queue size of +1.0GB). + +**How to compute** - From status: + +cluster.qos.worst_queue_bytes_log_server + +**How we alert** - We alert if the log queue is notably larger than the +spilling threshold (>1.6GB) for 3 consecutive minutes. + +*Storage Read Queue* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of in flight read requests on a storage +server. We track the average and maximum of the queue size over all +storage processes in the cluster. + +**How to compute** - From status (storage role only): + +cluster.processes..roles[n].query_queue_max + +**How we alert** - We do not alert on this metric. + +*Storage and Log Input Rates* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of bytes being input to each storage server +or transaction log for writes as represented in memory. This includes +various overhead for the data structures required to store the data, and +the magnitude of this overhead is different on storage servers and logs. +This data lives in memory for at least 5 seconds, so if the rate is too +high it can result in large queues. We track the average and maximum +input rates over all storage processes in the cluster. + +**How to compute** - From status (storage and log roles only): + +cluster.processes..roles[n].input_bytes.hz + +**How we alert** - We alert if the log input rate is larger than 80MB/s +for 20 out of 60 minutes, which can be an indication that we are using a +sizable fraction of our logs’ capacity. + +*Storage Server Operations and Bytes Per Second* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - We track the number of mutations, mutation bytes, +reads, and read bytes per second on each storage server. We use this +primarily to track whether a single replica contains a hot shard +receiving an outsized number of reads or writes. To do so, we monitor +the maximum, average, and “2nd team” rate. Comparing the maximum and 2nd +team can sometimes indicate a hot shard. + +**How to compute** - From status (storage roles only): + +| cluster.processes..roles[n].mutations.hz +| cluster.processes..roles[n].mutation_bytes.hz +| cluster.processes..roles[n].finished_queries.hz +| cluster.processes..roles[n].bytes_queried.hz + +To estimate the rate for the 2nd team (i.e the team that is the 2nd +busiest in the cluster), we ignore the top replication_factor storage +processes. + +**How we alert** - We do not alert on these metrics. + +*Transaction Log to Storage Server Lag* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - How far behind the latest mutations on the storage +servers are from those on the transaction logs, measured in seconds. In +addition to monitoring the average and maximum lag, we also measure what +we call the “worst replica lag”, which is an estimate of the worst lag +for a whole replica of data. + +During recoveries of the write subsystem, this number can temporarily +increase because the database is advanced by many seconds worth of +versions. + +When a missing storage server rejoins, if its data hasn’t been +re-replicated yet it will appear with a large lag that should steadily +decrease as it catches up. + +A storage server that ratekeeper allows to exceed the target queue size +may eventually start lagging if it remains slow. + +**How to compute** - From status (storage roles only): + +cluster.processes..roles[n].data_lag.seconds + +To compute the “worst replica lag”, we ignore the lag for all storage +servers in the first N-1 fault domains, where N is the minimum number of +replicas remaining across all data shards as reported by status at: + +cluster.data.state.min_replicas_remaining + +**How we alert** - We alert when the maximum lag exceeds 4 hours for a +duration of 2 minutes or if it exceeds 1000 seconds for a duration of 60 +minutes. A more sophisticated alert may only alert if the lag is large +and not decreasing. + +We also alert when the worst replica lag exceeds 15 seconds for 3 +consecutive minutes. + +*Storage Server Durability Lag* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - How far behind in seconds that the mutations on a +storage server’s disk are from the latest mutations in that storage +server’s memory. A large lag means can mean that the storage server +isn’t keeping up with the mutation rate, and the queue size can grow as +a result. We monitor the average and maximum durability lag for the +cluster. + +**How to compute** - From status (storage roles only): + +cluster.process..roles[n].durability_lag.seconds + +**How we alert** - We do not alert on this metric. + +**Other Cluster Metrics** +------------------------- + +*Data Movement* +~~~~~~~~~~~~~~~ + +**Explanation** - How much data is actively being moved or queued to be +moved between shards in the cluster. There is often a small amount of +rebalancing movement happening to keep the cluster well distributed, but +certain failures and maintenance operations can cause a lot of movement. + +**How to compute** - From status: + +| cluster.data.moving_data.in_flight_bytes +| cluster.data.moving_data.in_queue_bytes + +**How we alert** - We do not alert on this metric + +*Coordinators* +~~~~~~~~~~~~~~ + +**Explanation** - The number of coordinators in the cluster, both as +configured and that are reachable from our monitoring agent. + +**How to compute** - This list of coordinators can be found in status: + +cluster.coordinators.coordinators + +Each coordinator in the list also reports if it is reachable: + +cluster.coordinators.coordinators.reachable + +**How we alert** - We alert if there are any unreachable coordinators +for a duration of 3 hours or more. + +*Clients* +~~~~~~~~~ + +**Explanation** - A count of connected clients and incompatible clients. +Currently, a large number of connected clients can be taxing for some +parts of the cluster. Having incompatible clients may indicate a +client-side misconfiguration somewhere. + +**How to compute** - The connected client count can be obtained from +status directly: + +cluster.clients.count  + +To get the incompatible client count, we read the following list from +status and count the number of entries. Note that this is actually a +list of incompatible connections, which could theoretically include +incompatible server processes: + +cluster.incompatible_connections + +**How we alert** - We alert if the number of connected clients exceeds +1500 for 10 minutes. We also have a low priority alert if there are any +incompatible connections for a period longer than 3 hours. + +**Resource Usage** +------------------ + +*CPU Usage* +~~~~~~~~~~~ + +**Explanation** - Percentage of available CPU resources being used. We +track the average and maximum values for each process (as a fraction of +1 core) and each machine (as a fraction of all logical cores). A useful +extension of this would be to track the average and/or max per cluster +role to highlight which parts of the cluster are heavily utilized. + +**How to compute** - All of these metrics can be obtained from status. +For processes: + +cluster.processes..cpu.usage_cores + +For machines: + +cluster.machines..cpu.logical_core_utilization + +To get the roles assigned to each process: + +cluster.processes..roles[n].role + +**How we alert** - We do not alert on this metric. + +*Disk Activity* +~~~~~~~~~~~~~~~ + +**Explanation** - Various metrics for how the disks are being used. We +track averages and maximums for disk reads per second, disk writes per +second, and disk busyness percentage. + +**How to compute** - All of these metrics can be obtained from status. +For reads: + +cluster.processes..disk.reads.hz + +For writes: + +cluster.processes..disk.writes.hz + +For busyness (as a fraction of 1): + +cluster.processes..disk.busy + +**How we alert** - We do not alert on this metric. + +*Memory Usage* +~~~~~~~~~~~~~~ + +**Explanation** - How much memory is being used by each process and on +each machine. We track this in absolute numbers and as a percentage with +both averages and maximums. + +**How to compute** - All of these metrics can be obtained from status. +For process absolute memory: + +cluster.processes..memory.used_bytes + +For process memory used percentage, divide used memory by available +memory: + +cluster.processes..memory.available_bytes + +For machine absolute memory: + +cluster.machines..memory.committed_bytes + +For machine memory used percentage, divide used memory by free memory: + +cluster.machines..memory.free_bytes + +**How we alert** - We do not alert on this metric. + +*Network Activity* +~~~~~~~~~~~~~~~~~~ + +**Explanation** - Input and output network rates for processes and +machines in megabits per second (Mbps). We track averages and maximums +for each. + +**How to compute** - All of these metrics can be obtained from status. +For process traffic: + +| cluster.processes..network.megabits_received.hz +| cluster.processes..network.megabits_sent.hz + +For machine traffic: + +| cluster.machines..network.megabits_received.hz +| cluster.machines..network.megabits_sent.hz + +**How we alert** - We do not alert on this metric. + +*Network Connections* +~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Statistics about open connection and connection +activity. For each process, we track the number of connections, the +number of connections opened per second, the number of connections +closed per second, and the number of connection errors per second. + +**How to compute** - All of these metrics can be obtained from status: + +| cluster.processes..network.current_connections +| cluster.processes..network.connections_established.hz +| cluster.processes..network.connections_closed.hz +| cluster.processes..network.connection_errors.hz + +**How we alert** - We do not alert on this metric. + +*Network Retransmits* +~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The number of TCP segments retransmitted per second +per machine. + +**How to compute** - From status: + +cluster.machines..network.tcp_segments_retransmitted.hz + +**How we alert** - We do not alert on this metric. + +**Space Usage** +--------------- + +*Dataset Size* +~~~~~~~~~~~~~~ + +**Explanation** - The logical size of the database (i.e. the estimated +sum of key and value sizes) and the physical size of the database (bytes +used on disk). We also report an overhead factor, which is the physical +size divided by the logical size. Typically this is marginally larger +than the replication factor. + +**How to compute** - From status: + +| cluster.data.total_kv_size_bytes +| cluster.data.total_disk_used_bytes + +**How we alert** - We do not alert on this metric. + +*Process Space Usage* +~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Various metrics relating to the space usage on each +process. We track the amount of space free on each process, reporting +minimums and averages for absolute bytes and as a percentage. We also +track the amount of space available to each process, which includes +space within data files that is reusable. For available space, we track +the minimum available to storage processes and the minimum available for +the transaction logs’ queues and kv-stores as percentages. + +Running out of disk space can be a difficult situation to resolve, and +it’s important to be proactive about maintaining some buffer space. + +**How to compute** - All of these metrics can be obtained from status. +For process free bytes: + +cluster.processes..disk.free_bytes + +For process free percentage, divide free bytes by total bytes: + +cluster.processes..disk.total_bytes + +For available percentage divide available bytes by total bytes. The +first is for kv-store data structures, present in storage and log roles: + +| cluster.processes..roles[n].kvstore_available_bytes +| cluster.processes..roles[n].kvstore_total_bytes + +The second is for the queue data structure, present only in log roles: + +| cluster.processes..roles[n].queue_disk_available_bytes +| cluster.processes..roles[n].queue_disk_total_bytes + +**How we alert** - We alert when free space on any process falls below +15%. We also alert with low severity when available space falls below +35% and with higher severity when it falls below 25%. + +*Cluster Disk Space* +~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - An accounting of the amount of space on all disks in +the cluster as well as how much of that space is free and available, +counted separately for storage and log processes. Available space has +the same meaning as described in the “Process Space Usage” section +above, as measured on each process’s kv-store. + +**How to compute** - This needs to be aggregated from metrics in status. +For storage and log roles, the per-process values can be obtained from: + +| cluster.processes..roles[n].kvstore_total_bytes +| cluster.processes..roles[n].kvstore_free_bytes +| cluster.processes..roles[n].kvstore_available_bytes + +To compute totals for the cluster, these numbers would need to be summed +up across all processes in the cluster for each role. If you have +multiple processes sharing a single disk, then you can use the locality +API to tag each process with an identifier for its disk and then read +them back out with: + +cluster.processes..locality. + +In this case, you would only count the total and free bytes once per +disk. For available bytes, you would add free bytes once per disk and +(available-free) for each process. + +**How we alert** - We do not alert on this metric. + +**Backup and DR** +----------------- + +*Num Backup/DR Agents Running* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - A count of the number of backup and DR agents +currently connected to the cluster. For DR agents, we track the number +of DR agents where the cluster in question is the destination cluster, +but you could also count the number of agents using the cluster as a +source if needed. + +**How to compute** - From status: + +| cluster.layers.backup.instances_running +| cluster.layers.dr_backup.instances_running +| cluster.layers.dr_backup_dest.instances_running + +**How we alert** - We have a low severity alert if this number differs +at all from the expected value. We have high severity alerts if the +number of running agents is less than half of what is expected or if the +count decreases 5 times in one hour. + +*Num Backup/DR Agents Expected* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - The expected numbers of backup and DR agents in the +cluster. + +**How to compute** - We determine this number from how we’ve configured +the cluster. + +**How we alert** - We do not alert on this metric. + +*Backup/DR Running* +~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Tracks whether backup or DR is running on a cluster. +For our purposes, we only report DR is running on the primary cluster. + +**How to compute** - From status: + +| cluster.layers.backup.tags.default.running_backup +| cluster.layers.dr_backup.tags.default.running_backup + +**How we alert** - We alert if backup is not running for 5 consecutive +minutes or DR is not running for 15 consecutive minutes. Because we only +run backup on primary clusters in a DR pair, we don’t have either of +these alerts on secondary clusters. + +*Backup/DR Rate* +~~~~~~~~~~~~~~~~ + +**Explanation** - The rate at which backup and DR are processing data. +We report rates for both ranges (i.e. copying data at rest) and new +mutations. + +**How to compute** - We can get the total number of bytes of each type +in status: + +| cluster.layers.backup.tags.default.range_bytes_written +| cluster.layers.backup.tags.default.mutation_log_bytes_written +| cluster.layers.dr_backup.tags.default.range_bytes_written +| cluster.layers.dr_backup.tags.default.mutation_log_bytes_written + +To compute a rate, it is necessary to query these values multiple times +and divide the number of bytes that each has increased by the time +elapsed between the queries. + +**How we alert** - See Backup/DR Lag section, where we have an alert +that incorporates rate data. + +*Backup/DR Lag* +~~~~~~~~~~~~~~~ + +**Explanation** - How many seconds behind the most recent mutations a +restorable backup or DR is. A backup or DR is restorable if it contains +a consistent snapshot of some version of the database. For a backup or +DR that is not running or restorable, we do not track lag. + +**How to compute** - From status, you can get the lag from: + +| cluster.layers.backup.tags.default.last_restorable_seconds_behind +| cluster.layers.dr_backup.tags.default.seconds_behind + +This would then be combined with whether the backup or DR is running as +described above and whether it is restorable: + +| cluster.layers.backup.tags.default.running_backup_is_restorable +| cluster.layers.dr_backup.tags.default.running_backup_is_restorable + +**How we alert** - We have a low severity alert for a backup that is 30 +minutes behind and a DR that is 5 minutes behind. We have high severity +alerts for a backup or DR that is 60 minutes behind. + +We also have a high severity alert if a backup or DR is behind by at +least 5 minutes and the total backup/DR rate (combined range and +mutation bytes) is less than 1000 bytes/s. For backup, this alert occurs +after being in this state for 30 minutes, and for DR it is after 3 +minutes. + +*Backup Seconds Since Last Restorable* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - Measures how many seconds of data have not been backup +up and could not be restored. + +**How to compute** - This uses the same source metric as in backup lag, +except that we also track it in cases where the backup is not running or +is not restorable: + +cluster.layers.backup.tags.default.last_restorable_seconds_behind + +**How we alert** - We do not alert on this metric. + +*Datacenter Lag Seconds* +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - When running a multi-DC cluster with async +replication, this tracks the lag in seconds between datacenters. It is +conceptually similar to DR lag when replication is done between 2 +distinct clusters. + +**How to compute** - This information can be obtained from status. The +metric used varies depending on the version. In 6.1 and older, use the +following metric and divide by 1,000,000: + +cluster.datacenter_version_difference + +In 6.2 and later, use: + +cluster.datacenter_lag.seconds + +**How we alert** - We have not yet defined alerts on this metric. + +*Estimated Backup Size* +~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - This is not being tracked correctly. + +*Process Uptime* +~~~~~~~~~~~~~~~~ + +**Explanation** - How long each process has been running. + +**How to compute** - From status: + +cluster.processes..uptime_seconds + +**How we alert** - We do not alert on this metric. + +*Cluster Health* +~~~~~~~~~~~~~~~~ + +**Explanation** - This is a complicated metric reported by status that +is used to indicate that something about the cluster is not in a desired +state. For example, a cluster will not be healthy if it is unavailable, +is missing replicas of some data, has any running processes with errors, +etc. If the metric indicates the cluster isn’t healthy, running status +in fdbcli can help determine what’s wrong. + +**How to compute** - From status: + +cluster.database_status.healthy + +If the metric is missing, its value is presumed to be false. + +**How we alert** - We do not alert on this metric. + +*Layer Status* +~~~~~~~~~~~~~~ + +**Explanation** - Backup and DR report their statistics through a +mechanism called “layer status”. If this layer status is missing or +invalid, the state of backup and DR cannot be determined. This metric +can be used to track whether the layer status mechanism is working. + +**How to compute** - From status: + +cluster.layers._valid + +If the metric is missing, its value is presumed to be false. + +**How we alert** - We alert if the layer status is invalid for 10 +minutes. + +*Process Errors* +~~~~~~~~~~~~~~~~ + +**Explanation** - We track all errors logged by any process running in +the cluster (including the backup and DR agents). + +**How to compute** - From process trace logs, look for events with +Severity=“40” + +**How we alert** - We receive a daily summary of all errors. + +*Process Notable Warnings* +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Explanation** - We track all notable warnings logged by any process +running in the cluster (including the backup and DR agents). Note that +there can be some noise in these events, so we heavily summarize the +results. + +**How to compute** - From process trace logs, look for events with +Severity=“30” + +**How we alert** - We receive a daily summary of all notable warnings. From 09df9d83fdc9cf2734154104f406a4fd241737b1 Mon Sep 17 00:00:00 2001 From: Yao Xiao <87789492+yao-xiao-github@users.noreply.github.com> Date: Mon, 14 Mar 2022 21:11:44 -0700 Subject: [PATCH 077/138] Enable RocksDB with Valgrind. (#6595) --- fdbserver/SimulatedCluster.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 4408fd500b..e9d0404158 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2244,7 +2244,7 @@ void setupSimulatedSystem(std::vector>* systemActors, using namespace std::literals; -#if defined(SSD_ROCKSDB_EXPERIMENTAL) && !VALGRIND +#if defined(SSD_ROCKSDB_EXPERIMENTAL) bool rocksDBEnabled = true; #else bool rocksDBEnabled = false; From 1a1c1572467ce3c734c6a1000a9f9e9d7bd76e08 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Tue, 15 Mar 2022 14:55:35 +0100 Subject: [PATCH 078/138] ApiTester: address a thread sanitizer issue --- bindings/c/test/apitester/TesterTransactionExecutor.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bindings/c/test/apitester/TesterTransactionExecutor.cpp b/bindings/c/test/apitester/TesterTransactionExecutor.cpp index 505b55251e..36350eea02 100644 --- a/bindings/c/test/apitester/TesterTransactionExecutor.cpp +++ b/bindings/c/test/apitester/TesterTransactionExecutor.cpp @@ -270,6 +270,9 @@ protected: void onFutureReady(FDBFuture* f) { injectRandomSleep(); + // Hold a reference to this to avoid it to be + // destroyed before releasing the mutex + auto thisRef = shared_from_this(); std::unique_lock lock(mutex); auto iter = callbackMap.find(f); ASSERT(iter != callbackMap.end()); @@ -287,7 +290,6 @@ protected: scheduler->schedule(cbInfo.cont); return; } - onError(err); } From c635dcd3adca4ff7ea73cfc460968a5e1538d601 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Sat, 19 Feb 2022 15:09:55 -0800 Subject: [PATCH 079/138] Add tenant support in the FDB native client --- bindings/c/fdb_c.cpp | 29 ++ bindings/c/foundationdb/fdb_c.h | 19 + fdbclient/DatabaseContext.h | 19 +- fdbclient/GenericManagementAPI.actor.h | 155 +++++++ fdbclient/IClientApi.h | 17 + fdbclient/ISingleThreadTransaction.cpp | 17 +- fdbclient/ISingleThreadTransaction.h | 9 +- fdbclient/MultiVersionTransaction.actor.cpp | 135 +++++- fdbclient/MultiVersionTransaction.h | 74 ++++ fdbclient/NativeAPI.actor.cpp | 467 ++++++++++++++++---- fdbclient/NativeAPI.actor.h | 18 +- fdbclient/PaxosConfigTransaction.actor.cpp | 2 +- fdbclient/PaxosConfigTransaction.h | 2 +- fdbclient/ReadYourWrites.actor.cpp | 14 +- fdbclient/ReadYourWrites.h | 5 +- fdbclient/SimpleConfigTransaction.actor.cpp | 2 +- fdbclient/SimpleConfigTransaction.h | 2 +- fdbclient/SpecialKeySpace.actor.cpp | 2 + fdbclient/SystemData.cpp | 2 + fdbclient/SystemData.h | 2 + fdbclient/ThreadSafeTransaction.cpp | 44 +- fdbclient/ThreadSafeTransaction.h | 27 +- fdbclient/vexillographer/fdb.options | 6 +- fdbserver/storageserver.actor.cpp | 2 +- 24 files changed, 937 insertions(+), 134 deletions(-) diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index 6bbc360a1f..5de1253627 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -37,12 +37,14 @@ int g_api_version = 0; * FDBFuture -> ThreadSingleAssignmentVarBase * FDBResult -> ThreadSingleAssignmentVarBase * FDBDatabase -> IDatabase + * FDBTenant -> ITenant * FDBTransaction -> ITransaction */ #define TSAVB(f) ((ThreadSingleAssignmentVarBase*)(f)) #define TSAV(T, f) ((ThreadSingleAssignmentVar*)(f)) #define DB(d) ((IDatabase*)d) +#define TENANT(t) ((ITenant*)t) #define TXN(t) ((ITransaction*)t) // Legacy (pre API version 610) @@ -386,6 +388,14 @@ extern "C" DLLEXPORT void fdb_database_destroy(FDBDatabase* d) { CATCH_AND_DIE(DB(d)->delref();); } +extern "C" DLLEXPORT fdb_error_t fdb_database_open_tenant(FDBDatabase* d, + uint8_t const* tenant_name, + int tenant_name_length, + FDBTenant** out_tenant) { + CATCH_AND_RETURN(*out_tenant = + (FDBTenant*)DB(d)->openTenant(StringRef(tenant_name, tenant_name_length)).extractPtr();); +} + extern "C" DLLEXPORT fdb_error_t fdb_database_create_transaction(FDBDatabase* d, FDBTransaction** out_transaction) { CATCH_AND_RETURN(Reference tr = DB(d)->createTransaction(); if (g_api_version <= 15) tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -439,6 +449,25 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db }).extractPtr()); } +extern "C" DLLEXPORT FDBFuture* fdb_database_allocate_tenant(FDBDatabase* db, uint8_t const* name, int name_length) { + return (FDBFuture*)(DB(db)->createTenant(StringRef(name, name_length)).extractPtr()); +} + +extern "C" DLLEXPORT FDBFuture* fdb_database_remove_tenant(FDBDatabase* db, uint8_t const* name, int name_length) { + return (FDBFuture*)(DB(db)->deleteTenant(StringRef(name, name_length)).extractPtr()); +} + +extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) { + CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr();); +} + +extern "C" DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant) { + try { + TENANT(tenant)->delref(); + } catch (...) { + } +} + extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) { try { TXN(tr)->delref(); diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 5fc64eb741..140e3f4ca0 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -67,6 +67,7 @@ extern "C" { typedef struct FDB_future FDBFuture; typedef struct FDB_result FDBResult; typedef struct FDB_database FDBDatabase; +typedef struct FDB_tenant FDBTenant; typedef struct FDB_transaction FDBTransaction; typedef int fdb_error_t; @@ -271,6 +272,11 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_set_option(FDBDatabase* d, uint8_t const* value, int value_length); +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_open_tenant(FDBDatabase* d, + uint8_t const* tenant_name, + int tenant_name_length, + FDBTenant** out_tenant); + DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_create_transaction(FDBDatabase* d, FDBTransaction** out_transaction); @@ -294,6 +300,19 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version); +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_allocate_tenant(FDBDatabase* db, + uint8_t const* name, + int name_length); + +DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_remove_tenant(FDBDatabase* db, + uint8_t const* name, + int name_length); + +DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, + FDBTransaction** out_transaction); + +DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant); + DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr); DLLEXPORT void fdb_transaction_cancel(FDBTransaction* tr); diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index f8c8fb58b3..c752c63a32 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -133,6 +133,7 @@ public: }; struct WatchParameters : public ReferenceCounted { + const Optional tenant; const Key key; const Optional value; @@ -143,7 +144,8 @@ struct WatchParameters : public ReferenceCounted { const Optional debugID; const UseProvisionalProxies useProvisionalProxies; - WatchParameters(Key key, + WatchParameters(Optional tenant, + Key key, Optional value, Version version, TagSet tags, @@ -151,8 +153,8 @@ struct WatchParameters : public ReferenceCounted { TaskPriority taskID, Optional debugID, UseProvisionalProxies useProvisionalProxies) - : key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID), debugID(debugID), - useProvisionalProxies(useProvisionalProxies) {} + : tenant(tenant), key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID), + debugID(debugID), useProvisionalProxies(useProvisionalProxies) {} }; class WatchMetadata : public ReferenceCounted { @@ -287,9 +289,9 @@ public: void removeWatch(); // watch map operations - Reference getWatchMetadata(KeyRef key) const; - Key setWatchMetadata(Reference metadata); - void deleteWatchMetadata(KeyRef key); + Reference getWatchMetadata(Optional tenant, KeyRef key) const; + void setWatchMetadata(Reference metadata); + void deleteWatchMetadata(Optional tenant, KeyRef key); void clearWatchMetadata(); void setOption(FDBDatabaseOptions::Option option, Optional value); @@ -558,7 +560,10 @@ public: EventCacheHolder connectToDatabaseEventCacheHolder; private: - std::unordered_map> watchMap; + std::unordered_map, Key>, + Reference, + boost::hash, Key>>> + watchMap; }; #endif diff --git a/fdbclient/GenericManagementAPI.actor.h b/fdbclient/GenericManagementAPI.actor.h index 623a8f8afa..2850afab49 100644 --- a/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/GenericManagementAPI.actor.h @@ -37,6 +37,9 @@ the contents of the system key space. #include "fdbclient/ClientBooleanParams.h" #include "fdbclient/DatabaseConfiguration.h" #include "fdbclient/Status.h" +#include "fdbclient/Subspace.h" +#include "fdbclient/DatabaseConfiguration.h" +#include "fdbclient/Status.h" #include "fdbclient/SystemData.h" #include "flow/actorcompiler.h" // has to be last include @@ -626,6 +629,158 @@ Future changeConfig(Reference db, // used by special keys and fdbcli std::string generateErrorMessage(const CoordinatorsResult& res); +ACTOR template +Future> tryGetTenant(Reference db, TenantName name) { + state Reference tr = db->createTransaction(); + state Key tenantMapKey = name.withPrefix(tenantMapPrefix); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + Optional val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey))); + return val.map([](Optional v) { return decodeTenantEntry(v.get()); }); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future getTenant(Reference db, TenantName name) { + Optional entry = wait(tryGetTenant(db, name)); + if (!entry.present()) { + throw tenant_not_found(); + } + + return entry.get(); +} + +ACTOR template +Future createTenant(Reference db, TenantName name) { + if (name.startsWith("\xff"_sr)) { + throw invalid_tenant_name(); + } + + state Reference tr = db->createTransaction(); + state Key tenantMapKey = name.withPrefix(tenantMapPrefix); + + state bool tenantCheckCompleted = false; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + Optional val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey))); + if (val.present()) { + if (!tenantCheckCompleted) { + throw tenant_already_exists(); + } else { + // If the tenant did not exist when we started trying to create it, then we will return success + // even if someone else created it simultaneously. This helps us avoid problems if the commit + // result for creating this tenant is unknown. + return Void(); + } + } else { + tenantCheckCompleted = true; + } + + state Future> tenantDataPrefixFuture = + safeThreadFutureToFuture(tr->get(tenantDataPrefixKey)); + + state Optional lastIdVal = wait(safeThreadFutureToFuture(tr->get(tenantLastIdKey))); + Optional tenantDataPrefix = wait(tenantDataPrefixFuture); + + state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0, + tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr); + + RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(newTenant.prefix), 1))); + if (!contents.empty()) { + throw tenant_prefix_allocator_conflict(); + } + + tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(newTenant.id)); + tr->set(tenantMapKey, encodeTenantEntry(newTenant)); + + wait(safeThreadFutureToFuture(tr->commit())); + TraceEvent("CreatedTenant") + .detail("Tenant", name) + .detail("ID", newTenant.id) + .detail("Prefix", newTenant.prefix); + + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future deleteTenant(Reference db, TenantName name) { + state Reference tr = db->createTransaction(); + state Key tenantMapKey = name.withPrefix(tenantMapPrefix); + + state bool tenantCheckCompleted = false; + + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Optional tenantEntry = wait(tryGetTenant(db, name)); + if (!tenantEntry.present()) { + if (!tenantCheckCompleted) { + throw tenant_not_found(); + } else { + // If the tenant existed when we started trying to delete it, then we will return success + // even if someone else deleted it simultaneously. This helps us avoid problems if the commit + // result for deleting this tenant is unknown. + return Void(); + } + } else { + tenantCheckCompleted = true; + } + + RangeResult contents = + wait(safeThreadFutureToFuture(tr->getRange(prefixRange(tenantEntry.get().prefix), 1))); + if (!contents.empty()) { + throw tenant_not_empty(); + } + + tr->clear(tenantMapKey); + wait(safeThreadFutureToFuture(tr->commit())); + TraceEvent("DeletedTenant").detail("Tenant", name); + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR template +Future>> listTenants(Reference db, StringRef begin, StringRef end, int limit) { + state Reference tr = db->createTransaction(); + state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + RangeResult results = wait(safeThreadFutureToFuture( + tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit))); + + Standalone> tenants; + for (auto kv : results) { + tenants.push_back_deep(tenants.arena(), kv.key.removePrefix(tenantMapPrefix)); + } + + return tenants; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} } // namespace ManagementAPI #include "flow/unactorcompiler.h" diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index 34ab01b445..d6a8e7c102 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -111,11 +111,22 @@ public: virtual bool isValid() { return true; } }; +class ITenant { +public: + virtual ~ITenant() {} + + virtual Reference createTransaction() = 0; + + virtual void addref() = 0; + virtual void delref() = 0; +}; + // An interface that represents a connection to a cluster made by a client class IDatabase { public: virtual ~IDatabase() {} + virtual Reference openTenant(StringRef tenantName) = 0; virtual Reference createTransaction() = 0; virtual void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) = 0; virtual double getMainThreadBusyness() = 0; @@ -126,6 +137,12 @@ public: virtual ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) = 0; + // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. + virtual ThreadFuture createTenant(StringRef const& tenantName) = 0; + + // Deletes the tenant with the given name. The tenant must be empty. + virtual ThreadFuture deleteTenant(StringRef const& tenantName) = 0; + virtual void addref() = 0; virtual void delref() = 0; diff --git a/fdbclient/ISingleThreadTransaction.cpp b/fdbclient/ISingleThreadTransaction.cpp index f25155036d..da387efa6a 100644 --- a/fdbclient/ISingleThreadTransaction.cpp +++ b/fdbclient/ISingleThreadTransaction.cpp @@ -48,6 +48,21 @@ Reference ISingleThreadTransaction::create(Type type, } else { result = makeReference(); } - result->setDatabase(cx); + result->construct(cx); + return result; +} + +Reference ISingleThreadTransaction::create(Type type, + Database const& cx, + TenantName const& tenant) { + Reference result; + if (type == Type::RYW) { + result = makeReference(); + } else if (type == Type::SIMPLE_CONFIG) { + result = makeReference(); + } else { + result = makeReference(); + } + result->construct(cx, tenant); return result; } diff --git a/fdbclient/ISingleThreadTransaction.h b/fdbclient/ISingleThreadTransaction.h index 23448e4579..cc4054535f 100644 --- a/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/ISingleThreadTransaction.h @@ -45,8 +45,15 @@ public: }; static ISingleThreadTransaction* allocateOnForeignThread(Type); + static Reference create(Type, Database const&); - virtual void setDatabase(Database const&) = 0; + static Reference create(Type, Database const&, TenantName const&); + + virtual void construct(Database const&) = 0; + virtual void construct(Database const&, TenantName const&) { + // By default, a transaction implementation does not support tenants. + ASSERT(false); + } virtual void setVersion(Version v) = 0; virtual Future getReadVersion() = 0; diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 6719a43d7d..a775b7a3f3 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -18,7 +18,9 @@ * limitations under the License. */ +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/GenericManagementAPI.actor.h" #include "fdbclient/MultiVersionTransaction.h" #include "fdbclient/MultiVersionAssignmentVars.h" #include "fdbclient/ClientVersion.h" @@ -382,6 +384,15 @@ void DLTransaction::reset() { api->transactionReset(tr); } +// DLTenant +Reference DLTenant::createTransaction() { + ASSERT(api->tenantCreateTransaction != nullptr); + + FdbCApi::FDBTransaction* tr; + api->tenantCreateTransaction(tenant, &tr); + return Reference(new DLTransaction(api, tr)); +} + // DLDatabase DLDatabase::DLDatabase(Reference api, ThreadFuture dbFuture) : api(api), db(nullptr) { addref(); @@ -401,9 +412,19 @@ ThreadFuture DLDatabase::onReady() { return ready; } +Reference DLDatabase::openTenant(StringRef tenantName) { + if (!api->databaseOpenTenant) { + throw unsupported_operation(); + } + + FdbCApi::FDBTenant* tenant; + throwIfError(api->databaseOpenTenant(db, tenantName.begin(), tenantName.size(), &tenant)); + return makeReference(api, tenant); +} + Reference DLDatabase::createTransaction() { FdbCApi::FDBTransaction* tr; - api->databaseCreateTransaction(db, &tr); + throwIfError(api->databaseCreateTransaction(db, &tr)); return Reference(new DLTransaction(api, tr)); } @@ -473,6 +494,26 @@ ThreadFuture DLDatabase::getServerProtocol(Optional DLDatabase::createTenant(StringRef const& tenantName) { + if (api->databaseAllocateTenant == nullptr) { + throw unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseAllocateTenant(db, tenantName.begin(), tenantName.size()); + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); }); +} + +// Deletes the tenant with the given name. The tenant must be empty. +ThreadFuture DLDatabase::deleteTenant(StringRef const& tenantName) { + if (api->databaseRemoveTenant == nullptr) { + throw unsupported_operation(); + } + + FdbCApi::FDBFuture* f = api->databaseRemoveTenant(db, tenantName.begin(), tenantName.size()); + return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); }); +} + // DLApi // Loads the specified function from a dynamic library @@ -522,6 +563,7 @@ void DLApi::init() { loadClientFunction(&api->stopNetwork, lib, fdbCPath, "fdb_stop_network", headerVersion >= 0); loadClientFunction(&api->createDatabase, lib, fdbCPath, "fdb_create_database", headerVersion >= 610); + loadClientFunction(&api->databaseOpenTenant, lib, fdbCPath, "fdb_database_open_tenant", headerVersion >= 710); loadClientFunction( &api->databaseCreateTransaction, lib, fdbCPath, "fdb_database_create_transaction", headerVersion >= 0); loadClientFunction(&api->databaseSetOption, lib, fdbCPath, "fdb_database_set_option", headerVersion >= 0); @@ -532,6 +574,9 @@ void DLApi::init() { headerVersion >= 700); loadClientFunction( &api->databaseGetServerProtocol, lib, fdbCPath, "fdb_database_get_server_protocol", headerVersion >= 700); + loadClientFunction( + &api->databaseAllocateTenant, lib, fdbCPath, "fdb_database_allocate_tenant", headerVersion >= 710); + loadClientFunction(&api->databaseRemoveTenant, lib, fdbCPath, "fdb_database_remove_tenant", headerVersion >= 710); loadClientFunction(&api->databaseDestroy, lib, fdbCPath, "fdb_database_destroy", headerVersion >= 0); loadClientFunction(&api->databaseRebootWorker, lib, fdbCPath, "fdb_database_reboot_worker", headerVersion >= 700); loadClientFunction(&api->databaseForceRecoveryWithDataLoss, @@ -542,6 +587,10 @@ void DLApi::init() { loadClientFunction( &api->databaseCreateSnapshot, lib, fdbCPath, "fdb_database_create_snapshot", headerVersion >= 700); + loadClientFunction( + &api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710); + loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710); + loadClientFunction(&api->transactionSetOption, lib, fdbCPath, "fdb_transaction_set_option", headerVersion >= 0); loadClientFunction(&api->transactionDestroy, lib, fdbCPath, "fdb_transaction_destroy", headerVersion >= 0); loadClientFunction( @@ -737,6 +786,7 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParame // MultiVersionTransaction MultiVersionTransaction::MultiVersionTransaction(Reference db, + Optional> tenant, UniqueOrderedOptionList defaultOptions) : db(db), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar()) { setDefaultOptions(defaultOptions); @@ -749,18 +799,29 @@ void MultiVersionTransaction::setDefaultOptions(UniqueOrderedOptionListdbState->dbVar->get(); - TransactionInfo newTr; - if (currentDb.value) { - newTr.transaction = currentDb.value->createTransaction(); + if (tenant.present()) { + ASSERT(tenant.get()); + auto currentTenant = tenant.get()->tenantVar->get(); + if (currentTenant.value) { + newTr.transaction = currentTenant.value->createTransaction(); + } + + newTr.onChange = currentTenant.onChange; + } else { + auto currentDb = db->dbState->dbVar->get(); + if (currentDb.value) { + newTr.transaction = currentDb.value->createTransaction(); + } + + newTr.onChange = currentDb.onChange; } Optional timeout; for (auto option : persistentOptions) { if (option.first == FDBTransactionOptions::TIMEOUT) { timeout = option.second.castTo(); - } else if (currentDb.value) { + } else if (newTr.transaction) { newTr.transaction->setOption(option.first, option.second.castTo()); } } @@ -770,13 +831,11 @@ void MultiVersionTransaction::updateTransaction() { // that might inadvertently fail the transaction. if (timeout.present()) { setTimeout(timeout); - if (currentDb.value) { + if (newTr.transaction) { newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout); } } - newTr.onChange = currentDb.onChange; - lock.enter(); transaction = newTr; lock.leave(); @@ -1167,6 +1226,39 @@ bool MultiVersionTransaction::isValid() { return tr.transaction.isValid(); } +// MultiVersionTenant +MultiVersionTenant::MultiVersionTenant(Reference db, StringRef tenantName) + : db(db), tenantName(tenantName) { + updateTenant(); +} + +MultiVersionTenant::~MultiVersionTenant() {} + +Reference MultiVersionTenant::createTransaction() { + return Reference(new MultiVersionTransaction( + db, Reference::addRef(this), db->dbState->transactionDefaultOptions)); +} + +// Creates a new underlying tenant object whenever the database connection changes. This change is signaled +// to open transactions via an AsyncVar. +void MultiVersionTenant::updateTenant() { + Reference tenant; + auto currentDb = db->dbState->dbVar->get(); + if (currentDb.value) { + tenant = currentDb.value->openTenant(tenantName); + } else { + tenant = Reference(nullptr); + } + + tenantVar->set(tenant); + + MutexHolder holder(tenantLock); + tenantUpdater = mapThreadFuture(currentDb.onChange, [this](ErrorOr result) { + updateTenant(); + return Void(); + }); +} + // MultiVersionDatabase MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api, int threadIdx, @@ -1241,9 +1333,14 @@ Reference MultiVersionDatabase::debugCreateFromExistingDatabase(Refer return Reference(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false)); } +Reference MultiVersionDatabase::openTenant(StringRef tenantName) { + return makeReference(Reference::addRef(this), tenantName); +} + Reference MultiVersionDatabase::createTransaction() { - return Reference( - new MultiVersionTransaction(Reference::addRef(this), dbState->transactionDefaultOptions)); + return Reference(new MultiVersionTransaction(Reference::addRef(this), + Optional>(), + dbState->transactionDefaultOptions)); } void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional value) { @@ -1308,6 +1405,22 @@ ThreadFuture MultiVersionDatabase::getServerProtocol(Optional

versionMonitorDb->getServerProtocol(expectedVersion); } +// Registers a tenant with the given name. A prefix is automatically allocated for the tenant. +ThreadFuture MultiVersionDatabase::createTenant(StringRef const& tenantName) { + Standalone tenantNameCopy = tenantName; + Reference self = Reference::addRef(this); + + return onMainThread([self, tenantNameCopy]() { return ManagementAPI::createTenant(self, tenantNameCopy); }); +} + +// Deletes the tenant with the given name. The tenant must be empty. +ThreadFuture MultiVersionDatabase::deleteTenant(StringRef const& tenantName) { + Standalone tenantNameCopy = tenantName; + Reference self = Reference::addRef(this); + + return onMainThread([self, tenantNameCopy]() { return ManagementAPI::deleteTenant(self, tenantNameCopy); }); +} + MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference versionMonitorDb) : dbVar(new ThreadSafeAsyncVar>(Reference(nullptr))), clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb), closed(false) {} diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 87556b4c67..5beed79444 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -36,6 +36,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted { typedef struct FDB_result FDBResult; typedef struct FDB_cluster FDBCluster; typedef struct FDB_database FDBDatabase; + typedef struct FDB_tenant FDBTenant; typedef struct FDB_transaction FDBTransaction; typedef int fdb_error_t; @@ -120,6 +121,10 @@ struct FdbCApi : public ThreadSafeReferenceCounted { fdb_error_t (*createDatabase)(const char* clusterFilePath, FDBDatabase** db); // Database + fdb_error_t (*databaseOpenTenant)(FDBDatabase* database, + uint8_t const* tenantName, + int tenantNameLength, + FDBTenant** outTenant); fdb_error_t (*databaseCreateTransaction)(FDBDatabase* database, FDBTransaction** tr); fdb_error_t (*databaseSetOption)(FDBDatabase* database, FDBDatabaseOption option, @@ -139,6 +144,12 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int snapshotCommandLength); double (*databaseGetMainThreadBusyness)(FDBDatabase* database); FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion); + FDBFuture* (*databaseAllocateTenant)(FDBDatabase* database, uint8_t const* name, int name_length); + FDBFuture* (*databaseRemoveTenant)(FDBDatabase* database, uint8_t const* name, int name_length); + + // Tenant + fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction); + void (*tenantDestroy)(FDBTenant* tenant); // Transaction fdb_error_t (*transactionSetOption)(FDBTransaction* tr, @@ -361,6 +372,25 @@ private: FdbCApi::FDBTransaction* const tr; }; +class DLTenant : public ITenant, ThreadSafeReferenceCounted { +public: + DLTenant(Reference api, FdbCApi::FDBTenant* tenant) : api(api), tenant(tenant) {} + ~DLTenant() override { + if (tenant) { + api->tenantDestroy(tenant); + } + } + + Reference createTransaction() override; + + void addref() override { ThreadSafeReferenceCounted::addref(); } + void delref() override { ThreadSafeReferenceCounted::delref(); } + +private: + const Reference api; + FdbCApi::FDBTenant* tenant; +}; + // An implementation of IDatabase that wraps a database object created on an externally loaded client library. // All API calls to that database are routed through the external library. class DLDatabase : public IDatabase, ThreadSafeReferenceCounted { @@ -375,6 +405,7 @@ public: ThreadFuture onReady(); + Reference openTenant(StringRef tenantName) override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; @@ -385,6 +416,12 @@ public: ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; + // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. + ThreadFuture createTenant(StringRef const& tenantName) override; + + // Deletes the tenant with the given name. The tenant must be empty. + ThreadFuture deleteTenant(StringRef const& tenantName) override; + void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } @@ -432,6 +469,7 @@ private: }; class MultiVersionDatabase; +class MultiVersionTenant; // An implementation of ITransaction that wraps a transaction created either locally or through a dynamically loaded // external client. When needed (e.g on cluster version change), the MultiVersionTransaction can automatically replace @@ -439,6 +477,7 @@ class MultiVersionDatabase; class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted { public: MultiVersionTransaction(Reference db, + Optional> tenant, UniqueOrderedOptionList defaultOptions); ~MultiVersionTransaction() override; @@ -515,6 +554,7 @@ public: private: const Reference db; + const Optional> tenant; ThreadSpinLock lock; struct TransactionInfo { @@ -585,6 +625,33 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted { class MultiVersionApi; +// An implementation of ITenant that wraps a tenant created either locally or through a dynamically loaded +// external client. The wrapped ITenant is automatically changed when the MultiVersionDatabase used to create +// it connects with a different version. +class MultiVersionTenant final : public ITenant, ThreadSafeReferenceCounted { +public: + MultiVersionTenant(Reference db, StringRef tenantName); + ~MultiVersionTenant() override; + + Reference createTransaction() override; + + void addref() override { ThreadSafeReferenceCounted::addref(); } + void delref() override { ThreadSafeReferenceCounted::delref(); } + + Reference>> tenantVar; + +private: + Reference db; + const Standalone tenantName; + + Mutex tenantLock; + ThreadFuture tenantUpdater; + + // Creates a new underlying tenant object whenever the database connection changes. This change is signaled + // to open transactions via an AsyncVar. + void updateTenant(); +}; + // An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded // external client. The MultiVersionDatabase monitors the protocol version of the cluster and automatically // replaces the wrapped database when the protocol version changes. @@ -599,6 +666,7 @@ public: ~MultiVersionDatabase() override; + Reference openTenant(StringRef tenantName) override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; @@ -609,6 +677,12 @@ public: ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; + // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. + ThreadFuture createTenant(StringRef const& tenantName) override; + + // Deletes the tenant with the given name. The tenant must be empty. + ThreadFuture deleteTenant(StringRef const& tenantName) override; + void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1d4c898925..350cf107ab 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -30,6 +30,7 @@ #include "contrib/fmt-8.1.1/include/fmt/format.h" +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" @@ -128,6 +129,12 @@ Future loadBalance( FDB_BOOLEAN_PARAM(TransactionRecordLogInfo); FDB_DEFINE_BOOLEAN_PARAM(UseProvisionalProxies); +// Whether or not a request should include the tenant name +FDB_BOOLEAN_PARAM(UseTenant); + +// Whether or not a function should implicitly add the tenant prefix to the request and/or remove it from the result +FDB_BOOLEAN_PARAM(ApplyTenantPrefix); + NetworkOptions networkOptions; TLSConfig tlsConfig(TLSEndpointType::CLIENT); @@ -2041,20 +2048,19 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } -Reference DatabaseContext::getWatchMetadata(KeyRef key) const { - const auto it = watchMap.find(key); +Reference DatabaseContext::getWatchMetadata(Optional tenant, KeyRef key) const { + const auto it = watchMap.find(std::make_pair(tenant, key)); if (it == watchMap.end()) return Reference(); return it->second; } -Key DatabaseContext::setWatchMetadata(Reference metadata) { - watchMap[metadata->parameters->key] = metadata; - return metadata->parameters->key; +void DatabaseContext::setWatchMetadata(Reference metadata) { + watchMap[std::make_pair(metadata->parameters->tenant, metadata->parameters->key)] = metadata; } -void DatabaseContext::deleteWatchMetadata(KeyRef key) { - watchMap.erase(key); +void DatabaseContext::deleteWatchMetadata(Optional tenant, KeyRef key) { + watchMap.erase(std::make_pair(tenant, key)); } void DatabaseContext::clearWatchMetadata() { @@ -2502,19 +2508,23 @@ AddressExclusion AddressExclusion::parse(StringRef const& key) { Future> getValue(Reference const& trState, Key const& key, Future const& version, - TransactionRecordLogInfo const& recordLogInfo); + UseTenant const& useTenant = UseTenant::True, + TransactionRecordLogInfo const& recordLogInfo = TransactionRecordLogInfo::True); Future getRange(Reference const& trState, Future const& fVersion, KeySelector const& begin, KeySelector const& end, GetRangeLimits const& limits, - Reverse const& reverse); + Reverse const& reverse, + UseTenant const& useTenant); ACTOR Future> fetchServerInterface(Reference trState, Future ver, UID id) { - Optional val = wait(getValue(trState, serverListKeyFor(id), ver, TransactionRecordLogInfo::False)); + Optional val = + wait(getValue(trState, serverListKeyFor(id), ver, UseTenant::False, TransactionRecordLogInfo::False)); + if (!val.present()) { // A storage server has been removed from serverList since we read keyServers return Optional(); @@ -2849,11 +2859,19 @@ SpanID generateSpanID(bool transactionTracingSample, SpanID parentContext = Span } } +TransactionState::TransactionState(Database cx, + Optional tenant, + TaskPriority taskID, + SpanID spanID, + Reference trLogInfo) + : cx(cx), tenant(tenant), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID) {} + Reference TransactionState::cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const { SpanID newSpanID = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanID; - Reference newState = makeReference(cx, cx->taskID, newSpanID, newTrLogInfo); + Reference newState = + makeReference(cx, tenant, cx->taskID, newSpanID, newTrLogInfo); if (!cx->apiVersionAtLeast(16)) { newState->options = options; @@ -2867,6 +2885,15 @@ Reference TransactionState::cloneAndReset(Referenceinternal && !options.rawAccess && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && + !tenant.present()) { + throw tenant_name_required(); + } + + return TenantInfo(tenant); +} + Future Transaction::warmRange(KeyRange keys) { return warmRange_impl(trState, keys); } @@ -2874,12 +2901,24 @@ Future Transaction::warmRange(KeyRange keys) { ACTOR Future> getValue(Reference trState, Key key, Future version, - TransactionRecordLogInfo recordLogInfo = TransactionRecordLogInfo::True) { + UseTenant useTenant, + TransactionRecordLogInfo recordLogInfo) { state Version ver = wait(version); state Span span("NAPI:getValue"_loc, trState->spanID); + if (useTenant && trState->tenant.present()) { + span.addTag("tenant"_sr, trState->tenant.get()); + } + span.addTag("key"_sr, key); trState->cx->validateVersion(ver); + if (useTenant) { + Key resolvedTenantPrefix = wait(trState->tenantPrefix); + if (resolvedTenantPrefix.size() > 0) { + key = key.withPrefix(resolvedTenantPrefix); + } + } + loop { state std::pair> ssi = wait(getKeyLocation(trState, key, &StorageServerInterface::getValue)); @@ -2918,7 +2957,7 @@ ACTOR Future> getValue(Reference trState, ssi.second, &StorageServerInterface::getValue, GetValueRequest(span.context, - TenantInfo(), + useTenant ? trState->getTenantInfo() : TenantInfo(), key, ver, trState->cx->sampleReadTags() ? trState->options.readTags @@ -2985,7 +3024,10 @@ ACTOR Future> getValue(Reference trState, } } -ACTOR Future getKey(Reference trState, KeySelector k, Future version) { +ACTOR Future getKey(Reference trState, + KeySelector k, + Future version, + ApplyTenantPrefix applyTenantPrefix = ApplyTenantPrefix::True) { wait(success(version)); state Optional getKeyID = Optional(); @@ -3001,13 +3043,24 @@ ACTOR Future getKey(Reference trState, KeySelector k, Fut // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); } + state Key resolvedTenantPrefix; + if (applyTenantPrefix) { + Key _resolvedTenantPrefix = wait(trState->tenantPrefix); + resolvedTenantPrefix = _resolvedTenantPrefix; + if (resolvedTenantPrefix.size() > 0) { + k = KeySelectorRef(k.getKey().withPrefix(resolvedTenantPrefix), k.orEqual, k.offset); + } + } + loop { - if (k.getKey() == allKeys.end) { - if (k.offset > 0) - return allKeys.end; + if (k.getKey().removePrefix(resolvedTenantPrefix) == allKeys.end) { + if (k.offset > 0) { + return resolvedTenantPrefix.empty() || applyTenantPrefix ? allKeys.end + : allKeys.end.withPrefix(resolvedTenantPrefix); + } k.orEqual = false; - } else if (k.getKey() == allKeys.begin && k.offset <= 0) { - return Key(); + } else if (k.getKey().removePrefix(resolvedTenantPrefix) == allKeys.begin && k.offset <= 0) { + return applyTenantPrefix ? Key() : resolvedTenantPrefix; } Key locationKey(k.getKey(), k.arena()); @@ -3024,7 +3077,7 @@ ACTOR Future getKey(Reference trState, KeySelector k, Fut ++trState->cx->transactionPhysicalReads; GetKeyRequest req(span.context, - TenantInfo(), + trState->getTenantInfo(), k, version.get(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), @@ -3058,7 +3111,11 @@ ACTOR Future getKey(Reference trState, KeySelector k, Fut // reply.sel.offset).detail("OrEqual", k.orEqual); k = reply.sel; if (!k.offset && k.orEqual) { - return k.getKey(); + if (applyTenantPrefix) { + return k.getKey().removePrefix(resolvedTenantPrefix); + } else { + return k.getKey(); + } } } catch (Error& e) { if (getKeyID.present()) @@ -3157,7 +3214,7 @@ ACTOR Future watchValue(Database cx, Reference p ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(span.context, - TenantInfo(), + TenantInfo(parameters->tenant), parameters->key, parameters->value, ver, @@ -3206,22 +3263,22 @@ ACTOR Future watchValue(Database cx, Reference p } } -ACTOR Future watchStorageServerResp(Key key, Database cx) { +ACTOR Future watchStorageServerResp(Optional tenant, Key key, Database cx) { loop { try { - state Reference metadata = cx->getWatchMetadata(key); + state Reference metadata = cx->getWatchMetadata(tenant, key); if (!metadata.isValid()) return Void(); Version watchVersion = wait(watchValue(cx, metadata->parameters)); - metadata = cx->getWatchMetadata(key); + metadata = cx->getWatchMetadata(tenant, key); if (!metadata.isValid()) return Void(); // case 1: version_1 (SS) >= version_2 (map) if (watchVersion >= metadata->parameters->version) { - cx->deleteWatchMetadata(key); + cx->deleteWatchMetadata(tenant, key); if (metadata->watchPromise.canBeSet()) metadata->watchPromise.send(watchVersion); } @@ -3231,7 +3288,7 @@ ACTOR Future watchStorageServerResp(Key key, Database cx) { // case 2: version_1 < version_2 and future_count == 1 if (metadata->watchPromise.getFutureReferenceCount() == 1) { - cx->deleteWatchMetadata(key); + cx->deleteWatchMetadata(tenant, key); } } } catch (Error& e) { @@ -3239,16 +3296,16 @@ ACTOR Future watchStorageServerResp(Key key, Database cx) { throw e; } - Reference metadata = cx->getWatchMetadata(key); + Reference metadata = cx->getWatchMetadata(tenant, key); if (!metadata.isValid()) { return Void(); } else if (metadata->watchPromise.getFutureReferenceCount() == 1) { - cx->deleteWatchMetadata(key); + cx->deleteWatchMetadata(tenant, key); return Void(); } else if (e.code() == error_code_future_version) { continue; } - cx->deleteWatchMetadata(key); + cx->deleteWatchMetadata(tenant, key); metadata->watchPromise.sendError(e); throw e; } @@ -3261,11 +3318,11 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference try { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); state Optional valSS = wait(tr.get(parameters->key)); - Reference metadata = cx->getWatchMetadata(parameters->key.contents()); + Reference metadata = cx->getWatchMetadata(parameters->tenant, parameters->key); // val_3 != val_1 (storage server value doesnt match value in map) if (metadata.isValid() && valSS != metadata->parameters->value) { - cx->deleteWatchMetadata(parameters->key.contents()); + cx->deleteWatchMetadata(parameters->tenant, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); @@ -3274,9 +3331,9 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference // val_3 == val_2 (storage server value matches value passed into the function -> new watch) if (valSS == parameters->value) { metadata = makeReference(parameters); - Key key = cx->setWatchMetadata(metadata); + cx->setWatchMetadata(metadata); - metadata->watchFutureSS = watchStorageServerResp(key, cx); + metadata->watchFutureSS = watchStorageServerResp(parameters->tenant, parameters->key, cx); } // if val_3 != val_2 @@ -3294,14 +3351,14 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference } Future getWatchFuture(Database cx, Reference parameters) { - Reference metadata = cx->getWatchMetadata(parameters->key.contents()); + Reference metadata = cx->getWatchMetadata(parameters->tenant, parameters->key); // case 1: key not in map if (!metadata.isValid()) { metadata = makeReference(parameters); - Key key = cx->setWatchMetadata(metadata); + cx->setWatchMetadata(metadata); - metadata->watchFutureSS = watchStorageServerResp(key, cx); + metadata->watchFutureSS = watchStorageServerResp(parameters->tenant, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } // case 2: val_1 == val_2 (received watch with same value as key already in the map so just update) @@ -3316,15 +3373,15 @@ Future getWatchFuture(Database cx, Reference parameters) // recreate in SS) else if (parameters->version > metadata->parameters->version) { TEST(true); // Setting a watch that has a different value than the one in the map but a higher version (newer) - cx->deleteWatchMetadata(parameters->key); + cx->deleteWatchMetadata(parameters->tenant, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); metadata = makeReference(parameters); - Key key = cx->setWatchMetadata(metadata); + cx->setWatchMetadata(metadata); - metadata->watchFutureSS = watchStorageServerResp(key, cx); + metadata->watchFutureSS = watchStorageServerResp(parameters->tenant, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } @@ -3340,6 +3397,8 @@ Future getWatchFuture(Database cx, Reference parameters) } ACTOR Future watchValueMap(Future version, + Optional tenant, + Future tenantPrefix, Key key, Optional value, Database cx, @@ -3349,8 +3408,16 @@ ACTOR Future watchValueMap(Future version, Optional debugID, UseProvisionalProxies useProvisionalProxies) { state Version ver = wait(version); + + Key resolvedTenantPrefix = wait(tenantPrefix); + if (resolvedTenantPrefix.size()) { + key = key.withPrefix(resolvedTenantPrefix); + } + wait(getWatchFuture( - cx, makeReference(key, value, ver, tags, spanID, taskID, debugID, useProvisionalProxies))); + cx, + makeReference(tenant, key, value, ver, tags, spanID, taskID, debugID, useProvisionalProxies))); + return Void(); } @@ -3392,10 +3459,15 @@ Future getExactRange(Reference trState, KeyRange keys, Key mapper, GetRangeLimits limits, - Reverse reverse) { + Reverse reverse, + UseTenant useTenant) { state RangeResultFamily output; state Span span("NAPI:getExactRange"_loc, trState->spanID); + if (useTenant && trState->tenant.present()) { + span.addTag("tenant"_sr, trState->tenant.get()); + } + // printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); loop { state std::vector>> locations = @@ -3413,6 +3485,7 @@ Future getExactRange(Reference trState, req.mapper = mapper; req.arena.dependsOn(mapper.arena()); + req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); @@ -3568,7 +3641,7 @@ Future resolveKey(Reference trState, KeySelector const& k if (key.isFirstGreaterThan()) return Future(keyAfter(key.getKey())); - return getKey(trState, key, version); + return getKey(trState, key, version, ApplyTenantPrefix::False); } ACTOR template @@ -3578,7 +3651,8 @@ Future getRangeFallback(Reference trState, KeySelector end, Key mapper, GetRangeLimits limits, - Reverse reverse) { + Reverse reverse, + UseTenant useTenant) { if (version == latestVersion) { state Transaction transaction(trState->cx); transaction.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY); @@ -3597,12 +3671,12 @@ Future getRangeFallback(Reference trState, return RangeResultFamily(); } - // if e is allKeys.end, we have read through the end of the database - // if b is allKeys.begin, we have either read through the beginning of the database, - // or allKeys.begin exists in the database and will be part of the conflict range anyways + // if e is allKeys.end, we have read through the end of the database/tenant + // if b is allKeys.begin, we have either read through the beginning of the database/tenant, + // or allKeys.begin exists in the database/tenant and will be part of the conflict range anyways RangeResultFamily _r = wait(getExactRange( - trState, version, KeyRangeRef(b, e), mapper, limits, reverse)); + trState, version, KeyRangeRef(b, e), mapper, limits, reverse, useTenant)); RangeResultFamily r = _r; if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) @@ -3659,8 +3733,10 @@ void getRangeFinished(Reference trState, Snapshot snapshot, Promise> conflictRange, Reverse reverse, - RangeResultFamily result) { + RangeResultFamily result, + UseTenant useTenant) { int64_t bytes = getRangeResultFamilyBytes(result); + ASSERT(!useTenant || trState->tenantPrefix.isReady()); trState->cx->transactionBytesRead += bytes; trState->cx->transactionKeysRead += result.size(); @@ -3716,13 +3792,15 @@ Future getRange(Reference trState, GetRangeLimits limits, Promise> conflictRange, Snapshot snapshot, - Reverse reverse) { + Reverse reverse, + UseTenant useTenant = UseTenant::True) { // state using RangeResultRefFamily = typename RangeResultFamily::RefType; state GetRangeLimits originalLimits(limits); - state KeySelector originalBegin = begin; - state KeySelector originalEnd = end; state RangeResultFamily output; state Span span("NAPI:getRange"_loc, trState->spanID); + if (useTenant && trState->tenant.present()) { + span.addTag("tenant"_sr, trState->tenant.get()); + } try { state Version version = wait(fVersion); @@ -3733,8 +3811,21 @@ Future getRange(Reference trState, // version that the first one completed // FIXME: Is this really right? Weaken this and see if there is a problem; // if so maybe there is a much subtler problem even with this. + state Key tenantPrefix; + if (useTenant) { + Key resolvedTenantPrefix = wait(trState->tenantPrefix); + tenantPrefix = resolvedTenantPrefix; - if (begin.getKey() == allKeys.begin && begin.offset < 1) { + if (tenantPrefix.size() > 0) { + begin = KeySelectorRef(begin.getKey().withPrefix(tenantPrefix), begin.orEqual, begin.offset); + end = KeySelectorRef(end.getKey().withPrefix(tenantPrefix), end.orEqual, end.offset); + } + } + + state KeySelector originalBegin = begin; + state KeySelector originalEnd = end; + + if (begin.getKey().removePrefix(tenantPrefix) == allKeys.begin && begin.offset < 1) { output.readToBegin = true; begin = KeySelector(firstGreaterOrEqual(begin.getKey()), begin.arena()); } @@ -3743,9 +3834,17 @@ Future getRange(Reference trState, ASSERT((!limits.hasRowLimit() || limits.rows >= limits.minRows) && limits.minRows >= 0); loop { - if (end.getKey() == allKeys.begin && (end.offset < 1 || end.isFirstGreaterOrEqual())) { - getRangeFinished( - trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); + if (end.getKey().removePrefix(tenantPrefix) == allKeys.begin && + (end.offset < 1 || end.isFirstGreaterOrEqual())) { + getRangeFinished(trState, + startTime, + originalBegin, + originalEnd, + snapshot, + conflictRange, + reverse, + output, + useTenant); return output; } @@ -3759,6 +3858,7 @@ Future getRange(Reference trState, req.mapper = mapper; req.arena.dependsOn(mapper.arena()); + req.tenantInfo = useTenant ? trState->getTenantInfo() : TenantInfo(); req.isFetchKeys = (trState->taskID == TaskPriority::FetchKeys); req.version = readVersion; @@ -3887,8 +3987,15 @@ Future getRange(Reference trState, output = copy; output.more = true; - getRangeFinished( - trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); + getRangeFinished(trState, + startTime, + originalBegin, + originalEnd, + snapshot, + conflictRange, + reverse, + output, + useTenant); return output; } @@ -3897,8 +4004,15 @@ Future getRange(Reference trState, output.readThrough = reverse ? shard.begin : shard.end; } - getRangeFinished( - trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); + getRangeFinished(trState, + startTime, + originalBegin, + originalEnd, + snapshot, + conflictRange, + reverse, + output, + useTenant); return output; } @@ -3912,8 +4026,15 @@ Future getRange(Reference trState, } output.more = modifiedSelectors || limits.isReached() || rep.more; - getRangeFinished( - trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); + getRangeFinished(trState, + startTime, + originalBegin, + originalEnd, + snapshot, + conflictRange, + reverse, + output, + useTenant); return output; } @@ -3926,9 +4047,23 @@ Future getRange(Reference trState, if (!rep.data.size()) { RangeResultFamily result = wait( getRangeFallback( - trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse)); - getRangeFinished( - trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); + trState, + version, + originalBegin, + originalEnd, + mapper, + originalLimits, + reverse, + useTenant)); + getRangeFinished(trState, + startTime, + originalBegin, + originalEnd, + snapshot, + conflictRange, + reverse, + result, + useTenant); return result; } @@ -3958,9 +4093,23 @@ Future getRange(Reference trState, if (e.code() == error_code_wrong_shard_server) { RangeResultFamily result = wait( getRangeFallback( - trState, version, originalBegin, originalEnd, mapper, originalLimits, reverse)); - getRangeFinished( - trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); + trState, + version, + originalBegin, + originalEnd, + mapper, + originalLimits, + reverse, + useTenant)); + getRangeFinished(trState, + startTime, + originalBegin, + originalEnd, + snapshot, + conflictRange, + reverse, + result, + useTenant); return result; } @@ -4174,6 +4323,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, state Optional> tssDuplicateStream; state GetKeyValuesStreamRequest req; + req.tenantInfo = trState->getTenantInfo(); req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); @@ -4417,7 +4567,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, ACTOR Future>> getRangeSplitPoints(Reference trState, KeyRange keys, - int64_t chunkSize); + int64_t chunkSize, + Optional> tenantPrefix); static KeyRange intersect(KeyRangeRef lhs, KeyRangeRef rhs) { return KeyRange(KeyRangeRef(std::max(lhs.begin, rhs.begin), std::min(lhs.end, rhs.end))); @@ -4443,6 +4594,12 @@ ACTOR Future getRangeStream(Reference trState, state Version version = wait(fVersion); trState->cx->validateVersion(version); + Key resolvedTenantPrefix = wait(trState->tenantPrefix); + if (resolvedTenantPrefix.size() > 0) { + begin = KeySelectorRef(begin.getKey().withPrefix(resolvedTenantPrefix), begin.orEqual, begin.offset); + end = KeySelectorRef(end.getKey().withPrefix(resolvedTenantPrefix), end.orEqual, end.offset); + } + Future fb = resolveKey(trState, begin, version); state Future fe = resolveKey(trState, end, version); @@ -4469,8 +4626,8 @@ ACTOR Future getRangeStream(Reference trState, state std::pair> ssi = wait(getKeyLocation(trState, reverse ? e : b, &StorageServerInterface::getKeyValuesStream, reverse)); state KeyRange shardIntersection = intersect(ssi.first, KeyRangeRef(b, e)); - state Standalone> splitPoints = - wait(getRangeSplitPoints(trState, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE)); + state Standalone> splitPoints = wait(getRangeSplitPoints( + trState, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE, Optional>())); state std::vector toSend; // state std::vector::iterator>> outstandingRequests; @@ -4510,9 +4667,18 @@ Future getRange(Reference const& trState, KeySelector const& begin, KeySelector const& end, GetRangeLimits const& limits, - Reverse const& reverse) { - return getRange( - trState, fVersion, begin, end, ""_sr, limits, Promise>(), Snapshot::True, reverse); + Reverse const& reverse, + UseTenant const& useTenant) { + return getRange(trState, + fVersion, + begin, + end, + ""_sr, + limits, + Promise>(), + Snapshot::True, + reverse, + useTenant); } bool DatabaseContext::debugUseTags = false; @@ -4543,16 +4709,43 @@ void debugAddTags(Reference trState) { } } +ACTOR Future getTenantPrefixImpl(Reference trState, Future version) { + // TODO: Support local and/or stateless role caching + // Note: this does not set a conflict range for the tenant read. This is ok, we expect tenants to change + // infrequently and we will have our request rejected at commit time if it does. + Optional val = + wait(getValue(trState, trState->tenant.get().withPrefix(tenantMapPrefix), version, UseTenant::False)); + + if (!val.present()) { + TraceEvent(SevWarn, "ClientTenantNotFound", trState->cx->dbId) + .detail("Tenant", trState->tenant.get()) + .backtrace(); + throw tenant_not_found(); + } + + return decodeTenantEntry(val.get()).prefix; +} + +Future Transaction::getTenantPrefix() { + if (!trState->tenant.present()) { + trState->tenantPrefix = Key(); + } else if (!trState->tenantPrefix.isValid()) { + trState->tenantPrefix = getTenantPrefixImpl(trState, getReadVersion()); + } + + return trState->tenantPrefix; +} + Transaction::Transaction() : trState(makeReference(TaskPriority::DefaultEndpoint, generateSpanID(false))) {} -Transaction::Transaction(Database const& cx) +Transaction::Transaction(Database const& cx, Optional const& tenant) : trState(makeReference(cx, + tenant, cx->taskID, generateSpanID(cx->transactionTracingSample), createTrLogInfoProbabilistically(cx))), - span(trState->spanID, "Transaction"_loc), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), - tr(TenantInfo(), trState->spanID) { + span(trState->spanID, "Transaction"_loc), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), tr(tenant, trState->spanID) { if (DatabaseContext::debugUseTags) { debugAddTags(trState); } @@ -4592,6 +4785,7 @@ void Transaction::setVersion(Version v) { if (v <= 0) throw version_invalid(); readVersion = v; + trState->tenantPrefix = getTenantPrefix(); } Future> Transaction::get(const Key& key, Snapshot snapshot) { @@ -4659,6 +4853,8 @@ void Watch::setWatch(Future watchFuture) { // FIXME: This seems pretty horrible. Now a Database can't die until all of its watches do... ACTOR Future watch(Reference watch, Database cx, + Optional tenant, + Future tenantPrefix, TagSet tags, SpanID spanID, TaskPriority taskID, @@ -4682,6 +4878,8 @@ ACTOR Future watch(Reference watch, TEST(true); // Recreated a watch after switch cx->clearWatchMetadata(); watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion, + tenant, + tenantPrefix, watch->key, watch->value, cx, @@ -4710,10 +4908,18 @@ Future Transaction::getRawReadVersion() { Future Transaction::watch(Reference watch) { ++trState->cx->transactionWatchRequests; + + if (!trState->cx->internal && !trState->options.rawAccess && + trState->cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !trState->tenant.present()) { + throw tenant_name_required(); + } + trState->cx->addWatch(); watches.push_back(watch); return ::watch(watch, trState->cx, + trState->tenant, + getTenantPrefix(), trState->options.readTags, trState->spanID, trState->taskID, @@ -4726,6 +4932,14 @@ ACTOR Future>> getAddressesForKeyActor(Referen Key key) { state std::vector ssi; + Key resolvedTenantPrefix = wait(trState->tenantPrefix); + if (resolvedTenantPrefix.size() > 0) { + key = key.withPrefix(resolvedTenantPrefix, key.arena()); + } + + // Check that we specified a tenant if required + trState->getTenantInfo(); + // If key >= allKeys.end, then getRange will return a kv-pair with an empty value. This will result in our // serverInterfaces vector being empty, which will cause us to return an empty addresses list. state Key ksKey = keyServersKey(key); @@ -4734,10 +4948,16 @@ ACTOR Future>> getAddressesForKeyActor(Referen lastLessOrEqual(serverTagKeys.begin), firstGreaterThan(serverTagKeys.end), GetRangeLimits(CLIENT_KNOBS->TOO_MANY), - Reverse::False)); + Reverse::False, + UseTenant::False)); ASSERT(!serverTagResult.more && serverTagResult.size() < CLIENT_KNOBS->TOO_MANY); - Future futureServerUids = - getRange(trState, ver, lastLessOrEqual(ksKey), firstGreaterThan(ksKey), GetRangeLimits(1), Reverse::False); + Future futureServerUids = getRange(trState, + ver, + lastLessOrEqual(ksKey), + firstGreaterThan(ksKey), + GetRangeLimits(1), + Reverse::False, + UseTenant::False); RangeResult serverUids = wait(futureServerUids); ASSERT(serverUids.size()); // every shard needs to have a team @@ -5171,6 +5391,7 @@ void TransactionOptions::clear() { expensiveClearCostEstimation = false; useGrvCache = false; skipGrvCache = false; + rawAccess = false; } TransactionOptions::TransactionOptions() { @@ -5188,7 +5409,7 @@ void TransactionOptions::reset(Database const& cx) { void Transaction::resetImpl(bool generateNewSpan) { flushTrLogsIfEnabled(); trState = trState->cloneAndReset(createTrLogInfoProbabilistically(trState->cx), generateNewSpan); - tr = CommitTransactionRequest(TenantInfo(), trState->spanID); + tr = CommitTransactionRequest(trState->tenant, trState->spanID); readVersion = Future(); metadataVersion = Promise>(); extraConflictRanges.clear(); @@ -5265,7 +5486,6 @@ ACTOR void checkWrites(Reference trState, wait(delay(deterministicRandom()->random01())); // delay between 0 and 1 seconds - // Future> version, Database cx, CommitTransactionRequest req ) { state KeyRangeMap expectedValues; auto& mutations = req.transaction.mutations; @@ -5364,6 +5584,8 @@ void Transaction::setupWatches() { for (int i = 0; i < watches.size(); ++i) watches[i]->setWatch(watchValueMap(watchVersion, + trState->tenant, + getTenantPrefix(), watches[i]->key, watches[i]->value, trState->cx, @@ -5451,6 +5673,29 @@ ACTOR Future> estimateCommitCosts(Referen return trCommitCosts; } +// TODO: send the prefix as part of the commit request and ship it all the way +// through to the storage servers +void applyTenantPrefix(CommitTransactionRequest& req, Key tenantPrefix) { + for (auto& m : req.transaction.mutations) { + m.param1 = m.param1.withPrefix(tenantPrefix, req.arena); + if (m.type == MutationRef::ClearRange) { + m.param2 = m.param2.withPrefix(tenantPrefix, req.arena); + } else if (m.type == MutationRef::SetVersionstampedKey) { + uint8_t* key = mutateString(m.param1); + int* offset = reinterpret_cast(&key[m.param1.size() - 4]); + *offset += tenantPrefix.size(); + } + } + + for (auto& rc : req.transaction.read_conflict_ranges) { + rc = rc.withPrefix(tenantPrefix, req.arena); + } + + for (auto& wc : req.transaction.write_conflict_ranges) { + wc = wc.withPrefix(tenantPrefix, req.arena); + } +} + ACTOR static Future tryCommit(Reference trState, CommitTransactionRequest req, Future readVersion) { @@ -5474,6 +5719,16 @@ ACTOR static Future tryCommit(Reference trState, wait(store(req.transaction.read_snapshot, readVersion)); } + try { + Key resolvedTenantPrefix = wait(trState->tenantPrefix); + if (!resolvedTenantPrefix.empty()) { + applyTenantPrefix(req, resolvedTenantPrefix); + } + } catch (Error& e) { + // TODO: use a different error here? + throw not_committed(); + } + startTime = now(); state Optional commitID = Optional(); @@ -5766,6 +6021,7 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaloptions.causalWriteRisky = true; + trState->tenantPrefix = Key(); break; case FDBTransactionOptions::CAUSAL_READ_RISKY: @@ -5951,6 +6207,14 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaloptions.skipGrvCache = true; break; + case FDBTransactionOptions::READ_SYSTEM_KEYS: + case FDBTransactionOptions::ACCESS_SYSTEM_KEYS: + case FDBTransactionOptions::RAW_ACCESS: + // System key access implies raw access. Native API handles the raw access, + // system key access is handled in RYW. + validateOptionValueNotPresent(value); + trState->options.rawAccess = true; + break; default: break; @@ -6289,6 +6553,8 @@ Future Transaction::getReadVersion(uint32_t flags) { batcher.stream.send(req); trState->startTime = now(); readVersion = extractReadVersion(trState, location, spanContext, req.reply.getFuture(), metadataVersion); + + trState->tenantPrefix = getTenantPrefix(); } return readVersion; } @@ -6747,8 +7013,19 @@ Future>> DatabaseContext::getReadH ACTOR Future>> getRangeSplitPoints(Reference trState, KeyRange keys, - int64_t chunkSize) { + int64_t chunkSize, + Optional> tenantPrefix) { state Span span("NAPI:GetRangeSplitPoints"_loc, trState->spanID); + + state Key resolvedTenantPrefix; + if (tenantPrefix.present()) { + Key _resolvedTenantPrefix = wait(trState->tenantPrefix); + if (_resolvedTenantPrefix.size() > 0) { + resolvedTenantPrefix = _resolvedTenantPrefix; + keys = keys.withPrefix(resolvedTenantPrefix); + } + } + loop { state std::vector>> locations = wait(getKeyRangeLocations( trState, keys, CLIENT_KNOBS->TOO_MANY, Reverse::False, &StorageServerInterface::getRangeSplitPoints)); @@ -6759,7 +7036,7 @@ ACTOR Future>> getRangeSplitPoints(ReferencegetTenantInfo(), KeyRangeRef(partBegin, partEnd), chunkSize); fReplies[i] = loadBalance(locations[i].second->locations(), &StorageServerInterface::getRangeSplitPoints, req, @@ -6769,19 +7046,28 @@ ACTOR Future>> getRangeSplitPoints(Reference> results; - results.push_back_deep(results.arena(), keys.begin); + results.push_back_deep(results.arena(), keys.begin.removePrefix(resolvedTenantPrefix)); for (int i = 0; i < nLocs; i++) { if (i > 0) { - results.push_back_deep(results.arena(), locations[i].first.begin); // Need this shard boundary + results.push_back_deep( + results.arena(), + locations[i].first.begin.removePrefix(resolvedTenantPrefix)); // Need this shard boundary } if (fReplies[i].get().splitPoints.size() > 0) { - results.append( - results.arena(), fReplies[i].get().splitPoints.begin(), fReplies[i].get().splitPoints.size()); + if (resolvedTenantPrefix.size() == 0) { + results.append(results.arena(), + fReplies[i].get().splitPoints.begin(), + fReplies[i].get().splitPoints.size()); + } else { + for (auto sp : fReplies[i].get().splitPoints) { + results.push_back(results.arena(), sp.removePrefix(resolvedTenantPrefix)); + } + } results.arena().dependsOn(fReplies[i].get().splitPoints.arena()); } } if (results.back() != keys.end) { - results.push_back_deep(results.arena(), keys.end); + results.push_back_deep(results.arena(), keys.end.removePrefix(resolvedTenantPrefix)); } return results; @@ -6797,7 +7083,7 @@ ACTOR Future>> getRangeSplitPoints(Reference>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) { - return ::getRangeSplitPoints(trState, keys, chunkSize); + return ::getRangeSplitPoints(trState, keys, chunkSize, getTenantPrefix()); } #define BG_REQUEST_DEBUG false @@ -7817,7 +8103,6 @@ ACTOR Future> getOverlappingChangeFeedsA KeyRangeRef range, Version minVersion) { state Database cx(db); - state Transaction tr(cx); state Span span("NAPI:GetOverlappingChangeFeeds"_loc); loop { diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index d2ee31d20b..1429ab25b7 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -159,6 +159,7 @@ struct TransactionOptions { bool expensiveClearCostEstimation : 1; bool useGrvCache : 1; bool skipGrvCache : 1; + bool rawAccess : 1; TransactionPriority priority; @@ -236,6 +237,7 @@ struct Watch : public ReferenceCounted, NonCopyable { struct TransactionState : ReferenceCounted { Database cx; + Optional tenant; Reference trLogInfo; TransactionOptions options; @@ -250,23 +252,29 @@ struct TransactionState : ReferenceCounted { Version committedVersion{ invalidVersion }; + Future tenantPrefix; + // Used to save conflicting keys if FDBTransactionOptions::REPORT_CONFLICTING_KEYS is enabled // prefix/ : '1' - any keys equal or larger than this key are (probably) conflicting keys // prefix/ : '0' - any keys equal or larger than this key are (definitely) not conflicting keys std::shared_ptr> conflictingKeys; // Only available so that Transaction can have a default constructor, for use in state variables - TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID) {} + TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID), tenantPrefix(Key()) {} - TransactionState(Database cx, TaskPriority taskID, SpanID spanID, Reference trLogInfo) - : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID) {} + TransactionState(Database cx, + Optional tenant, + TaskPriority taskID, + SpanID spanID, + Reference trLogInfo); Reference cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const; + TenantInfo getTenantInfo() const; }; class Transaction : NonCopyable { public: - explicit Transaction(Database const& cx); + explicit Transaction(Database const& cx, Optional const& tenant = Optional()); ~Transaction(); void setVersion(Version v); @@ -440,6 +448,8 @@ public: return Standalone>(tr.transaction.write_conflict_ranges, tr.arena); } + Future getTenantPrefix(); + Reference trState; std::vector> watches; Span span; diff --git a/fdbclient/PaxosConfigTransaction.actor.cpp b/fdbclient/PaxosConfigTransaction.actor.cpp index c3dce23efb..20c68a6d0c 100644 --- a/fdbclient/PaxosConfigTransaction.actor.cpp +++ b/fdbclient/PaxosConfigTransaction.actor.cpp @@ -461,6 +461,6 @@ PaxosConfigTransaction::PaxosConfigTransaction() = default; PaxosConfigTransaction::~PaxosConfigTransaction() = default; -void PaxosConfigTransaction::setDatabase(Database const& cx) { +void PaxosConfigTransaction::construct(Database const& cx) { impl = PImpl::create(cx); } diff --git a/fdbclient/PaxosConfigTransaction.h b/fdbclient/PaxosConfigTransaction.h index 276450c8a4..64192e26de 100644 --- a/fdbclient/PaxosConfigTransaction.h +++ b/fdbclient/PaxosConfigTransaction.h @@ -35,7 +35,7 @@ public: PaxosConfigTransaction(std::vector const&); PaxosConfigTransaction(); ~PaxosConfigTransaction(); - void setDatabase(Database const&) override; + void construct(Database const&) override; Future getReadVersion() override; Optional getCachedReadVersion() const override; diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index bc12302472..734e33665d 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1443,17 +1443,21 @@ public: } }; -ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx) - : ISingleThreadTransaction(cx->deferredError), tr(cx), cache(&arena), writes(&arena), retries(0), approximateSize(0), - creationTime(now()), commitStarted(false), versionStampFuture(tr.getVersionstamp()), +ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx, Optional tenantName) + : ISingleThreadTransaction(cx->deferredError), tr(cx, tenantName), cache(&arena), writes(&arena), retries(0), + approximateSize(0), creationTime(now()), commitStarted(false), versionStampFuture(tr.getVersionstamp()), specialKeySpaceWriteMap(std::make_pair(false, Optional()), specialKeys.end), options(tr) { std::copy( cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions)); applyPersistentOptions(); } -void ReadYourWritesTransaction::setDatabase(Database const& cx) { - *this = ReadYourWritesTransaction(cx); +void ReadYourWritesTransaction::construct(Database const& cx) { + *this = ReadYourWritesTransaction(cx, Optional()); +} + +void ReadYourWritesTransaction::construct(Database const& cx, TenantName const& tenantName) { + *this = ReadYourWritesTransaction(cx, tenantName); } ACTOR Future timebomb(double endTime, Promise resetPromise) { diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h index 84bc05e4ef..db2ab5dac2 100644 --- a/fdbclient/ReadYourWrites.h +++ b/fdbclient/ReadYourWrites.h @@ -68,10 +68,11 @@ class ReadYourWritesTransaction final : NonCopyable, public ISingleThreadTransaction, public FastAllocated { public: - explicit ReadYourWritesTransaction(Database const& cx); + explicit ReadYourWritesTransaction(Database const& cx, Optional tenant = Optional()); ~ReadYourWritesTransaction(); - void setDatabase(Database const&) override; + void construct(Database const&) override; + void construct(Database const&, TenantName const& tenant) override; void setVersion(Version v) override { tr.setVersion(v); } Future getReadVersion() override; Optional getCachedReadVersion() const override { return tr.getCachedReadVersion(); } diff --git a/fdbclient/SimpleConfigTransaction.actor.cpp b/fdbclient/SimpleConfigTransaction.actor.cpp index f1bc1aebb9..a070b8a53d 100644 --- a/fdbclient/SimpleConfigTransaction.actor.cpp +++ b/fdbclient/SimpleConfigTransaction.actor.cpp @@ -286,7 +286,7 @@ void SimpleConfigTransaction::checkDeferredError() const { impl->checkDeferredError(deferredError); } -void SimpleConfigTransaction::setDatabase(Database const& cx) { +void SimpleConfigTransaction::construct(Database const& cx) { impl = PImpl::create(cx); } diff --git a/fdbclient/SimpleConfigTransaction.h b/fdbclient/SimpleConfigTransaction.h index 871d0efa97..bb013f2c51 100644 --- a/fdbclient/SimpleConfigTransaction.h +++ b/fdbclient/SimpleConfigTransaction.h @@ -43,7 +43,7 @@ public: SimpleConfigTransaction(ConfigTransactionInterface const&); SimpleConfigTransaction(Database const&); SimpleConfigTransaction(); - void setDatabase(Database const&) override; + void construct(Database const&) override; ~SimpleConfigTransaction(); Future getReadVersion() override; Optional getCachedReadVersion() const override; diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 2516901a09..fad13a1ac5 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -28,6 +28,7 @@ #include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/Knobs.h" #include "fdbclient/ProcessInterface.h" #include "fdbclient/GlobalConfig.actor.h" @@ -1291,6 +1292,7 @@ void ProcessClassRangeImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& } ACTOR Future getProcessClassSourceActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) { + ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); std::vector _workers = wait(getWorkers(&ryw->getTransaction())); auto workers = _workers; // strip const // Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5 diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 82c194f5db..4d1bc23574 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -1336,6 +1336,8 @@ TenantMapEntry decodeTenantEntry(ValueRef const& value) { const KeyRangeRef tenantMapKeys("\xff/tenantMap/"_sr, "\xff/tenantMap0"_sr); const KeyRef tenantMapPrefix = tenantMapKeys.begin; const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr; +const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr; +const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr; // for tests void testSSISerdes(StorageServerInterface const& ssi, bool useFB) { diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 993efb1dba..68cfa0fcbe 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -598,6 +598,8 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value); extern const KeyRangeRef tenantMapKeys; extern const KeyRef tenantMapPrefix; extern const KeyRef tenantMapPrivatePrefix; +extern const KeyRef tenantLastIdKey; +extern const KeyRef tenantDataPrefixKey; Value encodeTenantEntry(TenantMapEntry const& tenantEntry); TenantMapEntry decodeTenantEntry(ValueRef const& value); diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index ce2d6c39c4..9b0626361a 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -23,6 +23,7 @@ #include "fdbclient/ThreadSafeTransaction.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/versions.h" +#include "fdbclient/GenericManagementAPI.actor.h" #include "fdbclient/NativeAPI.actor.h" // Users of ThreadSafeTransaction might share Reference between different threads as long as they don't @@ -46,9 +47,13 @@ ThreadFuture> ThreadSafeDatabase::createFromExistingDatabas }); } +Reference ThreadSafeDatabase::openTenant(StringRef tenantName) { + return makeReference(Reference::addRef(this), tenantName); +} + Reference ThreadSafeDatabase::createTransaction() { auto type = isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW; - return Reference(new ThreadSafeTransaction(db, type)); + return Reference(new ThreadSafeTransaction(db, type, Optional())); } void ThreadSafeDatabase::setOption(FDBDatabaseOptions::Option option, Optional value) { @@ -111,6 +116,24 @@ ThreadFuture ThreadSafeDatabase::getServerProtocol(Optional Future { return db->getClusterProtocol(expectedVersion); }); } +// Registers a tenant with the given name. A prefix is automatically allocated for the tenant. +ThreadFuture ThreadSafeDatabase::createTenant(StringRef const& name) { + DatabaseContext* db = this->db; + TenantName tenantNameCopy = name; + return onMainThread([db, tenantNameCopy]() -> Future { + return ManagementAPI::createTenant(Reference::addRef(db), tenantNameCopy); + }); +} + +// Deletes the tenant with the given name. The tenant must be empty. +ThreadFuture ThreadSafeDatabase::deleteTenant(StringRef const& name) { + DatabaseContext* db = this->db; + TenantName tenantNameCopy = name; + return onMainThread([db, tenantNameCopy]() -> Future { + return ManagementAPI::deleteTenant(Reference::addRef(db), tenantNameCopy); + }); +} + ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) { ClusterConnectionFile* connFile = new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first); @@ -139,7 +162,16 @@ ThreadSafeDatabase::~ThreadSafeDatabase() { onMainThreadVoid([db]() { db->delref(); }, nullptr); } -ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type) { +Reference ThreadSafeTenant::createTransaction() { + auto type = db->isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW; + return Reference(new ThreadSafeTransaction(db->db, type, name)); +} + +ThreadSafeTenant::~ThreadSafeTenant() {} + +ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, + ISingleThreadTransaction::Type type, + Optional tenant) { // Allocate memory for the transaction from this thread (so the pointer is known for subsequent method calls) // but run its constructor on the main thread @@ -150,9 +182,13 @@ ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadT auto tr = this->tr = ISingleThreadTransaction::allocateOnForeignThread(type); // No deferred error -- if the construction of the RYW transaction fails, we have no where to put it onMainThreadVoid( - [tr, cx]() { + [tr, cx, tenant]() { cx->addref(); - tr->setDatabase(Database(cx)); + if (tenant.present()) { + tr->construct(Database(cx), tenant.get()); + } else { + tr->construct(Database(cx)); + } }, nullptr); } diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index 6c03262891..938e21b429 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -35,6 +35,7 @@ public: ~ThreadSafeDatabase() override; static ThreadFuture> createFromExistingDatabase(Database cx); + Reference openTenant(StringRef tenantName) override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; @@ -46,6 +47,12 @@ public: ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; + // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. + ThreadFuture createTenant(StringRef const& name) override; + + // Deletes the tenant with the given name. The tenant must be empty. + ThreadFuture deleteTenant(StringRef const& name) override; + // Returns after a majority of coordination servers are available and have reported a leader. The // cluster file therefore is valid, but the database might be unavailable. ThreadFuture onConnected(); @@ -58,6 +65,7 @@ public: ThreadFuture createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override; private: + friend class ThreadSafeTenant; friend class ThreadSafeTransaction; bool isConfigDB{ false }; DatabaseContext* db; @@ -68,11 +76,28 @@ public: // Internal use only DatabaseContext* unsafeGetPtr() const { return db; } }; +class ThreadSafeTenant : public ITenant, ThreadSafeReferenceCounted, NonCopyable { +public: + ThreadSafeTenant(Reference db, StringRef name) : db(db), name(name) {} + ~ThreadSafeTenant() override; + + Reference createTransaction() override; + + void addref() override { ThreadSafeReferenceCounted::addref(); } + void delref() override { ThreadSafeReferenceCounted::delref(); } + +private: + Reference db; + Standalone name; +}; + // An implementation of ITransaction that serializes operations onto the network thread and interacts with the // lower-level client APIs exposed by ISingleThreadTransaction class ThreadSafeTransaction : public ITransaction, ThreadSafeReferenceCounted, NonCopyable { public: - explicit ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type); + explicit ThreadSafeTransaction(DatabaseContext* cx, + ISingleThreadTransaction::Type type, + Optional tenant); ~ThreadSafeTransaction() override; // Note: used while refactoring fdbcli, need to be removed later diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index c4845ae8bd..9f2f9e52f9 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -230,9 +230,11 @@ description is not currently required but encouraged.

MultiVersionDatabase::createTenant(StringRef const& tenantName) { +ThreadFuture MultiVersionDatabase::createTenant(TenantNameRef const& tenantName) { Standalone tenantNameCopy = tenantName; Reference self = Reference::addRef(this); @@ -1414,7 +1422,7 @@ ThreadFuture MultiVersionDatabase::createTenant(StringRef const& tenantNam } // Deletes the tenant with the given name. The tenant must be empty. -ThreadFuture MultiVersionDatabase::deleteTenant(StringRef const& tenantName) { +ThreadFuture MultiVersionDatabase::deleteTenant(TenantNameRef const& tenantName) { Standalone tenantNameCopy = tenantName; Reference self = Reference::addRef(this); diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 5beed79444..38e662bb60 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -364,6 +364,11 @@ public: ThreadFuture onError(Error const& e) override; void reset() override; + Optional getTenant() override { + ASSERT(false); + throw internal_error(); + } + void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } @@ -405,7 +410,7 @@ public: ThreadFuture onReady(); - Reference openTenant(StringRef tenantName) override; + Reference openTenant(TenantNameRef tenantName) override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; @@ -417,10 +422,10 @@ public: Optional expectedVersion = Optional()) override; // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - ThreadFuture createTenant(StringRef const& tenantName) override; + ThreadFuture createTenant(TenantNameRef const& tenantName) override; // Deletes the tenant with the given name. The tenant must be empty. - ThreadFuture deleteTenant(StringRef const& tenantName) override; + ThreadFuture deleteTenant(TenantNameRef const& tenantName) override; void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } @@ -546,6 +551,8 @@ public: ThreadFuture onError(Error const& e) override; void reset() override; + Optional getTenant() override; + void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } @@ -595,6 +602,8 @@ private: void setDefaultOptions(UniqueOrderedOptionList options); std::vector>>> persistentOptions; + + const Optional tenantName; }; struct ClientDesc { @@ -639,10 +648,10 @@ public: void delref() override { ThreadSafeReferenceCounted::delref(); } Reference>> tenantVar; + const Standalone tenantName; private: Reference db; - const Standalone tenantName; Mutex tenantLock; ThreadFuture tenantUpdater; @@ -666,7 +675,7 @@ public: ~MultiVersionDatabase() override; - Reference openTenant(StringRef tenantName) override; + Reference openTenant(TenantNameRef tenantName) override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; @@ -678,10 +687,10 @@ public: Optional expectedVersion = Optional()) override; // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - ThreadFuture createTenant(StringRef const& tenantName) override; + ThreadFuture createTenant(TenantNameRef const& tenantName) override; // Deletes the tenant with the given name. The tenant must be empty. - ThreadFuture deleteTenant(StringRef const& tenantName) override; + ThreadFuture deleteTenant(TenantNameRef const& tenantName) override; void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 1429ab25b7..c14f180d4f 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -449,6 +449,7 @@ public: } Future getTenantPrefix(); + Optional getTenant() { return trState->tenant; } Reference trState; std::vector> watches; diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h index db2ab5dac2..887fd89c6c 100644 --- a/fdbclient/ReadYourWrites.h +++ b/fdbclient/ReadYourWrites.h @@ -191,6 +191,8 @@ public: void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; } Transaction& getTransaction() { return tr; } + Optional getTenant() { return tr.getTenant(); } + // used in template functions as returned Future type template using FutureT = Future; diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 9b0626361a..1e16071fd5 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -47,7 +47,7 @@ ThreadFuture> ThreadSafeDatabase::createFromExistingDatabas }); } -Reference ThreadSafeDatabase::openTenant(StringRef tenantName) { +Reference ThreadSafeDatabase::openTenant(TenantNameRef tenantName) { return makeReference(Reference::addRef(this), tenantName); } @@ -117,7 +117,7 @@ ThreadFuture ThreadSafeDatabase::getServerProtocol(Optional ThreadSafeDatabase::createTenant(StringRef const& name) { +ThreadFuture ThreadSafeDatabase::createTenant(TenantNameRef const& name) { DatabaseContext* db = this->db; TenantName tenantNameCopy = name; return onMainThread([db, tenantNameCopy]() -> Future { @@ -126,7 +126,7 @@ ThreadFuture ThreadSafeDatabase::createTenant(StringRef const& name) { } // Deletes the tenant with the given name. The tenant must be empty. -ThreadFuture ThreadSafeDatabase::deleteTenant(StringRef const& name) { +ThreadFuture ThreadSafeDatabase::deleteTenant(TenantNameRef const& name) { DatabaseContext* db = this->db; TenantName tenantNameCopy = name; return onMainThread([db, tenantNameCopy]() -> Future { @@ -171,7 +171,8 @@ ThreadSafeTenant::~ThreadSafeTenant() {} ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type, - Optional tenant) { + Optional tenant) + : tenantName(tenant) { // Allocate memory for the transaction from this thread (so the pointer is known for subsequent method calls) // but run its constructor on the main thread @@ -497,6 +498,10 @@ ThreadFuture ThreadSafeTransaction::onError(Error const& e) { return onMainThread([tr, e]() { return tr->onError(e); }); } +Optional ThreadSafeTransaction::getTenant() { + return tenantName; +} + void ThreadSafeTransaction::operator=(ThreadSafeTransaction&& r) noexcept { tr = r.tr; r.tr = nullptr; diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index 938e21b429..8a7b0ff68f 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -35,7 +35,7 @@ public: ~ThreadSafeDatabase() override; static ThreadFuture> createFromExistingDatabase(Database cx); - Reference openTenant(StringRef tenantName) override; + Reference openTenant(TenantNameRef tenantName) override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; @@ -48,10 +48,10 @@ public: Optional expectedVersion = Optional()) override; // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - ThreadFuture createTenant(StringRef const& name) override; + ThreadFuture createTenant(TenantNameRef const& name) override; // Deletes the tenant with the given name. The tenant must be empty. - ThreadFuture deleteTenant(StringRef const& name) override; + ThreadFuture deleteTenant(TenantNameRef const& name) override; // Returns after a majority of coordination servers are available and have reported a leader. The // cluster file therefore is valid, but the database might be unavailable. @@ -174,6 +174,8 @@ public: ThreadFuture checkDeferredError(); ThreadFuture onError(Error const& e) override; + Optional getTenant() override; + // These are to permit use as state variables in actors: ThreadSafeTransaction() : tr(nullptr) {} void operator=(ThreadSafeTransaction&& r) noexcept; @@ -186,6 +188,7 @@ public: private: ISingleThreadTransaction* tr; + const Optional tenantName; }; // An implementation of IClientApi that serializes operations onto the network thread and interacts with the lower-level From a168faf219f5dbec07e0464fa8598fd8e5c0a177 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 23 Feb 2022 08:00:03 -0800 Subject: [PATCH 081/138] Fix bugs in serialization and tenant prefix management on the client. --- fdbclient/NativeAPI.actor.cpp | 65 ++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 350cf107ab..5eb2331c91 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -2889,6 +2889,8 @@ TenantInfo TransactionState::getTenantInfo() const { if (!cx->internal && !options.rawAccess && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !tenant.present()) { throw tenant_name_required(); + } else if (options.rawAccess) { + return TenantInfo(); } return TenantInfo(tenant); @@ -3043,13 +3045,9 @@ ACTOR Future getKey(Reference trState, // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); } - state Key resolvedTenantPrefix; - if (applyTenantPrefix) { - Key _resolvedTenantPrefix = wait(trState->tenantPrefix); - resolvedTenantPrefix = _resolvedTenantPrefix; - if (resolvedTenantPrefix.size() > 0) { - k = KeySelectorRef(k.getKey().withPrefix(resolvedTenantPrefix), k.orEqual, k.offset); - } + state Key resolvedTenantPrefix = wait(trState->tenantPrefix); + if (applyTenantPrefix && resolvedTenantPrefix.size() > 0) { + k = KeySelectorRef(k.getKey().withPrefix(resolvedTenantPrefix), k.orEqual, k.offset); } loop { @@ -3679,9 +3677,11 @@ Future getRangeFallback(Reference trState, trState, version, KeyRangeRef(b, e), mapper, limits, reverse, useTenant)); RangeResultFamily r = _r; - if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) + ASSERT(trState->tenantPrefix.isReady()); + + if (b.removePrefix(trState->tenantPrefix.get()) == allKeys.begin && ((reverse && !r.more) || !reverse)) r.readToBegin = true; - if (e == allKeys.end && ((!reverse && !r.more) || reverse)) + if (e.removePrefix(trState->tenantPrefix.get()) == allKeys.end && ((!reverse && !r.more) || reverse)) r.readThroughEnd = true; ASSERT(!limits.hasRowLimit() || r.size() <= limits.rows); @@ -4002,6 +4002,7 @@ Future getRange(Reference trState, if (readThrough) { output.arena().dependsOn(shard.arena()); output.readThrough = reverse ? shard.begin : shard.end; + ASSERT(output.readThrough.get().startsWith(tenantPrefix)); } getRangeFinished(trState, @@ -4023,6 +4024,7 @@ Future getRange(Reference trState, if (readThrough) { output.arena().dependsOn(shard.arena()); output.readThrough = reverse ? shard.begin : shard.end; + ASSERT(output.readThrough.get().startsWith(tenantPrefix)); } output.more = modifiedSelectors || limits.isReached() || rep.more; @@ -4438,7 +4440,15 @@ ACTOR Future getRangeStreamFragment(Reference trState, if (trState->debugID.present()) g_traceBatch.addEvent( "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); - RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena); + + ASSERT(trState->tenantPrefix.isReady()); + Key tenantPrefix = trState->tenantPrefix.get(); + + RangeResult output; + output.more = rep.more; + for (auto rr : rep.data) { + output.push_back_deep(output.arena(), KeyValueRef(rr.key.removePrefix(tenantPrefix), rr.value)); + } if (tssDuplicateStream.present() && !tssDuplicateStream.get().done()) { // shallow copy the reply with an arena depends, and send it to the duplicate stream for TSS @@ -4446,8 +4456,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, replyCopy.version = rep.version; replyCopy.more = rep.more; replyCopy.cached = rep.cached; - replyCopy.arena.dependsOn(rep.arena); - replyCopy.data.append(replyCopy.arena, rep.data.begin(), rep.data.size()); + replyCopy.arena.dependsOn(output.arena()); + replyCopy.data.append(replyCopy.arena, output.begin(), output.size()); tssDuplicateStream.get().stream.send(replyCopy); } @@ -4461,7 +4471,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, // If the reply says there is more but we know that we finished the shard, then fix rep.more if (reverse && output.more && rep.data.size() > 0 && - output[output.size() - 1].key == locations[shard].first.begin) { + rep.data[rep.data.size() - 1].key == locations[shard].first.begin) { output.more = false; } @@ -4481,10 +4491,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, // Make next request to the same shard with a beginning key just after the last key returned if (reverse) locations[shard].first = - KeyRangeRef(locations[shard].first.begin, output[output.size() - 1].key); + KeyRangeRef(locations[shard].first.begin, rep.data[rep.data.size() - 1].key); else locations[shard].first = - KeyRangeRef(keyAfter(output[output.size() - 1].key), locations[shard].first.end); + KeyRangeRef(keyAfter(rep.data[rep.data.size() - 1].key), locations[shard].first.end); } if (locations[shard].first.empty()) { @@ -4525,10 +4535,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, } ASSERT(output.size()); - if (keys.begin == allKeys.begin && !reverse) { + if (keys.begin.removePrefix(tenantPrefix) == allKeys.begin && !reverse) { output.readToBegin = true; } - if (keys.end == allKeys.end && reverse) { + if (keys.end.removePrefix(tenantPrefix) == allKeys.end && reverse) { output.readThroughEnd = true; } results->send(std::move(output)); @@ -5719,14 +5729,9 @@ ACTOR static Future tryCommit(Reference trState, wait(store(req.transaction.read_snapshot, readVersion)); } - try { - Key resolvedTenantPrefix = wait(trState->tenantPrefix); - if (!resolvedTenantPrefix.empty()) { - applyTenantPrefix(req, resolvedTenantPrefix); - } - } catch (Error& e) { - // TODO: use a different error here? - throw not_committed(); + state Key resolvedTenantPrefix = wait(trState->tenantPrefix); + if (!resolvedTenantPrefix.empty()) { + applyTenantPrefix(req, resolvedTenantPrefix); } startTime = now(); @@ -5817,8 +5822,9 @@ ACTOR static Future tryCommit(Reference trState, conflictingKRIndices.end()); for (auto const& rCRIndex : mergedIds) { const KeyRangeRef kr = req.transaction.read_conflict_ranges[rCRIndex]; - const KeyRange krWithPrefix = KeyRangeRef(kr.begin.withPrefix(conflictingKeysRange.begin), - kr.end.withPrefix(conflictingKeysRange.begin)); + const KeyRange krWithPrefix = KeyRangeRef( + kr.begin.removePrefix(resolvedTenantPrefix).withPrefix(conflictingKeysRange.begin), + kr.end.removePrefix(resolvedTenantPrefix).withPrefix(conflictingKeysRange.begin)); trState->conflictingKeys->insert(krWithPrefix, conflictingKeysTrue); } } @@ -5861,7 +5867,8 @@ ACTOR static Future tryCommit(Reference trState, } else { if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded && - e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled) { + e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled && + e.code() != error_code_process_behind && e.code() != error_code_future_version) { TraceEvent(SevError, "TryCommitError").error(e); } if (trState->trLogInfo) @@ -6214,6 +6221,8 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaloptions.rawAccess = true; + trState->tenant = Optional(); + tr.tenantInfo = TenantInfo(); break; default: From 59aa2abb7ed6a847d7631e0001a6a51e88fe7806 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 23 Feb 2022 08:07:19 -0800 Subject: [PATCH 082/138] Add some additional logging and buggification to the create and delete tenant functions --- fdbclient/GenericManagementAPI.actor.h | 32 ++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/fdbclient/GenericManagementAPI.actor.h b/fdbclient/GenericManagementAPI.actor.h index ae481a0b15..0c80f773e8 100644 --- a/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/GenericManagementAPI.actor.h @@ -681,6 +681,8 @@ Future createTenant(Reference db, TenantName name) { // If the tenant did not exist when we started trying to create it, then we will return success // even if someone else created it simultaneously. This helps us avoid problems if the commit // result for creating this tenant is unknown. + Version readVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); + TraceEvent("CreatedTenantAlready").detail("Tenant", name).detail("ReadVersion", readVersion); return Void(); } } else { @@ -704,11 +706,21 @@ Future createTenant(Reference db, TenantName name) { tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(newTenant.id)); tr->set(tenantMapKey, encodeTenantEntry(newTenant)); + if (BUGGIFY) { + throw commit_unknown_result(); + } + wait(safeThreadFutureToFuture(tr->commit())); + + if (BUGGIFY) { + throw commit_unknown_result(); + } + TraceEvent("CreatedTenant") .detail("Tenant", name) - .detail("ID", newTenant.id) - .detail("Prefix", newTenant.prefix); + .detail("TenantId", newTenant.id) + .detail("Prefix", newTenant.prefix) + .detail("Version", tr->getCommittedVersion()); return Void(); } catch (Error& e) { @@ -728,7 +740,7 @@ Future deleteTenant(Reference db, TenantName name) { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Optional tenantEntry = wait(tryGetTenant(db, name)); + state Optional tenantEntry = wait(tryGetTenant(db, name)); if (!tenantEntry.present()) { if (!tenantCheckCompleted) { throw tenant_not_found(); @@ -736,6 +748,8 @@ Future deleteTenant(Reference db, TenantName name) { // If the tenant existed when we started trying to delete it, then we will return success // even if someone else deleted it simultaneously. This helps us avoid problems if the commit // result for deleting this tenant is unknown. + Version readVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); + TraceEvent("DeletedTenantAlready").detail("Tenant", name).detail("ReadVersion", readVersion); return Void(); } } else { @@ -749,8 +763,18 @@ Future deleteTenant(Reference db, TenantName name) { } tr->clear(tenantMapKey); + + if (BUGGIFY) { + throw commit_unknown_result(); + } + wait(safeThreadFutureToFuture(tr->commit())); - TraceEvent("DeletedTenant").detail("Tenant", name); + + if (BUGGIFY) { + throw commit_unknown_result(); + } + + TraceEvent("DeletedTenant").detail("Tenant", name).detail("Version", tr->getCommittedVersion()); return Void(); } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); From 2a211260285f62cb83b68710e79bdd4ab396db30 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 25 Feb 2022 11:43:20 -0800 Subject: [PATCH 083/138] Don't apply read prefixes on the client. Cache tenant data locally. --- fdbclient/ClientKnobs.cpp | 2 + fdbclient/ClientKnobs.h | 2 + fdbclient/DatabaseContext.h | 32 +- fdbclient/NativeAPI.actor.cpp | 809 +++++++++---------- fdbclient/NativeAPI.actor.h | 5 +- fdbserver/QuietDatabase.actor.cpp | 3 + fdbserver/storageserver.actor.cpp | 4 +- fdbserver/workloads/RandomMoveKeys.actor.cpp | 2 + 8 files changed, 419 insertions(+), 440 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 7e8ae46cc4..d57441bbda 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -89,6 +89,8 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); + init( TENANT_CACHE_EVICTION_SIZE, 100000 ); + init( TENANT_CACHE_EVICTION_SIZE, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_EVICTION_SIZE_SIM = 3; init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/ClientKnobs.h b/fdbclient/ClientKnobs.h index 7b01e1c1c0..c4597a6d8d 100644 --- a/fdbclient/ClientKnobs.h +++ b/fdbclient/ClientKnobs.h @@ -89,6 +89,8 @@ public: int LOCATION_CACHE_EVICTION_SIZE_SIM; double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD; double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL; + int TENANT_CACHE_EVICTION_SIZE; + int TENANT_CACHE_EVICTION_SIZE_SIM; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index c752c63a32..ca08273518 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -205,6 +205,16 @@ struct EndpointFailureInfo { double lastRefreshTime = 0; }; +struct KeyRangeLocationInfo { + TenantMapEntry tenantEntry; + KeyRange range; + Reference locations; + + KeyRangeLocationInfo() {} + KeyRangeLocationInfo(TenantMapEntry tenantEntry, KeyRange range, Reference locations) + : tenantEntry(tenantEntry), range(range), locations(locations) {} +}; + class DatabaseContext : public ReferenceCounted, public FastAllocated, NonCopyable { public: static DatabaseContext* allocateOnForeignThread() { @@ -239,14 +249,22 @@ public: switchable)); } - std::pair> getCachedLocation(const KeyRef&, Reverse isBackward = Reverse::False); - bool getCachedLocations(const KeyRangeRef&, - std::vector>>&, + Optional getCachedLocation(const Optional& tenant, + const KeyRef&, + Reverse isBackward = Reverse::False); + bool getCachedLocations(const Optional& tenant, + const KeyRangeRef&, + std::vector&, int limit, Reverse reverse); - Reference setCachedLocation(const KeyRangeRef&, const std::vector&); - void invalidateCache(const KeyRef&, Reverse isBackward = Reverse::False); - void invalidateCache(const KeyRangeRef&); + void cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry); + Reference setCachedLocation(const Optional& tenant, + const TenantMapEntry& tenantEntry, + const KeyRangeRef&, + const std::vector&); + void invalidateCachedTenant(const TenantNameRef& tenant); + void invalidateCache(const KeyRef& tenantPrefix, const KeyRef& key, Reverse isBackward = Reverse::False); + void invalidateCache(const KeyRef& tenantPrefix, const KeyRangeRef& keys); // Records that `endpoint` is failed on a healthy server. void setFailedEndpointOnHealthyServer(const Endpoint& endpoint); @@ -409,8 +427,10 @@ public: // Cache of location information int locationCacheSize; + int tenantCacheSize; CoalescedKeyRangeMap> locationCache; std::unordered_map failedEndpointsOnHealthyServersInfo; + std::unordered_map tenantCache; std::map server_interf; std::map blobWorker_interf; // blob workers don't change endpoints for the same ID diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 5eb2331c91..c26ddd58c6 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -132,9 +132,6 @@ FDB_DEFINE_BOOLEAN_PARAM(UseProvisionalProxies); // Whether or not a request should include the tenant name FDB_BOOLEAN_PARAM(UseTenant); -// Whether or not a function should implicitly add the tenant prefix to the request and/or remove it from the result -FDB_BOOLEAN_PARAM(ApplyTenantPrefix); - NetworkOptions networkOptions; TLSConfig tlsConfig(TLSEndpointType::CLIENT); @@ -1358,6 +1355,8 @@ DatabaseContext::DatabaseContext(ReferenceisSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM : CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE; + tenantCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE_SIM + : CLIENT_KNOBS->TENANT_CACHE_EVICTION_SIZE; getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted")); getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted")); @@ -1636,24 +1635,55 @@ DatabaseContext::~DatabaseContext() { locationCache.insert(allKeys, Reference()); } -std::pair> DatabaseContext::getCachedLocation(const KeyRef& key, Reverse isBackward) { - if (isBackward) { - auto range = locationCache.rangeContainingKeyBefore(key); - return std::make_pair(range->range(), range->value()); - } else { - auto range = locationCache.rangeContaining(key); - return std::make_pair(range->range(), range->value()); +Optional DatabaseContext::getCachedLocation(const Optional& tenantName, + const KeyRef& key, + Reverse isBackward) { + TenantMapEntry tenantEntry; + Arena arena; + KeyRef resolvedKey = key; + + if (tenantName.present()) { + auto itr = tenantCache.find(tenantName.get()); + if (itr != tenantCache.end()) { + tenantEntry = itr->second; + resolvedKey = resolvedKey.withPrefix(tenantEntry.prefix, arena); + } else { + return Optional(); + } } + + auto range = + isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); + if (range->value()) { + return KeyRangeLocationInfo(tenantEntry, range->range(), range->value()); + } + + return Optional(); } -bool DatabaseContext::getCachedLocations(const KeyRangeRef& range, - std::vector>>& result, +bool DatabaseContext::getCachedLocations(const Optional& tenantName, + const KeyRangeRef& range, + std::vector& result, int limit, Reverse reverse) { result.clear(); - auto begin = locationCache.rangeContaining(range.begin); - auto end = locationCache.rangeContainingKeyBefore(range.end); + TenantMapEntry tenantEntry; + Arena arena; + KeyRangeRef resolvedRange = range; + + if (tenantName.present()) { + auto itr = tenantCache.find(tenantName.get()); + if (itr != tenantCache.end()) { + tenantEntry = itr->second; + resolvedRange = resolvedRange.withPrefix(tenantEntry.prefix, arena); + } else { + return false; + } + } + + auto begin = locationCache.rangeContaining(resolvedRange.begin); + auto end = locationCache.rangeContainingKeyBefore(resolvedRange.end); loop { auto r = reverse ? end : begin; @@ -1662,7 +1692,7 @@ bool DatabaseContext::getCachedLocations(const KeyRangeRef& range, result.clear(); return false; } - result.emplace_back(r->range() & range, r->value()); + result.emplace_back(tenantEntry, r->range() & resolvedRange, r->value()); if (result.size() == limit || begin == end) { break; } @@ -1676,8 +1706,21 @@ bool DatabaseContext::getCachedLocations(const KeyRangeRef& range, return true; } -Reference DatabaseContext::setCachedLocation(const KeyRangeRef& keys, +void DatabaseContext::cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry) { + int attempts = 0; + while (tenantCache.size() > tenantCacheSize && attempts++ < 100) { + tenantCache[tenant] = tenantEntry; + } +} + +Reference DatabaseContext::setCachedLocation(const Optional& tenant, + const TenantMapEntry& tenantEntry, + const KeyRangeRef& absoluteKeys, const std::vector& servers) { + if (tenant.present()) { + cacheTenant(tenant.get(), tenantEntry); + } + std::vector>> serverRefs; serverRefs.reserve(servers.size()); for (const auto& interf : servers) { @@ -1693,20 +1736,36 @@ Reference DatabaseContext::setCachedLocation(const KeyRangeRef& ke Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it locationCache.insert(KeyRangeRef(begin, end), Reference()); } - locationCache.insert(keys, loc); + locationCache.insert(absoluteKeys, loc); return loc; } -void DatabaseContext::invalidateCache(const KeyRef& key, Reverse isBackward) { +void DatabaseContext::invalidateCachedTenant(const TenantNameRef& tenant) { + tenantCache.erase(tenant); +} + +void DatabaseContext::invalidateCache(const KeyRef& tenantPrefix, const KeyRef& key, Reverse isBackward) { + Arena arena; + KeyRef resolvedKey = key; + if (!tenantPrefix.empty()) { + resolvedKey = resolvedKey.withPrefix(tenantPrefix, arena); + } + if (isBackward) { - locationCache.rangeContainingKeyBefore(key)->value() = Reference(); + locationCache.rangeContainingKeyBefore(resolvedKey)->value() = Reference(); } else { - locationCache.rangeContaining(key)->value() = Reference(); + locationCache.rangeContaining(resolvedKey)->value() = Reference(); } } -void DatabaseContext::invalidateCache(const KeyRangeRef& keys) { - auto rs = locationCache.intersectingRanges(keys); +void DatabaseContext::invalidateCache(const KeyRef& tenantPrefix, const KeyRangeRef& keys) { + Arena arena; + KeyRangeRef resolvedKeys = keys; + if (!tenantPrefix.empty()) { + resolvedKeys = resolvedKeys.withPrefix(tenantPrefix, arena); + } + + auto rs = locationCache.intersectingRanges(resolvedKeys); Key begin = rs.begin().begin(), end = rs.end().begin(); // insert invalidates rs, so can't be passed a mere reference into it locationCache.insert(KeyRangeRef(begin, end), Reference()); @@ -1865,7 +1924,7 @@ ACTOR static Future switchConnectionRecordImpl(ReferencecommitProxies.clear(); self->grvProxies.clear(); self->minAcceptableReadVersion = std::numeric_limits::max(); - self->invalidateCache(allKeys); + self->invalidateCache(Key(), allKeys); auto clearedClientInfo = self->clientInfo->get(); clearedClientInfo.commitProxies.clear(); @@ -2576,15 +2635,25 @@ void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) { } } +KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) { + if (prefix.empty()) { + return range; + } else { + KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin; + KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end; + return KeyRangeRef(begin, end); + } +} + // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). // Otherwise returns the shard containing key -ACTOR Future>> getKeyLocation_internal( - Database cx, - Key key, - SpanID spanID, - Optional debugID, - UseProvisionalProxies useProvisionalProxies, - Reverse isBackward) { +ACTOR Future getKeyLocation_internal(Database cx, + Optional tenant, + Key key, + SpanID spanID, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Reverse isBackward) { state Span span("NAPI:getKeyLocation"_loc, spanID); if (isBackward) { @@ -2604,12 +2673,11 @@ ACTOR Future>> getKeyLocation_intern wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(span.context, - Optional(), + tenant.castTo(), key, Optional(), 100, isBackward, - latestVersion, key.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; @@ -2617,9 +2685,14 @@ ACTOR Future>> getKeyLocation_intern g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.After"); ASSERT(rep.results.size() == 1); - auto locationInfo = cx->setCachedLocation(rep.results[0].first, rep.results[0].second); + auto locationInfo = + cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second); updateTssMappings(cx, rep); - return std::make_pair(KeyRange(rep.results[0].first, rep.arena), locationInfo); + + return KeyRangeLocationInfo( + rep.tenantEntry, + KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), + locationInfo); } } } @@ -2655,46 +2728,56 @@ bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) { } template -Future>> getKeyLocation(Database const& cx, - Key const& key, - F StorageServerInterface::*member, - SpanID spanID, - Optional debugID, - UseProvisionalProxies useProvisionalProxies, - Reverse isBackward = Reverse::False) { +Future getKeyLocation(Database const& cx, + Optional const& tenant, + Key const& key, + F StorageServerInterface::*member, + SpanID spanID, + Optional debugID, + UseProvisionalProxies useProvisionalProxies, + Reverse isBackward = Reverse::False) { // we first check whether this range is cached - auto ssi = cx->getCachedLocation(key, isBackward); - if (!ssi.second) { - return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward); + Optional locationInfo = cx->getCachedLocation(tenant, key, isBackward); + if (!locationInfo.present()) { + return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward); } bool onlyEndpointFailedAndNeedRefresh = false; - for (int i = 0; i < ssi.second->size(); i++) { - if (checkOnlyEndpointFailed(cx, ssi.second->get(i, member).getEndpoint())) { + for (int i = 0; i < locationInfo.get().locations->size(); i++) { + if (checkOnlyEndpointFailed(cx, locationInfo.get().locations->get(i, member).getEndpoint())) { onlyEndpointFailedAndNeedRefresh = true; } } if (onlyEndpointFailedAndNeedRefresh) { - cx->invalidateCache(key); + cx->invalidateCache(locationInfo.get().tenantEntry.prefix, key); + // Refresh the cache with a new getKeyLocations made to proxies. - return getKeyLocation_internal(cx, key, spanID, debugID, useProvisionalProxies, isBackward); + return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward); } - return ssi; + return locationInfo.get(); } template -Future>> getKeyLocation(Reference trState, - Key const& key, - F StorageServerInterface::*member, - Reverse isBackward = Reverse::False) { - return getKeyLocation( - trState->cx, key, member, trState->spanID, trState->debugID, trState->useProvisionalProxies, isBackward); +Future getKeyLocation(Reference trState, + Key const& key, + F StorageServerInterface::*member, + Reverse isBackward = Reverse::False, + UseTenant useTenant = UseTenant::True) { + return getKeyLocation(trState->cx, + useTenant ? trState->tenant : Optional(), + key, + member, + trState->spanID, + trState->debugID, + trState->useProvisionalProxies, + isBackward); } -ACTOR Future>>> getKeyRangeLocations_internal( +ACTOR Future> getKeyRangeLocations_internal( Database cx, + Optional tenant, KeyRange keys, int limit, Reverse reverse, @@ -2713,12 +2796,11 @@ ACTOR Future>>> getKeyRa wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(span.context, - Optional(), + tenant.castTo(), keys.begin, keys.end, limit, reverse, - latestVersion, keys.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; @@ -2727,13 +2809,16 @@ ACTOR Future>>> getKeyRa g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.After"); ASSERT(rep.results.size()); - state std::vector>> results; + state std::vector results; state int shard = 0; for (; shard < rep.results.size(); shard++) { // FIXME: these shards are being inserted into the map sequentially, it would be much more CPU // efficient to save the map pairs and insert them all at once. - results.emplace_back(rep.results[shard].first & keys, - cx->setCachedLocation(rep.results[shard].first, rep.results[shard].second)); + results.emplace_back( + rep.tenantEntry, + (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), + cx->setCachedLocation( + tenant, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); wait(yield()); } updateTssMappings(cx, rep); @@ -2751,63 +2836,71 @@ ACTOR Future>>> getKeyRa // Example: If query the function with key range (b, d), the returned list of pairs could be something like: // [([a, b1), locationInfo), ([b1, c), locationInfo), ([c, d1), locationInfo)]. template -Future>>> getKeyRangeLocations( - Database const& cx, - KeyRange const& keys, - int limit, - Reverse reverse, - F StorageServerInterface::*member, - SpanID const& spanID, - Optional const& debugID, - UseProvisionalProxies useProvisionalProxies) { +Future> getKeyRangeLocations(Database const& cx, + Optional tenant, + KeyRange const& keys, + int limit, + Reverse reverse, + F StorageServerInterface::*member, + SpanID const& spanID, + Optional const& debugID, + UseProvisionalProxies useProvisionalProxies) { ASSERT(!keys.empty()); - std::vector>> locations; - if (!cx->getCachedLocations(keys, locations, limit, reverse)) { - return getKeyRangeLocations_internal(cx, keys, limit, reverse, spanID, debugID, useProvisionalProxies); + std::vector locations; + if (!cx->getCachedLocations(tenant, keys, locations, limit, reverse)) { + return getKeyRangeLocations_internal(cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies); } bool foundFailed = false; - for (const auto& [range, locInfo] : locations) { + for (const auto& locationInfo : locations) { bool onlyEndpointFailedAndNeedRefresh = false; - for (int i = 0; i < locInfo->size(); i++) { - if (checkOnlyEndpointFailed(cx, locInfo->get(i, member).getEndpoint())) { + for (int i = 0; i < locationInfo.locations->size(); i++) { + if (checkOnlyEndpointFailed(cx, locationInfo.locations->get(i, member).getEndpoint())) { onlyEndpointFailedAndNeedRefresh = true; } } if (onlyEndpointFailedAndNeedRefresh) { - cx->invalidateCache(range.begin); + cx->invalidateCache(locationInfo.tenantEntry.prefix, locationInfo.range.begin); foundFailed = true; } } if (foundFailed) { // Refresh the cache with a new getKeyRangeLocations made to proxies. - return getKeyRangeLocations_internal(cx, keys, limit, reverse, spanID, debugID, useProvisionalProxies); + return getKeyRangeLocations_internal(cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies); } return locations; } template -Future>>> getKeyRangeLocations( - Reference trState, - KeyRange const& keys, - int limit, - Reverse reverse, - F StorageServerInterface::*member) { - return getKeyRangeLocations( - trState->cx, keys, limit, reverse, member, trState->spanID, trState->debugID, trState->useProvisionalProxies); +Future> getKeyRangeLocations(Reference trState, + KeyRange const& keys, + int limit, + Reverse reverse, + F StorageServerInterface::*member, + UseTenant useTenant = UseTenant::True) { + return getKeyRangeLocations(trState->cx, + useTenant ? trState->tenant : Optional(), + keys, + limit, + reverse, + member, + trState->spanID, + trState->debugID, + trState->useProvisionalProxies); } ACTOR Future warmRange_impl(Reference trState, KeyRange keys) { state int totalRanges = 0; state int totalRequests = 0; loop { - std::vector>> locations = + std::vector locations = wait(getKeyRangeLocations_internal(trState->cx, + trState->tenant, keys, CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, Reverse::False, @@ -2817,14 +2910,14 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || - locations[locations.size() - 1].first.end >= keys.end) + locations[locations.size() - 1].range.end >= keys.end) break; - keys = KeyRangeRef(locations[locations.size() - 1].first.end, keys.end); + keys = KeyRangeRef(locations[locations.size() - 1].range.end, keys.end); if (totalRequests % 20 == 0) { // To avoid blocking the proxies from starting other transactions, occasionally get a read version. - state Transaction tr(trState->cx); + state Transaction tr(trState->cx, trState->tenant); loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); @@ -2914,16 +3007,10 @@ ACTOR Future> getValue(Reference trState, span.addTag("key"_sr, key); trState->cx->validateVersion(ver); - if (useTenant) { - Key resolvedTenantPrefix = wait(trState->tenantPrefix); - if (resolvedTenantPrefix.size() > 0) { - key = key.withPrefix(resolvedTenantPrefix); - } - } - loop { - state std::pair> ssi = - wait(getKeyLocation(trState, key, &StorageServerInterface::getValue)); + state KeyRangeLocationInfo locationInfo = + wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, useTenant)); + state Optional getValueID = Optional(); state uint64_t startTime; state double startTimeD; @@ -2956,7 +3043,7 @@ ACTOR Future> getValue(Reference trState, when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetValueReply _reply = wait(loadBalance( trState->cx.getPtr(), - ssi.second, + locationInfo.locations, &StorageServerInterface::getValue, GetValueRequest(span.context, useTenant ? trState->getTenantInfo() : TenantInfo(), @@ -3014,7 +3101,7 @@ ACTOR Future> getValue(Reference trState, } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && ver == latestVersion)) { - trState->cx->invalidateCache(key); + trState->cx->invalidateCache(useTenant ? locationInfo.tenantEntry.prefix : Key(), key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else { if (trState->trLogInfo && recordLogInfo) @@ -3026,10 +3113,7 @@ ACTOR Future> getValue(Reference trState, } } -ACTOR Future getKey(Reference trState, - KeySelector k, - Future version, - ApplyTenantPrefix applyTenantPrefix = ApplyTenantPrefix::True) { +ACTOR Future getKey(Reference trState, KeySelector k, Future version) { wait(success(version)); state Optional getKeyID = Optional(); @@ -3045,24 +3129,18 @@ ACTOR Future getKey(Reference trState, // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); } - state Key resolvedTenantPrefix = wait(trState->tenantPrefix); - if (applyTenantPrefix && resolvedTenantPrefix.size() > 0) { - k = KeySelectorRef(k.getKey().withPrefix(resolvedTenantPrefix), k.orEqual, k.offset); - } - loop { - if (k.getKey().removePrefix(resolvedTenantPrefix) == allKeys.end) { + if (k.getKey() == allKeys.end) { if (k.offset > 0) { - return resolvedTenantPrefix.empty() || applyTenantPrefix ? allKeys.end - : allKeys.end.withPrefix(resolvedTenantPrefix); + return allKeys.end; } k.orEqual = false; - } else if (k.getKey().removePrefix(resolvedTenantPrefix) == allKeys.begin && k.offset <= 0) { - return applyTenantPrefix ? Key() : resolvedTenantPrefix; + } else if (k.getKey() == allKeys.begin && k.offset <= 0) { + return Key(); } Key locationKey(k.getKey(), k.arena()); - state std::pair> ssi = + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, locationKey, &StorageServerInterface::getKey, Reverse{ k.isBackward() })); try { @@ -3088,7 +3166,7 @@ ACTOR Future getKey(Reference trState, when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetKeyReply _reply = wait(loadBalance( trState->cx.getPtr(), - ssi.second, + locationInfo.locations, &StorageServerInterface::getKey, req, TaskPriority::DefaultPromiseEndpoint, @@ -3109,17 +3187,13 @@ ACTOR Future getKey(Reference trState, // reply.sel.offset).detail("OrEqual", k.orEqual); k = reply.sel; if (!k.offset && k.orEqual) { - if (applyTenantPrefix) { - return k.getKey().removePrefix(resolvedTenantPrefix); - } else { - return k.getKey(); - } + return k.getKey(); } } catch (Error& e) { if (getKeyID.present()) g_traceBatch.addEvent("GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.Error"); if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { - trState->cx->invalidateCache(k.getKey(), Reverse{ k.isBackward() }); + trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, k.getKey(), Reverse{ k.isBackward() }); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else { @@ -3186,13 +3260,13 @@ ACTOR Future watchValue(Database cx, Reference p ASSERT(parameters->version != latestVersion); loop { - state std::pair> ssi = - wait(getKeyLocation(cx, - parameters->key, - &StorageServerInterface::watchValue, - parameters->spanID, - parameters->debugID, - parameters->useProvisionalProxies)); + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(cx, + parameters->tenant, + parameters->key, + &StorageServerInterface::watchValue, + parameters->spanID, + parameters->debugID, + parameters->useProvisionalProxies)); try { state Optional watchValueID = Optional(); @@ -3209,7 +3283,7 @@ ACTOR Future watchValue(Database cx, Reference p choose { when(WatchValueReply r = wait( loadBalance(cx.getPtr(), - ssi.second, + locationInfo.locations, &StorageServerInterface::watchValue, WatchValueRequest(span.context, TenantInfo(parameters->tenant), @@ -3240,7 +3314,7 @@ ACTOR Future watchValue(Database cx, Reference p ver = v; } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { - cx->invalidateCache(parameters->key); + cx->invalidateCache(locationInfo.tenantEntry.prefix, parameters->key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, parameters->taskID)); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off @@ -3396,7 +3470,6 @@ Future getWatchFuture(Database cx, Reference parameters) ACTOR Future watchValueMap(Future version, Optional tenant, - Future tenantPrefix, Key key, Optional value, Database cx, @@ -3407,11 +3480,6 @@ ACTOR Future watchValueMap(Future version, UseProvisionalProxies useProvisionalProxies) { state Version ver = wait(version); - Key resolvedTenantPrefix = wait(tenantPrefix); - if (resolvedTenantPrefix.size()) { - key = key.withPrefix(resolvedTenantPrefix); - } - wait(getWatchFuture( cx, makeReference(tenant, key, value, ver, tags, spanID, taskID, debugID, useProvisionalProxies))); @@ -3468,16 +3536,17 @@ Future getExactRange(Reference trState, // printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); loop { - state std::vector>> locations = + state std::vector locations = wait(getKeyRangeLocations(trState, keys, CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, reverse, - getRangeRequestStream())); + getRangeRequestStream(), + useTenant)); ASSERT(locations.size()); state int shard = 0; loop { - const KeyRangeRef& range = locations[shard].first; + const KeyRangeRef& range = locations[shard].range; GetKeyValuesFamilyRequest req; req.mapper = mapper; @@ -3490,7 +3559,7 @@ Future getExactRange(Reference trState, req.spanContext = span.context; // keep shard's arena around in case of async tss comparison - req.arena.dependsOn(locations[shard].first.arena()); + req.arena.dependsOn(locations[shard].range.arena()); transformRangeLimits(limits, reverse, req); ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); @@ -3519,7 +3588,7 @@ Future getExactRange(Reference trState, when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetKeyValuesFamilyReply _rep = wait(loadBalance( trState->cx.getPtr(), - locations[shard].second, + locations[shard].locations, getRangeRequestStream(), req, TaskPriority::DefaultPromiseEndpoint, @@ -3555,7 +3624,7 @@ Future getExactRange(Reference trState, bool more = rep.more; // If the reply says there is more but we know that we finished the shard, then fix rep.more if (reverse && more && rep.data.size() > 0 && - output[output.size() - 1].key == locations[shard].first.begin) + output[output.size() - 1].key == locations[shard].range.begin) more = false; if (more) { @@ -3573,17 +3642,17 @@ Future getExactRange(Reference trState, TEST(true); // GetKeyValuesFamilyReply.more in getExactRange // Make next request to the same shard with a beginning key just after the last key returned if (reverse) - locations[shard].first = - KeyRangeRef(locations[shard].first.begin, output[output.size() - 1].key); + locations[shard].range = + KeyRangeRef(locations[shard].range.begin, output[output.size() - 1].key); else - locations[shard].first = - KeyRangeRef(keyAfter(output[output.size() - 1].key), locations[shard].first.end); + locations[shard].range = + KeyRangeRef(keyAfter(output[output.size() - 1].key), locations[shard].range.end); } - if (!more || locations[shard].first.empty()) { + if (!more || locations[shard].range.empty()) { TEST(true); // getExactrange (!more || locations[shard].first.empty()) if (shard == locations.size() - 1) { - const KeyRangeRef& range = locations[shard].first; + const KeyRangeRef& range = locations[shard].range; KeyRef begin = reverse ? keys.begin : range.end; KeyRef end = reverse ? range.begin : keys.end; @@ -3610,21 +3679,23 @@ Future getExactRange(Reference trState, } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { - const KeyRangeRef& range = locations[shard].first; + const KeyRangeRef& range = locations[shard].range; if (reverse) keys = KeyRangeRef(keys.begin, range.end); else keys = KeyRangeRef(range.begin, keys.end); - trState->cx->invalidateCache(keys); + trState->cx->invalidateCache(locations[0].tenantEntry.prefix, keys); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else { TraceEvent(SevInfo, "GetExactRangeError") .error(e) - .detail("ShardBegin", locations[shard].first.begin) - .detail("ShardEnd", locations[shard].first.end); + .detail("Tenant", trState->tenant) + .detail("ShardBegin", locations[shard].range.begin) + .detail("ShardEnd", locations[shard].range.end); throw; } } @@ -3639,7 +3710,7 @@ Future resolveKey(Reference trState, KeySelector const& k if (key.isFirstGreaterThan()) return Future(keyAfter(key.getKey())); - return getKey(trState, key, version, ApplyTenantPrefix::False); + return getKey(trState, key, version); } ACTOR template @@ -3677,11 +3748,9 @@ Future getRangeFallback(Reference trState, trState, version, KeyRangeRef(b, e), mapper, limits, reverse, useTenant)); RangeResultFamily r = _r; - ASSERT(trState->tenantPrefix.isReady()); - - if (b.removePrefix(trState->tenantPrefix.get()) == allKeys.begin && ((reverse && !r.more) || !reverse)) + if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) r.readToBegin = true; - if (e.removePrefix(trState->tenantPrefix.get()) == allKeys.end && ((!reverse && !r.more) || reverse)) + if (e == allKeys.end && ((!reverse && !r.more) || reverse)) r.readThroughEnd = true; ASSERT(!limits.hasRowLimit() || r.size() <= limits.rows); @@ -3733,10 +3802,8 @@ void getRangeFinished(Reference trState, Snapshot snapshot, Promise> conflictRange, Reverse reverse, - RangeResultFamily result, - UseTenant useTenant) { + RangeResultFamily result) { int64_t bytes = getRangeResultFamilyBytes(result); - ASSERT(!useTenant || trState->tenantPrefix.isReady()); trState->cx->transactionBytesRead += bytes; trState->cx->transactionKeysRead += result.size(); @@ -3796,6 +3863,8 @@ Future getRange(Reference trState, UseTenant useTenant = UseTenant::True) { // state using RangeResultRefFamily = typename RangeResultFamily::RefType; state GetRangeLimits originalLimits(limits); + state KeySelector originalBegin = begin; + state KeySelector originalEnd = end; state RangeResultFamily output; state Span span("NAPI:getRange"_loc, trState->spanID); if (useTenant && trState->tenant.present()) { @@ -3811,21 +3880,8 @@ Future getRange(Reference trState, // version that the first one completed // FIXME: Is this really right? Weaken this and see if there is a problem; // if so maybe there is a much subtler problem even with this. - state Key tenantPrefix; - if (useTenant) { - Key resolvedTenantPrefix = wait(trState->tenantPrefix); - tenantPrefix = resolvedTenantPrefix; - if (tenantPrefix.size() > 0) { - begin = KeySelectorRef(begin.getKey().withPrefix(tenantPrefix), begin.orEqual, begin.offset); - end = KeySelectorRef(end.getKey().withPrefix(tenantPrefix), end.orEqual, end.offset); - } - } - - state KeySelector originalBegin = begin; - state KeySelector originalEnd = end; - - if (begin.getKey().removePrefix(tenantPrefix) == allKeys.begin && begin.offset < 1) { + if (begin.getKey() == allKeys.begin && begin.offset < 1) { output.readToBegin = true; begin = KeySelector(firstGreaterOrEqual(begin.getKey()), begin.arena()); } @@ -3834,25 +3890,17 @@ Future getRange(Reference trState, ASSERT((!limits.hasRowLimit() || limits.rows >= limits.minRows) && limits.minRows >= 0); loop { - if (end.getKey().removePrefix(tenantPrefix) == allKeys.begin && - (end.offset < 1 || end.isFirstGreaterOrEqual())) { - getRangeFinished(trState, - startTime, - originalBegin, - originalEnd, - snapshot, - conflictRange, - reverse, - output, - useTenant); + if (end.getKey() == allKeys.begin && (end.offset < 1 || end.isFirstGreaterOrEqual())) { + getRangeFinished( + trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } Key locationKey = reverse ? Key(end.getKey(), end.arena()) : Key(begin.getKey(), begin.arena()); Reverse locationBackward{ reverse ? (end - 1).isBackward() : begin.isBackward() }; - state std::pair> beginServer = wait(getKeyLocation( - trState, locationKey, getRangeRequestStream(), locationBackward)); - state KeyRange shard = beginServer.first; + state KeyRangeLocationInfo beginServer = wait(getKeyLocation( + trState, locationKey, getRangeRequestStream(), locationBackward, useTenant)); + state KeyRange shard = beginServer.range; state bool modifiedSelectors = false; state GetKeyValuesFamilyRequest req; req.mapper = mapper; @@ -3926,7 +3974,7 @@ Future getRange(Reference trState, // state AnnotateActor annotation(currentLineage); GetKeyValuesFamilyReply _rep = wait(loadBalance(trState->cx.getPtr(), - beginServer.second, + beginServer.locations, getRangeRequestStream(), req, TaskPriority::DefaultPromiseEndpoint, @@ -3987,33 +4035,18 @@ Future getRange(Reference trState, output = copy; output.more = true; - getRangeFinished(trState, - startTime, - originalBegin, - originalEnd, - snapshot, - conflictRange, - reverse, - output, - useTenant); + getRangeFinished( + trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } if (readThrough) { output.arena().dependsOn(shard.arena()); output.readThrough = reverse ? shard.begin : shard.end; - ASSERT(output.readThrough.get().startsWith(tenantPrefix)); } - getRangeFinished(trState, - startTime, - originalBegin, - originalEnd, - snapshot, - conflictRange, - reverse, - output, - useTenant); + getRangeFinished( + trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } @@ -4024,19 +4057,11 @@ Future getRange(Reference trState, if (readThrough) { output.arena().dependsOn(shard.arena()); output.readThrough = reverse ? shard.begin : shard.end; - ASSERT(output.readThrough.get().startsWith(tenantPrefix)); } output.more = modifiedSelectors || limits.isReached() || rep.more; - getRangeFinished(trState, - startTime, - originalBegin, - originalEnd, - snapshot, - conflictRange, - reverse, - output, - useTenant); + getRangeFinished( + trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, output); return output; } @@ -4057,15 +4082,8 @@ Future getRange(Reference trState, originalLimits, reverse, useTenant)); - getRangeFinished(trState, - startTime, - originalBegin, - originalEnd, - snapshot, - conflictRange, - reverse, - result, - useTenant); + getRangeFinished( + trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); return result; } @@ -4089,7 +4107,8 @@ Future getRange(Reference trState, } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && readVersion == latestVersion)) { - trState->cx->invalidateCache(reverse ? end.getKey() : begin.getKey(), + trState->cx->invalidateCache(beginServer.tenantEntry.prefix, + reverse ? end.getKey() : begin.getKey(), Reverse{ reverse ? (end - 1).isBackward() : begin.isBackward() }); if (e.code() == error_code_wrong_shard_server) { @@ -4103,15 +4122,8 @@ Future getRange(Reference trState, originalLimits, reverse, useTenant)); - getRangeFinished(trState, - startTime, - originalBegin, - originalEnd, - snapshot, - conflictRange, - reverse, - result, - useTenant); + getRangeFinished( + trState, startTime, originalBegin, originalEnd, snapshot, conflictRange, reverse, result); return result; } @@ -4316,12 +4328,12 @@ ACTOR Future getRangeStreamFragment(Reference trState, Reverse reverse, SpanID spanContext) { loop { - state std::vector>> locations = wait(getKeyRangeLocations( + state std::vector locations = wait(getKeyRangeLocations( trState, keys, CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, reverse, &StorageServerInterface::getKeyValuesStream)); ASSERT(locations.size()); state int shard = 0; loop { - const KeyRange& range = locations[shard].first; + const KeyRange& range = locations[shard].range; state Optional> tssDuplicateStream; state GetKeyValuesStreamRequest req; @@ -4350,7 +4362,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, ++trState->cx->transactionPhysicalReads; state GetKeyValuesStreamReply rep; - if (locations[shard].second->size() == 0) { + if (locations[shard].locations->size() == 0) { wait(trState->cx->connectionFileChanged()); results->sendError(transaction_too_old()); return Void(); @@ -4362,10 +4374,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, // FIXME: create a load balance function for this code so future users of reply streams do not have // to duplicate this code int count = 0; - for (int i = 0; i < locations[shard].second->size(); i++) { + for (int i = 0; i < locations[shard].locations->size(); i++) { if (!IFailureMonitor::failureMonitor() .getState(locations[shard] - .second->get(i, &StorageServerInterface::getKeyValuesStream) + .locations->get(i, &StorageServerInterface::getKeyValuesStream) .getEndpoint()) .failed) { if (deterministicRandom()->random01() <= 1.0 / ++count) { @@ -4378,10 +4390,12 @@ ACTOR Future getRangeStreamFragment(Reference trState, break; } - std::vector> ok(locations[shard].second->size()); + std::vector> ok(locations[shard].locations->size()); for (int i = 0; i < ok.size(); i++) { ok[i] = IFailureMonitor::failureMonitor().onStateEqual( - locations[shard].second->get(i, &StorageServerInterface::getKeyValuesStream).getEndpoint(), + locations[shard] + .locations->get(i, &StorageServerInterface::getKeyValuesStream) + .getEndpoint(), FailureStatus(false)); } @@ -4389,7 +4403,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, if (now() - g_network->networkInfo.newestAlternativesFailure > 1 || deterministicRandom()->random01() < 0.01) { TraceEvent("AllAlternativesFailed") - .detail("Alternatives", locations[shard].second->description()); + .detail("Alternatives", locations[shard].locations->description()); } wait(allAlternativesFailedDelay(quorum(ok, 1))); @@ -4397,13 +4411,13 @@ ACTOR Future getRangeStreamFragment(Reference trState, state ReplyPromiseStream replyStream = locations[shard] - .second->get(useIdx, &StorageServerInterface::getKeyValuesStream) + .locations->get(useIdx, &StorageServerInterface::getKeyValuesStream) .getReplyStream(req); tssDuplicateStream = maybeDuplicateTSSStreamFragment( req, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr, - &locations[shard].second->get(useIdx, &StorageServerInterface::getKeyValuesStream)); + &locations[shard].locations->get(useIdx, &StorageServerInterface::getKeyValuesStream)); state bool breakAgain = false; loop { @@ -4440,15 +4454,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, if (trState->debugID.present()) g_traceBatch.addEvent( "TransactionDebug", trState->debugID.get().first(), "NativeAPI.getExactRange.After"); - - ASSERT(trState->tenantPrefix.isReady()); - Key tenantPrefix = trState->tenantPrefix.get(); - - RangeResult output; - output.more = rep.more; - for (auto rr : rep.data) { - output.push_back_deep(output.arena(), KeyValueRef(rr.key.removePrefix(tenantPrefix), rr.value)); - } + RangeResult output(RangeResultRef(rep.data, rep.more), rep.arena); if (tssDuplicateStream.present() && !tssDuplicateStream.get().done()) { // shallow copy the reply with an arena depends, and send it to the duplicate stream for TSS @@ -4456,8 +4462,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, replyCopy.version = rep.version; replyCopy.more = rep.more; replyCopy.cached = rep.cached; - replyCopy.arena.dependsOn(output.arena()); - replyCopy.data.append(replyCopy.arena, output.begin(), output.size()); + replyCopy.arena.dependsOn(rep.arena); + replyCopy.data.append(replyCopy.arena, rep.data.begin(), rep.data.size()); tssDuplicateStream.get().stream.send(replyCopy); } @@ -4471,7 +4477,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, // If the reply says there is more but we know that we finished the shard, then fix rep.more if (reverse && output.more && rep.data.size() > 0 && - rep.data[rep.data.size() - 1].key == locations[shard].first.begin) { + rep.data[rep.data.size() - 1].key == locations[shard].range.begin) { output.more = false; } @@ -4490,19 +4496,19 @@ ACTOR Future getRangeStreamFragment(Reference trState, TEST(true); // GetKeyValuesStreamReply.more in getRangeStream // Make next request to the same shard with a beginning key just after the last key returned if (reverse) - locations[shard].first = - KeyRangeRef(locations[shard].first.begin, rep.data[rep.data.size() - 1].key); + locations[shard].range = + KeyRangeRef(locations[shard].range.begin, rep.data[rep.data.size() - 1].key); else - locations[shard].first = - KeyRangeRef(keyAfter(rep.data[rep.data.size() - 1].key), locations[shard].first.end); + locations[shard].range = + KeyRangeRef(keyAfter(rep.data[rep.data.size() - 1].key), locations[shard].range.end); } - if (locations[shard].first.empty()) { + if (locations[shard].range.empty()) { output.more = false; } if (!output.more) { - const KeyRange& range = locations[shard].first; + const KeyRange& range = locations[shard].range; if (shard == locations.size() - 1) { KeyRef begin = reverse ? keys.begin : range.end; KeyRef end = reverse ? range.begin : keys.end; @@ -4535,10 +4541,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, } ASSERT(output.size()); - if (keys.begin.removePrefix(tenantPrefix) == allKeys.begin && !reverse) { + if (keys.begin == allKeys.begin && !reverse) { output.readToBegin = true; } - if (keys.end.removePrefix(tenantPrefix) == allKeys.end && reverse) { + if (keys.end == allKeys.end && reverse) { output.readThroughEnd = true; } results->send(std::move(output)); @@ -4556,14 +4562,15 @@ ACTOR Future getRangeStreamFragment(Reference trState, } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_connection_failed) { - const KeyRangeRef& range = locations[shard].first; + const KeyRangeRef& range = locations[shard].range; if (reverse) keys = KeyRangeRef(keys.begin, range.end); else keys = KeyRangeRef(range.begin, keys.end); - trState->cx->invalidateCache(keys); + trState->cx->invalidateCache(locations[0].tenantEntry.prefix, keys); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else { @@ -4577,8 +4584,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, ACTOR Future>> getRangeSplitPoints(Reference trState, KeyRange keys, - int64_t chunkSize, - Optional> tenantPrefix); + int64_t chunkSize); static KeyRange intersect(KeyRangeRef lhs, KeyRangeRef rhs) { return KeyRange(KeyRangeRef(std::max(lhs.begin, rhs.begin), std::min(lhs.end, rhs.end))); @@ -4604,12 +4610,6 @@ ACTOR Future getRangeStream(Reference trState, state Version version = wait(fVersion); trState->cx->validateVersion(version); - Key resolvedTenantPrefix = wait(trState->tenantPrefix); - if (resolvedTenantPrefix.size() > 0) { - begin = KeySelectorRef(begin.getKey().withPrefix(resolvedTenantPrefix), begin.orEqual, begin.offset); - end = KeySelectorRef(end.getKey().withPrefix(resolvedTenantPrefix), end.orEqual, end.offset); - } - Future fb = resolveKey(trState, begin, version); state Future fe = resolveKey(trState, end, version); @@ -4633,11 +4633,11 @@ ACTOR Future getRangeStream(Reference trState, state std::vector> outstandingRequests; while (b < e) { - state std::pair> ssi = + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, reverse ? e : b, &StorageServerInterface::getKeyValuesStream, reverse)); - state KeyRange shardIntersection = intersect(ssi.first, KeyRangeRef(b, e)); - state Standalone> splitPoints = wait(getRangeSplitPoints( - trState, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE, Optional>())); + state KeyRange shardIntersection = intersect(locationInfo.range, KeyRangeRef(b, e)); + state Standalone> splitPoints = + wait(getRangeSplitPoints(trState, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE)); state std::vector toSend; // state std::vector::iterator>> outstandingRequests; @@ -4719,33 +4719,6 @@ void debugAddTags(Reference trState) { } } -ACTOR Future getTenantPrefixImpl(Reference trState, Future version) { - // TODO: Support local and/or stateless role caching - // Note: this does not set a conflict range for the tenant read. This is ok, we expect tenants to change - // infrequently and we will have our request rejected at commit time if it does. - Optional val = - wait(getValue(trState, trState->tenant.get().withPrefix(tenantMapPrefix), version, UseTenant::False)); - - if (!val.present()) { - TraceEvent(SevWarn, "ClientTenantNotFound", trState->cx->dbId) - .detail("Tenant", trState->tenant.get()) - .backtrace(); - throw tenant_not_found(); - } - - return decodeTenantEntry(val.get()).prefix; -} - -Future Transaction::getTenantPrefix() { - if (!trState->tenant.present()) { - trState->tenantPrefix = Key(); - } else if (!trState->tenantPrefix.isValid()) { - trState->tenantPrefix = getTenantPrefixImpl(trState, getReadVersion()); - } - - return trState->tenantPrefix; -} - Transaction::Transaction() : trState(makeReference(TaskPriority::DefaultEndpoint, generateSpanID(false))) {} @@ -4795,7 +4768,6 @@ void Transaction::setVersion(Version v) { if (v <= 0) throw version_invalid(); readVersion = v; - trState->tenantPrefix = getTenantPrefix(); } Future> Transaction::get(const Key& key, Snapshot snapshot) { @@ -4864,7 +4836,6 @@ void Watch::setWatch(Future watchFuture) { ACTOR Future watch(Reference watch, Database cx, Optional tenant, - Future tenantPrefix, TagSet tags, SpanID spanID, TaskPriority taskID, @@ -4889,7 +4860,6 @@ ACTOR Future watch(Reference watch, cx->clearWatchMetadata(); watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion, tenant, - tenantPrefix, watch->key, watch->value, cx, @@ -4929,7 +4899,6 @@ Future Transaction::watch(Reference watch) { return ::watch(watch, trState->cx, trState->tenant, - getTenantPrefix(), trState->options.readTags, trState->spanID, trState->taskID, @@ -4942,17 +4911,15 @@ ACTOR Future>> getAddressesForKeyActor(Referen Key key) { state std::vector ssi; - Key resolvedTenantPrefix = wait(trState->tenantPrefix); - if (resolvedTenantPrefix.size() > 0) { - key = key.withPrefix(resolvedTenantPrefix, key.arena()); + state Key resolvedKey = key; + if (trState->tenant.present()) { + KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue)); + resolvedKey = key.withPrefix(locationInfo.tenantEntry.prefix); } - // Check that we specified a tenant if required - trState->getTenantInfo(); - // If key >= allKeys.end, then getRange will return a kv-pair with an empty value. This will result in our // serverInterfaces vector being empty, which will cause us to return an empty addresses list. - state Key ksKey = keyServersKey(key); + state Key ksKey = keyServersKey(resolvedKey); state RangeResult serverTagResult = wait(getRange(trState, ver, lastLessOrEqual(serverTagKeys.begin), @@ -5595,7 +5562,6 @@ void Transaction::setupWatches() { for (int i = 0; i < watches.size(); ++i) watches[i]->setWatch(watchValueMap(watchVersion, trState->tenant, - getTenantPrefix(), watches[i]->key, watches[i]->value, trState->cx, @@ -5634,7 +5600,7 @@ ACTOR Future> estimateCommitCosts(Referen ++trCommitCosts.expensiveCostEstCount; ++trState->cx->transactionsExpensiveClearCostEstCount; } else { - std::vector>> locations = wait(getKeyRangeLocations( + std::vector locations = wait(getKeyRangeLocations( trState, keyRange, CLIENT_KNOBS->TOO_MANY, Reverse::False, &StorageServerInterface::getShardState)); if (locations.empty()) { continue; @@ -5729,9 +5695,11 @@ ACTOR static Future tryCommit(Reference trState, wait(store(req.transaction.read_snapshot, readVersion)); } - state Key resolvedTenantPrefix = wait(trState->tenantPrefix); - if (!resolvedTenantPrefix.empty()) { - applyTenantPrefix(req, resolvedTenantPrefix); + state Key tenantPrefix; + if (trState->tenant.present()) { + KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue)); + applyTenantPrefix(req, locationInfo.tenantEntry.prefix); + tenantPrefix = locationInfo.tenantEntry.prefix; } startTime = now(); @@ -5822,9 +5790,9 @@ ACTOR static Future tryCommit(Reference trState, conflictingKRIndices.end()); for (auto const& rCRIndex : mergedIds) { const KeyRangeRef kr = req.transaction.read_conflict_ranges[rCRIndex]; - const KeyRange krWithPrefix = KeyRangeRef( - kr.begin.removePrefix(resolvedTenantPrefix).withPrefix(conflictingKeysRange.begin), - kr.end.removePrefix(resolvedTenantPrefix).withPrefix(conflictingKeysRange.begin)); + const KeyRange krWithPrefix = + KeyRangeRef(kr.begin.removePrefix(tenantPrefix).withPrefix(conflictingKeysRange.begin), + kr.end.removePrefix(tenantPrefix).withPrefix(conflictingKeysRange.begin)); trState->conflictingKeys->insert(krWithPrefix, conflictingKeysTrue); } } @@ -5868,7 +5836,8 @@ ACTOR static Future tryCommit(Reference trState, if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded && e.code() != error_code_batch_transaction_throttled && e.code() != error_code_tag_throttled && - e.code() != error_code_process_behind && e.code() != error_code_future_version) { + e.code() != error_code_process_behind && e.code() != error_code_future_version && + e.code() != error_code_tenant_not_found) { TraceEvent(SevError, "TryCommitError").error(e); } if (trState->trLogInfo) @@ -6028,7 +5997,6 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaloptions.causalWriteRisky = true; - trState->tenantPrefix = Key(); break; case FDBTransactionOptions::CAUSAL_READ_RISKY: @@ -6562,8 +6530,6 @@ Future Transaction::getReadVersion(uint32_t flags) { batcher.stream.send(req); trState->startTime = now(); readVersion = extractReadVersion(trState, location, spanContext, req.reply.getFuture(), metadataVersion); - - trState->tenantPrefix = getTenantPrefix(); } return readVersion; } @@ -6736,7 +6702,7 @@ ACTOR Future doGetStorageMetrics(Database cx, KeyRange keys, Ref throw; } wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys)); return m; } @@ -6745,23 +6711,23 @@ ACTOR Future doGetStorageMetrics(Database cx, KeyRange keys, Ref ACTOR Future getStorageMetricsLargeKeyRange(Database cx, KeyRange keys) { state Span span("NAPI:GetStorageMetricsLargeKeyRange"_loc); - std::vector>> locations = - wait(getKeyRangeLocations(cx, - keys, - std::numeric_limits::max(), - Reverse::False, - &StorageServerInterface::waitMetrics, - span.context, - Optional(), - UseProvisionalProxies::False)); + std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), + keys, + std::numeric_limits::max(), + Reverse::False, + &StorageServerInterface::waitMetrics, + span.context, + Optional(), + UseProvisionalProxies::False)); state int nLocs = locations.size(); state std::vector> fx(nLocs); state StorageMetrics total; KeyRef partBegin, partEnd; for (int i = 0; i < nLocs; i++) { - partBegin = (i == 0) ? keys.begin : locations[i].first.begin; - partEnd = (i == nLocs - 1) ? keys.end : locations[i].first.end; - fx[i] = doGetStorageMetrics(cx, KeyRangeRef(partBegin, partEnd), locations[i].second); + partBegin = (i == 0) ? keys.begin : locations[i].range.begin; + partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end; + fx[i] = doGetStorageMetrics(cx, KeyRangeRef(partBegin, partEnd), locations[i].locations); } wait(waitForAll(fx)); for (int i = 0; i < nLocs; i++) { @@ -6788,11 +6754,10 @@ ACTOR Future trackBoundedStorageMetrics(KeyRange keys, } } -ACTOR Future waitStorageMetricsMultipleLocations( - std::vector>> locations, - StorageMetrics min, - StorageMetrics max, - StorageMetrics permittedError) { +ACTOR Future waitStorageMetricsMultipleLocations(std::vector locations, + StorageMetrics min, + StorageMetrics max, + StorageMetrics permittedError) { state int nLocs = locations.size(); state std::vector> fx(nLocs); state StorageMetrics total; @@ -6803,10 +6768,10 @@ ACTOR Future waitStorageMetricsMultipleLocations( state StorageMetrics minMinus = min - halfErrorPerMachine * (nLocs - 1); for (int i = 0; i < nLocs; i++) { - WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics()); + WaitMetricsRequest req(locations[i].range, StorageMetrics(), StorageMetrics()); req.min.bytes = 0; req.max.bytes = -1; - fx[i] = loadBalance(locations[i].second->locations(), + fx[i] = loadBalance(locations[i].locations->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution); @@ -6824,7 +6789,7 @@ ACTOR Future waitStorageMetricsMultipleLocations( for (int i = 0; i < nLocs; i++) wx[i] = trackBoundedStorageMetrics( - locations[i].first, locations[i].second, fx[i].get(), halfErrorPerMachine, deltas); + locations[i].range, locations[i].locations, fx[i].get(), halfErrorPerMachine, deltas); loop { StorageMetrics delta = waitNext(deltas.getFuture()); @@ -6846,8 +6811,9 @@ ACTOR Future>> getReadHotRanges(Da loop { int64_t shardLimit = 100; // Shard limit here does not really matter since this function is currently only used // to find the read-hot sub ranges within a read-hot shard. - std::vector>> locations = + std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), keys, shardLimit, Reverse::False, @@ -6870,10 +6836,10 @@ ACTOR Future>> getReadHotRanges(Da state std::vector> fReplies(nLocs); KeyRef partBegin, partEnd; for (int i = 0; i < nLocs; i++) { - partBegin = (i == 0) ? keys.begin : locations[i].first.begin; - partEnd = (i == nLocs - 1) ? keys.end : locations[i].first.end; + partBegin = (i == 0) ? keys.begin : locations[i].range.begin; + partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end; ReadHotSubRangeRequest req(KeyRangeRef(partBegin, partEnd)); - fReplies[i] = loadBalance(locations[i].second->locations(), + fReplies[i] = loadBalance(locations[i].locations->locations(), &StorageServerInterface::getReadHotRanges, req, TaskPriority::DataDistribution); @@ -6901,7 +6867,7 @@ ACTOR Future>> getReadHotRanges(Da TraceEvent(SevError, "GetReadHotSubRangesError").error(e); throw; } - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } @@ -6916,15 +6882,15 @@ ACTOR Future, int>> waitStorageMetrics(Databa int expectedShardCount) { state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample)); loop { - std::vector>> locations = - wait(getKeyRangeLocations(cx, - keys, - shardLimit, - Reverse::False, - &StorageServerInterface::waitMetrics, - span.context, - Optional(), - UseProvisionalProxies::False)); + std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), + keys, + shardLimit, + Reverse::False, + &StorageServerInterface::waitMetrics, + span.context, + Optional(), + UseProvisionalProxies::False)); if (expectedShardCount >= 0 && locations.size() != expectedShardCount) { return std::make_pair(Optional(), locations.size()); } @@ -6938,7 +6904,7 @@ ACTOR Future, int>> waitStorageMetrics(Databa fx = waitStorageMetricsMultipleLocations(locations, min, max, permittedError); } else { WaitMetricsRequest req(keys, min, max); - fx = loadBalance(locations[0].second->locations(), + fx = loadBalance(locations[0].locations->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution); @@ -6950,7 +6916,7 @@ ACTOR Future, int>> waitStorageMetrics(Databa TraceEvent(SevError, "WaitStorageMetricsError").error(e); throw; } - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } else { @@ -6960,7 +6926,7 @@ ACTOR Future, int>> waitStorageMetrics(Databa .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY); wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); // make sure that the next getKeyRangeLocations() call will actually re-fetch the range - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); } } } @@ -7022,31 +6988,21 @@ Future>> DatabaseContext::getReadH ACTOR Future>> getRangeSplitPoints(Reference trState, KeyRange keys, - int64_t chunkSize, - Optional> tenantPrefix) { + int64_t chunkSize) { state Span span("NAPI:GetRangeSplitPoints"_loc, trState->spanID); - state Key resolvedTenantPrefix; - if (tenantPrefix.present()) { - Key _resolvedTenantPrefix = wait(trState->tenantPrefix); - if (_resolvedTenantPrefix.size() > 0) { - resolvedTenantPrefix = _resolvedTenantPrefix; - keys = keys.withPrefix(resolvedTenantPrefix); - } - } - loop { - state std::vector>> locations = wait(getKeyRangeLocations( + state std::vector locations = wait(getKeyRangeLocations( trState, keys, CLIENT_KNOBS->TOO_MANY, Reverse::False, &StorageServerInterface::getRangeSplitPoints)); try { state int nLocs = locations.size(); state std::vector> fReplies(nLocs); KeyRef partBegin, partEnd; for (int i = 0; i < nLocs; i++) { - partBegin = (i == 0) ? keys.begin : locations[i].first.begin; - partEnd = (i == nLocs - 1) ? keys.end : locations[i].first.end; + partBegin = (i == 0) ? keys.begin : locations[i].range.begin; + partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end; SplitRangeRequest req(trState->getTenantInfo(), KeyRangeRef(partBegin, partEnd), chunkSize); - fReplies[i] = loadBalance(locations[i].second->locations(), + fReplies[i] = loadBalance(locations[i].locations->locations(), &StorageServerInterface::getRangeSplitPoints, req, TaskPriority::DataDistribution); @@ -7055,28 +7011,20 @@ ACTOR Future>> getRangeSplitPoints(Reference> results; - results.push_back_deep(results.arena(), keys.begin.removePrefix(resolvedTenantPrefix)); + results.push_back_deep(results.arena(), keys.begin); for (int i = 0; i < nLocs; i++) { if (i > 0) { - results.push_back_deep( - results.arena(), - locations[i].first.begin.removePrefix(resolvedTenantPrefix)); // Need this shard boundary + results.push_back_deep(results.arena(), + locations[i].range.begin); // Need this shard boundary } if (fReplies[i].get().splitPoints.size() > 0) { - if (resolvedTenantPrefix.size() == 0) { - results.append(results.arena(), - fReplies[i].get().splitPoints.begin(), - fReplies[i].get().splitPoints.size()); - } else { - for (auto sp : fReplies[i].get().splitPoints) { - results.push_back(results.arena(), sp.removePrefix(resolvedTenantPrefix)); - } - } + results.append( + results.arena(), fReplies[i].get().splitPoints.begin(), fReplies[i].get().splitPoints.size()); results.arena().dependsOn(fReplies[i].get().splitPoints.arena()); } } if (results.back() != keys.end) { - results.push_back_deep(results.arena(), keys.end.removePrefix(resolvedTenantPrefix)); + results.push_back_deep(results.arena(), keys.end); } return results; @@ -7085,14 +7033,14 @@ ACTOR Future>> getRangeSplitPoints(Referencecx->invalidateCache(keys); + trState->cx->invalidateCache(locations[0].tenantEntry.prefix, keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } } Future>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) { - return ::getRangeSplitPoints(trState, keys, chunkSize, getTenantPrefix()); + return ::getRangeSplitPoints(trState, keys, chunkSize); } #define BG_REQUEST_DEBUG false @@ -7350,8 +7298,9 @@ ACTOR Future>> splitStorageMetrics(Database cx, StorageMetrics estimated) { state Span span("NAPI:SplitStorageMetrics"_loc); loop { - state std::vector>> locations = + state std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, Reverse::False, @@ -7366,7 +7315,7 @@ ACTOR Future>> splitStorageMetrics(Database cx, // solution to this. if (locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) { wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); } else { results.push_back_deep(results.arena(), keys.begin); try { @@ -7374,8 +7323,8 @@ ACTOR Future>> splitStorageMetrics(Database cx, state int i = 0; for (; i < locations.size(); i++) { - SplitMetricsRequest req(locations[i].first, limit, used, estimated, i == locations.size() - 1); - SplitMetricsReply res = wait(loadBalance(locations[i].second->locations(), + SplitMetricsRequest req(locations[i].range, limit, used, estimated, i == locations.size() - 1); + SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(), &StorageServerInterface::splitMetrics, req, TaskPriority::DataDistribution)); @@ -7406,7 +7355,7 @@ ACTOR Future>> splitStorageMetrics(Database cx, TraceEvent(SevError, "SplitStorageMetricsError").error(e); throw; } - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } @@ -7934,8 +7883,9 @@ ACTOR Future getChangeFeedStreamActor(Reference db, try { KeyRange fullRange = wait(getChangeFeedRange(db, cx, rangeID, begin)); keys = fullRange & range; - state std::vector>> locations = + state std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), keys, CLIENT_KNOBS->CHANGE_FEED_LOCATION_LIMIT, Reverse::False, @@ -7956,10 +7906,11 @@ ACTOR Future getChangeFeedStreamActor(Reference db, // to duplicate this code int count = 0; int useIdx = -1; - for (int i = 0; i < locations[loc].second->size(); i++) { + for (int i = 0; i < locations[loc].locations->size(); i++) { if (!IFailureMonitor::failureMonitor() - .getState( - locations[loc].second->get(i, &StorageServerInterface::changeFeedStream).getEndpoint()) + .getState(locations[loc] + .locations->get(i, &StorageServerInterface::changeFeedStream) + .getEndpoint()) .failed) { if (deterministicRandom()->random01() <= 1.0 / ++count) { useIdx = i; @@ -7973,17 +7924,17 @@ ACTOR Future getChangeFeedStreamActor(Reference db, continue; } - std::vector> ok(locations[loc].second->size()); + std::vector> ok(locations[loc].locations->size()); for (int i = 0; i < ok.size(); i++) { ok[i] = IFailureMonitor::failureMonitor().onStateEqual( - locations[loc].second->get(i, &StorageServerInterface::changeFeedStream).getEndpoint(), + locations[loc].locations->get(i, &StorageServerInterface::changeFeedStream).getEndpoint(), FailureStatus(false)); } // Making this SevWarn means a lot of clutter if (now() - g_network->networkInfo.newestAlternativesFailure > 1 || deterministicRandom()->random01() < 0.01) { - TraceEvent("AllAlternativesFailed").detail("Alternatives", locations[0].second->description()); + TraceEvent("AllAlternativesFailed").detail("Alternatives", locations[0].locations->description()); } wait(allAlternativesFailedDelay(quorum(ok, 1))); @@ -7993,8 +7944,8 @@ ACTOR Future getChangeFeedStreamActor(Reference db, if (locations.size() > 1) { std::vector> interfs; for (int i = 0; i < locations.size(); i++) { - interfs.emplace_back(locations[i].second->getInterface(chosenLocations[i]), - locations[i].first & range); + interfs.emplace_back(locations[i].locations->getInterface(chosenLocations[i]), + locations[i].range & range); } wait(mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end) || cx->connectionFileChanged()); } else { @@ -8003,7 +7954,7 @@ ACTOR Future getChangeFeedStreamActor(Reference db, req.begin = begin; req.end = end; req.range = range; - StorageServerInterface interf = locations[0].second->getInterface(chosenLocations[0]); + StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]); state ReplyPromiseStream replyStream = interf.changeFeedStream.getReplyStream(req); for (auto& it : results->storageData) { @@ -8059,7 +8010,7 @@ ACTOR Future getChangeFeedStreamActor(Reference db, e.code() == error_code_connection_failed || e.code() == error_code_unknown_change_feed || e.code() == error_code_broken_promise) { db->changeFeedCache.erase(rangeID); - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); } else { results->mutations.sendError(e); @@ -8116,8 +8067,9 @@ ACTOR Future> getOverlappingChangeFeedsA loop { try { - state std::vector>> locations = + state std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), range, CLIENT_KNOBS->CHANGE_FEED_LOCATION_LIMIT, Reverse::False, @@ -8137,7 +8089,7 @@ ACTOR Future> getOverlappingChangeFeedsA state std::vector>> allOverlappingRequests; for (auto& it : locations) { allOverlappingRequests.push_back( - singleLocationOverlappingChangeFeeds(cx, it.second, it.first & range, minVersion)); + singleLocationOverlappingChangeFeeds(cx, it.locations, it.range & range, minVersion)); } wait(waitForAll(allOverlappingRequests)); @@ -8150,7 +8102,7 @@ ACTOR Future> getOverlappingChangeFeedsA return result; } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { - cx->invalidateCache(range); + cx->invalidateCache(Key(), range); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); } else { throw e; @@ -8197,8 +8149,9 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke state KeyRange keys = wait(getChangeFeedRange(db, cx, rangeID)); - state std::vector>> locations = + state std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), keys, 3, Reverse::False, @@ -8214,9 +8167,9 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke bool foundFailed = false; for (int i = 0; i < locations.size() && !foundFailed; i++) { - for (int j = 0; j < locations[i].second->size() && !foundFailed; j++) { + for (int j = 0; j < locations[i].locations->size() && !foundFailed; j++) { if (IFailureMonitor::failureMonitor() - .getState(locations[i].second->get(j, &StorageServerInterface::changeFeedPop).getEndpoint()) + .getState(locations[i].locations->get(j, &StorageServerInterface::changeFeedPop).getEndpoint()) .isFailed()) { foundFailed = true; } @@ -8232,9 +8185,9 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke // FIXME: lookup both the src and dest shards as of the pop version to ensure all locations are popped std::vector> popRequests; for (int i = 0; i < locations.size(); i++) { - for (int j = 0; j < locations[i].second->size(); j++) { - popRequests.push_back(locations[i].second->getInterface(j).changeFeedPop.getReply( - ChangeFeedPopRequest(rangeID, version, locations[i].first))); + for (int j = 0; j < locations[i].locations->size(); j++) { + popRequests.push_back(locations[i].locations->getInterface(j).changeFeedPop.getReply( + ChangeFeedPopRequest(rangeID, version, locations[i].range))); } } choose { @@ -8249,7 +8202,7 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke throw; } db->changeFeedCache.erase(rangeID); - cx->invalidateCache(keys); + cx->invalidateCache(Key(), keys); wait(popChangeFeedBackup(cx, rangeID, version)); } return Void(); diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index c14f180d4f..e3fa9bdd8e 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -252,15 +252,13 @@ struct TransactionState : ReferenceCounted { Version committedVersion{ invalidVersion }; - Future tenantPrefix; - // Used to save conflicting keys if FDBTransactionOptions::REPORT_CONFLICTING_KEYS is enabled // prefix/ : '1' - any keys equal or larger than this key are (probably) conflicting keys // prefix/ : '0' - any keys equal or larger than this key are (definitely) not conflicting keys std::shared_ptr> conflictingKeys; // Only available so that Transaction can have a default constructor, for use in state variables - TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID), tenantPrefix(Key()) {} + TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID) {} TransactionState(Database cx, Optional tenant, @@ -448,7 +446,6 @@ public: return Standalone>(tr.transaction.write_conflict_ranges, tr.arena); } - Future getTenantPrefix(); Optional getTenant() { return trState->tenant; } Reference trState; diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 00a29a1f16..0c8718513d 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -19,6 +19,7 @@ */ #include +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/SystemData.h" #include "flow/ActorCollection.h" #include "fdbrpc/simulator.h" @@ -233,6 +234,7 @@ ACTOR Future> getBlobWorkers(Database cx, bool if (use_system_priority) { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); } + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); try { RangeResult blobWorkersList = wait(tr.getRange(blobWorkerListKeys, CLIENT_KNOBS->TOO_MANY)); @@ -256,6 +258,7 @@ ACTOR Future> getStorageServers(Database cx, if (use_system_priority) { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); } + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); try { RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 4880b37529..d7a8dfb0a9 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -4450,7 +4450,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { // FIXME: The client cache does not notice when servers are added to a team. To read from a local storage server // we must refresh the cache manually. - data->cx->invalidateCache(keys); + data->cx->invalidateCache(Key(), keys); loop { state Transaction tr(data->cx); @@ -4470,7 +4470,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { : tryGetRange(results, &tr, keys); state Key nfk = keys.begin; - try + try { loop { TEST(true); // Fetching keys for transferred shard while (data->fetchKeysBudgetUsed.get()) { diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp index 02a03e17a8..6d17e4b0d9 100644 --- a/fdbserver/workloads/RandomMoveKeys.actor.cpp +++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/FDBOptions.g.h" #include "fdbrpc/simulator.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/ManagementAPI.actor.h" @@ -50,6 +51,7 @@ struct MoveKeysWorkload : TestWorkload { // Get the database configuration so as to use proper team size state Transaction tr(cx); loop { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); try { RangeResult res = wait(tr.getRange(configKeys, 1000)); ASSERT(res.size() < 1000); From fb73e1857a49ae12c641e5e4bd1e41b91c1e2dea Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 1 Mar 2022 08:24:48 -0800 Subject: [PATCH 084/138] Some get key requests should not happen within a tenant. Don't set read_system_keys in watch logic unless necessary. Set access_system_keys in a couple movekeys functions. --- fdbclient/NativeAPI.actor.cpp | 31 ++++++++++++++++++++----------- fdbserver/MoveKeys.actor.cpp | 3 +++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c26ddd58c6..c9128145bb 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3113,7 +3113,10 @@ ACTOR Future> getValue(Reference trState, } } -ACTOR Future getKey(Reference trState, KeySelector k, Future version) { +ACTOR Future getKey(Reference trState, + KeySelector k, + Future version, + UseTenant useTenant = UseTenant::True) { wait(success(version)); state Optional getKeyID = Optional(); @@ -3140,8 +3143,8 @@ ACTOR Future getKey(Reference trState, KeySelector k, Fut } Key locationKey(k.getKey(), k.arena()); - state KeyRangeLocationInfo locationInfo = - wait(getKeyLocation(trState, locationKey, &StorageServerInterface::getKey, Reverse{ k.isBackward() })); + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation( + trState, locationKey, &StorageServerInterface::getKey, Reverse{ k.isBackward() }, useTenant)); try { if (getKeyID.present()) @@ -3153,7 +3156,7 @@ ACTOR Future getKey(Reference trState, KeySelector k, Fut ++trState->cx->transactionPhysicalReads; GetKeyRequest req(span.context, - trState->getTenantInfo(), + useTenant ? trState->getTenantInfo() : TenantInfo(), k, version.get(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), @@ -3388,7 +3391,10 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference state ReadYourWritesTransaction tr(cx); loop { try { - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + if (!parameters->tenant.present() && parameters->key.startsWith(systemKeys.begin)) { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + } + state Optional valSS = wait(tr.get(parameters->key)); Reference metadata = cx->getWatchMetadata(parameters->tenant, parameters->key); @@ -3703,14 +3709,17 @@ Future getExactRange(Reference trState, } } -Future resolveKey(Reference trState, KeySelector const& key, Version const& version) { +Future resolveKey(Reference trState, + KeySelector const& key, + Version const& version, + UseTenant useTenant) { if (key.isFirstGreaterOrEqual()) return Future(key.getKey()); if (key.isFirstGreaterThan()) return Future(keyAfter(key.getKey())); - return getKey(trState, key, version); + return getKey(trState, key, version, useTenant); } ACTOR template @@ -3731,8 +3740,8 @@ Future getRangeFallback(Reference trState, version = ver; } - Future fb = resolveKey(trState, begin, version); - state Future fe = resolveKey(trState, end, version); + Future fb = resolveKey(trState, begin, version, useTenant); + state Future fe = resolveKey(trState, end, version, useTenant); state Key b = wait(fb); state Key e = wait(fe); @@ -4610,8 +4619,8 @@ ACTOR Future getRangeStream(Reference trState, state Version version = wait(fVersion); trState->cx->validateVersion(version); - Future fb = resolveKey(trState, begin, version); - state Future fe = resolveKey(trState, end, version); + Future fb = resolveKey(trState, begin, version, UseTenant::True); + state Future fe = resolveKey(trState, end, version, UseTenant::True); state Key b = wait(fb); state Key e = wait(fe); diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 0e08328ff4..0ea2c61613 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -20,6 +20,7 @@ #include +#include "fdbclient/FDBOptions.g.h" #include "flow/Util.h" #include "fdbrpc/FailureMonitor.h" #include "fdbclient/KeyBackedTypes.h" @@ -605,6 +606,7 @@ ACTOR Future checkFetchingState(Database cx, tr.trState->taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); std::vector>> serverListEntries; serverListEntries.reserve(dest.size()); @@ -698,6 +700,7 @@ ACTOR static Future finishMoveKeys(Database occ, tr.trState->taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); releaser.release(); wait(finishMoveKeysParallelismLock->take(TaskPriority::DataDistributionLaunch)); From 1d44ef1c8eb76494fd8050187800ddbc03d8ed38 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 2 Mar 2022 10:11:38 -0800 Subject: [PATCH 085/138] Specify system key access for a few move keys transactions --- fdbserver/MoveKeys.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 0ea2c61613..707fdf952e 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -66,6 +66,7 @@ ACTOR Future takeMoveKeysLock(Database cx, UID ddId) { state MoveKeysLock lock; state UID txnId; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); if (!g_network->isSimulated()) { txnId = deterministicRandom()->randomUniqueID(); tr.debugTransaction(txnId); @@ -100,6 +101,7 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, MoveKeysLock lock, const DDEnabledState* ddEnabledState, bool isWrite = true) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); if (!ddEnabledState->isDDEnabled()) { TraceEvent(SevDebug, "DDDisabledByInMemoryCheck").log(); throw movekeys_conflict(); @@ -1335,6 +1337,7 @@ ACTOR Future removeKeysFromFailedServer(Database cx, try { tr.trState->taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); wait(checkMoveKeysLock(&tr, lock, ddEnabledState)); TraceEvent("RemoveKeysFromFailedServerLocked") .detail("ServerID", serverID) From ecccfd0868cea448e3c6e457dbf9e7155cacc3cb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 3 Mar 2022 15:00:05 -0800 Subject: [PATCH 086/138] Add cache invalidation to tenant cache. Send tenant ID along with tenant name in requests to validate that the tenant hasn't changed. Fix a few bugs. --- .../transaction_profiling_analyzer.py | 2 + fdbclient/ClientKnobs.cpp | 2 +- fdbclient/ClientLogEvents.h | 146 +++++--- fdbclient/CommitProxyInterface.h | 5 +- fdbclient/DatabaseContext.h | 12 +- fdbclient/GenericManagementAPI.actor.h | 199 ++++++---- fdbclient/NativeAPI.actor.cpp | 349 +++++++++++------- fdbclient/NativeAPI.actor.h | 1 + flow/ProtocolVersion.h | 1 + 9 files changed, 451 insertions(+), 266 deletions(-) diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index decfaa05ad..2ffc94065a 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -192,6 +192,8 @@ class BaseInfo(object): self.start_timestamp = bb.get_double() if protocol_version >= PROTOCOL_VERSION_6_3: self.dc_id = bb.get_bytes_with_length() + if protocol_version >= PROTOCOL_VERSION_7_1: + self.tenant = bb.get_bytes_with_length() class GetVersionInfo(BaseInfo): def __init__(self, bb, protocol_version): diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index d57441bbda..8b66d7dad8 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -90,7 +90,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); init( TENANT_CACHE_EVICTION_SIZE, 100000 ); - init( TENANT_CACHE_EVICTION_SIZE, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_EVICTION_SIZE_SIM = 3; + init( TENANT_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_EVICTION_SIZE_SIM = 3; init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/ClientLogEvents.h b/fdbclient/ClientLogEvents.h index 29c74ed42b..53b338f569 100644 --- a/fdbclient/ClientLogEvents.h +++ b/fdbclient/ClientLogEvents.h @@ -41,7 +41,8 @@ enum class TransactionPriorityType : int { PRIORITY_DEFAULT = 0, PRIORITY_BATCH static_assert(sizeof(TransactionPriorityType) == 4, "transaction_profiling_analyzer.py assumes this field has size 4"); struct Event { - Event(EventType t, double ts, const Optional>& dc) : type(t), startTs(ts) { + Event(EventType t, double ts, const Optional>& dc, const Optional& tenant) + : type(t), startTs(ts), tenant(tenant) { if (dc.present()) dcId = dc.get(); } @@ -49,7 +50,9 @@ struct Event { template Ar& serialize(Ar& ar) { - if (ar.protocolVersion().version() >= (uint64_t)0x0FDB00B063010001LL) { + if (ar.protocolVersion().hasTenants()) { + return serializer(ar, type, startTs, dcId, tenant); + } else if (ar.protocolVersion().version() >= (uint64_t)0x0FDB00B063010001LL) { return serializer(ar, type, startTs, dcId); } else { return serializer(ar, type, startTs); @@ -59,8 +62,10 @@ struct Event { EventType type{ EventType::UNSET }; double startTs{ 0 }; Key dcId{}; + Optional tenant{}; void logEvent(std::string id, int maxFieldLength) const {} + void augmentTraceEvent(TraceEvent& event) const { event.detail("Tenant", tenant); } }; struct EventGetVersion : public Event { @@ -77,7 +82,9 @@ struct EventGetVersion : public Event { double latency; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_GetVersion").detail("TransactionID", id).detail("Latency", latency); + TraceEvent event("TransactionTrace_GetVersion"); + event.detail("TransactionID", id).detail("Latency", latency); + augmentTraceEvent(event); } }; @@ -97,10 +104,9 @@ struct EventGetVersion_V2 : public Event { TransactionPriorityType priorityType{ TransactionPriorityType::UNSET }; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_GetVersion") - .detail("TransactionID", id) - .detail("Latency", latency) - .detail("PriorityType", priorityType); + TraceEvent event("TransactionTrace_GetVersion"); + event.detail("TransactionID", id).detail("Latency", latency).detail("PriorityType", priorityType); + augmentTraceEvent(event); } }; @@ -110,8 +116,9 @@ struct EventGetVersion_V3 : public Event { const Optional>& dcId, double lat, TransactionPriority priority, - Version version) - : Event(EventType::GET_VERSION_LATENCY, ts, dcId), latency(lat), readVersion(version) { + Version version, + const Optional& tenant) + : Event(EventType::GET_VERSION_LATENCY, ts, dcId, tenant), latency(lat), readVersion(version) { switch (priority) { // Unfortunately, the enum serialized here disagrees with the enum used elsewhere for the values used by each // priority @@ -143,17 +150,23 @@ struct EventGetVersion_V3 : public Event { Version readVersion; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_GetVersion") - .detail("TransactionID", id) + TraceEvent event("TransactionTrace_GetVersion"); + event.detail("TransactionID", id) .detail("Latency", latency) .detail("PriorityType", priorityType) .detail("ReadVersion", readVersion); + augmentTraceEvent(event); } }; struct EventGet : public Event { - EventGet(double ts, const Optional>& dcId, double lat, int size, const KeyRef& in_key) - : Event(EventType::GET_LATENCY, ts, dcId), latency(lat), valueSize(size), key(in_key) {} + EventGet(double ts, + const Optional>& dcId, + double lat, + int size, + const KeyRef& in_key, + const Optional& tenant) + : Event(EventType::GET_LATENCY, ts, dcId, tenant), latency(lat), valueSize(size), key(in_key) {} EventGet() {} template @@ -169,13 +182,14 @@ struct EventGet : public Event { Key key; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_Get") - .setMaxEventLength(-1) + TraceEvent event("TransactionTrace_Get"); + event.setMaxEventLength(-1) .detail("TransactionID", id) .detail("Latency", latency) .detail("ValueSizeBytes", valueSize) .setMaxFieldLength(maxFieldLength) .detail("Key", key); + augmentTraceEvent(event); } }; @@ -185,8 +199,9 @@ struct EventGetRange : public Event { double lat, int size, const KeyRef& start_key, - const KeyRef& end_key) - : Event(EventType::GET_RANGE_LATENCY, ts, dcId), latency(lat), rangeSize(size), startKey(start_key), + const KeyRef& end_key, + const Optional& tenant) + : Event(EventType::GET_RANGE_LATENCY, ts, dcId, tenant), latency(lat), rangeSize(size), startKey(start_key), endKey(end_key) {} EventGetRange() {} @@ -204,14 +219,15 @@ struct EventGetRange : public Event { Key endKey; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_GetRange") - .setMaxEventLength(-1) + TraceEvent event("TransactionTrace_GetRange"); + event.setMaxEventLength(-1) .detail("TransactionID", id) .detail("Latency", latency) .detail("RangeSizeBytes", rangeSize) .setMaxFieldLength(maxFieldLength) .detail("StartKey", startKey) .detail("EndKey", endKey); + augmentTraceEvent(event); } }; @@ -234,36 +250,40 @@ struct EventCommit : public Event { void logEvent(std::string id, int maxFieldLength) const { for (auto& read_range : req.transaction.read_conflict_ranges) { - TraceEvent("TransactionTrace_Commit_ReadConflictRange") - .setMaxEventLength(-1) + TraceEvent ev1("TransactionTrace_Commit_ReadConflictRange"); + ev1.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Begin", read_range.begin) .detail("End", read_range.end); + augmentTraceEvent(ev1); } for (auto& write_range : req.transaction.write_conflict_ranges) { - TraceEvent("TransactionTrace_Commit_WriteConflictRange") - .setMaxEventLength(-1) + TraceEvent ev2("TransactionTrace_Commit_WriteConflictRange"); + ev2.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Begin", write_range.begin) .detail("End", write_range.end); + augmentTraceEvent(ev2); } for (auto& mutation : req.transaction.mutations) { - TraceEvent("TransactionTrace_Commit_Mutation") - .setMaxEventLength(-1) + TraceEvent ev3("TransactionTrace_Commit_Mutation"); + ev3.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Mutation", mutation); + augmentTraceEvent(ev3); } - TraceEvent("TransactionTrace_Commit") - .detail("TransactionID", id) + TraceEvent ev4("TransactionTrace_Commit"); + ev4.detail("TransactionID", id) .detail("Latency", latency) .detail("NumMutations", numMutations) .detail("CommitSizeBytes", commitBytes); + augmentTraceEvent(ev4); } }; @@ -275,8 +295,9 @@ struct EventCommit_V2 : public Event { int mut, int bytes, Version version, - const CommitTransactionRequest& commit_req) - : Event(EventType::COMMIT_LATENCY, ts, dcId), latency(lat), numMutations(mut), commitBytes(bytes), + const CommitTransactionRequest& commit_req, + const Optional& tenant) + : Event(EventType::COMMIT_LATENCY, ts, dcId, tenant), latency(lat), numMutations(mut), commitBytes(bytes), commitVersion(version), req(commit_req) {} EventCommit_V2() {} @@ -298,43 +319,51 @@ struct EventCommit_V2 : public Event { void logEvent(std::string id, int maxFieldLength) const { for (auto& read_range : req.transaction.read_conflict_ranges) { - TraceEvent("TransactionTrace_Commit_ReadConflictRange") - .setMaxEventLength(-1) + TraceEvent ev1("TransactionTrace_Commit_ReadConflictRange"); + ev1.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Begin", read_range.begin) .detail("End", read_range.end); + augmentTraceEvent(ev1); } for (auto& write_range : req.transaction.write_conflict_ranges) { - TraceEvent("TransactionTrace_Commit_WriteConflictRange") - .setMaxEventLength(-1) + TraceEvent ev2("TransactionTrace_Commit_WriteConflictRange"); + ev2.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Begin", write_range.begin) .detail("End", write_range.end); + augmentTraceEvent(ev2); } for (auto& mutation : req.transaction.mutations) { - TraceEvent("TransactionTrace_Commit_Mutation") - .setMaxEventLength(-1) + TraceEvent ev3("TransactionTrace_Commit_Mutation"); + ev3.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Mutation", mutation); + augmentTraceEvent(ev3); } - TraceEvent("TransactionTrace_Commit") - .detail("TransactionID", id) + TraceEvent ev4("TransactionTrace_Commit"); + ev4.detail("TransactionID", id) .detail("CommitVersion", commitVersion) .detail("Latency", latency) .detail("NumMutations", numMutations) .detail("CommitSizeBytes", commitBytes); + augmentTraceEvent(ev4); } }; struct EventGetError : public Event { - EventGetError(double ts, const Optional>& dcId, int err_code, const KeyRef& in_key) - : Event(EventType::ERROR_GET, ts, dcId), errCode(err_code), key(in_key) {} + EventGetError(double ts, + const Optional>& dcId, + int err_code, + const KeyRef& in_key, + const Optional& tenant) + : Event(EventType::ERROR_GET, ts, dcId, tenant), errCode(err_code), key(in_key) {} EventGetError() {} template @@ -349,12 +378,13 @@ struct EventGetError : public Event { Key key; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_GetError") - .setMaxEventLength(-1) + TraceEvent event("TransactionTrace_GetError"); + event.setMaxEventLength(-1) .detail("TransactionID", id) .detail("ErrCode", errCode) .setMaxFieldLength(maxFieldLength) .detail("Key", key); + augmentTraceEvent(event); } }; @@ -363,8 +393,9 @@ struct EventGetRangeError : public Event { const Optional>& dcId, int err_code, const KeyRef& start_key, - const KeyRef& end_key) - : Event(EventType::ERROR_GET_RANGE, ts, dcId), errCode(err_code), startKey(start_key), endKey(end_key) {} + const KeyRef& end_key, + const Optional& tenant) + : Event(EventType::ERROR_GET_RANGE, ts, dcId, tenant), errCode(err_code), startKey(start_key), endKey(end_key) {} EventGetRangeError() {} template @@ -380,13 +411,14 @@ struct EventGetRangeError : public Event { Key endKey; void logEvent(std::string id, int maxFieldLength) const { - TraceEvent("TransactionTrace_GetRangeError") - .setMaxEventLength(-1) + TraceEvent event("TransactionTrace_GetRangeError"); + event.setMaxEventLength(-1) .detail("TransactionID", id) .detail("ErrCode", errCode) .setMaxFieldLength(maxFieldLength) .detail("StartKey", startKey) .detail("EndKey", endKey); + augmentTraceEvent(event); } }; @@ -394,8 +426,9 @@ struct EventCommitError : public Event { EventCommitError(double ts, const Optional>& dcId, int err_code, - const CommitTransactionRequest& commit_req) - : Event(EventType::ERROR_COMMIT, ts, dcId), errCode(err_code), req(commit_req) {} + const CommitTransactionRequest& commit_req, + const Optional& tenant) + : Event(EventType::ERROR_COMMIT, ts, dcId, tenant), errCode(err_code), req(commit_req) {} EventCommitError() {} template @@ -412,32 +445,37 @@ struct EventCommitError : public Event { void logEvent(std::string id, int maxFieldLength) const { for (auto& read_range : req.transaction.read_conflict_ranges) { - TraceEvent("TransactionTrace_CommitError_ReadConflictRange") - .setMaxEventLength(-1) + TraceEvent ev1("TransactionTrace_CommitError_ReadConflictRange"); + ev1.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Begin", read_range.begin) .detail("End", read_range.end); + augmentTraceEvent(ev1); } for (auto& write_range : req.transaction.write_conflict_ranges) { - TraceEvent("TransactionTrace_CommitError_WriteConflictRange") - .setMaxEventLength(-1) + TraceEvent ev2("TransactionTrace_CommitError_WriteConflictRange"); + ev2.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Begin", write_range.begin) .detail("End", write_range.end); + augmentTraceEvent(ev2); } for (auto& mutation : req.transaction.mutations) { - TraceEvent("TransactionTrace_CommitError_Mutation") - .setMaxEventLength(-1) + TraceEvent ev3("TransactionTrace_CommitError_Mutation"); + ev3.setMaxEventLength(-1) .detail("TransactionID", id) .setMaxFieldLength(maxFieldLength) .detail("Mutation", mutation); + augmentTraceEvent(ev3); } - TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode); + TraceEvent ev4("TransactionTrace_CommitError"); + ev4.detail("TransactionID", id).detail("ErrCode", errCode); + augmentTraceEvent(ev4); } }; } // namespace FdbClientLogEvents diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h index 4e006284b5..2cb913b7a3 100644 --- a/fdbclient/CommitProxyInterface.h +++ b/fdbclient/CommitProxyInterface.h @@ -171,9 +171,8 @@ struct CommitTransactionRequest : TimedRequest { TenantInfo tenantInfo; - CommitTransactionRequest() : CommitTransactionRequest(TenantInfo(), SpanID()) {} - CommitTransactionRequest(TenantInfo const& tenantInfo, SpanID const& context) - : spanContext(context), flags(0), tenantInfo(tenantInfo) {} + CommitTransactionRequest() : CommitTransactionRequest(SpanID()) {} + CommitTransactionRequest(SpanID const& context) : spanContext(context), flags(0) {} template void serialize(Ar& ar) { diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index ca08273518..71d0c2ffc1 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -133,7 +133,7 @@ public: }; struct WatchParameters : public ReferenceCounted { - const Optional tenant; + const TenantInfo tenant; const Key key; const Optional value; @@ -144,7 +144,7 @@ struct WatchParameters : public ReferenceCounted { const Optional debugID; const UseProvisionalProxies useProvisionalProxies; - WatchParameters(Optional tenant, + WatchParameters(TenantInfo tenant, Key key, Optional value, Version version, @@ -307,9 +307,9 @@ public: void removeWatch(); // watch map operations - Reference getWatchMetadata(Optional tenant, KeyRef key) const; + Reference getWatchMetadata(int64_t tenantId, KeyRef key) const; void setWatchMetadata(Reference metadata); - void deleteWatchMetadata(Optional tenant, KeyRef key); + void deleteWatchMetadata(int64_t tenant, KeyRef key); void clearWatchMetadata(); void setOption(FDBDatabaseOptions::Option option, Optional value); @@ -580,9 +580,7 @@ public: EventCacheHolder connectToDatabaseEventCacheHolder; private: - std::unordered_map, Key>, - Reference, - boost::hash, Key>>> + std::unordered_map, Reference, boost::hash>> watchMap; }; diff --git a/fdbclient/GenericManagementAPI.actor.h b/fdbclient/GenericManagementAPI.actor.h index 0c80f773e8..be51c5b17f 100644 --- a/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/GenericManagementAPI.actor.h @@ -629,6 +629,17 @@ Future changeConfig(Reference db, // used by special keys and fdbcli std::string generateErrorMessage(const CoordinatorsResult& res); +ACTOR template +Future> tryGetTenantTransaction(Reference tr, TenantName name) { + state Key tenantMapKey = name.withPrefix(tenantMapPrefix); + + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + Optional val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey))); + return val.map([](Optional v) { return decodeTenantEntry(v.get()); }); +} + ACTOR template Future> tryGetTenant(Reference db, TenantName name) { state Reference tr = db->createTransaction(); @@ -636,17 +647,24 @@ Future> tryGetTenant(Reference db, TenantName name) loop { try { - tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); - - Optional val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey))); - return val.map([](Optional v) { return decodeTenantEntry(v.get()); }); + Optional entry = wait(tryGetTenantTransaction(tr, name)); + return entry; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); } } } +ACTOR template +Future getTenantTransaction(Reference tr, TenantName name) { + Optional entry = wait(tryGetTenantTransaction(tr, name)); + if (!entry.present()) { + throw tenant_not_found(); + } + + return entry.get(); +} + ACTOR template Future getTenant(Reference db, TenantName name) { Optional entry = wait(tryGetTenant(db, name)); @@ -657,54 +675,64 @@ Future getTenant(Reference db, TenantName name) { return entry.get(); } -ACTOR template -Future createTenant(Reference db, TenantName name) { +ACTOR template +Future> createTenantTransaction(Reference tr, TenantName name) { + state Key tenantMapKey = name.withPrefix(tenantMapPrefix); + if (name.startsWith("\xff"_sr)) { throw invalid_tenant_name(); } + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, name)); + if (tenantEntry.present()) { + return Optional(); + } + + Optional tenantMode = wait(safeThreadFutureToFuture(tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr)))); + + if (!tenantMode.present() || tenantMode.get() == StringRef(format("%d", TenantMode::DISABLED))) { + throw tenants_disabled(); + } + + state Future> tenantDataPrefixFuture = safeThreadFutureToFuture(tr->get(tenantDataPrefixKey)); + + state Optional lastIdVal = wait(safeThreadFutureToFuture(tr->get(tenantLastIdKey))); + Optional tenantDataPrefix = wait(tenantDataPrefixFuture); + + state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0, + tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr); + + RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(newTenant.prefix), 1))); + if (!contents.empty()) { + throw tenant_prefix_allocator_conflict(); + } + + tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(newTenant.id)); + tr->set(tenantMapKey, encodeTenantEntry(newTenant)); + + return newTenant; +} + +ACTOR template +Future createTenant(Reference db, TenantName name) { state Reference tr = db->createTransaction(); - state Key tenantMapKey = name.withPrefix(tenantMapPrefix); - - state bool tenantCheckCompleted = false; + state bool firstTry = true; loop { try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - - Optional val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey))); - if (val.present()) { - if (!tenantCheckCompleted) { + if (firstTry) { + Optional entry = wait(tryGetTenantTransaction(tr, name)); + if (entry.present()) { throw tenant_already_exists(); - } else { - // If the tenant did not exist when we started trying to create it, then we will return success - // even if someone else created it simultaneously. This helps us avoid problems if the commit - // result for creating this tenant is unknown. - Version readVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); - TraceEvent("CreatedTenantAlready").detail("Tenant", name).detail("ReadVersion", readVersion); - return Void(); } - } else { - tenantCheckCompleted = true; + + firstTry = false; } - state Future> tenantDataPrefixFuture = - safeThreadFutureToFuture(tr->get(tenantDataPrefixKey)); - - state Optional lastIdVal = wait(safeThreadFutureToFuture(tr->get(tenantLastIdKey))); - Optional tenantDataPrefix = wait(tenantDataPrefixFuture); - - state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0, - tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr); - - RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(newTenant.prefix), 1))); - if (!contents.empty()) { - throw tenant_prefix_allocator_conflict(); - } - - tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(newTenant.id)); - tr->set(tenantMapKey, encodeTenantEntry(newTenant)); + state Optional newTenant = wait(createTenantTransaction(tr, name)); if (BUGGIFY) { throw commit_unknown_result(); @@ -718,8 +746,8 @@ Future createTenant(Reference db, TenantName name) { TraceEvent("CreatedTenant") .detail("Tenant", name) - .detail("TenantId", newTenant.id) - .detail("Prefix", newTenant.prefix) + .detail("TenantId", newTenant.present() ? newTenant.get().id : -1) + .detail("Prefix", newTenant.present() ? (StringRef)newTenant.get().prefix : "Unknown"_sr) .detail("Version", tr->getCommittedVersion()); return Void(); @@ -729,40 +757,45 @@ Future createTenant(Reference db, TenantName name) { } } +ACTOR template +Future deleteTenantTransaction(Reference tr, TenantName name) { + state Key tenantMapKey = name.withPrefix(tenantMapPrefix); + + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, name)); + if (!tenantEntry.present()) { + return Void(); + } + + RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(tenantEntry.get().prefix), 1))); + if (!contents.empty()) { + throw tenant_not_empty(); + } + + tr->clear(tenantMapKey); + + return Void(); +} + ACTOR template Future deleteTenant(Reference db, TenantName name) { state Reference tr = db->createTransaction(); - state Key tenantMapKey = name.withPrefix(tenantMapPrefix); - - state bool tenantCheckCompleted = false; + state bool firstTry = true; loop { try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Optional tenantEntry = wait(tryGetTenant(db, name)); - if (!tenantEntry.present()) { - if (!tenantCheckCompleted) { + if (firstTry) { + Optional entry = wait(tryGetTenantTransaction(tr, name)); + if (!entry.present()) { throw tenant_not_found(); - } else { - // If the tenant existed when we started trying to delete it, then we will return success - // even if someone else deleted it simultaneously. This helps us avoid problems if the commit - // result for deleting this tenant is unknown. - Version readVersion = wait(safeThreadFutureToFuture(tr->getReadVersion())); - TraceEvent("DeletedTenantAlready").detail("Tenant", name).detail("ReadVersion", readVersion); - return Void(); } - } else { - tenantCheckCompleted = true; + + firstTry = false; } - RangeResult contents = - wait(safeThreadFutureToFuture(tr->getRange(prefixRange(tenantEntry.get().prefix), 1))); - if (!contents.empty()) { - throw tenant_not_empty(); - } - - tr->clear(tenantMapKey); + wait(deleteTenantTransaction(tr, name)); if (BUGGIFY) { throw commit_unknown_result(); @@ -782,6 +815,27 @@ Future deleteTenant(Reference db, TenantName name) { } } +ACTOR template +Future>> listTenantsTransaction(Reference tr, + StringRef begin, + StringRef end, + int limit) { + state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix); + + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + RangeResult results = wait(safeThreadFutureToFuture( + tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit))); + + Standalone> tenants; + for (auto kv : results) { + tenants.push_back_deep(tenants.arena(), kv.key.removePrefix(tenantMapPrefix)); + } + + return tenants; +} + ACTOR template Future>> listTenants(Reference db, StringRef begin, StringRef end, int limit) { state Reference tr = db->createTransaction(); @@ -789,16 +843,7 @@ Future>> listTenants(Reference db, Strin loop { try { - tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); - RangeResult results = wait(safeThreadFutureToFuture( - tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit))); - - Standalone> tenants; - for (auto kv : results) { - tenants.push_back_deep(tenants.arena(), kv.key.removePrefix(tenantMapPrefix)); - } - + Standalone> tenants = wait(listTenantsTransaction(tr, begin, end, limit)); return tenants; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c9128145bb..54bccd4849 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1924,6 +1924,7 @@ ACTOR static Future switchConnectionRecordImpl(ReferencecommitProxies.clear(); self->grvProxies.clear(); self->minAcceptableReadVersion = std::numeric_limits::max(); + self->tenantCache.clear(); self->invalidateCache(Key(), allKeys); auto clearedClientInfo = self->clientInfo->get(); @@ -2107,19 +2108,19 @@ Database Database::createDatabase(std::string connFileName, return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } -Reference DatabaseContext::getWatchMetadata(Optional tenant, KeyRef key) const { - const auto it = watchMap.find(std::make_pair(tenant, key)); +Reference DatabaseContext::getWatchMetadata(int64_t tenantId, KeyRef key) const { + const auto it = watchMap.find(std::make_pair(tenantId, key)); if (it == watchMap.end()) return Reference(); return it->second; } void DatabaseContext::setWatchMetadata(Reference metadata) { - watchMap[std::make_pair(metadata->parameters->tenant, metadata->parameters->key)] = metadata; + watchMap[std::make_pair(metadata->parameters->tenant.tenantId, metadata->parameters->key)] = metadata; } -void DatabaseContext::deleteWatchMetadata(Optional tenant, KeyRef key) { - watchMap.erase(std::make_pair(tenant, key)); +void DatabaseContext::deleteWatchMetadata(int64_t tenantId, KeyRef key) { + watchMap.erase(std::make_pair(tenantId, key)); } void DatabaseContext::clearWatchMetadata() { @@ -2665,36 +2666,46 @@ ACTOR Future getKeyLocation_internal(Database cx, if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.Before"); - loop { - ++cx->transactionKeyServerLocationRequests; - choose { - when(wait(cx->onProxiesChanged())) {} - when(GetKeyServerLocationsReply rep = - wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), - &CommitProxyInterface::getKeyServersLocations, - GetKeyServerLocationsRequest(span.context, - tenant.castTo(), - key, - Optional(), - 100, - isBackward, - key.arena()), - TaskPriority::DefaultPromiseEndpoint))) { - ++cx->transactionKeyServerLocationRequestsCompleted; - if (debugID.present()) - g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.After"); - ASSERT(rep.results.size() == 1); + try { + loop { + ++cx->transactionKeyServerLocationRequests; + choose { + when(wait(cx->onProxiesChanged())) {} + when(GetKeyServerLocationsReply rep = + wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), + &CommitProxyInterface::getKeyServersLocations, + GetKeyServerLocationsRequest(span.context, + tenant.castTo(), + key, + Optional(), + 100, + isBackward, + key.arena()), + TaskPriority::DefaultPromiseEndpoint))) { + ++cx->transactionKeyServerLocationRequestsCompleted; + if (debugID.present()) + g_traceBatch.addEvent( + "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.After"); + ASSERT(rep.results.size() == 1); - auto locationInfo = - cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second); - updateTssMappings(cx, rep); + auto locationInfo = + cx->setCachedLocation(tenant, rep.tenantEntry, rep.results[0].first, rep.results[0].second); + updateTssMappings(cx, rep); - return KeyRangeLocationInfo( - rep.tenantEntry, - KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), - locationInfo); + return KeyRangeLocationInfo( + rep.tenantEntry, + KeyRange(toRelativeRange(rep.results[0].first, rep.tenantEntry.prefix), rep.arena), + locationInfo); + } } } + } catch (Error& e) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(tenant.present()); + cx->invalidateCachedTenant(tenant.get()); + } + + throw; } } @@ -2765,14 +2776,23 @@ Future getKeyLocation(Reference trState, F StorageServerInterface::*member, Reverse isBackward = Reverse::False, UseTenant useTenant = UseTenant::True) { - return getKeyLocation(trState->cx, - useTenant ? trState->tenant : Optional(), - key, - member, - trState->spanID, - trState->debugID, - trState->useProvisionalProxies, - isBackward); + auto f = getKeyLocation(trState->cx, + useTenant ? trState->tenant : Optional(), + key, + member, + trState->spanID, + trState->debugID, + trState->useProvisionalProxies, + isBackward); + + if (trState->tenant.present() && useTenant) { + return map(f, [trState](const KeyRangeLocationInfo& locationInfo) { + trState->tenantId = locationInfo.tenantEntry.id; + return locationInfo; + }); + } else { + return f; + } } ACTOR Future> getKeyRangeLocations_internal( @@ -2788,44 +2808,54 @@ ACTOR Future> getKeyRangeLocations_internal( if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.Before"); - loop { - ++cx->transactionKeyServerLocationRequests; - choose { - when(wait(cx->onProxiesChanged())) {} - when(GetKeyServerLocationsReply _rep = - wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), - &CommitProxyInterface::getKeyServersLocations, - GetKeyServerLocationsRequest(span.context, - tenant.castTo(), - keys.begin, - keys.end, - limit, - reverse, - keys.arena()), - TaskPriority::DefaultPromiseEndpoint))) { - ++cx->transactionKeyServerLocationRequestsCompleted; - state GetKeyServerLocationsReply rep = _rep; - if (debugID.present()) - g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.After"); - ASSERT(rep.results.size()); + try { + loop { + ++cx->transactionKeyServerLocationRequests; + choose { + when(wait(cx->onProxiesChanged())) {} + when(GetKeyServerLocationsReply _rep = + wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), + &CommitProxyInterface::getKeyServersLocations, + GetKeyServerLocationsRequest(span.context, + tenant.castTo(), + keys.begin, + keys.end, + limit, + reverse, + keys.arena()), + TaskPriority::DefaultPromiseEndpoint))) { + ++cx->transactionKeyServerLocationRequestsCompleted; + state GetKeyServerLocationsReply rep = _rep; + if (debugID.present()) + g_traceBatch.addEvent( + "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.After"); + ASSERT(rep.results.size()); - state std::vector results; - state int shard = 0; - for (; shard < rep.results.size(); shard++) { - // FIXME: these shards are being inserted into the map sequentially, it would be much more CPU - // efficient to save the map pairs and insert them all at once. - results.emplace_back( - rep.tenantEntry, - (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), - cx->setCachedLocation( - tenant, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); - wait(yield()); + state std::vector results; + state int shard = 0; + for (; shard < rep.results.size(); shard++) { + // FIXME: these shards are being inserted into the map sequentially, it would be much more CPU + // efficient to save the map pairs and insert them all at once. + results.emplace_back( + rep.tenantEntry, + (toRelativeRange(rep.results[shard].first, rep.tenantEntry.prefix) & keys), + cx->setCachedLocation( + tenant, rep.tenantEntry, rep.results[shard].first, rep.results[shard].second)); + wait(yield()); + } + updateTssMappings(cx, rep); + + return results; } - updateTssMappings(cx, rep); - - return results; } } + } catch (Error& e) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(tenant.present()); + cx->invalidateCachedTenant(tenant.get()); + } + + throw; } } @@ -2883,15 +2913,25 @@ Future> getKeyRangeLocations(Referencecx, - useTenant ? trState->tenant : Optional(), - keys, - limit, - reverse, - member, - trState->spanID, - trState->debugID, - trState->useProvisionalProxies); + auto f = getKeyRangeLocations(trState->cx, + useTenant ? trState->tenant : Optional(), + keys, + limit, + reverse, + member, + trState->spanID, + trState->debugID, + trState->useProvisionalProxies); + + if (trState->tenant.present() && useTenant) { + return map(f, [trState](const std::vector& locationInfo) { + ASSERT(!locationInfo.empty()); + trState->tenantId = locationInfo[0].tenantEntry.id; + return locationInfo; + }); + } else { + return f; + } } ACTOR Future warmRange_impl(Reference trState, KeyRange keys) { @@ -2982,11 +3022,14 @@ TenantInfo TransactionState::getTenantInfo() const { if (!cx->internal && !options.rawAccess && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !tenant.present()) { throw tenant_name_required(); - } else if (options.rawAccess) { + } else if (options.rawAccess || !tenant.present()) { return TenantInfo(); + } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && tenant.present()) { + throw tenants_disabled(); } - return TenantInfo(tenant); + ASSERT(tenantId != TenantInfo::INVALID_TENANT); + return TenantInfo(tenant.get(), tenantId); } Future Transaction::warmRange(KeyRange keys) { @@ -3069,7 +3112,7 @@ ACTOR Future> getValue(Reference trState, if (trState->trLogInfo && recordLogInfo) { int valueSize = reply.value.present() ? reply.value.get().size() : 0; trState->trLogInfo->addLog(FdbClientLogEvents::EventGet( - startTimeD, trState->cx->clientLocality.dcId(), latency, valueSize, key)); + startTimeD, trState->cx->clientLocality.dcId(), latency, valueSize, key, trState->tenant)); } trState->cx->getValueCompleted->latency = timer_int() - startTime; trState->cx->getValueCompleted->log(); @@ -3103,10 +3146,16 @@ ACTOR Future> getValue(Reference trState, (e.code() == error_code_transaction_too_old && ver == latestVersion)) { trState->cx->invalidateCache(useTenant ? locationInfo.tenantEntry.prefix : Key(), key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(useTenant && trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); } else { if (trState->trLogInfo && recordLogInfo) - trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError( - startTimeD, trState->cx->clientLocality.dcId(), static_cast(e.code()), key)); + trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, + trState->cx->clientLocality.dcId(), + static_cast(e.code()), + key, + trState->tenant)); throw e; } } @@ -3199,6 +3248,9 @@ ACTOR Future getKey(Reference trState, trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, k.getKey(), Reverse{ k.isBackward() }); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(useTenant && trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); throw e; @@ -3264,7 +3316,7 @@ ACTOR Future watchValue(Database cx, Reference p loop { state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(cx, - parameters->tenant, + parameters->tenant.name, parameters->key, &StorageServerInterface::watchValue, parameters->spanID, @@ -3289,7 +3341,7 @@ ACTOR Future watchValue(Database cx, Reference p locationInfo.locations, &StorageServerInterface::watchValue, WatchValueRequest(span.context, - TenantInfo(parameters->tenant), + parameters->tenant, parameters->key, parameters->value, ver, @@ -3319,6 +3371,9 @@ ACTOR Future watchValue(Database cx, Reference p if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { cx->invalidateCache(locationInfo.tenantEntry.prefix, parameters->key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, parameters->taskID)); + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(parameters->tenant.name.present()); + cx->invalidateCachedTenant(parameters->tenant.name.get()); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead @@ -3338,22 +3393,22 @@ ACTOR Future watchValue(Database cx, Reference p } } -ACTOR Future watchStorageServerResp(Optional tenant, Key key, Database cx) { +ACTOR Future watchStorageServerResp(int64_t tenantId, Key key, Database cx) { loop { try { - state Reference metadata = cx->getWatchMetadata(tenant, key); + state Reference metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) return Void(); Version watchVersion = wait(watchValue(cx, metadata->parameters)); - metadata = cx->getWatchMetadata(tenant, key); + metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) return Void(); // case 1: version_1 (SS) >= version_2 (map) if (watchVersion >= metadata->parameters->version) { - cx->deleteWatchMetadata(tenant, key); + cx->deleteWatchMetadata(tenantId, key); if (metadata->watchPromise.canBeSet()) metadata->watchPromise.send(watchVersion); } @@ -3363,7 +3418,7 @@ ACTOR Future watchStorageServerResp(Optional tenant, Key key, // case 2: version_1 < version_2 and future_count == 1 if (metadata->watchPromise.getFutureReferenceCount() == 1) { - cx->deleteWatchMetadata(tenant, key); + cx->deleteWatchMetadata(tenantId, key); } } } catch (Error& e) { @@ -3371,16 +3426,16 @@ ACTOR Future watchStorageServerResp(Optional tenant, Key key, throw e; } - Reference metadata = cx->getWatchMetadata(tenant, key); + Reference metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) { return Void(); } else if (metadata->watchPromise.getFutureReferenceCount() == 1) { - cx->deleteWatchMetadata(tenant, key); + cx->deleteWatchMetadata(tenantId, key); return Void(); } else if (e.code() == error_code_future_version) { continue; } - cx->deleteWatchMetadata(tenant, key); + cx->deleteWatchMetadata(tenantId, key); metadata->watchPromise.sendError(e); throw e; } @@ -3388,30 +3443,30 @@ ACTOR Future watchStorageServerResp(Optional tenant, Key key, } ACTOR Future sameVersionDiffValue(Database cx, Reference parameters) { - state ReadYourWritesTransaction tr(cx); + state ReadYourWritesTransaction tr(cx, parameters->tenant.name); loop { try { - if (!parameters->tenant.present() && parameters->key.startsWith(systemKeys.begin)) { + if (!parameters->tenant.name.present()) { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); } state Optional valSS = wait(tr.get(parameters->key)); - Reference metadata = cx->getWatchMetadata(parameters->tenant, parameters->key); + Reference metadata = cx->getWatchMetadata(parameters->tenant.tenantId, parameters->key); // val_3 != val_1 (storage server value doesnt match value in map) if (metadata.isValid() && valSS != metadata->parameters->value) { - cx->deleteWatchMetadata(parameters->tenant, parameters->key); + cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); } // val_3 == val_2 (storage server value matches value passed into the function -> new watch) - if (valSS == parameters->value) { + if (valSS == parameters->value && tr.getTransactionState()->tenantId == parameters->tenant.tenantId) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); - metadata->watchFutureSS = watchStorageServerResp(parameters->tenant, parameters->key, cx); + metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); } // if val_3 != val_2 @@ -3429,14 +3484,14 @@ ACTOR Future sameVersionDiffValue(Database cx, Reference } Future getWatchFuture(Database cx, Reference parameters) { - Reference metadata = cx->getWatchMetadata(parameters->tenant, parameters->key); + Reference metadata = cx->getWatchMetadata(parameters->tenant.tenantId, parameters->key); // case 1: key not in map if (!metadata.isValid()) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); - metadata->watchFutureSS = watchStorageServerResp(parameters->tenant, parameters->key, cx); + metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } // case 2: val_1 == val_2 (received watch with same value as key already in the map so just update) @@ -3451,7 +3506,7 @@ Future getWatchFuture(Database cx, Reference parameters) // recreate in SS) else if (parameters->version > metadata->parameters->version) { TEST(true); // Setting a watch that has a different value than the one in the map but a higher version (newer) - cx->deleteWatchMetadata(parameters->tenant, parameters->key); + cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); @@ -3459,7 +3514,7 @@ Future getWatchFuture(Database cx, Reference parameters) metadata = makeReference(parameters); cx->setWatchMetadata(metadata); - metadata->watchFutureSS = watchStorageServerResp(parameters->tenant, parameters->key, cx); + metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } @@ -3475,7 +3530,7 @@ Future getWatchFuture(Database cx, Reference parameters) } ACTOR Future watchValueMap(Future version, - Optional tenant, + TenantInfo tenant, Key key, Optional value, Database cx, @@ -3696,6 +3751,10 @@ Future getExactRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(useTenant && trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); + break; } else { TraceEvent(SevInfo, "GetExactRangeError") .error(e) @@ -3818,8 +3877,13 @@ void getRangeFinished(Reference trState, trState->cx->transactionKeysRead += result.size(); if (trState->trLogInfo) { - trState->trLogInfo->addLog(FdbClientLogEvents::EventGetRange( - startTime, trState->cx->clientLocality.dcId(), now() - startTime, bytes, begin.getKey(), end.getKey())); + trState->trLogInfo->addLog(FdbClientLogEvents::EventGetRange(startTime, + trState->cx->clientLocality.dcId(), + now() - startTime, + bytes, + begin.getKey(), + end.getKey(), + trState->tenant)); } if (!snapshot) { @@ -4137,6 +4201,9 @@ Future getRange(Reference trState, } wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(useTenant && trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); } else { if (trState->trLogInfo) trState->trLogInfo->addLog( @@ -4144,7 +4211,8 @@ Future getRange(Reference trState, trState->cx->clientLocality.dcId(), static_cast(e.code()), begin.getKey(), - end.getKey())); + end.getKey(), + trState->tenant)); throw e; } @@ -4582,6 +4650,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); + break; } else { results->sendError(e); return Void(); @@ -4737,7 +4809,7 @@ Transaction::Transaction(Database const& cx, Optional const& tenant) cx->taskID, generateSpanID(cx->transactionTracingSample), createTrLogInfoProbabilistically(cx))), - span(trState->spanID, "Transaction"_loc), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), tr(tenant, trState->spanID) { + span(trState->spanID, "Transaction"_loc), backoff(CLIENT_KNOBS->DEFAULT_BACKOFF), tr(trState->spanID) { if (DatabaseContext::debugUseTags) { debugAddTags(trState); } @@ -4841,10 +4913,25 @@ void Watch::setWatch(Future watchFuture) { onSetWatchTrigger.send(Void()); } +ACTOR Future getTenantMetadata(Reference trState, Key key) { + KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, key, &StorageServerInterface::getValue)); + return trState->getTenantInfo(); +} + +Future populateAndGetTenant(Reference trState, Key const& key) { + if (!trState->tenant.present()) { + return TenantInfo(); + } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { + return trState->getTenantInfo(); + } else { + return getTenantMetadata(trState, key); + } +} + // FIXME: This seems pretty horrible. Now a Database can't die until all of its watches do... ACTOR Future watch(Reference watch, Database cx, - Optional tenant, + Future tenant, TagSet tags, SpanID spanID, TaskPriority taskID, @@ -4859,6 +4946,7 @@ ACTOR Future watch(Reference watch, // NativeAPI finished commit and updated watchFuture when(wait(watch->onSetWatchTrigger.getFuture())) { + state TenantInfo tenantInfo = wait(tenant); loop { choose { // NativeAPI watchValue future finishes or errors @@ -4868,7 +4956,7 @@ ACTOR Future watch(Reference watch, TEST(true); // Recreated a watch after switch cx->clearWatchMetadata(); watch->watchFuture = watchValueMap(cx->minAcceptableReadVersion, - tenant, + tenantInfo, watch->key, watch->value, cx, @@ -4901,13 +4989,15 @@ Future Transaction::watch(Reference watch) { if (!trState->cx->internal && !trState->options.rawAccess && trState->cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !trState->tenant.present()) { throw tenant_name_required(); + } else if (trState->cx->clientInfo->get().tenantMode == TenantMode::DISABLED && trState->tenant.present()) { + throw tenants_disabled(); } trState->cx->addWatch(); watches.push_back(watch); return ::watch(watch, trState->cx, - trState->tenant, + populateAndGetTenant(trState, watch->key), trState->options.readTags, trState->spanID, trState->taskID, @@ -5395,7 +5485,7 @@ void TransactionOptions::reset(Database const& cx) { void Transaction::resetImpl(bool generateNewSpan) { flushTrLogsIfEnabled(); trState = trState->cloneAndReset(createTrLogInfoProbabilistically(trState->cx), generateNewSpan); - tr = CommitTransactionRequest(trState->tenant, trState->spanID); + tr = CommitTransactionRequest(trState->spanID); readVersion = Future(); metadataVersion = Promise>(); extraConflictRanges.clear(); @@ -5570,7 +5660,7 @@ void Transaction::setupWatches() { for (int i = 0; i < watches.size(); ++i) watches[i]->setWatch(watchValueMap(watchVersion, - trState->tenant, + trState->getTenantInfo(), watches[i]->key, watches[i]->value, trState->cx, @@ -5707,6 +5797,7 @@ ACTOR static Future tryCommit(Reference trState, state Key tenantPrefix; if (trState->tenant.present()) { KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue)); + req.tenantInfo = trState->getTenantInfo(); applyTenantPrefix(req, locationInfo.tenantEntry.prefix); tenantPrefix = locationInfo.tenantEntry.prefix; } @@ -5784,7 +5875,8 @@ ACTOR static Future tryCommit(Reference trState, req.transaction.mutations.size(), req.transaction.mutations.expectedSize(), ci.version, - req)); + req, + trState->tenant)); return Void(); } else { // clear the RYW transaction which contains previous conflicting keys @@ -5851,7 +5943,7 @@ ACTOR static Future tryCommit(Reference trState, } if (trState->trLogInfo) trState->trLogInfo->addLog(FdbClientLogEvents::EventCommitError( - startTime, trState->cx->clientLocality.dcId(), static_cast(e.code()), req)); + startTime, trState->cx->clientLocality.dcId(), static_cast(e.code()), req, trState->tenant)); throw; } } @@ -6199,6 +6291,7 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaloptions.rawAccess = true; trState->tenant = Optional(); + trState->tenantId = TenantInfo::INVALID_TENANT; tr.tenantInfo = TenantInfo(); break; @@ -6384,8 +6477,12 @@ ACTOR Future extractReadVersion(Reference trState, } trState->cx->GRVLatencies.addSample(latency); if (trState->trLogInfo) - trState->trLogInfo->addLog(FdbClientLogEvents::EventGetVersion_V3( - trState->startTime, trState->cx->clientLocality.dcId(), latency, trState->options.priority, rep.version)); + trState->trLogInfo->addLog(FdbClientLogEvents::EventGetVersion_V3(trState->startTime, + trState->cx->clientLocality.dcId(), + latency, + trState->options.priority, + rep.version, + trState->tenant)); if (rep.locked && !trState->options.lockAware) throw database_locked(); @@ -7038,12 +7135,16 @@ ACTOR Future>> getRangeSplitPoints(Referencecx->invalidateCache(locations[0].tenantEntry.prefix, keys); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); + } else if (e.code() == error_code_tenant_not_found) { + ASSERT(trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); + } else { TraceEvent(SevError, "GetRangeSplitPoints").error(e); throw; } - trState->cx->invalidateCache(locations[0].tenantEntry.prefix, keys); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index e3fa9bdd8e..d96c6c445e 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -238,6 +238,7 @@ struct Watch : public ReferenceCounted, NonCopyable { struct TransactionState : ReferenceCounted { Database cx; Optional tenant; + int64_t tenantId = TenantInfo::INVALID_TENANT; Reference trLogInfo; TransactionOptions options; diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index 99900acd7b..81deef5a4e 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -161,6 +161,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, NetworkAddressHostnameFlag); PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, StorageMetadata); PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, PerpetualWiggleMetadata); + PROTOCOL_VERSION_FEATURE(0x0FDB00B071010000LL, Tenants); }; template <> From 1e1098ca9ac5bac5e3b348470d51228dd9f7ad9a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 4 Mar 2022 12:17:09 -0800 Subject: [PATCH 087/138] Don't check whether tenants are enabled when watch is called. Support unknown_tenant error. --- fdbclient/ClientKnobs.cpp | 1 + fdbclient/ClientKnobs.h | 1 + fdbclient/NativeAPI.actor.cpp | 39 ++++++++++++++++++++++------------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 8b66d7dad8..1eb46cfe1b 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -61,6 +61,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01(); init( REPLY_BYTE_LIMIT, 80000 ); init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01(); init( DEFAULT_MAX_BACKOFF, 1.0 ); diff --git a/fdbclient/ClientKnobs.h b/fdbclient/ClientKnobs.h index c4597a6d8d..1c4a9ca874 100644 --- a/fdbclient/ClientKnobs.h +++ b/fdbclient/ClientKnobs.h @@ -60,6 +60,7 @@ public: double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is // mostly wrong (e.g. dumping the database after a test) double FUTURE_VERSION_RETRY_DELAY; + double UNKNOWN_TENANT_RETRY_DELAY; int REPLY_BYTE_LIMIT; double DEFAULT_BACKOFF; double DEFAULT_MAX_BACKOFF; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 54bccd4849..bca0421e64 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3146,9 +3146,10 @@ ACTOR Future> getValue(Reference trState, (e.code() == error_code_transaction_too_old && ver == latestVersion)) { trState->cx->invalidateCache(useTenant ? locationInfo.tenantEntry.prefix : Key(), key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant.present()); trState->cx->invalidateCachedTenant(trState->tenant.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { if (trState->trLogInfo && recordLogInfo) trState->trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, @@ -3248,9 +3249,10 @@ ACTOR Future getKey(Reference trState, trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, k.getKey(), Reverse{ k.isBackward() }); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant.present()); trState->cx->invalidateCachedTenant(trState->tenant.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); throw e; @@ -3371,9 +3373,10 @@ ACTOR Future watchValue(Database cx, Reference p if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { cx->invalidateCache(locationInfo.tenantEntry.prefix, parameters->key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, parameters->taskID)); - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(parameters->tenant.name.present()); cx->invalidateCachedTenant(parameters->tenant.name.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, parameters->taskID)); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead @@ -3751,9 +3754,10 @@ Future getExactRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant.present()); trState->cx->invalidateCachedTenant(trState->tenant.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); break; } else { TraceEvent(SevInfo, "GetExactRangeError") @@ -4201,9 +4205,10 @@ Future getRange(Reference trState, } wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant.present()); trState->cx->invalidateCachedTenant(trState->tenant.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { if (trState->trLogInfo) trState->trLogInfo->addLog( @@ -4650,9 +4655,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(trState->tenant.present()); trState->cx->invalidateCachedTenant(trState->tenant.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); break; } else { results->sendError(e); @@ -4986,12 +4992,6 @@ Future Transaction::getRawReadVersion() { Future Transaction::watch(Reference watch) { ++trState->cx->transactionWatchRequests; - if (!trState->cx->internal && !trState->options.rawAccess && - trState->cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !trState->tenant.present()) { - throw tenant_name_required(); - } else if (trState->cx->clientInfo->get().tenantMode == TenantMode::DISABLED && trState->tenant.present()) { - throw tenants_disabled(); - } trState->cx->addWatch(); watches.push_back(watch); @@ -5797,11 +5797,12 @@ ACTOR static Future tryCommit(Reference trState, state Key tenantPrefix; if (trState->tenant.present()) { KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue)); - req.tenantInfo = trState->getTenantInfo(); applyTenantPrefix(req, locationInfo.tenantEntry.prefix); tenantPrefix = locationInfo.tenantEntry.prefix; } + req.tenantInfo = trState->getTenantInfo(); + startTime = now(); state Optional commitID = Optional(); @@ -5933,6 +5934,10 @@ ACTOR static Future tryCommit(Reference trState, // The user needs to be informed that we aren't sure whether the commit happened. Standard retry loops // retry it anyway (relying on transaction idempotence) but a client might do something else. throw commit_unknown_result(); + } else if (e.code() == error_code_unknown_tenant) { + ASSERT(trState->tenant.present()); + trState->cx->invalidateCachedTenant(trState->tenant.get()); + throw; } else { if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && e.code() != error_code_database_locked && e.code() != error_code_proxy_memory_limit_exceeded && @@ -6788,6 +6793,11 @@ Future Transaction::onError(Error const& e) { reset(); return delay(std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), trState->taskID); } + if (e.code() == error_code_unknown_tenant) { + double maxBackoff = trState->options.maxBackoff; + reset(); + return delay(std::min(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, maxBackoff), trState->taskID); + } return e; } @@ -7138,9 +7148,10 @@ ACTOR Future>> getRangeSplitPoints(Referencecx->invalidateCache(locations[0].tenantEntry.prefix, keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); - } else if (e.code() == error_code_tenant_not_found) { + } else if (e.code() == error_code_unknown_tenant) { ASSERT(trState->tenant.present()); trState->cx->invalidateCachedTenant(trState->tenant.get()); + wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { TraceEvent(SevError, "GetRangeSplitPoints").error(e); throw; From 8bc2b283e137b6f37b1a6ce56e4657ce6afbccce Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Sun, 6 Mar 2022 22:16:42 -0800 Subject: [PATCH 088/138] Create a special keys API to create, delete, and read tenants. Remove the C API to create/delete tenants. --- bindings/c/fdb_c.cpp | 8 -- documentation/sphinx/source/special-keys.rst | 1 + fdbclient/GenericManagementAPI.actor.h | 26 +++--- fdbclient/IClientApi.h | 6 -- fdbclient/MultiVersionTransaction.actor.cpp | 36 ------- fdbclient/MultiVersionTransaction.h | 12 --- fdbclient/NativeAPI.actor.cpp | 7 +- fdbclient/SpecialKeySpace.actor.cpp | 98 +++++++++++++++++++- fdbclient/SpecialKeySpace.actor.h | 11 +++ fdbclient/ThreadSafeTransaction.cpp | 18 ---- fdbclient/ThreadSafeTransaction.h | 6 -- 11 files changed, 129 insertions(+), 100 deletions(-) diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index 2b1b6272cf..f5a428384d 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -449,14 +449,6 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db }).extractPtr()); } -extern "C" DLLEXPORT FDBFuture* fdb_database_allocate_tenant(FDBDatabase* db, uint8_t const* name, int name_length) { - return (FDBFuture*)(DB(db)->createTenant(TenantNameRef(name, name_length)).extractPtr()); -} - -extern "C" DLLEXPORT FDBFuture* fdb_database_remove_tenant(FDBDatabase* db, uint8_t const* name, int name_length) { - return (FDBFuture*)(DB(db)->deleteTenant(TenantNameRef(name, name_length)).extractPtr()); -} - extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) { CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr();); } diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index 0dd5840ab8..96004272a4 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -205,6 +205,7 @@ that process, and wait for necessary data to be moved away. #. ``\xff\xff/management/failed_locality/`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster ` for documentation for the corresponding fdbcli command. #. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. +#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``. Clearing a key in this range will delete the tenant with name ``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any created in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. Note: the tenants key-space does not support range clears. An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or diff --git a/fdbclient/GenericManagementAPI.actor.h b/fdbclient/GenericManagementAPI.actor.h index be51c5b17f..ebaa924cd9 100644 --- a/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/GenericManagementAPI.actor.h @@ -630,7 +630,7 @@ Future changeConfig(Reference db, std::string generateErrorMessage(const CoordinatorsResult& res); ACTOR template -Future> tryGetTenantTransaction(Reference tr, TenantName name) { +Future> tryGetTenantTransaction(Transaction tr, TenantName name) { state Key tenantMapKey = name.withPrefix(tenantMapPrefix); tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); @@ -656,7 +656,7 @@ Future> tryGetTenant(Reference db, TenantName name) } ACTOR template -Future getTenantTransaction(Reference tr, TenantName name) { +Future getTenantTransaction(Transaction tr, TenantName name) { Optional entry = wait(tryGetTenantTransaction(tr, name)); if (!entry.present()) { throw tenant_not_found(); @@ -676,7 +676,7 @@ Future getTenant(Reference db, TenantName name) { } ACTOR template -Future> createTenantTransaction(Reference tr, TenantName name) { +Future> createTenantTransaction(Transaction tr, TenantNameRef name) { state Key tenantMapKey = name.withPrefix(tenantMapPrefix); if (name.startsWith("\xff"_sr)) { @@ -758,7 +758,7 @@ Future createTenant(Reference db, TenantName name) { } ACTOR template -Future deleteTenantTransaction(Reference tr, TenantName name) { +Future deleteTenantTransaction(Transaction tr, TenantNameRef name) { state Key tenantMapKey = name.withPrefix(tenantMapPrefix); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -816,9 +816,9 @@ Future deleteTenant(Reference db, TenantName name) { } ACTOR template -Future>> listTenantsTransaction(Reference tr, - StringRef begin, - StringRef end, +Future> listTenantsTransaction(Transaction tr, + TenantNameRef begin, + TenantNameRef end, int limit) { state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix); @@ -828,22 +828,24 @@ Future>> listTenantsTransaction(ReferencegetRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit))); - Standalone> tenants; + std::map tenants; for (auto kv : results) { - tenants.push_back_deep(tenants.arena(), kv.key.removePrefix(tenantMapPrefix)); + tenants[kv.key.removePrefix(tenantMapPrefix)] = decodeTenantEntry(kv.value); } return tenants; } ACTOR template -Future>> listTenants(Reference db, StringRef begin, StringRef end, int limit) { +Future> listTenants(Reference db, + TenantName begin, + TenantName end, + int limit) { state Reference tr = db->createTransaction(); - state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix); loop { try { - Standalone> tenants = wait(listTenantsTransaction(tr, begin, end, limit)); + std::map tenants = wait(listTenantsTransaction(tr, begin, end, limit)); return tenants; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index a6aa5f3f00..8d224fe3d8 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -140,12 +140,6 @@ public: virtual ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) = 0; - // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - virtual ThreadFuture createTenant(TenantNameRef const& tenantName) = 0; - - // Deletes the tenant with the given name. The tenant must be empty. - virtual ThreadFuture deleteTenant(TenantNameRef const& tenantName) = 0; - virtual void addref() = 0; virtual void delref() = 0; diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 9617917fff..d6a301bba8 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -494,26 +494,6 @@ ThreadFuture DLDatabase::getServerProtocol(Optional DLDatabase::createTenant(TenantNameRef const& tenantName) { - if (api->databaseAllocateTenant == nullptr) { - throw unsupported_operation(); - } - - FdbCApi::FDBFuture* f = api->databaseAllocateTenant(db, tenantName.begin(), tenantName.size()); - return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); }); -} - -// Deletes the tenant with the given name. The tenant must be empty. -ThreadFuture DLDatabase::deleteTenant(TenantNameRef const& tenantName) { - if (api->databaseRemoveTenant == nullptr) { - throw unsupported_operation(); - } - - FdbCApi::FDBFuture* f = api->databaseRemoveTenant(db, tenantName.begin(), tenantName.size()); - return toThreadFuture(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); }); -} - // DLApi // Loads the specified function from a dynamic library @@ -1413,22 +1393,6 @@ ThreadFuture MultiVersionDatabase::getServerProtocol(Optional

versionMonitorDb->getServerProtocol(expectedVersion); } -// Registers a tenant with the given name. A prefix is automatically allocated for the tenant. -ThreadFuture MultiVersionDatabase::createTenant(TenantNameRef const& tenantName) { - Standalone tenantNameCopy = tenantName; - Reference self = Reference::addRef(this); - - return onMainThread([self, tenantNameCopy]() { return ManagementAPI::createTenant(self, tenantNameCopy); }); -} - -// Deletes the tenant with the given name. The tenant must be empty. -ThreadFuture MultiVersionDatabase::deleteTenant(TenantNameRef const& tenantName) { - Standalone tenantNameCopy = tenantName; - Reference self = Reference::addRef(this); - - return onMainThread([self, tenantNameCopy]() { return ManagementAPI::deleteTenant(self, tenantNameCopy); }); -} - MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference versionMonitorDb) : dbVar(new ThreadSafeAsyncVar>(Reference(nullptr))), clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb), closed(false) {} diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 38e662bb60..da1fe3affb 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -421,12 +421,6 @@ public: ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; - // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - ThreadFuture createTenant(TenantNameRef const& tenantName) override; - - // Deletes the tenant with the given name. The tenant must be empty. - ThreadFuture deleteTenant(TenantNameRef const& tenantName) override; - void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } @@ -686,12 +680,6 @@ public: ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; - // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - ThreadFuture createTenant(TenantNameRef const& tenantName) override; - - // Deletes the tenant with the given name. The tenant must be empty. - ThreadFuture deleteTenant(TenantNameRef const& tenantName) override; - void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index bca0421e64..ff4293e06c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1368,6 +1368,12 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); + if (apiVersionAtLeast(710)) { + registerSpecialKeySpaceModule( + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, + std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenantmap"))); + } if (apiVersionAtLeast(700)) { registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG, SpecialKeySpace::IMPLTYPE::READONLY, @@ -4992,7 +4998,6 @@ Future Transaction::getRawReadVersion() { Future Transaction::watch(Reference watch) { ++trState->cx->transactionWatchRequests; - trState->cx->addWatch(); watches.push_back(watch); return ::watch(watch, diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index fad13a1ac5..382206e48c 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -55,6 +55,8 @@ static bool isAlphaNumeric(const std::string& key) { } } // namespace +const KeyRangeRef TenantMapRangeImpl::submoduleRange = KeyRangeRef("tenant_map/"_sr, "tenant_map0"_sr); + std::unordered_map SpecialKeySpace::moduleToBoundary = { { SpecialKeySpace::MODULE::TRANSACTION, KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")) }, @@ -112,7 +114,8 @@ std::unordered_map SpecialKeySpace::managementApiCommandT .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, { "datadistribution", KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) - .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) } + .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }, + { "tenantmap", TenantMapRangeImpl::submoduleRange.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) } }; std::unordered_map SpecialKeySpace::actorLineageApiCommandToRange = { @@ -2699,3 +2702,96 @@ Future> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr // exclude locality with failed option as true. return excludeLocalityCommitActor(ryw, true); } + +ACTOR Future getTenantList(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) { + KeyRangeRef tenantRange = + kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) + .removePrefix(TenantMapRangeImpl::submoduleRange.begin); + state KeyRef managementPrefix = + kr.begin.substr(0, + SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() + + TenantMapRangeImpl::submoduleRange.begin.size()); + + std::map tenants = wait(ManagementAPI::listTenantsTransaction( + Reference::addRef(ryw), tenantRange.begin, tenantRange.end, limitsHint.rows)); + + RangeResult results; + for (auto tenant : tenants) { + json_spirit::mObject tenantEntry; + tenantEntry["id"] = tenant.second.id; + tenantEntry["prefix"] = printable(tenant.second.prefix); + std::string tenantEntryString = + json_spirit::write_string(json_spirit::mValue(tenantEntry), json_spirit::Output_options::raw_utf8); + ValueRef tenantEntryBytes(results.arena(), tenantEntryString); + results.push_back(results.arena(), + KeyValueRef(tenant.first.withPrefix(managementPrefix, results.arena()), tenantEntryBytes)); + } + + return results; +} + +TenantMapRangeImpl::TenantMapRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {} + +Future TenantMapRangeImpl::getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const { + return getTenantList(ryw, kr, limitsHint); +} + +ACTOR Future deleteTenantRange(ReadYourWritesTransaction* ryw, TenantName beginTenant, TenantName endTenant) { + std::map tenants = wait( + ManagementAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY)); + + if (tenants.size() == CLIENT_KNOBS->TOO_MANY) { + TraceEvent(SevWarn, "DeleteTenantRangeTooLange") + .detail("BeginTenant", beginTenant) + .detail("EndTenant", endTenant); + ryw->setSpecialKeySpaceErrorMsg("too many tenants to range delete"); + throw special_keys_api_failure(); + } + + std::vector> deleteFutures; + for (auto tenant : tenants) { + deleteFutures.push_back(ManagementAPI::deleteTenantTransaction(&ryw->getTransaction(), tenant.first)); + } + + wait(waitForAll(deleteFutures)); + return Void(); +} + +Future> TenantMapRangeImpl::commit(ReadYourWritesTransaction* ryw) { + auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range); + std::vector> tenantManagementFutures; + for (auto range : ranges) { + if (!range.value().first) { + continue; + } + + TenantNameRef tenantName = + range.begin() + .removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin) + .removePrefix(TenantMapRangeImpl::submoduleRange.begin); + + if (range.value().second.present()) { + tenantManagementFutures.push_back( + success(ManagementAPI::createTenantTransaction(&ryw->getTransaction(), tenantName))); + } else { + // For a single key clear, just issue the delete + if (KeyRangeRef(range.begin(), range.end()).singleKeyRange()) { + tenantManagementFutures.push_back( + ManagementAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName)); + } else { + TenantNameRef endTenant = range.end().removePrefix( + SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); + if (endTenant.startsWith(submoduleRange.begin)) { + endTenant = endTenant.removePrefix(submoduleRange.end); + } else { + endTenant = "\xff"_sr; + } + tenantManagementFutures.push_back(deleteTenantRange(ryw, tenantName, endTenant)); + } + } + } + + return tag(waitForAll(tenantManagementFutures), Optional()); +} diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 857c8c8ab0..4a90e3a796 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -528,5 +528,16 @@ public: Future> commit(ReadYourWritesTransaction* ryw) override; }; +class TenantMapRangeImpl : public SpecialKeyRangeRWImpl { +public: + const static KeyRangeRef submoduleRange; + + explicit TenantMapRangeImpl(KeyRangeRef kr); + Future getRange(ReadYourWritesTransaction* ryw, + KeyRangeRef kr, + GetRangeLimits limitsHint) const override; + Future> commit(ReadYourWritesTransaction* ryw) override; +}; + #include "flow/unactorcompiler.h" #endif diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 1e16071fd5..fb0cb5b578 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -116,24 +116,6 @@ ThreadFuture ThreadSafeDatabase::getServerProtocol(Optional Future { return db->getClusterProtocol(expectedVersion); }); } -// Registers a tenant with the given name. A prefix is automatically allocated for the tenant. -ThreadFuture ThreadSafeDatabase::createTenant(TenantNameRef const& name) { - DatabaseContext* db = this->db; - TenantName tenantNameCopy = name; - return onMainThread([db, tenantNameCopy]() -> Future { - return ManagementAPI::createTenant(Reference::addRef(db), tenantNameCopy); - }); -} - -// Deletes the tenant with the given name. The tenant must be empty. -ThreadFuture ThreadSafeDatabase::deleteTenant(TenantNameRef const& name) { - DatabaseContext* db = this->db; - TenantName tenantNameCopy = name; - return onMainThread([db, tenantNameCopy]() -> Future { - return ManagementAPI::deleteTenant(Reference::addRef(db), tenantNameCopy); - }); -} - ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) { ClusterConnectionFile* connFile = new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first); diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index 8a7b0ff68f..b3c46ea6d4 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -47,12 +47,6 @@ public: ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; - // Registers a tenant with the given name. A prefix is automatically allocated for the tenant. - ThreadFuture createTenant(TenantNameRef const& name) override; - - // Deletes the tenant with the given name. The tenant must be empty. - ThreadFuture deleteTenant(TenantNameRef const& name) override; - // Returns after a majority of coordination servers are available and have reported a leader. The // cluster file therefore is valid, but the database might be unavailable. ThreadFuture onConnected(); From 68069c978413f2432977ab858fd976f8904e5bb8 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 7 Mar 2022 11:32:51 -0800 Subject: [PATCH 089/138] Undo some unused changes. Fix tenant cache eviction. --- bindings/c/foundationdb/fdb_c.h | 8 -------- documentation/sphinx/source/special-keys.rst | 2 +- fdbclient/DatabaseContext.h | 1 + fdbclient/MultiVersionTransaction.actor.cpp | 3 --- fdbclient/MultiVersionTransaction.h | 2 -- fdbclient/NativeAPI.actor.cpp | 20 ++++++++++++++------ 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h index 140e3f4ca0..214a9bd124 100644 --- a/bindings/c/foundationdb/fdb_c.h +++ b/bindings/c/foundationdb/fdb_c.h @@ -300,14 +300,6 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version); -DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_allocate_tenant(FDBDatabase* db, - uint8_t const* name, - int name_length); - -DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_remove_tenant(FDBDatabase* db, - uint8_t const* name, - int name_length); - DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction); diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index 96004272a4..a722e63b33 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -205,7 +205,7 @@ that process, and wait for necessary data to be moved away. #. ``\xff\xff/management/failed_locality/`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster ` for documentation for the corresponding fdbcli command. #. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. -#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``. Clearing a key in this range will delete the tenant with name ``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any created in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. Note: the tenants key-space does not support range clears. +#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``. Clearing a key in this range will delete the tenant with name ``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. Note: the tenants key-space does not support range clears. An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 71d0c2ffc1..40d9a94f42 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -431,6 +431,7 @@ public: CoalescedKeyRangeMap> locationCache; std::unordered_map failedEndpointsOnHealthyServersInfo; std::unordered_map tenantCache; + std::vector tenantCacheList; std::map server_interf; std::map blobWorker_interf; // blob workers don't change endpoints for the same ID diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index d6a301bba8..141dc49d43 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -554,9 +554,6 @@ void DLApi::init() { headerVersion >= 700); loadClientFunction( &api->databaseGetServerProtocol, lib, fdbCPath, "fdb_database_get_server_protocol", headerVersion >= 700); - loadClientFunction( - &api->databaseAllocateTenant, lib, fdbCPath, "fdb_database_allocate_tenant", headerVersion >= 710); - loadClientFunction(&api->databaseRemoveTenant, lib, fdbCPath, "fdb_database_remove_tenant", headerVersion >= 710); loadClientFunction(&api->databaseDestroy, lib, fdbCPath, "fdb_database_destroy", headerVersion >= 0); loadClientFunction(&api->databaseRebootWorker, lib, fdbCPath, "fdb_database_reboot_worker", headerVersion >= 700); loadClientFunction(&api->databaseForceRecoveryWithDataLoss, diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index da1fe3affb..58bfbfd4c7 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -144,8 +144,6 @@ struct FdbCApi : public ThreadSafeReferenceCounted { int snapshotCommandLength); double (*databaseGetMainThreadBusyness)(FDBDatabase* database); FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion); - FDBFuture* (*databaseAllocateTenant)(FDBDatabase* database, uint8_t const* name, int name_length); - FDBFuture* (*databaseRemoveTenant)(FDBDatabase* database, uint8_t const* name, int name_length); // Tenant fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index ff4293e06c..78933e0c31 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1713,9 +1713,17 @@ bool DatabaseContext::getCachedLocations(const Optional& tenantName, } void DatabaseContext::cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry) { - int attempts = 0; - while (tenantCache.size() > tenantCacheSize && attempts++ < 100) { + if (tenantCacheSize > 0) { + int attempts = 0; + while (tenantCache.size() > tenantCacheSize && attempts++ < 100) { + int randomEntry = deterministicRandom()->randomInt(0, tenantCacheList.size()); + tenantCache.erase(tenantCacheList[randomEntry]); + tenantCacheList[randomEntry] = tenantCacheList.back(); + tenantCacheList.pop_back(); + } + tenantCache[tenant] = tenantEntry; + tenantCacheList.push_back(tenant); } } @@ -3150,7 +3158,7 @@ ACTOR Future> getValue(Reference trState, } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || (e.code() == error_code_transaction_too_old && ver == latestVersion)) { - trState->cx->invalidateCache(useTenant ? locationInfo.tenantEntry.prefix : Key(), key); + trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { ASSERT(useTenant && trState->tenant.present()); @@ -4565,7 +4573,7 @@ ACTOR Future getRangeStreamFragment(Reference trState, // If the reply says there is more but we know that we finished the shard, then fix rep.more if (reverse && output.more && rep.data.size() > 0 && - rep.data[rep.data.size() - 1].key == locations[shard].range.begin) { + output[output.size() - 1].key == locations[shard].range.begin) { output.more = false; } @@ -4585,10 +4593,10 @@ ACTOR Future getRangeStreamFragment(Reference trState, // Make next request to the same shard with a beginning key just after the last key returned if (reverse) locations[shard].range = - KeyRangeRef(locations[shard].range.begin, rep.data[rep.data.size() - 1].key); + KeyRangeRef(locations[shard].range.begin, output[output.size() - 1].key); else locations[shard].range = - KeyRangeRef(keyAfter(rep.data[rep.data.size() - 1].key), locations[shard].range.end); + KeyRangeRef(keyAfter(output[output.size() - 1].key), locations[shard].range.end); } if (locations[shard].range.empty()) { From d6247e32ef8e90f9fc6ed0166a02df9f6c5fbc27 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 7 Mar 2022 11:35:54 -0800 Subject: [PATCH 090/138] Fix documentation error --- documentation/sphinx/source/special-keys.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index a722e63b33..7726d8239d 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -205,7 +205,7 @@ that process, and wait for necessary data to be moved away. #. ``\xff\xff/management/failed_locality/`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster ` for documentation for the corresponding fdbcli command. #. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. -#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``. Clearing a key in this range will delete the tenant with name ``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. Note: the tenants key-space does not support range clears. +#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ````. Clearing a key in this range will delete the tenant with name ````. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. Note: the tenants key-space does not support range clears. An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or From 502209229c5460fabd2dc402a34bc122c93f5678 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 7 Mar 2022 12:02:05 -0800 Subject: [PATCH 091/138] Update generated.go --- bindings/go/src/fdb/generated.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index 8be49f5c79..b636484408 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -448,16 +448,21 @@ func (o TransactionOptions) SetInitializeNewDatabase() error { return o.setOpt(300, nil) } -// Allows this transaction to read and modify system keys (those that start with the byte 0xFF) +// Allows this transaction to read and modify system keys (those that start with the byte 0xFF). Implies raw_access. func (o TransactionOptions) SetAccessSystemKeys() error { return o.setOpt(301, nil) } -// Allows this transaction to read system keys (those that start with the byte 0xFF) +// Allows this transaction to read system keys (those that start with the byte 0xFF). Implies raw_access. func (o TransactionOptions) SetReadSystemKeys() error { return o.setOpt(302, nil) } +// Allows this transaction to access the raw key-space when tenant mode is on. +func (o TransactionOptions) SetRawAccess() error { + return o.setOpt(303, nil) +} + // Not yet implemented. func (o TransactionOptions) SetDebugRetryLogging(param string) error { return o.setOpt(401, []byte(param)) From 06f088e0887fdd21cfbd62e4bdb6a8dcfb7e2ee8 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 8 Mar 2022 15:41:22 -0800 Subject: [PATCH 092/138] Pass along versions with GetKeyServerLocationsRequests, if we have them. --- fdbclient/NativeAPI.actor.cpp | 178 +++++++++++++++++++++++----------- 1 file changed, 122 insertions(+), 56 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 78933e0c31..13fba89f01 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -2668,7 +2668,8 @@ ACTOR Future getKeyLocation_internal(Database cx, SpanID spanID, Optional debugID, UseProvisionalProxies useProvisionalProxies, - Reverse isBackward) { + Reverse isBackward, + Version version) { state Span span("NAPI:getKeyLocation"_loc, spanID); if (isBackward) { @@ -2694,6 +2695,7 @@ ACTOR Future getKeyLocation_internal(Database cx, Optional(), 100, isBackward, + version, key.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; @@ -2760,11 +2762,12 @@ Future getKeyLocation(Database const& cx, SpanID spanID, Optional debugID, UseProvisionalProxies useProvisionalProxies, - Reverse isBackward = Reverse::False) { + Reverse isBackward, + Version version) { // we first check whether this range is cached Optional locationInfo = cx->getCachedLocation(tenant, key, isBackward); if (!locationInfo.present()) { - return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward); + return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward, version); } bool onlyEndpointFailedAndNeedRefresh = false; @@ -2778,7 +2781,7 @@ Future getKeyLocation(Database const& cx, cx->invalidateCache(locationInfo.get().tenantEntry.prefix, key); // Refresh the cache with a new getKeyLocations made to proxies. - return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward); + return getKeyLocation_internal(cx, tenant, key, spanID, debugID, useProvisionalProxies, isBackward, version); } return locationInfo.get(); @@ -2788,8 +2791,9 @@ template Future getKeyLocation(Reference trState, Key const& key, F StorageServerInterface::*member, - Reverse isBackward = Reverse::False, - UseTenant useTenant = UseTenant::True) { + Reverse isBackward, + UseTenant useTenant, + Version version) { auto f = getKeyLocation(trState->cx, useTenant ? trState->tenant : Optional(), key, @@ -2797,7 +2801,8 @@ Future getKeyLocation(Reference trState, trState->spanID, trState->debugID, trState->useProvisionalProxies, - isBackward); + isBackward, + version); if (trState->tenant.present() && useTenant) { return map(f, [trState](const KeyRangeLocationInfo& locationInfo) { @@ -2817,7 +2822,8 @@ ACTOR Future> getKeyRangeLocations_internal( Reverse reverse, SpanID spanID, Optional debugID, - UseProvisionalProxies useProvisionalProxies) { + UseProvisionalProxies useProvisionalProxies, + Version version) { state Span span("NAPI:getKeyRangeLocations"_loc, spanID); if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.Before"); @@ -2836,6 +2842,7 @@ ACTOR Future> getKeyRangeLocations_internal( keys.end, limit, reverse, + version, keys.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; @@ -2888,13 +2895,15 @@ Future> getKeyRangeLocations(Database const& c F StorageServerInterface::*member, SpanID const& spanID, Optional const& debugID, - UseProvisionalProxies useProvisionalProxies) { + UseProvisionalProxies useProvisionalProxies, + Version version) { ASSERT(!keys.empty()); std::vector locations; if (!cx->getCachedLocations(tenant, keys, locations, limit, reverse)) { - return getKeyRangeLocations_internal(cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies); + return getKeyRangeLocations_internal( + cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies, version); } bool foundFailed = false; @@ -2914,7 +2923,8 @@ Future> getKeyRangeLocations(Database const& c if (foundFailed) { // Refresh the cache with a new getKeyRangeLocations made to proxies. - return getKeyRangeLocations_internal(cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies); + return getKeyRangeLocations_internal( + cx, tenant, keys, limit, reverse, spanID, debugID, useProvisionalProxies, version); } return locations; @@ -2926,7 +2936,8 @@ Future> getKeyRangeLocations(Referencecx, useTenant ? trState->tenant : Optional(), keys, @@ -2935,7 +2946,8 @@ Future> getKeyRangeLocations(ReferencespanID, trState->debugID, - trState->useProvisionalProxies); + trState->useProvisionalProxies, + version); if (trState->tenant.present() && useTenant) { return map(f, [trState](const std::vector& locationInfo) { @@ -2948,9 +2960,12 @@ Future> getKeyRangeLocations(Reference warmRange_impl(Reference trState, KeyRange keys) { +ACTOR Future warmRange_impl(Reference trState, KeyRange keys, Future fVersion) { state int totalRanges = 0; state int totalRequests = 0; + + state Version version = wait(fVersion); + loop { std::vector locations = wait(getKeyRangeLocations_internal(trState->cx, @@ -2960,7 +2975,8 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange Reverse::False, trState->spanID, trState->debugID, - trState->useProvisionalProxies)); + trState->useProvisionalProxies, + version)); totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || @@ -3047,7 +3063,7 @@ TenantInfo TransactionState::getTenantInfo() const { } Future Transaction::warmRange(KeyRange keys) { - return warmRange_impl(trState, keys); + return warmRange_impl(trState, keys, getReadVersion()); } ACTOR Future> getValue(Reference trState, @@ -3066,7 +3082,7 @@ ACTOR Future> getValue(Reference trState, loop { state KeyRangeLocationInfo locationInfo = - wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, useTenant)); + wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, useTenant, ver)); state Optional getValueID = Optional(); state uint64_t startTime; @@ -3207,8 +3223,12 @@ ACTOR Future getKey(Reference trState, } Key locationKey(k.getKey(), k.arena()); - state KeyRangeLocationInfo locationInfo = wait(getKeyLocation( - trState, locationKey, &StorageServerInterface::getKey, Reverse{ k.isBackward() }, useTenant)); + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, + locationKey, + &StorageServerInterface::getKey, + Reverse{ k.isBackward() }, + useTenant, + version.get())); try { if (getKeyID.present()) @@ -3337,7 +3357,9 @@ ACTOR Future watchValue(Database cx, Reference p &StorageServerInterface::watchValue, parameters->spanID, parameters->debugID, - parameters->useProvisionalProxies)); + parameters->useProvisionalProxies, + Reverse::False, + parameters->version)); try { state Optional watchValueID = Optional(); @@ -3620,7 +3642,8 @@ Future getExactRange(Reference trState, CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, reverse, getRangeRequestStream(), - useTenant)); + useTenant, + version)); ASSERT(locations.size()); state int shard = 0; loop { @@ -3989,8 +4012,13 @@ Future getRange(Reference trState, Key locationKey = reverse ? Key(end.getKey(), end.arena()) : Key(begin.getKey(), begin.arena()); Reverse locationBackward{ reverse ? (end - 1).isBackward() : begin.isBackward() }; - state KeyRangeLocationInfo beginServer = wait(getKeyLocation( - trState, locationKey, getRangeRequestStream(), locationBackward, useTenant)); + state KeyRangeLocationInfo beginServer = + wait(getKeyLocation(trState, + locationKey, + getRangeRequestStream(), + locationBackward, + useTenant, + version)); state KeyRange shard = beginServer.range; state bool modifiedSelectors = false; state GetKeyValuesFamilyRequest req; @@ -4424,8 +4452,14 @@ ACTOR Future getRangeStreamFragment(Reference trState, Reverse reverse, SpanID spanContext) { loop { - state std::vector locations = wait(getKeyRangeLocations( - trState, keys, CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, reverse, &StorageServerInterface::getKeyValuesStream)); + state std::vector locations = + wait(getKeyRangeLocations(trState, + keys, + CLIENT_KNOBS->GET_RANGE_SHARD_LIMIT, + reverse, + &StorageServerInterface::getKeyValuesStream, + UseTenant::True, + version)); ASSERT(locations.size()); state int shard = 0; loop { @@ -4685,7 +4719,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, ACTOR Future>> getRangeSplitPoints(Reference trState, KeyRange keys, - int64_t chunkSize); + int64_t chunkSize, + Version version); static KeyRange intersect(KeyRangeRef lhs, KeyRangeRef rhs) { return KeyRange(KeyRangeRef(std::max(lhs.begin, rhs.begin), std::min(lhs.end, rhs.end))); @@ -4734,11 +4769,11 @@ ACTOR Future getRangeStream(Reference trState, state std::vector> outstandingRequests; while (b < e) { - state KeyRangeLocationInfo locationInfo = - wait(getKeyLocation(trState, reverse ? e : b, &StorageServerInterface::getKeyValuesStream, reverse)); + state KeyRangeLocationInfo locationInfo = wait(getKeyLocation( + trState, reverse ? e : b, &StorageServerInterface::getKeyValuesStream, reverse, UseTenant::True, version)); state KeyRange shardIntersection = intersect(locationInfo.range, KeyRangeRef(b, e)); state Standalone> splitPoints = - wait(getRangeSplitPoints(trState, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE)); + wait(getRangeSplitPoints(trState, shardIntersection, CLIENT_KNOBS->RANGESTREAM_FRAGMENT_SIZE, version)); state std::vector toSend; // state std::vector::iterator>> outstandingRequests; @@ -4933,18 +4968,20 @@ void Watch::setWatch(Future watchFuture) { onSetWatchTrigger.send(Void()); } -ACTOR Future getTenantMetadata(Reference trState, Key key) { - KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, key, &StorageServerInterface::getValue)); +ACTOR Future getTenantMetadata(Reference trState, Key key, Future fVersion) { + state Version version = wait(fVersion); + KeyRangeLocationInfo locationInfo = + wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, UseTenant::True, version)); return trState->getTenantInfo(); } -Future populateAndGetTenant(Reference trState, Key const& key) { +Future populateAndGetTenant(Reference trState, Key const& key, Future version) { if (!trState->tenant.present()) { return TenantInfo(); } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { return trState->getTenantInfo(); } else { - return getTenantMetadata(trState, key); + return getTenantMetadata(trState, key, version); } } @@ -5008,14 +5045,15 @@ Future Transaction::watch(Reference watch) { trState->cx->addWatch(); watches.push_back(watch); - return ::watch(watch, - trState->cx, - populateAndGetTenant(trState, watch->key), - trState->options.readTags, - trState->spanID, - trState->taskID, - trState->debugID, - trState->useProvisionalProxies); + return ::watch( + watch, + trState->cx, + populateAndGetTenant(trState, watch->key, readVersion.isValid() ? readVersion : Future(latestVersion)), + trState->options.readTags, + trState->spanID, + trState->taskID, + trState->debugID, + trState->useProvisionalProxies); } ACTOR Future>> getAddressesForKeyActor(Reference trState, @@ -5025,7 +5063,9 @@ ACTOR Future>> getAddressesForKeyActor(Referen state Key resolvedKey = key; if (trState->tenant.present()) { - KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue)); + state Version version = wait(ver); + KeyRangeLocationInfo locationInfo = wait(getKeyLocation( + trState, ""_sr, &StorageServerInterface::getValue, Reverse::False, UseTenant::True, version)); resolvedKey = key.withPrefix(locationInfo.tenantEntry.prefix); } @@ -5712,8 +5752,14 @@ ACTOR Future> estimateCommitCosts(Referen ++trCommitCosts.expensiveCostEstCount; ++trState->cx->transactionsExpensiveClearCostEstCount; } else { - std::vector locations = wait(getKeyRangeLocations( - trState, keyRange, CLIENT_KNOBS->TOO_MANY, Reverse::False, &StorageServerInterface::getShardState)); + std::vector locations = + wait(getKeyRangeLocations(trState, + keyRange, + CLIENT_KNOBS->TOO_MANY, + Reverse::False, + &StorageServerInterface::getShardState, + UseTenant::True, + latestVersion)); if (locations.empty()) { continue; } @@ -5809,7 +5855,12 @@ ACTOR static Future tryCommit(Reference trState, state Key tenantPrefix; if (trState->tenant.present()) { - KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue)); + KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, + ""_sr, + &StorageServerInterface::getValue, + Reverse::False, + UseTenant::True, + req.transaction.read_snapshot)); applyTenantPrefix(req, locationInfo.tenantEntry.prefix); tenantPrefix = locationInfo.tenantEntry.prefix; } @@ -6848,7 +6899,8 @@ ACTOR Future getStorageMetricsLargeKeyRange(Database cx, KeyRang &StorageServerInterface::waitMetrics, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); state int nLocs = locations.size(); state std::vector> fx(nLocs); state StorageMetrics total; @@ -6949,7 +7001,8 @@ ACTOR Future>> getReadHotRanges(Da &StorageServerInterface::getReadHotRanges, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); try { // TODO: how to handle this? // This function is called whenever a shard becomes read-hot. But somehow the shard was splitted across more @@ -7019,7 +7072,8 @@ ACTOR Future, int>> waitStorageMetrics(Databa &StorageServerInterface::waitMetrics, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); if (expectedShardCount >= 0 && locations.size() != expectedShardCount) { return std::make_pair(Optional(), locations.size()); } @@ -7117,12 +7171,19 @@ Future>> DatabaseContext::getReadH ACTOR Future>> getRangeSplitPoints(Reference trState, KeyRange keys, - int64_t chunkSize) { + int64_t chunkSize, + Version version) { state Span span("NAPI:GetRangeSplitPoints"_loc, trState->spanID); loop { - state std::vector locations = wait(getKeyRangeLocations( - trState, keys, CLIENT_KNOBS->TOO_MANY, Reverse::False, &StorageServerInterface::getRangeSplitPoints)); + state std::vector locations = + wait(getKeyRangeLocations(trState, + keys, + CLIENT_KNOBS->TOO_MANY, + Reverse::False, + &StorageServerInterface::getRangeSplitPoints, + UseTenant::True, + version)); try { state int nLocs = locations.size(); state std::vector> fReplies(nLocs); @@ -7174,7 +7235,8 @@ ACTOR Future>> getRangeSplitPoints(Reference>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) { - return ::getRangeSplitPoints(trState, keys, chunkSize); + return ::getRangeSplitPoints( + trState, keys, chunkSize, readVersion.isValid() && readVersion.isReady() ? readVersion.get() : latestVersion); } #define BG_REQUEST_DEBUG false @@ -7441,7 +7503,8 @@ ACTOR Future>> splitStorageMetrics(Database cx, &StorageServerInterface::splitMetrics, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); state StorageMetrics used; state Standalone> results; @@ -8026,7 +8089,8 @@ ACTOR Future getChangeFeedStreamActor(Reference db, &StorageServerInterface::changeFeedStream, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); if (locations.size() >= CLIENT_KNOBS->CHANGE_FEED_LOCATION_LIMIT) { ASSERT_WE_THINK(false); @@ -8210,7 +8274,8 @@ ACTOR Future> getOverlappingChangeFeedsA &StorageServerInterface::overlappingChangeFeeds, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); if (locations.size() >= CLIENT_KNOBS->CHANGE_FEED_LOCATION_LIMIT) { TraceEvent(SevError, "OverlappingRangeTooLarge") @@ -8292,7 +8357,8 @@ ACTOR Future popChangeFeedMutationsActor(Reference db, Ke &StorageServerInterface::changeFeedPop, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); if (locations.size() > 2) { wait(popChangeFeedBackup(cx, rangeID, version)); From 0536e56725649f39c830f81a0284402c5a486890 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 10 Mar 2022 13:14:31 -0800 Subject: [PATCH 093/138] The tenant cache uses a simpler eviction policy. Some various improvement around the use of futures. Add documentation for C API and fix inaccurate statement in special keys documentation. --- documentation/sphinx/source/api-c.rst | 34 ++++++++++++++++++- .../sphinx/source/api-common.rst.inc | 5 ++- documentation/sphinx/source/special-keys.rst | 2 +- fdbclient/DatabaseContext.h | 1 - fdbclient/GenericManagementAPI.actor.h | 16 +++++---- fdbclient/NativeAPI.actor.cpp | 25 +++++++------- 6 files changed, 60 insertions(+), 23 deletions(-) diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst index 5d746eff7a..8e121fb3e1 100644 --- a/documentation/sphinx/source/api-c.rst +++ b/documentation/sphinx/source/api-c.rst @@ -6,6 +6,7 @@ .. |database-type| replace:: ``FDBDatabase`` .. |database-class| replace:: :type:`FDBDatabase` .. |database-auto| replace:: FIXME +.. |tenant-type| replace:: `FDBTenant`` .. |transaction-class| replace:: FIXME .. |get-key-func| replace:: :func:`fdb_transaction_get_key()` .. |get-range-func| replace:: :func:`fdb_transaction_get_range()` @@ -419,9 +420,20 @@ An |database-blurb1| Modifications to a database are performed via transactions. |option-doc| +.. function:: fdb_error_t fdb_database_open_tenant(FDBDatabase* database, uint8_t const* tenant_name, int tenant_name_length, FDBTenant** out_tenant) + + Opens a tenant on the given database. All transactions created by this tenant will operate on the tenant's key-space. The caller assumes ownership of the :type:`FDBTenant` object and must destroy it with :func:`fdb_tenant_destroy()`. + + ``tenant_name`` + The name of the tenant being accessed, as a byte string. + ``tenant_name_length`` + The length of the tenant name byte string. + ``*out_tenant`` + Set to point to the newly created :type:`FDBTenant`. + .. function:: fdb_error_t fdb_database_create_transaction(FDBDatabase* database, FDBTransaction** out_transaction) - Creates a new transaction on the given database. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`. + Creates a new transaction on the given database without using a tenant, meaning that it will operate on the entire database key-space. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`. ``*out_transaction`` Set to point to the newly created :type:`FDBTransaction`. @@ -486,6 +498,26 @@ An |database-blurb1| Modifications to a database are performed via transactions. Returns a value where 0 indicates that the client is idle and 1 (or larger) indicates that the client is saturated. By default, this value is updated every second. +Tenant +====== + +|tenant-blurb1| + +.. type:: FDBTenant + + An opaque type that represents a tenant in the FoundationDB C API. + +.. function:: void fdb_tenant_destroy(FDBTenant* tenant) + + Destroys an :type:`FDBTenant` object. It must be called exactly once for each successful call to :func:`fdb_database_create_tenant()`. This function only destroys a handle to the tenant -- the tenant and its data will be fine! + +.. function:: fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTronsaction **out_transaction) + + Creates a new transaction on the given tenant. This transaction will operate within the tenant's key-space and cannot access data outside the tenant. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`. + + ``*out_transaction`` + Set to point to the newly created :type:`FDBTransaction`. + Transaction =========== diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc index 704e224496..93878983eb 100644 --- a/documentation/sphinx/source/api-common.rst.inc +++ b/documentation/sphinx/source/api-common.rst.inc @@ -74,6 +74,9 @@ .. |database-sync| replace:: The convenience methods provided by |database-type| have the same signature as the corresponding methods of ``Transaction``. However, most of the |database-type| methods are fully synchronous. (An exception is the methods for watches.) As a result, the |database-type| methods do not support the use of :ref:`implicit parallelism with futures `. +.. |tenant-blurb1| replace:: + |tenant-type| represents a FoundationDB tenant. Tenants are optional named transaction domains that can be used to provide multiple disjoint key-spaces to client applications. A transaction created in a tenant will be limited to the keys contained within that tenant, and transactions operating on different tenants can use the same key names without interfering with each other. + .. |keysel-blurb1| replace:: FoundationDB's lexicographically ordered data model permits finding keys based on their order (for example, finding the first key in the database greater than a given key). Key selectors represent a description of a key in the database that could be resolved to an actual key by |get-key-func| or used directly as the beginning or end of a range in |get-range-func|. @@ -627,4 +630,4 @@ .. |option-set-distributed-client-tracer| replace:: - Sets a tracer to run on the client. Should be set to the same value as the tracer set on the server. \ No newline at end of file + Sets a tracer to run on the client. Should be set to the same value as the tracer set on the server. diff --git a/documentation/sphinx/source/special-keys.rst b/documentation/sphinx/source/special-keys.rst index 7726d8239d..2e1b161692 100644 --- a/documentation/sphinx/source/special-keys.rst +++ b/documentation/sphinx/source/special-keys.rst @@ -205,7 +205,7 @@ that process, and wait for necessary data to be moved away. #. ``\xff\xff/management/failed_locality/`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster ` for documentation for the corresponding fdbcli command. #. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. #. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/``. Setting this key only has an effect in the current transaction and is not persisted on commit. -#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ````. Clearing a key in this range will delete the tenant with name ````. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. Note: the tenants key-space does not support range clears. +#. ``\xff\xff/management/tenant_map/`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ````. Clearing a key in this range will delete the tenant with name ````. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants. An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 40d9a94f42..71d0c2ffc1 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -431,7 +431,6 @@ public: CoalescedKeyRangeMap> locationCache; std::unordered_map failedEndpointsOnHealthyServersInfo; std::unordered_map tenantCache; - std::vector tenantCacheList; std::map server_interf; std::map blobWorker_interf; // blob workers don't change endpoints for the same ID diff --git a/fdbclient/GenericManagementAPI.actor.h b/fdbclient/GenericManagementAPI.actor.h index ebaa924cd9..4eb04a4466 100644 --- a/fdbclient/GenericManagementAPI.actor.h +++ b/fdbclient/GenericManagementAPI.actor.h @@ -643,7 +643,6 @@ Future> tryGetTenantTransaction(Transaction tr, TenantN ACTOR template Future> tryGetTenant(Reference db, TenantName name) { state Reference tr = db->createTransaction(); - state Key tenantMapKey = name.withPrefix(tenantMapPrefix); loop { try { @@ -675,6 +674,7 @@ Future getTenant(Reference db, TenantName name) { return entry.get(); } +// Creates a tenant with the given name. If the tenant already exists, an empty optional will be returned. ACTOR template Future> createTenantTransaction(Transaction tr, TenantNameRef name) { state Key tenantMapKey = name.withPrefix(tenantMapPrefix); @@ -686,10 +686,9 @@ Future> createTenantTransaction(Transaction tr, TenantN tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - state Optional tenantEntry = wait(tryGetTenantTransaction(tr, name)); - if (tenantEntry.present()) { - return Optional(); - } + state Future> tenantEntryFuture = tryGetTenantTransaction(tr, name); + state Future> tenantDataPrefixFuture = safeThreadFutureToFuture(tr->get(tenantDataPrefixKey)); + state Future> lastIdFuture = safeThreadFutureToFuture(tr->get(tenantLastIdKey)); Optional tenantMode = wait(safeThreadFutureToFuture(tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr)))); @@ -697,9 +696,12 @@ Future> createTenantTransaction(Transaction tr, TenantN throw tenants_disabled(); } - state Future> tenantDataPrefixFuture = safeThreadFutureToFuture(tr->get(tenantDataPrefixKey)); + Optional tenantEntry = wait(tenantEntryFuture); + if (tenantEntry.present()) { + return Optional(); + } - state Optional lastIdVal = wait(safeThreadFutureToFuture(tr->get(tenantLastIdKey))); + state Optional lastIdVal = wait(lastIdFuture); Optional tenantDataPrefix = wait(tenantDataPrefixFuture); state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0, diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 13fba89f01..550dbbfb18 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1714,16 +1714,13 @@ bool DatabaseContext::getCachedLocations(const Optional& tenantName, void DatabaseContext::cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry) { if (tenantCacheSize > 0) { - int attempts = 0; - while (tenantCache.size() > tenantCacheSize && attempts++ < 100) { - int randomEntry = deterministicRandom()->randomInt(0, tenantCacheList.size()); - tenantCache.erase(tenantCacheList[randomEntry]); - tenantCacheList[randomEntry] = tenantCacheList.back(); - tenantCacheList.pop_back(); + // Naive cache eviction just erases the entire cache when it gets full. + // We don't expect a single client to fill the tenant cache typically, so this should work reasonably well. + if (tenantCache.size() > tenantCacheSize) { + tenantCache.clear(); } tenantCache[tenant] = tenantEntry; - tenantCacheList.push_back(tenant); } } @@ -4924,7 +4921,11 @@ Future> Transaction::get(const Key& key, Snapshot snapshot) { if (!snapshot) tr.transaction.read_conflict_ranges.push_back(tr.arena, singleKeyRange(key, tr.arena)); + UseTenant useTenant = UseTenant::True; if (key == metadataVersionKey) { + // It is legal to read the metadata version key inside of a tenant. + // This will return the global metadata version key. + useTenant = UseTenant::False; ++trState->cx->transactionMetadataVersionReads; if (!ver.isReady() || metadataVersion.isSet()) { return metadataVersion.getFuture(); @@ -4958,7 +4959,7 @@ Future> Transaction::get(const Key& key, Snapshot snapshot) { } } - return getValue(trState, key, ver); + return getValue(trState, key, ver, useTenant); } void Watch::setWatch(Future watchFuture) { @@ -4968,14 +4969,13 @@ void Watch::setWatch(Future watchFuture) { onSetWatchTrigger.send(Void()); } -ACTOR Future getTenantMetadata(Reference trState, Key key, Future fVersion) { - state Version version = wait(fVersion); +ACTOR Future getTenantMetadata(Reference trState, Key key, Version version) { KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, UseTenant::True, version)); return trState->getTenantInfo(); } -Future populateAndGetTenant(Reference trState, Key const& key, Future version) { +Future populateAndGetTenant(Reference trState, Key const& key, Version version) { if (!trState->tenant.present()) { return TenantInfo(); } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { @@ -5048,7 +5048,8 @@ Future Transaction::watch(Reference watch) { return ::watch( watch, trState->cx, - populateAndGetTenant(trState, watch->key, readVersion.isValid() ? readVersion : Future(latestVersion)), + populateAndGetTenant( + trState, watch->key, readVersion.isValid() && readVersion.isReady() ? readVersion.get() : latestVersion), trState->options.readTags, trState->spanID, trState->taskID, From 445754d7cbccd468d2d8299e56e9f31af4a7ee73 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 11 Mar 2022 12:09:52 -0800 Subject: [PATCH 094/138] Fix documentation substitutions --- documentation/sphinx/source/api-c.rst | 2 +- documentation/sphinx/source/api-python.rst | 1 + documentation/sphinx/source/api-ruby.rst | 1 + documentation/sphinx/source/data-modeling.rst | 1 + documentation/sphinx/source/developer-guide.rst | 1 + 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst index 8e121fb3e1..730cf7c4c5 100644 --- a/documentation/sphinx/source/api-c.rst +++ b/documentation/sphinx/source/api-c.rst @@ -6,7 +6,7 @@ .. |database-type| replace:: ``FDBDatabase`` .. |database-class| replace:: :type:`FDBDatabase` .. |database-auto| replace:: FIXME -.. |tenant-type| replace:: `FDBTenant`` +.. |tenant-type| replace:: ``FDBTenant`` .. |transaction-class| replace:: FIXME .. |get-key-func| replace:: :func:`fdb_transaction_get_key()` .. |get-range-func| replace:: :func:`fdb_transaction_get_range()` diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst index 18fbd01adb..f3af667e0c 100644 --- a/documentation/sphinx/source/api-python.rst +++ b/documentation/sphinx/source/api-python.rst @@ -7,6 +7,7 @@ .. |database-type| replace:: ``Database`` .. |database-class| replace:: :class:`Database` .. |database-auto| replace:: the :func:`@fdb.transactional ` decorator +.. |tenant-type| replace:: FIXME .. |transaction-class| replace:: :class:`Transaction` .. |get-key-func| replace:: :func:`Transaction.get_key` .. |get-range-func| replace:: :func:`Transaction.get_range` diff --git a/documentation/sphinx/source/api-ruby.rst b/documentation/sphinx/source/api-ruby.rst index 4d92b5ea0e..14a26d6d1c 100644 --- a/documentation/sphinx/source/api-ruby.rst +++ b/documentation/sphinx/source/api-ruby.rst @@ -5,6 +5,7 @@ .. |database-type| replace:: ``Database`` .. |database-class| replace:: :class:`Database` .. |database-auto| replace:: :meth:`Database.transact` +.. |tenant-type| replace:: FIXME .. |transaction-class| replace:: :class:`Transaction` .. |get-key-func| replace:: :meth:`Transaction.get_key` .. |get-range-func| replace:: :meth:`Transaction.get_range` diff --git a/documentation/sphinx/source/data-modeling.rst b/documentation/sphinx/source/data-modeling.rst index e039250f68..146b006809 100644 --- a/documentation/sphinx/source/data-modeling.rst +++ b/documentation/sphinx/source/data-modeling.rst @@ -8,6 +8,7 @@ .. |database-type| replace:: ``Database`` .. |database-class| replace:: ``Database`` .. |database-auto| replace:: FIXME +.. |tenant-type| replace:: FIXME .. |transaction-class| replace:: ``Transaction`` .. |get-key-func| replace:: get_key() .. |get-range-func| replace:: get_range() diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index af2ff736f8..ecd466834e 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -8,6 +8,7 @@ .. |database-type| replace:: ``Database`` .. |database-class| replace:: ``Database`` .. |database-auto| replace:: FIXME +.. |tenant-type| replace:: FIXME .. |transaction-class| replace:: ``Transaction`` .. |get-key-func| replace:: get_key() .. |get-range-func| replace:: get_range() From 149ca44b9b06ba10a3d2119834964054d5fdbbb9 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 11 Mar 2022 12:10:23 -0800 Subject: [PATCH 095/138] Disallow setting raw access on a transaction that has specified a tenant --- fdbclient/NativeAPI.actor.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 550dbbfb18..945a62fa95 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -6359,10 +6359,12 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaltenant.present()) { + Error e = invalid_option(); + TraceEvent(SevWarn, "TenantTransactionRawAccess").error(e).detail("Tenant", trState->tenant); + throw e; + } trState->options.rawAccess = true; - trState->tenant = Optional(); - trState->tenantId = TenantInfo::INVALID_TENANT; - tr.tenantInfo = TenantInfo(); break; default: From a70adf9fcd62787cae162e3f441dbd285fd6230e Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 14 Mar 2022 09:29:57 -0700 Subject: [PATCH 096/138] Convert cached ranges to relative ranges when using tenants. --- fdbclient/NativeAPI.actor.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 945a62fa95..4efd15acab 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1291,6 +1291,16 @@ Future HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* return healthMetricsGetRangeActor(ryw, kr); } +KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) { + if (prefix.empty()) { + return range; + } else { + KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin; + KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end; + return KeyRangeRef(begin, end); + } +} + DatabaseContext::DatabaseContext(Reference>> connectionRecord, Reference> clientInfo, Reference> const> coordinator, @@ -1661,7 +1671,7 @@ Optional DatabaseContext::getCachedLocation(const Optional auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); if (range->value()) { - return KeyRangeLocationInfo(tenantEntry, range->range(), range->value()); + return KeyRangeLocationInfo(tenantEntry, toRelativeRange(range->range(), tenantEntry.prefix), range->value()); } return Optional(); @@ -1698,7 +1708,7 @@ bool DatabaseContext::getCachedLocations(const Optional& tenantName, result.clear(); return false; } - result.emplace_back(tenantEntry, r->range() & resolvedRange, r->value()); + result.emplace_back(tenantEntry, toRelativeRange(r->range() & resolvedRange, tenantEntry.prefix), r->value()); if (result.size() == limit || begin == end) { break; } @@ -2647,16 +2657,6 @@ void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) { } } -KeyRangeRef toRelativeRange(KeyRangeRef range, KeyRef prefix) { - if (prefix.empty()) { - return range; - } else { - KeyRef begin = range.begin.startsWith(prefix) ? range.begin.removePrefix(prefix) : allKeys.begin; - KeyRef end = range.end.startsWith(prefix) ? range.end.removePrefix(prefix) : allKeys.end; - return KeyRangeRef(begin, end); - } -} - // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). // Otherwise returns the shard containing key ACTOR Future getKeyLocation_internal(Database cx, From 8a5107af0305ec2daebda632ad6304972409de51 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 14 Mar 2022 16:01:01 -0700 Subject: [PATCH 097/138] Fix a couple variable initialization issues in the tenant MVC implementation. Encode tenant prefixes with unicode escape sequences for the list command. --- fdbclient/MultiVersionTransaction.actor.cpp | 4 ++-- fdbclient/SpecialKeySpace.actor.cpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 141dc49d43..8986d0817e 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -765,7 +765,7 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParame MultiVersionTransaction::MultiVersionTransaction(Reference db, Optional> tenant, UniqueOrderedOptionList defaultOptions) - : db(db), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar()) { + : db(db), tenant(tenant), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar()) { setDefaultOptions(defaultOptions); updateTransaction(); } @@ -1213,7 +1213,7 @@ bool MultiVersionTransaction::isValid() { // MultiVersionTenant MultiVersionTenant::MultiVersionTenant(Reference db, StringRef tenantName) - : tenantName(tenantName), db(db) { + : tenantVar(new ThreadSafeAsyncVar>(Reference(nullptr))), tenantName(tenantName), db(db) { updateTenant(); } diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 382206e48c..c78ad6b998 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -2719,9 +2719,8 @@ ACTOR Future getTenantList(ReadYourWritesTransaction* ryw, KeyRange for (auto tenant : tenants) { json_spirit::mObject tenantEntry; tenantEntry["id"] = tenant.second.id; - tenantEntry["prefix"] = printable(tenant.second.prefix); - std::string tenantEntryString = - json_spirit::write_string(json_spirit::mValue(tenantEntry), json_spirit::Output_options::raw_utf8); + tenantEntry["prefix"] = tenant.second.prefix.toString(); + std::string tenantEntryString = json_spirit::write_string(json_spirit::mValue(tenantEntry)); ValueRef tenantEntryBytes(results.arena(), tenantEntryString); results.push_back(results.arena(), KeyValueRef(tenant.first.withPrefix(managementPrefix, results.arena()), tenantEntryBytes)); From 22faf8e5b3c8bb1eb76d91d2958dc1fe7030be83 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 15 Mar 2022 10:46:31 -0700 Subject: [PATCH 098/138] Ignore tenants when setting watches on the metadataVersionKey or sending mutations or conflict ranges that use the metadataVersionKey. --- fdbclient/NativeAPI.actor.cpp | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 4efd15acab..c054cf1d92 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -4976,7 +4976,7 @@ ACTOR Future getTenantMetadata(Reference trState, } Future populateAndGetTenant(Reference trState, Key const& key, Version version) { - if (!trState->tenant.present()) { + if (!trState->tenant.present() || key == metadataVersionKey) { return TenantInfo(); } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { return trState->getTenantInfo(); @@ -5812,22 +5812,28 @@ ACTOR Future> estimateCommitCosts(Referen // through to the storage servers void applyTenantPrefix(CommitTransactionRequest& req, Key tenantPrefix) { for (auto& m : req.transaction.mutations) { - m.param1 = m.param1.withPrefix(tenantPrefix, req.arena); - if (m.type == MutationRef::ClearRange) { - m.param2 = m.param2.withPrefix(tenantPrefix, req.arena); - } else if (m.type == MutationRef::SetVersionstampedKey) { - uint8_t* key = mutateString(m.param1); - int* offset = reinterpret_cast(&key[m.param1.size() - 4]); - *offset += tenantPrefix.size(); + if (m.param1 != metadataVersionKey) { + m.param1 = m.param1.withPrefix(tenantPrefix, req.arena); + if (m.type == MutationRef::ClearRange) { + m.param2 = m.param2.withPrefix(tenantPrefix, req.arena); + } else if (m.type == MutationRef::SetVersionstampedKey) { + uint8_t* key = mutateString(m.param1); + int* offset = reinterpret_cast(&key[m.param1.size() - 4]); + *offset += tenantPrefix.size(); + } } } for (auto& rc : req.transaction.read_conflict_ranges) { - rc = rc.withPrefix(tenantPrefix, req.arena); + if (rc.begin != metadataVersionKey) { + rc = rc.withPrefix(tenantPrefix, req.arena); + } } for (auto& wc : req.transaction.write_conflict_ranges) { - wc = wc.withPrefix(tenantPrefix, req.arena); + if (wc.begin != metadataVersionKey) { + wc = wc.withPrefix(tenantPrefix, req.arena); + } } } From 582ba5d5197daecab4bfd6dfe747fc8507b4538d Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 15 Mar 2022 11:29:43 -0700 Subject: [PATCH 099/138] Fix issue with stuck config nodes In rare circumstances where the cluster controller dies / moves to a new machine, sometimes only a minority of `ConfigNode`s received messages telling them they were registered. When the `ConfigNode`s attempt to register with the new broadcaster (on the new cluster controller), the knob system would get stuck because only a minority would be registered. Part of this change allows registration of unregistered `ConfigNode`s if there is no path to a majority of registered nodes. --- fdbserver/ConfigBroadcaster.actor.cpp | 18 ++++- fdbserver/ConfigFollowerInterface.h | 6 +- fdbserver/ConfigNode.actor.cpp | 4 +- fdbserver/PaxosConfigConsumer.actor.cpp | 93 +++++++++++-------------- 4 files changed, 64 insertions(+), 57 deletions(-) diff --git a/fdbserver/ConfigBroadcaster.actor.cpp b/fdbserver/ConfigBroadcaster.actor.cpp index c7b46a1f4d..0ba9e4d6cf 100644 --- a/fdbserver/ConfigBroadcaster.actor.cpp +++ b/fdbserver/ConfigBroadcaster.actor.cpp @@ -94,6 +94,7 @@ class ConfigBroadcasterImpl { int coordinators = 0; std::unordered_set activeConfigNodes; + std::unordered_set registrationResponses; bool disallowUnregistered = false; Promise newConfigNodesAllowed; @@ -217,6 +218,7 @@ class ConfigBroadcasterImpl { self->clients.erase(clientUID); self->clientFailures.erase(clientUID); self->activeConfigNodes.erase(clientAddress); + self->registrationResponses.erase(clientAddress); // See comment where this promise is reset below. if (self->newConfigNodesAllowed.isSet()) { self->newConfigNodesAllowed.reset(); @@ -258,6 +260,7 @@ class ConfigBroadcasterImpl { self->newConfigNodesAllowed.reset(); } } + self->registrationResponses.insert(address); if (registered) { if (!self->disallowUnregistered) { @@ -265,9 +268,18 @@ class ConfigBroadcasterImpl { } self->activeConfigNodes.insert(address); self->disallowUnregistered = true; - } else if (self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) { - // Need to allow registration of previously unregistered nodes when - // the cluster first starts up. + } else if ((self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) || + self->coordinators - self->registrationResponses.size() <= + self->coordinators / 2 + 1 - self->activeConfigNodes.size()) { + // Received a registration request from an unregistered node. There + // are two cases where we want to allow unregistered nodes to + // register: + // * the cluster is just starting and no nodes are registered + // * a minority of nodes are registered and a majority are + // unregistered. This situation should only occur in rare + // circumstances where the cluster controller dies with only a + // minority of config nodes having received a + // ConfigBroadcastReadyRequest self->activeConfigNodes.insert(address); if (self->activeConfigNodes.size() >= self->coordinators / 2 + 1 && self->newConfigNodesAllowed.canBeSet()) { diff --git a/fdbserver/ConfigFollowerInterface.h b/fdbserver/ConfigFollowerInterface.h index da841ecbb3..5b0d264c56 100644 --- a/fdbserver/ConfigFollowerInterface.h +++ b/fdbserver/ConfigFollowerInterface.h @@ -176,14 +176,16 @@ struct ConfigFollowerRollforwardRequest { struct ConfigFollowerGetCommittedVersionReply { static constexpr FileIdentifier file_identifier = 9214735; + Version lastCompacted; Version lastCommitted; ConfigFollowerGetCommittedVersionReply() = default; - explicit ConfigFollowerGetCommittedVersionReply(Version lastCommitted) : lastCommitted(lastCommitted) {} + explicit ConfigFollowerGetCommittedVersionReply(Version lastCompacted, Version lastCommitted) + : lastCompacted(lastCompacted), lastCommitted(lastCommitted) {} template void serialize(Ar& ar) { - serializer(ar, lastCommitted); + serializer(ar, lastCompacted, lastCommitted); } }; diff --git a/fdbserver/ConfigNode.actor.cpp b/fdbserver/ConfigNode.actor.cpp index a99fda55dc..7152c8ce9b 100644 --- a/fdbserver/ConfigNode.actor.cpp +++ b/fdbserver/ConfigNode.actor.cpp @@ -540,13 +540,15 @@ class ConfigNodeImpl { // committed version and rollforward version. ASSERT_GT(req.mutations[0].version, currentGeneration.committedVersion); wait(commitMutations(self, req.mutations, req.annotations, req.target)); + req.reply.send(Void()); return Void(); } ACTOR static Future getCommittedVersion(ConfigNodeImpl* self, ConfigFollowerGetCommittedVersionRequest req) { + state Version lastCompacted = wait(getLastCompactedVersion(self)); ConfigGeneration generation = wait(getGeneration(self)); - req.reply.send(ConfigFollowerGetCommittedVersionReply{ generation.committedVersion }); + req.reply.send(ConfigFollowerGetCommittedVersionReply{ lastCompacted, generation.committedVersion }); return Void(); } diff --git a/fdbserver/PaxosConfigConsumer.actor.cpp b/fdbserver/PaxosConfigConsumer.actor.cpp index 875b34cae4..cdc21f14cb 100644 --- a/fdbserver/PaxosConfigConsumer.actor.cpp +++ b/fdbserver/PaxosConfigConsumer.actor.cpp @@ -45,6 +45,9 @@ class GetCommittedVersionQuorum { std::map> replies; std::map priorVersions; std::map committed; + // Need to know the largest compacted version on any node to avoid asking + // for changes that have already been compacted. + Version largestCompactedResponse{ 0 }; // Last durably committed version. Version lastSeenVersion; size_t totalRepliesReceived{ 0 }; @@ -61,6 +64,7 @@ class GetCommittedVersionQuorum { ACTOR static Future updateNode(GetCommittedVersionQuorum* self, CommittedVersions nodeVersion, CommittedVersions quorumVersion, + Version lastCompacted, ConfigFollowerInterface cfi) { state Version target = quorumVersion.lastCommitted; if (nodeVersion.lastCommitted == target) { @@ -82,11 +86,23 @@ class GetCommittedVersionQuorum { rollback = std::max(nodeVersion.lastCommitted - 1, Version{ 0 }); } + if (rollback.present()) { + // When a new ConfigBroadcaster is created, it may not know + // about the last committed version on the ConfigNodes. If + // compaction has occurred, this can cause change requests to + // be sent to nodes asking for version 0 when the node has + // already compacted that version, causing an error. Make sure + // the rollback version is at least set to the last compacted + // version to prevent this issue. + rollback = std::max(rollback.get(), lastCompacted); + } + // Now roll node forward to match the largest committed version of // the replies. state Reference quorumCfi(new ConfigFollowerInfo(self->replies[target], false)); try { - state Version lastSeenVersion = rollback.present() ? rollback.get() : nodeVersion.lastCommitted; + state Version lastSeenVersion = std::max( + rollback.present() ? rollback.get() : nodeVersion.lastCommitted, self->largestCompactedResponse); ConfigFollowerGetChangesReply reply = wait(timeoutError(basicLoadBalance(quorumCfi, &ConfigFollowerInterface::getChanges, @@ -96,40 +112,12 @@ class GetCommittedVersionQuorum { rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }), SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); } catch (Error& e) { - if (e.code() == error_code_version_already_compacted || e.code() == error_code_process_behind) { - // In the case of an already_compacted or process_behind - // error, fetch the latest snapshot and attempt to roll the - // node forward. - TEST(true); // PaxosConfigConsumer rollforward compacted or behind ConfigNode - - try { - ConfigFollowerGetSnapshotAndChangesReply reply = - wait(timeoutError(basicLoadBalance(quorumCfi, - &ConfigFollowerInterface::getSnapshotAndChanges, - ConfigFollowerGetSnapshotAndChangesRequest{ target }), - SERVER_KNOBS->GET_SNAPSHOT_AND_CHANGES_TIMEOUT)); - if (reply.changes.size() == 0 || reply.changes[reply.changes.size() - 1].version < target) { - return Void(); - } - - int64_t rollbackTo = reply.changes[0].version - 1; - if (rollback.present()) { - rollbackTo = std::min(rollbackTo, rollback.get()); - } - wait(timeoutError( - cfi.rollforward.getReply(ConfigFollowerRollforwardRequest{ - rollbackTo, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }), - SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); - } catch (Error& e2) { - if (e2.code() != error_code_transaction_too_old) { - throw; - } - } - } else if (e.code() == error_code_transaction_too_old) { + if (e.code() == error_code_transaction_too_old) { // Seeing this trace is not necessarily a problem. There // are legitimate scenarios where a ConfigNode could return - // transaction_too_old in response to a rollforward - // request. + // one of these errors in response to a get changes or + // rollforward request. The retry loop should handle this + // case. TraceEvent(SevInfo, "ConfigNodeRollforwardError").error(e); } else { throw; @@ -144,9 +132,10 @@ class GetCommittedVersionQuorum { ConfigFollowerGetCommittedVersionReply reply = wait(timeoutError(cfi.getCommittedVersion.getReply(ConfigFollowerGetCommittedVersionRequest{}), SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT)); - self->committed[cfi.address()] = reply.lastCommitted; ++self->totalRepliesReceived; + self->largestCompactedResponse = std::max(self->largestCompactedResponse, reply.lastCompacted); + state Version lastCompacted = reply.lastCompacted; self->largestCommitted = std::max(self->largestCommitted, reply.lastCommitted); state CommittedVersions committedVersions = CommittedVersions{ self->lastSeenVersion, reply.lastCommitted }; if (self->priorVersions.find(committedVersions.lastCommitted) == self->priorVersions.end()) { @@ -160,14 +149,15 @@ class GetCommittedVersionQuorum { if (self->quorumVersion.canBeSet()) { self->quorumVersion.send(QuorumVersion{ committedVersions, true }); } - wait(self->updateNode(self, committedVersions, self->quorumVersion.getFuture().get().versions, cfi)); + wait(self->updateNode( + self, committedVersions, self->quorumVersion.getFuture().get().versions, lastCompacted, cfi)); } else if (self->maxAgreement >= self->cfis.size() / 2 + 1) { // A quorum of ConfigNodes agree on the latest committed version, // but the node we just got a reply from is not one of them. We may // need to roll it forward or back. QuorumVersion quorumVersion = wait(self->quorumVersion.getFuture()); ASSERT(committedVersions.lastCommitted != quorumVersion.versions.lastCommitted); - wait(self->updateNode(self, committedVersions, quorumVersion.versions, cfi)); + wait(self->updateNode(self, committedVersions, quorumVersion.versions, lastCompacted, cfi)); } else if (self->maxAgreement + (self->cfis.size() - self->totalRepliesReceived) < (self->cfis.size() / 2 + 1)) { // It is impossible to reach a quorum of ConfigNodes that agree @@ -182,12 +172,13 @@ class GetCommittedVersionQuorum { self->quorumVersion.send( QuorumVersion{ CommittedVersions{ largestCommittedPrior, largestCommitted }, false }); } - wait(self->updateNode(self, committedVersions, self->quorumVersion.getFuture().get().versions, cfi)); + wait(self->updateNode( + self, committedVersions, self->quorumVersion.getFuture().get().versions, lastCompacted, cfi)); } else { // Still building up responses; don't have enough data to act on // yet, so wait until we do. QuorumVersion quorumVersion = wait(self->quorumVersion.getFuture()); - wait(self->updateNode(self, committedVersions, quorumVersion.versions, cfi)); + wait(self->updateNode(self, committedVersions, quorumVersion.versions, lastCompacted, cfi)); } } catch (Error& e) { // Count a timeout as a reply. @@ -196,8 +187,10 @@ class GetCommittedVersionQuorum { if (self->quorumVersion.canBeSet()) { self->quorumVersion.sendError(e); } - } else if (e.code() != error_code_timed_out) { - throw; + } else if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) { + if (self->quorumVersion.canBeSet()) { + self->quorumVersion.sendError(e); + } } else if (self->totalRepliesReceived == self->cfis.size() && self->quorumVersion.canBeSet() && !self->quorumVersion.isError()) { size_t nonTimeoutReplies = @@ -206,14 +199,10 @@ class GetCommittedVersionQuorum { }); if (nonTimeoutReplies >= self->cfis.size() / 2 + 1) { // Make sure to trigger the quorumVersion if a timeout - // occurred, a quorum disagree on the committed version, and - // there are no more incoming responses. Note that this means - // that it is impossible to reach a quorum, so send back the - // largest committed version seen. We also need to store the - // interface for the timed out server for future communication - // attempts. - auto& nodes = self->replies[self->largestCommitted]; - nodes.push_back(cfi); + // occurred, a quorum disagree on the committed version, + // and there are no more incoming responses. Note that this + // means that it is impossible to reach a quorum, so send + // back the largest committed version seen. self->quorumVersion.send( QuorumVersion{ CommittedVersions{ self->lastSeenVersion, self->largestCommitted }, false }); } else if (!self->quorumVersion.isSet()) { @@ -333,7 +322,8 @@ class PaxosConfigConsumerImpl { } catch (Error& e) { if (e.code() == error_code_failed_to_reach_quorum) { wait(self->getCommittedVersionQuorum.complete()); - } else if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) { + } else if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise && + e.code() != error_code_version_already_compacted && e.code() != error_code_process_behind) { throw; } wait(delayJittered(0.1)); @@ -391,7 +381,8 @@ class PaxosConfigConsumerImpl { wait(delayJittered(self->pollingInterval)); } catch (Error& e) { if (e.code() == error_code_version_already_compacted || e.code() == error_code_timed_out || - e.code() == error_code_failed_to_reach_quorum) { + e.code() == error_code_failed_to_reach_quorum || e.code() == error_code_version_already_compacted || + e.code() == error_code_process_behind) { TEST(true); // PaxosConfigConsumer get version_already_compacted error if (e.code() == error_code_failed_to_reach_quorum) { try { @@ -411,7 +402,7 @@ class PaxosConfigConsumerImpl { self->resetCommittedVersionQuorum(); continue; } else { - throw e; + throw; } } try { From 67eba5ec7c41155dfc241cfcdfa9d3db64698467 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Mon, 13 Dec 2021 12:13:34 -0600 Subject: [PATCH 100/138] Limiting DD Moves by destination SS. --- fdbclient/ServerKnobs.cpp | 2 + fdbclient/ServerKnobs.h | 2 + fdbserver/DataDistributionQueue.actor.cpp | 168 ++++++++++++++++++---- 3 files changed, 143 insertions(+), 29 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 38b8fa5f55..bbfdcece5d 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -113,10 +113,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // Data distribution queue init( HEALTH_POLL_TIME, 1.0 ); init( BEST_TEAM_STUCK_DELAY, 1.0 ); + init( DEST_OVERLOADED_DELAY, 0.2 ); init( BG_REBALANCE_POLLING_INTERVAL, 10.0 ); init( BG_REBALANCE_SWITCH_CHECK_INTERVAL, 5.0 ); if (randomize && BUGGIFY) BG_REBALANCE_SWITCH_CHECK_INTERVAL = 1.0; init( DD_QUEUE_LOGGING_INTERVAL, 5.0 ); init( RELOCATION_PARALLELISM_PER_SOURCE_SERVER, 2 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_SOURCE_SERVER = 1; + init( RELOCATION_PARALLELISM_PER_DEST_SERVER, 10 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_DEST_SERVER = 1; // Note: if this is smaller than FETCH_KEYS_PARALLELISM, this will artificially reduce performance. The current default of 10 is probably too high but is set conservatively for now. init( DD_QUEUE_MAX_KEY_SERVERS, 100 ); if( randomize && BUGGIFY ) DD_QUEUE_MAX_KEY_SERVERS = 1; init( DD_REBALANCE_PARALLELISM, 50 ); init( DD_REBALANCE_RESET_AMOUNT, 30 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index bf12da046a..532f70c19a 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -112,10 +112,12 @@ public: // Data distribution queue double HEALTH_POLL_TIME; double BEST_TEAM_STUCK_DELAY; + double DEST_OVERLOADED_DELAY; double BG_REBALANCE_POLLING_INTERVAL; double BG_REBALANCE_SWITCH_CHECK_INTERVAL; double DD_QUEUE_LOGGING_INTERVAL; double RELOCATION_PARALLELISM_PER_SOURCE_SERVER; + double RELOCATION_PARALLELISM_PER_DEST_SERVER; int DD_QUEUE_MAX_KEY_SERVERS; int DD_REBALANCE_PARALLELISM; int DD_REBALANCE_RESET_AMOUNT; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 55d86742fa..f89244c5b7 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -48,6 +48,7 @@ struct RelocateData { int workFactor; std::vector src; std::vector completeSources; + std::vector completeDests; bool wantsNewServers; TraceInterval interval; @@ -87,7 +88,7 @@ struct RelocateData { return priority == rhs.priority && boundaryPriority == rhs.boundaryPriority && healthPriority == rhs.healthPriority && keys == rhs.keys && startTime == rhs.startTime && workFactor == rhs.workFactor && src == rhs.src && completeSources == rhs.completeSources && - wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId; + completeDests == rhs.completeDests && wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId; } bool operator!=(const RelocateData& rhs) const { return !(*this == rhs); } }; @@ -262,7 +263,7 @@ struct Busyness { Busyness() : ledger(10, 0) {} - bool canLaunch(int prio, int work) { + bool canLaunch(int prio, int work) const { ASSERT(prio > 0 && prio < 1000); return ledger[prio / 100] <= WORK_FULL_UTILIZATION - work; // allow for rounding errors in double division } @@ -281,7 +282,8 @@ struct Busyness { if (i != 1) result += ", "; result += i + 1 == j ? format("%03d", i * 100) : format("%03d/%03d", i * 100, (j - 1) * 100); - result += format("=%1.02f", (float)ledger[i] / WORK_FULL_UTILIZATION); + result += + format("=%1.02f (%d/%d)", (float)ledger[i] / WORK_FULL_UTILIZATION, ledger[i], WORK_FULL_UTILIZATION); i = j; } return result; @@ -289,7 +291,7 @@ struct Busyness { }; // find the "workFactor" for this, were it launched now -int getWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) { +int getSrcWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) { if (relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER; @@ -299,21 +301,26 @@ int getWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) { return WORK_FULL_UTILIZATION / singleRegionTeamSize / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER; } -// Data movement's resource control: Do not overload source servers used for the RelocateData +int getDestWorkFactor() { + // Work of moving a shard is even across destination servers + return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER; +} + +// Data movement's resource control: Do not overload servers used for the RelocateData // return true if servers are not too busy to launch the relocation // This ensure source servers will not be overloaded. -bool canLaunch(RelocateData& relocation, - int teamSize, - int singleRegionTeamSize, - std::map& busymap, - std::vector cancellableRelocations) { +bool canLaunchSrc(RelocateData& relocation, + int teamSize, + int singleRegionTeamSize, + std::map& busymap, + std::vector cancellableRelocations) { // assert this has not already been launched ASSERT(relocation.workFactor == 0); ASSERT(relocation.src.size() != 0); ASSERT(teamSize >= singleRegionTeamSize); // find the "workFactor" for this, were it launched now - int workFactor = getWorkFactor(relocation, singleRegionTeamSize); + int workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize); int neededServers = std::min(relocation.src.size(), teamSize - singleRegionTeamSize + 1); if (SERVER_KNOBS->USE_OLD_NEEDED_SERVERS) { neededServers = std::max(1, (int)relocation.src.size() - teamSize + 1); @@ -338,18 +345,55 @@ bool canLaunch(RelocateData& relocation, return false; } +// candidateTeams is a vector containing one team per datacenter, the team(s) DD is planning on moving the shard to. +bool canLaunchDest(const std::vector, bool>>& candidateTeams, + int priority, + std::map& busymapDest) { + // fail switch if this is causing issues + if (SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER <= 0) { + return true; + } + int workFactor = getDestWorkFactor(); + for (auto& team : candidateTeams) { + for (UID id : team.first->getServerIDs()) { + if (!busymapDest[id].canLaunch(priority, workFactor)) { + return false; + } + } + } + return true; +} + // update busyness for each server void launch(RelocateData& relocation, std::map& busymap, int singleRegionTeamSize) { // if we are here this means that we can launch and should adjust all the work the servers can do - relocation.workFactor = getWorkFactor(relocation, singleRegionTeamSize); + relocation.workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize); for (int i = 0; i < relocation.src.size(); i++) busymap[relocation.src[i]].addWork(relocation.priority, relocation.workFactor); } -void complete(RelocateData const& relocation, std::map& busymap) { +void launchDest(RelocateData& relocation, + const std::vector, bool>>& candidateTeams, + std::map& destBusymap) { + ASSERT(relocation.completeDests.empty()); + int destWorkFactor = getDestWorkFactor(); + for (auto& team : candidateTeams) { + for (UID id : team.first->getServerIDs()) { + relocation.completeDests.push_back(id); + destBusymap[id].addWork(relocation.priority, destWorkFactor); + } + } +} + +void complete(RelocateData const& relocation, std::map& busymap, std::map& destBusymap) { ASSERT(relocation.workFactor > 0); for (int i = 0; i < relocation.src.size(); i++) busymap[relocation.src[i]].removeWork(relocation.priority, relocation.workFactor); + + int destWorkFactor = getDestWorkFactor(); + for (UID id : relocation.completeDests) { + destBusymap[id].removeWork(relocation.priority, destWorkFactor); + } } ACTOR Future dataDistributionRelocator(struct DDQueueData* self, @@ -376,6 +420,7 @@ struct DDQueueData { int singleRegionTeamSize; std::map busymap; // UID is serverID + std::map destBusymap; // UID is serverID KeyRangeMap queueMap; std::set> fetchingSourcesQueue; @@ -546,15 +591,22 @@ struct DDQueueData { .detail("Problem", "relocate data that is inFlight is not also in the queue"); } + for (int i = 0; i < it->value().completeDests.size(); i++) { + // each server in the inFlight map is in the dest busymap + if (!destBusymap.count(it->value().completeDests[i])) + TraceEvent(SevError, "DDQueueValidateError10") + .detail("Problem", "each server in the inFlight map is in the destBusymap"); + } + // in flight relocates have source servers if (it->value().startTime != -1 && !it->value().src.size()) - TraceEvent(SevError, "DDQueueValidateError10") + TraceEvent(SevError, "DDQueueValidateError11") .detail("Problem", "in flight relocates have source servers"); if (inFlightActors.liveActorAt(it->range().begin)) { // the key range in the inFlight map matches the key range in the RelocateData message if (it->value().keys != it->range()) - TraceEvent(SevError, "DDQueueValidateError11") + TraceEvent(SevError, "DDQueueValidateError12") .detail( "Problem", "the key range in the inFlight map matches the key range in the RelocateData message"); @@ -564,13 +616,29 @@ struct DDQueueData { for (auto it = busymap.begin(); it != busymap.end(); ++it) { for (int i = 0; i < it->second.ledger.size() - 1; i++) { if (it->second.ledger[i] < it->second.ledger[i + 1]) - TraceEvent(SevError, "DDQueueValidateError12") + TraceEvent(SevError, "DDQueueValidateError13") .detail("Problem", "ascending ledger problem") .detail("LedgerLevel", i) .detail("LedgerValueA", it->second.ledger[i]) .detail("LedgerValueB", it->second.ledger[i + 1]); if (it->second.ledger[i] < 0.0) - TraceEvent(SevError, "DDQueueValidateError13") + TraceEvent(SevError, "DDQueueValidateError14") + .detail("Problem", "negative ascending problem") + .detail("LedgerLevel", i) + .detail("LedgerValue", it->second.ledger[i]); + } + } + + for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) { + for (int i = 0; i < it->second.ledger.size() - 1; i++) { + if (it->second.ledger[i] < it->second.ledger[i + 1]) + TraceEvent(SevError, "DDQueueValidateError15") + .detail("Problem", "ascending ledger problem") + .detail("LedgerLevel", i) + .detail("LedgerValueA", it->second.ledger[i]) + .detail("LedgerValueB", it->second.ledger[i + 1]); + if (it->second.ledger[i] < 0.0) + TraceEvent(SevError, "DDQueueValidateError16") .detail("Problem", "negative ascending problem") .detail("LedgerLevel", i) .detail("LedgerValue", it->second.ledger[i]); @@ -895,7 +963,7 @@ struct DDQueueData { // SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the // queue // FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY - if (!canLaunch(rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations)) { + if (!canLaunchSrc(rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations)) { // logRelocation( rd, "SkippingQueuedRelocation" ); continue; } @@ -956,6 +1024,18 @@ struct DDQueueData { } }; +static std::string destServersString(std::vector, bool>> const& bestTeams) { + std::stringstream ss; + + for (auto& tc : bestTeams) { + for (const auto& id : tc.first->getServerIDs()) { + ss << id.toString() << " "; + } + } + + return std::move(ss).str(); +} + // This actor relocates the specified keys to a good place. // The inFlightActor key range map stores the actor for each RelocateData ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, const DDEnabledState* ddEnabledState) { @@ -970,6 +1050,9 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, state bool anyHealthy = false; state bool allHealthy = true; state bool anyWithSource = false; + state bool anyDestOverloaded = false; + state int destOverloadedCount = 0; + state int stuckCount = 0; state std::vector, bool>> bestTeams; state double startTime = now(); state std::vector destIds; @@ -997,7 +1080,8 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, ASSERT(rd.src.size()); loop { - state int stuckCount = 0; + destOverloadedCount = 0; + stuckCount = 0; // state int bestTeamStuckThreshold = 50; loop { state int tciIndex = 0; @@ -1005,6 +1089,7 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, anyHealthy = false; allHealthy = true; anyWithSource = false; + anyDestOverloaded = false; bestTeams.clear(); // Get team from teamCollections in different DCs and find the best one while (tciIndex < self->teamCollections.size()) { @@ -1058,18 +1143,41 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second); tciIndex++; } - if (foundTeams && anyHealthy) { + // once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves + // already + anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap); + + if (foundTeams && anyHealthy && !anyDestOverloaded) { + ASSERT(rd.completeDests.empty()); break; } - TEST(true); // did not find a healthy destination team on the first attempt - stuckCount++; - TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId) - .suppressFor(1.0) - .detail("Count", stuckCount) - .detail("TeamCollectionId", tciIndex) - .detail("NumOfTeamCollections", self->teamCollections.size()); - wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch)); + if (anyDestOverloaded) { + TEST(true); // Destination overloaded throttled move + destOverloadedCount++; + TraceEvent(destOverloadedCount > 50 ? SevInfo : SevDebug, "DestSSBusy", distributorId) + .suppressFor(1.0) + .detail("StuckCount", stuckCount) + .detail("DestOverloadedCount", destOverloadedCount) + .detail("TeamCollectionId", tciIndex) + .detail("AnyDestOverloaded", anyDestOverloaded) + .detail("NumOfTeamCollections", self->teamCollections.size()) + .detail("Servers", destServersString(bestTeams)); + wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch)); + } else { + TEST(true); // did not find a healthy destination team on the first attempt + stuckCount++; + TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId) + .suppressFor(1.0) + .detail("StuckCount", stuckCount) + .detail("DestOverloadedCount", destOverloadedCount) + .detail("TeamCollectionId", tciIndex) + .detail("AnyDestOverloaded", anyDestOverloaded) + .detail("NumOfTeamCollections", self->teamCollections.size()); + wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch)); + } + + // TODO different trace event + knob for overloaded? Could wait on an async var for done moves } destIds.clear(); @@ -1123,6 +1231,8 @@ ACTOR Future dataDistributionRelocator(DDQueueData* self, RelocateData rd, // FIXME: do not add data in flight to servers that were already in the src. healthyDestinations.addDataInFlightToTeam(+metrics.bytes); + launchDest(rd, bestTeams, self->destBusymap); + if (SERVER_KNOBS->DD_ENABLE_VERBOSE_TRACING) { // StorageMetrics is the rd shard's metrics, e.g., bytes and write bandwidth TraceEvent(SevInfo, "RelocateShardDecision", distributorId) @@ -1646,7 +1756,7 @@ ACTOR Future dataDistributionQueue(Database cx, launchData = results; } when(RelocateData done = waitNext(self.dataTransferComplete.getFuture())) { - complete(done, self.busymap); + complete(done, self.busymap, self.destBusymap); if (serversToLaunchFrom.empty() && !done.src.empty()) launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch); serversToLaunchFrom.insert(done.src.begin(), done.src.end()); From a04934465c46ac4f7b0aa4cf9b4ddbe6fcb89a43 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 15 Mar 2022 10:41:16 -0700 Subject: [PATCH 101/138] Add tenant support back to mapped range requests. Fix ACTOR warning. --- fdbclient/NativeAPI.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 64 +++++++++---------------------- 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8f5c9efb44..c6f1fa4baf 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3900,7 +3900,7 @@ int64_t inline getRangeResultFamilyBytes(MappedRangeResultRef result) { } // TODO: Client should add mapped keys to conflict ranges. -ACTOR template // RangeResult or MappedRangeResult +template // RangeResult or MappedRangeResult void getRangeFinished(Reference trState, double startTime, KeySelector begin, diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index d7a8dfb0a9..135fac8352 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -2224,8 +2224,12 @@ ACTOR Future quickGetValue(StorageServer* data, if (data->shards[key]->isReadable()) { try { // TODO: Use a lower level API may be better? Or tweak priorities? - GetValueRequest req( - pOriginalReq->spanContext, TenantInfo(), key, version, pOriginalReq->tags, pOriginalReq->debugID); + GetValueRequest req(pOriginalReq->spanContext, + pOriginalReq->tenantInfo, + key, + version, + pOriginalReq->tags, + pOriginalReq->debugID); // Note that it does not use readGuard to avoid server being overloaded here. Throttling is enforced at the // original request level, rather than individual underlying lookups. The reason is that throttle any // individual underlying lookup will fail the original request, which is not productive. @@ -2245,7 +2249,7 @@ ACTOR Future quickGetValue(StorageServer* data, ++data->counters.quickGetValueMiss; if (SERVER_KNOBS->QUICK_GET_VALUE_FALLBACK) { - state Transaction tr(data->cx); + state Transaction tr(data->cx, pOriginalReq->tenantInfo.name); tr.setVersion(version); // TODO: is DefaultPromiseEndpoint the best priority for this? tr.trState->taskID = TaskPriority::DefaultPromiseEndpoint; @@ -2816,6 +2820,7 @@ ACTOR Future quickGetKeyValues( req.begin = getRange.begin; req.end = getRange.end; req.version = version; + req.tenantInfo = pOriginalReq->tenantInfo; // TODO: Validate when the underlying range query exceeds the limit. // TODO: Use remainingLimit, remainingLimitBytes rather than separate knobs. req.limit = SERVER_KNOBS->QUICK_GET_KEY_VALUES_LIMIT; @@ -2843,7 +2848,7 @@ ACTOR Future quickGetKeyValues( ++data->counters.quickGetKeyValuesMiss; if (SERVER_KNOBS->QUICK_GET_KEY_VALUES_FALLBACK) { - state Transaction tr(data->cx); + state Transaction tr(data->cx, pOriginalReq->tenantInfo.name); tr.setVersion(version); // TODO: is DefaultPromiseEndpoint the best priority for this? tr.trState->taskID = TaskPriority::DefaultPromiseEndpoint; @@ -2858,10 +2863,7 @@ ACTOR Future quickGetKeyValues( } }; -Key constructMappedKey(KeyValueRef* keyValue, - Tuple& mappedKeyFormatTuple, - bool& isRangeQuery, - Optional tenantPrefix) { +Key constructMappedKey(KeyValueRef* keyValue, Tuple& mappedKeyFormatTuple, bool& isRangeQuery) { // Lazily parse key and/or value to tuple because they may not need to be a tuple if not used. Optional keyTuple; Optional valueTuple; @@ -2951,13 +2953,7 @@ Key constructMappedKey(KeyValueRef* keyValue, } } - KeyRef mappedKey = mappedKeyTuple.pack(); - - if (tenantPrefix.present()) { - return mappedKey.withPrefix(tenantPrefix.get()); - } - - return mappedKey; + return mappedKeyTuple.pack(); } TEST_CASE("/fdbserver/storageserver/constructMappedKey") { @@ -2973,7 +2969,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { .append("{...}"_sr); bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, Optional()); + Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); Key expectedMappedKey = Tuple() .append("normal"_sr) @@ -2985,33 +2981,11 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { ASSERT(mappedKey.compare(expectedMappedKey) == 0); ASSERT(isRangeQuery == true); } - { - Tuple mapperTuple = Tuple() - .append("normal"_sr) - .append("{{escaped}}"_sr) - .append("{K[2]}"_sr) - .append("{V[0]}"_sr) - .append("{...}"_sr); - - bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, TenantMapEntry(1, "foo"_sr).prefix); - - Key expectedMappedKey = Tuple() - .append("normal"_sr) - .append("{escaped}"_sr) - .append("key-2"_sr) - .append("value-0"_sr) - .getDataAsStandalone() - .withPrefix("foo\x00\x00\x00\x00\x00\x00\x00\x01"_sr); - // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; - ASSERT(mappedKey.compare(expectedMappedKey) == 0); - ASSERT(isRangeQuery == true); - } { Tuple mapperTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr); bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, Optional()); + Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone(); // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; @@ -3022,7 +2996,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { Tuple mapperTuple = Tuple().append("{{{{}}"_sr).append("}}"_sr); bool isRangeQuery = false; - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, Optional()); + Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); Key expectedMappedKey = Tuple().append("{{}"_sr).append("}"_sr).getDataAsStandalone(); // std::cout << printable(mappedKey) << " == " << printable(expectedMappedKey) << std::endl; @@ -3034,7 +3008,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { bool isRangeQuery = false; state bool throwException = false; try { - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, Optional()); + Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_index); throwException = true; @@ -3046,7 +3020,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { bool isRangeQuery = false; state bool throwException2 = false; try { - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, Optional()); + Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_range_decriptor); throwException2 = true; @@ -3058,7 +3032,7 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { bool isRangeQuery = false; state bool throwException3 = false; try { - Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery, Optional()); + Key mappedKey = constructMappedKey(&kvr, mapperTuple, isRangeQuery); } catch (Error& e) { ASSERT(e.code() == error_code_mapper_bad_index); throwException3 = true; @@ -3090,7 +3064,7 @@ ACTOR Future mapKeyValues(StorageServer* data, kvm.value = it->value; state bool isRangeQuery = false; - state Key mappedKey = constructMappedKey(it, mappedKeyFormatTuple, isRangeQuery, tenantPrefix); + state Key mappedKey = constructMappedKey(it, mappedKeyFormatTuple, isRangeQuery); // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. result.arena.dependsOn(mappedKey.arena()); @@ -3101,12 +3075,10 @@ ACTOR Future mapKeyValues(StorageServer* data, // Use the mappedKey as the prefix of the range query. GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, input.version, &(result.arena), pOriginalReq)); - // TODO: Remove tenant prefixes in the keys if they haven't been removed? kvm.reqAndResult = getRange; } else { GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, input.version, &(result.arena), pOriginalReq)); - // TODO: Remove tenant prefixes in the keys if they haven't been removed? kvm.reqAndResult = getValue; } result.data.push_back(result.arena, kvm); From c3a68d661e37b0e13539fc72ad40b785d3a49203 Mon Sep 17 00:00:00 2001 From: He Liu <86634338+liquid-helium@users.noreply.github.com> Date: Tue, 15 Mar 2022 13:03:23 -0700 Subject: [PATCH 102/138] Physical Shard Move (#6264) Physical Shard Move part I: Checkpoint creation, transfer and restore. --- fdbclient/CMakeLists.txt | 1 + fdbclient/NativeAPI.actor.cpp | 161 ++++++ fdbclient/NativeAPI.actor.h | 19 + fdbclient/ServerKnobs.cpp | 3 + fdbclient/ServerKnobs.h | 2 + fdbclient/StorageCheckpoint.h | 88 ++++ fdbclient/StorageServerInterface.h | 62 +++ fdbclient/SystemData.cpp | 27 + fdbclient/SystemData.h | 7 + fdbserver/ApplyMetadataMutation.cpp | 24 + fdbserver/CMakeLists.txt | 5 + fdbserver/IKeyValueStore.h | 25 + fdbserver/KeyValueStoreRocksDB.actor.cpp | 471 ++++++++++++++++-- fdbserver/RocksDBCheckpointUtils.actor.cpp | 283 +++++++++++ fdbserver/RocksDBCheckpointUtils.actor.h | 209 ++++++++ fdbserver/ServerCheckpoint.actor.cpp | 67 +++ fdbserver/ServerCheckpoint.actor.h | 66 +++ fdbserver/storageserver.actor.cpp | 443 +++++++++++++--- .../workloads/PhysicalShardMove.actor.cpp | 234 +++++++++ flow/error_definitions.h | 2 + tests/CMakeLists.txt | 1 + tests/fast/PhysicalShardMove.toml | 13 + 22 files changed, 2086 insertions(+), 127 deletions(-) create mode 100644 fdbclient/StorageCheckpoint.h create mode 100644 fdbserver/RocksDBCheckpointUtils.actor.cpp create mode 100644 fdbserver/RocksDBCheckpointUtils.actor.h create mode 100644 fdbserver/ServerCheckpoint.actor.cpp create mode 100644 fdbserver/ServerCheckpoint.actor.h create mode 100644 fdbserver/workloads/PhysicalShardMove.actor.cpp create mode 100644 tests/fast/PhysicalShardMove.toml diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 1fbb8d532d..3d3a0d4ecd 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -128,6 +128,7 @@ set(FDBCLIENT_SRCS StatusClient.h StorageServerInterface.cpp StorageServerInterface.h + StorageCheckpoint.h Subspace.cpp Subspace.h StackLineage.h diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8f5c9efb44..68b32c84bc 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -7630,6 +7630,167 @@ ACTOR Future snapCreate(Database cx, Standalone snapCmd, UID sn } } +ACTOR template +static Future createCheckpointImpl(T tr, KeyRangeRef range, CheckpointFormat format) { + TraceEvent("CreateCheckpointTransactionBegin").detail("Range", range.toString()); + + state RangeResult keyServers = wait(krmGetRanges(tr, keyServersPrefix, range)); + ASSERT(!keyServers.more); + + state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY); + + for (int i = 0; i < keyServers.size() - 1; ++i) { + KeyRangeRef shard(keyServers[i].key, keyServers[i + 1].key); + std::vector src; + std::vector dest; + decodeKeyServersValue(UIDtoTagMap, keyServers[i].value, src, dest); + + // The checkpoint request is sent to all replicas, in case any of them is unhealthy. + // An alternative is to choose a healthy replica. + const UID checkpointID = deterministicRandom()->randomUniqueID(); + for (int idx = 0; idx < src.size(); ++idx) { + CheckpointMetaData checkpoint(shard & range, format, src[idx], checkpointID); + checkpoint.setState(CheckpointMetaData::Pending); + tr->set(checkpointKeyFor(checkpointID), checkpointValue(checkpoint)); + } + + TraceEvent("CreateCheckpointTransactionShard") + .detail("Shard", shard.toString()) + .detail("SrcServers", describe(src)) + .detail("ServerSelected", describe(src)) + .detail("CheckpointKey", checkpointKeyFor(checkpointID)) + .detail("ReadVersion", tr->getReadVersion().get()); + } + + return Void(); +} + +Future createCheckpoint(Reference tr, KeyRangeRef range, CheckpointFormat format) { + return holdWhile(tr, createCheckpointImpl(tr, range, format)); +} + +Future createCheckpoint(Transaction* tr, KeyRangeRef range, CheckpointFormat format) { + return createCheckpointImpl(tr, range, format); +} + +// Gets CheckpointMetaData of the specific keyrange, version and format from one of the storage servers, if none of the +// servers have the checkpoint, a checkpoint_not_found error is returned. +ACTOR static Future getCheckpointMetaDataInternal(GetCheckpointRequest req, + Reference alternatives, + double timeout) { + TraceEvent("GetCheckpointMetaDataInternalBegin") + .detail("Range", req.range.toString()) + .detail("Version", req.version) + .detail("Format", static_cast(req.format)) + .detail("Locations", alternatives->description()); + + state std::vector>> fs; + state int i = 0; + for (i = 0; i < alternatives->size(); ++i) { + // For each shard, all storage servers are checked, only one is required. + fs.push_back(errorOr(timeoutError(alternatives->getInterface(i).checkpoint.getReply(req), timeout))); + } + + state Optional error; + wait(waitForAll(fs)); + TraceEvent("GetCheckpointMetaDataInternalWaitEnd") + .detail("Range", req.range.toString()) + .detail("Version", req.version); + + for (i = 0; i < fs.size(); ++i) { + if (!fs[i].isReady()) { + error = timed_out(); + TraceEvent("GetCheckpointMetaDataInternalSSTimeout") + .detail("Range", req.range.toString()) + .detail("Version", req.version) + .detail("StorageServer", alternatives->getInterface(i).uniqueID); + continue; + } + + if (fs[i].get().isError()) { + const Error& e = fs[i].get().getError(); + TraceEvent("GetCheckpointMetaDataInternalError") + .errorUnsuppressed(e) + .detail("Range", req.range.toString()) + .detail("Version", req.version) + .detail("StorageServer", alternatives->getInterface(i).uniqueID); + if (e.code() != error_code_checkpoint_not_found || !error.present()) { + error = e; + } + } else { + return fs[i].get().get(); + } + } + + ASSERT(error.present()); + throw error.get(); +} + +ACTOR Future> getCheckpointMetaData(Database cx, + KeyRange keys, + Version version, + CheckpointFormat format, + double timeout) { + state Span span("NAPI:GetCheckpoint"_loc); + + loop { + TraceEvent("GetCheckpointBegin") + .detail("Range", keys.toString()) + .detail("Version", version) + .detail("Format", static_cast(format)); + + state std::vector> fs; + state int i = 0; + + try { + state std::vector>> locations = + wait(getKeyRangeLocations(cx, + keys, + CLIENT_KNOBS->TOO_MANY, + Reverse::False, + &StorageServerInterface::checkpoint, + span.context, + Optional(), + UseProvisionalProxies::False)); + + fs.clear(); + for (i = 0; i < locations.size(); ++i) { + fs.push_back(getCheckpointMetaDataInternal( + GetCheckpointRequest(version, keys, format), locations[i].second, timeout)); + TraceEvent("GetCheckpointShardBegin") + .detail("Range", locations[i].first.toString()) + .detail("Version", version) + .detail("StorageServers", locations[i].second->description()); + } + + choose { + when(wait(cx->connectionFileChanged())) { cx->invalidateCache(keys); } + when(wait(waitForAll(fs))) { break; } + when(wait(delay(timeout))) { + TraceEvent("GetCheckpointTimeout").detail("Range", keys.toString()).detail("Version", version); + } + } + } catch (Error& e) { + TraceEvent("GetCheckpointError").errorUnsuppressed(e).detail("Range", keys.toString()); + if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || + e.code() == error_code_connection_failed || e.code() == error_code_broken_promise) { + cx->invalidateCache(keys); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); + } else { + throw; + } + } + } + + std::vector res; + for (i = 0; i < fs.size(); ++i) { + TraceEvent("GetCheckpointShardEnd").detail("Checkpoint", fs[i].get().toString()); + res.push_back(fs[i].get()); + } + return res; +} + ACTOR Future checkSafeExclusions(Database cx, std::vector exclusions) { TraceEvent("ExclusionSafetyCheckBegin") .detail("NumExclusion", exclusions.size()) diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index fd3052d638..9bbe1073a7 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -490,6 +490,25 @@ int64_t extractIntOption(Optional value, // states: coordinator, TLog and storage state ACTOR Future snapCreate(Database cx, Standalone snapCmd, UID snapUID); +// Adds necessary mutation(s) to the transaction, so that *one* checkpoint will be created for +// each and every shards overlapping with `range`. Each checkpoint will be created at a random +// storage server for each shard. +// All checkpoint(s) will be created at the transaction's commit version. +Future createCheckpoint(Transaction* tr, KeyRangeRef range, CheckpointFormat format); + +// Same as above. +Future createCheckpoint(Reference tr, KeyRangeRef range, CheckpointFormat format); + +// Gets checkpoint metadata for `keys` at the specific version, with the particular format. +// One CheckpointMetaData will be returned for each distinctive shard. +// The collective keyrange of the returned checkpoint(s) is a super-set of `keys`. +// checkpoint_not_found() error will be returned if the specific checkpoint(s) cannot be found. +ACTOR Future> getCheckpointMetaData(Database cx, + KeyRange keys, + Version version, + CheckpointFormat format, + double timeout = 5.0); + // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed ACTOR Future checkSafeExclusions(Database cx, std::vector exclusions); diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index bbfdcece5d..c87d9f5ce6 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -367,6 +367,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 ); // If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO. init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE, true ); + init( DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, "fdb"); + init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true; init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 ); init( ROCKSDB_MAX_SUBCOMPACTIONS, 2 ); @@ -678,6 +680,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_STORAGE_COMMIT_TIME, 120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1; init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); + init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 ); init( QUICK_GET_VALUE_FALLBACK, true ); init( QUICK_GET_KEY_VALUES_FALLBACK, true ); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 532f70c19a..257746a0c5 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -298,6 +298,7 @@ public: bool ROCKSDB_READ_RANGE_REUSE_ITERATORS; int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC; bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE; + std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY; bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead double ROCKSDB_PERFCONTEXT_SAMPLE_RATE; int ROCKSDB_MAX_SUBCOMPACTIONS; @@ -617,6 +618,7 @@ public: bool ENABLE_CLEAR_RANGE_EAGER_READS; bool QUICK_GET_VALUE_FALLBACK; bool QUICK_GET_KEY_VALUES_FALLBACK; + int CHECKPOINT_TRANSFER_BLOCK_BYTES; int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; diff --git a/fdbclient/StorageCheckpoint.h b/fdbclient/StorageCheckpoint.h new file mode 100644 index 0000000000..7c83d71a3f --- /dev/null +++ b/fdbclient/StorageCheckpoint.h @@ -0,0 +1,88 @@ +/* + * StorageCheckpoint.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLIENT_STORAGCHECKPOINT_H +#define FDBCLIENT_STORAGCHECKPOINT_H +#pragma once + +#include "fdbclient/FDBTypes.h" + +// FDB storage checkpoint format. +enum CheckpointFormat { + InvalidFormat = 0, + // For RocksDB, checkpoint generated via rocksdb::Checkpoint::ExportColumnFamily(). + RocksDBColumnFamily = 1, + // For RocksDB, checkpoint generated via rocksdb::Checkpoint::CreateCheckpoint(). + RocksDB = 2, +}; + +// Metadata of a FDB checkpoint. +struct CheckpointMetaData { + enum CheckpointState { + InvalidState = 0, + Pending = 1, // Checkpoint creation pending. + Complete = 2, // Checkpoint is created and ready to be read. + Deleting = 3, // Checkpoint deletion requested. + Fail = 4, + }; + + constexpr static FileIdentifier file_identifier = 13804342; + Version version; + KeyRange range; + int16_t format; // CheckpointFormat. + UID ssID; // Storage server ID on which this checkpoint is created. + UID checkpointID; // A unique id for this checkpoint. + int16_t state; // CheckpointState. + int referenceCount; // A reference count on the checkpoint, it can only be deleted when this is 0. + int64_t gcTime; // Time to delete this checkpoint, a Unix timestamp in seconds. + + // A serialized metadata associated with format, this data can be understood by the corresponding KVS. + Standalone serializedCheckpoint; + + CheckpointMetaData() : format(InvalidFormat), state(InvalidState), referenceCount(0) {} + CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID) + : version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending), + referenceCount(0) {} + CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID) + : version(version), range(range), format(format), checkpointID(checkpointID), referenceCount(0) {} + + CheckpointState getState() const { return static_cast(state); } + + void setState(CheckpointState state) { this->state = static_cast(state); } + + CheckpointFormat getFormat() const { return static_cast(format); } + + void setFormat(CheckpointFormat format) { this->format = static_cast(format); } + + std::string toString() const { + std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) + + "\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() + + "\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast(state)) + + "\n"; + return res; + } + + template + void serialize(Ar& ar) { + serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint); + } +}; + +#endif diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 75c9411f18..592d2dd167 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -24,6 +24,7 @@ #include #include "fdbclient/FDBTypes.h" +#include "fdbclient/StorageCheckpoint.h" #include "fdbrpc/Locality.h" #include "fdbrpc/QueueModel.h" #include "fdbrpc/fdbrpc.h" @@ -85,6 +86,8 @@ struct StorageServerInterface { RequestStream overlappingChangeFeeds; RequestStream changeFeedPop; RequestStream changeFeedVersionUpdate; + RequestStream checkpoint; + RequestStream fetchCheckpoint; explicit StorageServerInterface(UID uid) : uniqueID(uid) {} StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {} @@ -137,6 +140,9 @@ struct StorageServerInterface { RequestStream(getValue.getEndpoint().getAdjustedEndpoint(17)); changeFeedVersionUpdate = RequestStream( getValue.getEndpoint().getAdjustedEndpoint(18)); + checkpoint = RequestStream(getValue.getEndpoint().getAdjustedEndpoint(19)); + fetchCheckpoint = + RequestStream(getValue.getEndpoint().getAdjustedEndpoint(20)); } } else { ASSERT(Ar::isDeserializing); @@ -184,6 +190,8 @@ struct StorageServerInterface { streams.push_back(overlappingChangeFeeds.getReceiver()); streams.push_back(changeFeedPop.getReceiver()); streams.push_back(changeFeedVersionUpdate.getReceiver()); + streams.push_back(checkpoint.getReceiver()); + streams.push_back(fetchCheckpoint.getReceiver()); FlowTransport::transport().addEndpoints(streams); } }; @@ -816,6 +824,60 @@ struct ChangeFeedPopRequest { } }; +// Request to search for a checkpoint for a minimum keyrange: `range`, at the specific version, +// in the specific format. +// A CheckpointMetaData will be returned if the specific checkpoint is found. +struct GetCheckpointRequest { + constexpr static FileIdentifier file_identifier = 13804343; + Version version; // The FDB version at which the checkpoint is created. + KeyRange range; + int16_t format; // CheckpointFormat. + Optional checkpointID; // When present, look for the checkpoint with the exact UID. + ReplyPromise reply; + + GetCheckpointRequest() {} + GetCheckpointRequest(Version version, KeyRange const& range, CheckpointFormat format) + : version(version), range(range), format(format) {} + + template + void serialize(Ar& ar) { + serializer(ar, version, range, format, checkpointID, reply); + } +}; + +// Reply to FetchCheckpointRequest, transfers checkpoint back to client. +struct FetchCheckpointReply : public ReplyPromiseStreamReply { + constexpr static FileIdentifier file_identifier = 13804345; + Standalone token; // Serialized data specific to a particular checkpoint format. + Standalone data; + + FetchCheckpointReply() {} + FetchCheckpointReply(StringRef token) : token(token) {} + + int expectedSize() const { return data.expectedSize(); } + + template + void serialize(Ar& ar) { + serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, ReplyPromiseStreamReply::sequence, token, data); + } +}; + +// Request to fetch checkpoint from a storage server. +struct FetchCheckpointRequest { + constexpr static FileIdentifier file_identifier = 13804344; + UID checkpointID; + Standalone token; // Serialized data specific to a particular checkpoint format. + ReplyPromiseStream reply; + + FetchCheckpointRequest() = default; + FetchCheckpointRequest(UID checkpointID, StringRef token) : checkpointID(checkpointID), token(token) {} + + template + void serialize(Ar& ar) { + serializer(ar, checkpointID, token, reply); + } +}; + struct OverlappingChangeFeedEntry { Key rangeId; KeyRange range; diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 4d1bc23574..9d1329f98b 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -215,6 +215,33 @@ const KeyRangeRef writeConflictRangeKeysRange = const KeyRef clusterIdKey = LiteralStringRef("\xff/clusterId"); +const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr; + +const Key checkpointKeyFor(UID checkpointID) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes(checkpointPrefix); + wr << checkpointID; + return wr.toValue(); +} + +const Value checkpointValue(const CheckpointMetaData& checkpoint) { + return ObjectWriter::toValue(checkpoint, IncludeVersion()); +} + +UID decodeCheckpointKey(const KeyRef& key) { + UID checkpointID; + BinaryReader rd(key.removePrefix(checkpointPrefix), Unversioned()); + rd >> checkpointID; + return checkpointID; +} + +CheckpointMetaData decodeCheckpointValue(const ValueRef& value) { + CheckpointMetaData checkpoint; + ObjectReader reader(value.begin(), IncludeVersion()); + reader.deserialize(checkpoint); + return checkpoint; +} + // "\xff/cacheServer/[[UID]] := StorageServerInterface" const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"), LiteralStringRef("\xff/cacheServer0")); const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin; diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index a7aa733f65..228c058d77 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -70,6 +70,13 @@ void decodeKeyServersValue(std::map const& tag_uid, extern const KeyRef clusterIdKey; +// "\xff/checkpoint/[[UID]] := [[CheckpointMetaData]]" +extern const KeyRef checkpointPrefix; +const Key checkpointKeyFor(UID checkpointID); +const Value checkpointValue(const CheckpointMetaData& checkpoint); +UID decodeCheckpointKey(const KeyRef& key); +CheckpointMetaData decodeCheckpointValue(const ValueRef& value); + // "\xff/storageCacheServer/[[UID]] := StorageServerInterface" // This will be added by the cache server on initialization and removed by DD // TODO[mpilman]: We will need a way to map uint16_t ids to UIDs in a future diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 471652dcda..5afc862b92 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -541,6 +541,29 @@ private: toCommit->writeTypedMessage(privatized); } + // Generates private mutations for the target storage server, instructing it to create a checkpoint. + void checkSetCheckpointKeys(MutationRef m) { + if (!m.param1.startsWith(checkpointPrefix)) { + return; + } + if (toCommit) { + CheckpointMetaData checkpoint = decodeCheckpointValue(m.param2); + Tag tag = decodeServerTagValue(txnStateStore->readValue(serverTagKeyFor(checkpoint.ssID)).get().get()); + MutationRef privatized = m; + privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); + TraceEvent("SendingPrivateMutationCheckpoint", dbgid) + .detail("Original", m) + .detail("Privatized", privatized) + .detail("Server", checkpoint.ssID) + .detail("TagKey", serverTagKeyFor(checkpoint.ssID)) + .detail("Tag", tag.toString()) + .detail("Checkpoint", checkpoint.toString()); + + toCommit->addTag(tag); + toCommit->writeTypedMessage(privatized); + } + } + void checkSetOtherKeys(MutationRef m) { if (initialCommit) return; @@ -1081,6 +1104,7 @@ public: if (m.type == MutationRef::SetValue && isSystemKey(m.param1)) { checkSetKeyServersPrefix(m); checkSetServerKeysPrefix(m); + checkSetCheckpointKeys(m); checkSetServerTagsPrefix(m); checkSetStorageCachePrefix(m); checkSetCacheKeysPrefix(m); diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 728b4f9ab1..f548fba0b5 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -50,6 +50,10 @@ set(FDBSERVER_SRCS KeyValueStoreMemory.actor.cpp KeyValueStoreRocksDB.actor.cpp KeyValueStoreSQLite.actor.cpp + ServerCheckpoint.actor.cpp + ServerCheckpoint.actor.h + RocksDBCheckpointUtils.actor.cpp + RocksDBCheckpointUtils.actor.h Knobs.h LatencyBandConfig.cpp LatencyBandConfig.h @@ -191,6 +195,7 @@ set(FDBSERVER_SRCS workloads/ChangeFeeds.actor.cpp workloads/DataDistributionMetrics.actor.cpp workloads/DataLossRecovery.actor.cpp + workloads/PhysicalShardMove.actor.cpp workloads/DDBalance.actor.cpp workloads/DDMetrics.actor.cpp workloads/DDMetricsExclude.actor.cpp diff --git a/fdbserver/IKeyValueStore.h b/fdbserver/IKeyValueStore.h index 6217cada9c..a295a55004 100644 --- a/fdbserver/IKeyValueStore.h +++ b/fdbserver/IKeyValueStore.h @@ -24,6 +24,22 @@ #include "fdbclient/FDBTypes.h" #include "fdbserver/Knobs.h" +#include "fdbclient/StorageCheckpoint.h" + +struct CheckpointRequest { + const Version version; // The FDB version at which the checkpoint is created. + const KeyRange range; // Keyrange this checkpoint must contain. + const CheckpointFormat format; + const UID checkpointID; + const std::string checkpointDir; // The local directory where the checkpoint file will be created. + + CheckpointRequest(const Version version, + const KeyRange& range, + const CheckpointFormat format, + const UID& id, + const std::string& checkpointDir) + : version(version), range(range), format(format), checkpointID(id), checkpointDir(checkpointDir) {} +}; class IClosable { public: @@ -87,6 +103,15 @@ public: virtual void enableSnapshot() {} + // Create a checkpoint. + virtual Future checkpoint(const CheckpointRequest& request) { throw not_implemented(); } + + // Restore from a checkpoint. + virtual Future restore(const std::vector& checkpoints) { throw not_implemented(); } + + // Delete a checkpoint. + virtual Future deleteCheckpoint(const CheckpointMetaData& checkpoint) { throw not_implemented(); } + /* Concurrency contract Causal consistency: diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp index 39ca3787ec..c157458d81 100644 --- a/fdbserver/KeyValueStoreRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp @@ -5,11 +5,21 @@ #include #include #include +#include #include +#include +#include +#include +#include +#include #include #include #include +#include +#include #include +#include + #include #include #include @@ -32,6 +42,8 @@ #endif // SSD_ROCKSDB_EXPERIMENTAL #include "fdbserver/IKeyValueStore.h" +#include "fdbserver/RocksDBCheckpointUtils.actor.h" + #include "flow/actorcompiler.h" // has to be last include #ifdef SSD_ROCKSDB_EXPERIMENTAL @@ -114,7 +126,10 @@ private: std::mutex mutex; }; using DB = rocksdb::DB*; +using CF = rocksdb::ColumnFamilyHandle*; +#define PERSIST_PREFIX "\xff\xff" +const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version"); const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = LiteralStringRef("RocksDBStorage"); const StringRef ROCKSDB_COMMIT_LATENCY_HISTOGRAM = LiteralStringRef("RocksDBCommitLatency"); const StringRef ROCKSDB_COMMIT_ACTION_HISTOGRAM = LiteralStringRef("RocksDBCommitAction"); @@ -134,6 +149,74 @@ const StringRef ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM = LiteralStringRef("Rock const StringRef ROCKSDB_READVALUE_GET_HISTOGRAM = LiteralStringRef("RocksDBReadValueGet"); const StringRef ROCKSDB_READPREFIX_GET_HISTOGRAM = LiteralStringRef("RocksDBReadPrefixGet"); +rocksdb::ExportImportFilesMetaData getMetaData(const CheckpointMetaData& checkpoint) { + rocksdb::ExportImportFilesMetaData metaData; + if (checkpoint.getFormat() != RocksDBColumnFamily) { + return metaData; + } + + RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(checkpoint); + metaData.db_comparator_name = rocksCF.dbComparatorName; + + for (const LiveFileMetaData& fileMetaData : rocksCF.sstFiles) { + rocksdb::LiveFileMetaData liveFileMetaData; + liveFileMetaData.size = fileMetaData.size; + liveFileMetaData.name = fileMetaData.name; + liveFileMetaData.file_number = fileMetaData.file_number; + liveFileMetaData.db_path = fileMetaData.db_path; + liveFileMetaData.smallest_seqno = fileMetaData.smallest_seqno; + liveFileMetaData.largest_seqno = fileMetaData.largest_seqno; + liveFileMetaData.smallestkey = fileMetaData.smallestkey; + liveFileMetaData.largestkey = fileMetaData.largestkey; + liveFileMetaData.num_reads_sampled = fileMetaData.num_reads_sampled; + liveFileMetaData.being_compacted = fileMetaData.being_compacted; + liveFileMetaData.num_entries = fileMetaData.num_entries; + liveFileMetaData.num_deletions = fileMetaData.num_deletions; + liveFileMetaData.temperature = static_cast(fileMetaData.temperature); + liveFileMetaData.oldest_blob_file_number = fileMetaData.oldest_blob_file_number; + liveFileMetaData.oldest_ancester_time = fileMetaData.oldest_ancester_time; + liveFileMetaData.file_creation_time = fileMetaData.file_creation_time; + liveFileMetaData.file_checksum = fileMetaData.file_checksum; + liveFileMetaData.file_checksum_func_name = fileMetaData.file_checksum_func_name; + liveFileMetaData.column_family_name = fileMetaData.column_family_name; + liveFileMetaData.level = fileMetaData.level; + metaData.files.push_back(liveFileMetaData); + } + + return metaData; +} + +void populateMetaData(CheckpointMetaData* checkpoint, const rocksdb::ExportImportFilesMetaData& metaData) { + RocksDBColumnFamilyCheckpoint rocksCF; + rocksCF.dbComparatorName = metaData.db_comparator_name; + for (const rocksdb::LiveFileMetaData& fileMetaData : metaData.files) { + LiveFileMetaData liveFileMetaData; + liveFileMetaData.size = fileMetaData.size; + liveFileMetaData.name = fileMetaData.name; + liveFileMetaData.file_number = fileMetaData.file_number; + liveFileMetaData.db_path = fileMetaData.db_path; + liveFileMetaData.smallest_seqno = fileMetaData.smallest_seqno; + liveFileMetaData.largest_seqno = fileMetaData.largest_seqno; + liveFileMetaData.smallestkey = fileMetaData.smallestkey; + liveFileMetaData.largestkey = fileMetaData.largestkey; + liveFileMetaData.num_reads_sampled = fileMetaData.num_reads_sampled; + liveFileMetaData.being_compacted = fileMetaData.being_compacted; + liveFileMetaData.num_entries = fileMetaData.num_entries; + liveFileMetaData.num_deletions = fileMetaData.num_deletions; + liveFileMetaData.temperature = static_cast(fileMetaData.temperature); + liveFileMetaData.oldest_blob_file_number = fileMetaData.oldest_blob_file_number; + liveFileMetaData.oldest_ancester_time = fileMetaData.oldest_ancester_time; + liveFileMetaData.file_creation_time = fileMetaData.file_creation_time; + liveFileMetaData.file_checksum = fileMetaData.file_checksum; + liveFileMetaData.file_checksum_func_name = fileMetaData.file_checksum_func_name; + liveFileMetaData.column_family_name = fileMetaData.column_family_name; + liveFileMetaData.level = fileMetaData.level; + rocksCF.sstFiles.push_back(liveFileMetaData); + } + checkpoint->setFormat(RocksDBColumnFamily); + checkpoint->serializedCheckpoint = ObjectWriter::toValue(rocksCF, IncludeVersion()); +} + rocksdb::Slice toSlice(StringRef s) { return rocksdb::Slice(reinterpret_cast(s.begin()), s.size()); } @@ -219,12 +302,13 @@ rocksdb::ReadOptions getReadOptions() { } struct ReadIterator { + CF& cf; uint64_t index; // incrementing counter to uniquely identify read iterator. bool inUse; std::shared_ptr iter; double creationTime; - ReadIterator(uint64_t index, DB& db, rocksdb::ReadOptions& options) - : index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options)) {} + ReadIterator(CF& cf, uint64_t index, DB& db, rocksdb::ReadOptions& options) + : cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {} }; /* @@ -241,8 +325,8 @@ gets deleted as the ref count becomes 0. */ class ReadIteratorPool { public: - ReadIteratorPool(DB& db, const std::string& path) - : db(db), index(0), iteratorsReuseCount(0), readRangeOptions(getReadOptions()) { + ReadIteratorPool(DB& db, CF& cf, const std::string& path) + : db(db), cf(cf), index(0), iteratorsReuseCount(0), readRangeOptions(getReadOptions()) { readRangeOptions.background_purge_on_iterator_cleanup = true; readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0); TraceEvent("ReadIteratorPool") @@ -271,12 +355,12 @@ public: } } index++; - ReadIterator iter(index, db, readRangeOptions); + ReadIterator iter(cf, index, db, readRangeOptions); iteratorsMap.insert({ index, iter }); return iter; } else { index++; - ReadIterator iter(index, db, readRangeOptions); + ReadIterator iter(cf, index, db, readRangeOptions); return iter; } } @@ -316,6 +400,7 @@ private: std::unordered_map iteratorsMap; std::unordered_map::iterator it; DB& db; + CF& cf; rocksdb::ReadOptions readRangeOptions; std::mutex mutex; // incrementing counter for every new iterator creation, to uniquely identify the iterator in returnIterator(). @@ -735,10 +820,9 @@ Error statusToError(const rocksdb::Status& s) { } struct RocksDBKeyValueStore : IKeyValueStore { - using CF = rocksdb::ColumnFamilyHandle*; - struct Writer : IThreadPoolReceiver { DB& db; + CF& cf; UID id; std::shared_ptr rateLimiter; @@ -752,11 +836,12 @@ struct RocksDBKeyValueStore : IKeyValueStore { int threadIndex; explicit Writer(DB& db, + CF& cf, UID id, std::shared_ptr readIterPool, std::shared_ptr perfContextMetrics, int threadIndex) - : db(db), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), + : db(db), cf(cf), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), threadIndex(threadIndex), rateLimiter(SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0 ? rocksdb::NewGenericRateLimiter( @@ -814,40 +899,71 @@ struct RocksDBKeyValueStore : IKeyValueStore { double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } }; void action(OpenAction& a) { - std::vector defaultCF = { rocksdb::ColumnFamilyDescriptor{ - "default", getCFOptions() } }; - std::vector handle; - auto options = getOptions(); + ASSERT(cf == nullptr); + + std::vector columnFamilies; + rocksdb::Options options = getOptions(); + rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, a.path, &columnFamilies); + if (std::find(columnFamilies.begin(), columnFamilies.end(), "default") == columnFamilies.end()) { + columnFamilies.push_back("default"); + } + + rocksdb::ColumnFamilyOptions cfOptions = getCFOptions(); + std::vector descriptors; + for (const std::string& name : columnFamilies) { + descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions }); + } + options.listeners.push_back(a.errorListener); if (SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0) { options.rate_limiter = rateLimiter; } - auto status = rocksdb::DB::Open(options, a.path, defaultCF, &handle, &db); + + std::vector handles; + status = rocksdb::DB::Open(options, a.path, descriptors, &handles, &db); + if (!status.ok()) { logRocksDBError(status, "Open"); a.done.sendError(statusToError(status)); + return; + } + + for (rocksdb::ColumnFamilyHandle* handle : handles) { + if (handle->GetName() == SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY) { + cf = handle; + break; + } + } + + if (cf == nullptr) { + status = db->CreateColumnFamily(cfOptions, SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, &cf); + if (!status.ok()) { + logRocksDBError(status, "Open"); + a.done.sendError(statusToError(status)); + } + } + + TraceEvent(SevInfo, "RocksDB") + .detail("Path", a.path) + .detail("Method", "Open") + .detail("KnobRocksDBWriteRateLimiterBytesPerSec", + SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC) + .detail("KnobRocksDBWriteRateLimiterAutoTune", SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE) + .detail("ColumnFamily", cf->GetName()); + if (g_network->isSimulated()) { + // The current thread and main thread are same when the code runs in simulation. + // blockUntilReady() is getting the thread into deadlock state, so directly calling + // the metricsLogger. + a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) && + flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); } else { - TraceEvent(SevInfo, "RocksDB") - .detail("Path", a.path) - .detail("Method", "Open") - .detail("KnobRocksDBWriteRateLimiterBytesPerSec", - SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC) - .detail("KnobRocksDBWriteRateLimiterAutoTune", SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE); - if (g_network->isSimulated()) { - // The current thread and main thread are same when the code runs in simulation. - // blockUntilReady() is getting the thread into deadlock state, so directly calling - // the metricsLogger. + onMainThread([&] { a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) && flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); - } else { - onMainThread([&] { - a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) && - flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool); - return Future(true); - }).blockUntilReady(); - } - a.done.send(Void()); + return Future(true); + }).blockUntilReady(); } + a.done.send(Void()); } struct DeleteVisitor : public rocksdb::WriteBatch::Handler { @@ -863,6 +979,26 @@ struct RocksDBKeyValueStore : IKeyValueStore { deletes.push_back_deep(arena, kr); return rocksdb::Status::OK(); } + + rocksdb::Status PutCF(uint32_t column_family_id, + const rocksdb::Slice& key, + const rocksdb::Slice& value) override { + return rocksdb::Status::OK(); + } + + rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override { + return rocksdb::Status::OK(); + } + + rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override { + return rocksdb::Status::OK(); + } + + rocksdb::Status MergeCF(uint32_t column_family_id, + const rocksdb::Slice& key, + const rocksdb::Slice& value) override { + return rocksdb::Status::OK(); + } }; struct CommitAction : TypedAction { @@ -894,7 +1030,12 @@ struct RocksDBKeyValueStore : IKeyValueStore { } Standalone> deletes; DeleteVisitor dv(deletes, deletes.arena()); - ASSERT(a.batchToCommit->Iterate(&dv).ok()); + rocksdb::Status s = a.batchToCommit->Iterate(&dv); + if (!s.ok()) { + logRocksDBError(s, "CommitDeleteVisitor"); + a.done.sendError(statusToError(s)); + return; + } // If there are any range deletes, we should have added them to be deleted. ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange()); rocksdb::WriteOptions options; @@ -906,7 +1047,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { // Request for batchToCommit bytes. If this request cannot be satisfied, the call is blocked. rateLimiter->Request(a.batchToCommit->GetDataSize() /* bytes */, rocksdb::Env::IO_HIGH); } - auto s = db->Write(options, a.batchToCommit.get()); + s = db->Write(options, a.batchToCommit.get()); readIterPool->update(); if (a.getHistograms) { writeHistogram->sampleSeconds(timer_monotonic() - writeBeginTime); @@ -922,7 +1063,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { for (const auto& keyRange : deletes) { auto begin = toSlice(keyRange.begin); auto end = toSlice(keyRange.end); - ASSERT(db->SuggestCompactRange(db->DefaultColumnFamily(), &begin, &end).ok()); + ASSERT(db->SuggestCompactRange(cf, &begin, &end).ok()); } if (a.getHistograms) { deleteCompactRangeHistogram->sampleSeconds(timer_monotonic() - compactRangeBeginTime); @@ -956,9 +1097,13 @@ struct RocksDBKeyValueStore : IKeyValueStore { logRocksDBError(s, "Close"); } if (a.deleteOnClose) { - std::vector defaultCF = { rocksdb::ColumnFamilyDescriptor{ - "default", getCFOptions() } }; - s = rocksdb::DestroyDB(a.path, getOptions(), defaultCF); + std::set columnFamilies{ "default" }; + columnFamilies.insert(SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY); + std::vector descriptors; + for (const std::string name : columnFamilies) { + descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, getCFOptions() }); + } + s = rocksdb::DestroyDB(a.path, getOptions(), descriptors); if (!s.ok()) { logRocksDBError(s, "Destroy"); } else { @@ -968,10 +1113,133 @@ struct RocksDBKeyValueStore : IKeyValueStore { TraceEvent("RocksDB").detail("Path", a.path).detail("Method", "Close"); a.done.send(Void()); } + + struct CheckpointAction : TypedAction { + CheckpointAction(const CheckpointRequest& request) : request(request) {} + + double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } + + const CheckpointRequest request; + ThreadReturnPromise reply; + }; + + void action(CheckpointAction& a) { + TraceEvent("RocksDBServeCheckpointBegin", id) + .detail("MinVersion", a.request.version) + .detail("Range", a.request.range.toString()) + .detail("Format", static_cast(a.request.format)) + .detail("CheckpointDir", a.request.checkpointDir); + + rocksdb::Checkpoint* checkpoint; + rocksdb::Status s = rocksdb::Checkpoint::Create(db, &checkpoint); + if (!s.ok()) { + logRocksDBError(s, "Checkpoint"); + a.reply.sendError(statusToError(s)); + return; + } + + rocksdb::PinnableSlice value; + rocksdb::ReadOptions readOptions = getReadOptions(); + s = db->Get(readOptions, cf, toSlice(persistVersion), &value); + + if (!s.ok() && !s.IsNotFound()) { + logRocksDBError(s, "Checkpoint"); + a.reply.sendError(statusToError(s)); + return; + } + + const Version version = s.IsNotFound() + ? latestVersion + : BinaryReader::fromStringRef(toStringRef(value), Unversioned()); + + TraceEvent("RocksDBServeCheckpointVersion", id) + .detail("CheckpointVersion", a.request.version) + .detail("PersistVersion", version); + + // TODO: set the range as the actual shard range. + CheckpointMetaData res(version, a.request.range, a.request.format, a.request.checkpointID); + const std::string& checkpointDir = a.request.checkpointDir; + + if (a.request.format == RocksDBColumnFamily) { + rocksdb::ExportImportFilesMetaData* pMetadata; + platform::eraseDirectoryRecursive(checkpointDir); + const std::string cwd = platform::getWorkingDirectory() + "/"; + s = checkpoint->ExportColumnFamily(cf, checkpointDir, &pMetadata); + + if (!s.ok()) { + logRocksDBError(s, "Checkpoint"); + a.reply.sendError(statusToError(s)); + return; + } + + populateMetaData(&res, *pMetadata); + delete pMetadata; + TraceEvent("RocksDBServeCheckpointSuccess", id) + .detail("CheckpointMetaData", res.toString()) + .detail("RocksDBCF", getRocksCF(res).toString()); + } else { + throw not_implemented(); + } + + res.setState(CheckpointMetaData::Complete); + a.reply.send(res); + } + + struct RestoreAction : TypedAction { + RestoreAction(const std::string& path, const std::vector& checkpoints) + : path(path), checkpoints(checkpoints) {} + + double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } + + const std::string path; + const std::vector checkpoints; + ThreadReturnPromise done; + }; + + void action(RestoreAction& a) { + TraceEvent("RocksDBServeRestoreBegin", id).detail("Path", a.path); + + // TODO: Fail gracefully. + ASSERT(!a.checkpoints.empty()); + + if (a.checkpoints[0].format == RocksDBColumnFamily) { + ASSERT_EQ(a.checkpoints.size(), 1); + TraceEvent("RocksDBServeRestoreCF", id) + .detail("Path", a.path) + .detail("Checkpoint", a.checkpoints[0].toString()) + .detail("RocksDBCF", getRocksCF(a.checkpoints[0]).toString()); + + auto options = getOptions(); + rocksdb::Status status = rocksdb::DB::Open(options, a.path, &db); + + if (!status.ok()) { + logRocksDBError(status, "Restore"); + a.done.sendError(statusToError(status)); + return; + } + + rocksdb::ExportImportFilesMetaData metaData = getMetaData(a.checkpoints[0]); + rocksdb::ImportColumnFamilyOptions importOptions; + importOptions.move_files = true; + status = db->CreateColumnFamilyWithImport( + getCFOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, importOptions, metaData, &cf); + + if (!status.ok()) { + logRocksDBError(status, "Restore"); + a.done.sendError(statusToError(status)); + } else { + TraceEvent(SevInfo, "RocksDB").detail("Path", a.path).detail("Method", "Restore"); + a.done.send(Void()); + } + } else { + throw not_implemented(); + } + } }; struct Reader : IThreadPoolReceiver { DB& db; + CF& cf; double readValueTimeout; double readValuePrefixTimeout; double readRangeTimeout; @@ -992,10 +1260,12 @@ struct RocksDBKeyValueStore : IKeyValueStore { int threadIndex; explicit Reader(DB& db, + CF& cf, std::shared_ptr readIterPool, std::shared_ptr perfContextMetrics, int threadIndex) - : db(db), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), threadIndex(threadIndex), + : db(db), cf(cf), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), + threadIndex(threadIndex), readRangeLatencyHistogram(Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP, ROCKSDB_READRANGE_LATENCY_HISTOGRAM, Histogram::Unit::microseconds)), @@ -1066,6 +1336,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; } }; void action(ReadValueAction& a) { + ASSERT(cf != nullptr); bool doPerfContextMetrics = SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE && (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE); @@ -1098,7 +1369,13 @@ struct RocksDBKeyValueStore : IKeyValueStore { options.deadline = std::chrono::duration_cast(deadlineSeconds); double dbGetBeginTime = a.getHistograms ? timer_monotonic() : 0; - auto s = db->Get(options, db->DefaultColumnFamily(), toSlice(a.key), &value); + auto s = db->Get(options, cf, toSlice(a.key), &value); + if (!s.ok() && !s.IsNotFound()) { + logRocksDBError(s, "ReadValue"); + a.result.sendError(statusToError(s)); + return; + } + if (a.getHistograms) { readValueGetHistogram->sampleSeconds(timer_monotonic() - dbGetBeginTime); } @@ -1175,7 +1452,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { options.deadline = std::chrono::duration_cast(deadlineSeconds); double dbGetBeginTime = a.getHistograms ? timer_monotonic() : 0; - auto s = db->Get(options, db->DefaultColumnFamily(), toSlice(a.key), &value); + auto s = db->Get(options, cf, toSlice(a.key), &value); if (a.getHistograms) { readPrefixGetHistogram->sampleSeconds(timer_monotonic() - dbGetBeginTime); } @@ -1330,6 +1607,7 @@ struct RocksDBKeyValueStore : IKeyValueStore { DB db = nullptr; std::shared_ptr perfContextMetrics; std::string path; + rocksdb::ColumnFamilyHandle* defaultFdbCF = nullptr; UID id; Reference writeThread; Reference readThreads; @@ -1357,7 +1635,8 @@ struct RocksDBKeyValueStore : IKeyValueStore { Counters counters; explicit RocksDBKeyValueStore(const std::string& path, UID id) - : path(path), id(id), perfContextMetrics(new PerfContextMetrics()), readIterPool(new ReadIteratorPool(db, path)), + : path(path), id(id), perfContextMetrics(new PerfContextMetrics()), + readIterPool(new ReadIteratorPool(db, defaultFdbCF, path)), readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX), fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX), numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX), @@ -1381,11 +1660,11 @@ struct RocksDBKeyValueStore : IKeyValueStore { readThreads = createGenericThreadPool(); } writeThread->addThread( - new Writer(db, id, readIterPool, perfContextMetrics, SERVER_KNOBS->ROCKSDB_READ_PARALLELISM), + new Writer(db, defaultFdbCF, id, readIterPool, perfContextMetrics, SERVER_KNOBS->ROCKSDB_READ_PARALLELISM), "fdb-rocksdb-wr"); TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM); for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) { - readThreads->addThread(new Reader(db, readIterPool, perfContextMetrics, i), "fdb-rocksdb-re"); + readThreads->addThread(new Reader(db, defaultFdbCF, readIterPool, perfContextMetrics, i), "fdb-rocksdb-re"); } } @@ -1429,7 +1708,8 @@ struct RocksDBKeyValueStore : IKeyValueStore { if (writeBatch == nullptr) { writeBatch.reset(new rocksdb::WriteBatch()); } - writeBatch->Put(toSlice(kv.key), toSlice(kv.value)); + ASSERT(defaultFdbCF != nullptr); + writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value)); } void clear(KeyRangeRef keyRange, const Arena*) override { @@ -1437,10 +1717,12 @@ struct RocksDBKeyValueStore : IKeyValueStore { writeBatch.reset(new rocksdb::WriteBatch()); } + ASSERT(defaultFdbCF != nullptr); + if (keyRange.singleKeyRange()) { - writeBatch->Delete(toSlice(keyRange.begin)); + writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin)); } else { - writeBatch->DeleteRange(toSlice(keyRange.begin), toSlice(keyRange.end)); + writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end)); } } @@ -1587,6 +1869,46 @@ struct RocksDBKeyValueStore : IKeyValueStore { return StorageBytes(free, total, live, free); } + + Future checkpoint(const CheckpointRequest& request) override { + auto a = new Writer::CheckpointAction(request); + + auto res = a->reply.getFuture(); + writeThread->post(a); + return res; + } + + Future restore(const std::vector& checkpoints) override { + auto a = new Writer::RestoreAction(path, checkpoints); + auto res = a->done.getFuture(); + writeThread->post(a); + return res; + } + + // Delete a checkpoint. + Future deleteCheckpoint(const CheckpointMetaData& checkpoint) override { + if (checkpoint.format == RocksDBColumnFamily) { + RocksDBColumnFamilyCheckpoint rocksCF; + ObjectReader reader(checkpoint.serializedCheckpoint.begin(), IncludeVersion()); + reader.deserialize(rocksCF); + + std::unordered_set dirs; + for (const LiveFileMetaData& file : rocksCF.sstFiles) { + dirs.insert(file.db_path); + } + for (const std::string dir : dirs) { + platform::eraseDirectoryRecursive(dir); + TraceEvent("DeleteCheckpointRemovedDir", id) + .detail("CheckpointID", checkpoint.checkpointID) + .detail("Dir", dir); + } + } else if (checkpoint.format == RocksDB) { + throw not_implemented(); + } else { + throw internal_error(); + } + return Void(); + } }; } // namespace @@ -1701,6 +2023,61 @@ TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/RocksDBReopen") { return Void(); } +TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/CheckpointRestore") { + state std::string cwd = platform::getWorkingDirectory() + "/"; + state std::string rocksDBTestDir = "rocksdb-kvstore-br-test-db"; + platform::eraseDirectoryRecursive(rocksDBTestDir); + + state IKeyValueStore* kvStore = new RocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID()); + wait(kvStore->init()); + + kvStore->set({ LiteralStringRef("foo"), LiteralStringRef("bar") }); + wait(kvStore->commit(false)); + + Optional val = wait(kvStore->readValue(LiteralStringRef("foo"))); + ASSERT(Optional(LiteralStringRef("bar")) == val); + + platform::eraseDirectoryRecursive("checkpoint"); + state std::string checkpointDir = cwd + "checkpoint"; + + CheckpointRequest request( + latestVersion, allKeys, RocksDBColumnFamily, deterministicRandom()->randomUniqueID(), checkpointDir); + CheckpointMetaData metaData = wait(kvStore->checkpoint(request)); + + state std::string rocksDBRestoreDir = "rocksdb-kvstore-br-restore-db"; + platform::eraseDirectoryRecursive(rocksDBRestoreDir); + + state IKeyValueStore* kvStoreCopy = + new RocksDBKeyValueStore(rocksDBRestoreDir, deterministicRandom()->randomUniqueID()); + + std::vector checkpoints; + checkpoints.push_back(metaData); + wait(kvStoreCopy->restore(checkpoints)); + + Optional val = wait(kvStoreCopy->readValue(LiteralStringRef("foo"))); + ASSERT(Optional(LiteralStringRef("bar")) == val); + + std::vector> closes; + closes.push_back(kvStore->onClosed()); + closes.push_back(kvStoreCopy->onClosed()); + kvStore->close(); + kvStoreCopy->close(); + wait(waitForAll(closes)); + + platform::eraseDirectoryRecursive(rocksDBTestDir); + platform::eraseDirectoryRecursive(rocksDBRestoreDir); + + return Void(); +} + +TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/RocksDBTypes") { + // If the following assertion fails, update SstFileMetaData and LiveFileMetaData in RocksDBCheckpointUtils.actor.h + // to be the same as rocksdb::SstFileMetaData and rocksdb::LiveFileMetaData. + ASSERT_EQ(sizeof(rocksdb::LiveFileMetaData), 184); + ASSERT_EQ(sizeof(rocksdb::ExportImportFilesMetaData), 32); + return Void(); +} + } // namespace #endif // SSD_ROCKSDB_EXPERIMENTAL diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp new file mode 100644 index 0000000000..612f8b1f20 --- /dev/null +++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp @@ -0,0 +1,283 @@ +/* + *RocksDBCheckpointUtils.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/RocksDBCheckpointUtils.actor.h" + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/StorageCheckpoint.h" +#include "flow/Trace.h" +#include "flow/flow.h" + +#include "flow/actorcompiler.h" // has to be last include + +namespace { + +class RocksDBCheckpointReader : public ICheckpointReader { +public: + RocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) + : checkpoint_(checkpoint), id_(logID), file_(Reference()), offset_(0) {} + + Future init(StringRef token) override; + + Future nextKeyValues(const int rowLimit, const int byteLimit) override { throw not_implemented(); } + + // Returns the next chunk of serialized checkpoint. + Future> nextChunk(const int byteLimit) override; + + Future close() override; + +private: + ACTOR static Future doInit(RocksDBCheckpointReader* self) { + ASSERT(self != nullptr); + try { + state Reference _file = wait(IAsyncFileSystem::filesystem()->open( + self->path_, IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO, 0)); + self->file_ = _file; + TraceEvent("RocksDBCheckpointReaderOpenFile").detail("File", self->path_); + } catch (Error& e) { + TraceEvent(SevWarnAlways, "ServerGetCheckpointFileFailure") + .errorUnsuppressed(e) + .detail("File", self->path_); + throw e; + } + + return Void(); + } + + ACTOR static Future> getNextChunk(RocksDBCheckpointReader* self, int byteLimit) { + int blockSize = std::min(64 * 1024, byteLimit); // Block size read from disk. + state Standalone buf = makeAlignedString(_PAGE_SIZE, blockSize); + int bytesRead = wait(self->file_->read(mutateString(buf), blockSize, self->offset_)); + if (bytesRead == 0) { + throw end_of_stream(); + } + + self->offset_ += bytesRead; + return buf.substr(0, bytesRead); + } + + ACTOR static Future doClose(RocksDBCheckpointReader* self) { + wait(delay(0, TaskPriority::FetchKeys)); + delete self; + return Void(); + } + + CheckpointMetaData checkpoint_; + UID id_; + Reference file_; + int offset_; + std::string path_; +}; + +Future RocksDBCheckpointReader::init(StringRef token) { + ASSERT_EQ(this->checkpoint_.getFormat(), RocksDBColumnFamily); + const std::string name = token.toString(); + this->offset_ = 0; + this->path_.clear(); + const RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(this->checkpoint_); + for (const auto& sstFile : rocksCF.sstFiles) { + if (sstFile.name == name) { + this->path_ = sstFile.db_path + sstFile.name; + break; + } + } + + if (this->path_.empty()) { + TraceEvent("RocksDBCheckpointReaderInitFileNotFound").detail("File", this->path_); + return checkpoint_not_found(); + } + + return doInit(this); +} + +Future> RocksDBCheckpointReader::nextChunk(const int byteLimit) { + return getNextChunk(this, byteLimit); +} + +Future RocksDBCheckpointReader::close() { + return doClose(this); +} + +// Fetch a single sst file from storage server. If the file is fetch successfully, it will be recorded via cFun. +ACTOR Future fetchCheckpointFile(Database cx, + std::shared_ptr metaData, + int idx, + std::string dir, + std::function(const CheckpointMetaData&)> cFun, + int maxRetries = 3) { + state RocksDBColumnFamilyCheckpoint rocksCF; + ObjectReader reader(metaData->serializedCheckpoint.begin(), IncludeVersion()); + reader.deserialize(rocksCF); + + // Skip fetched file. + if (rocksCF.sstFiles[idx].fetched && rocksCF.sstFiles[idx].db_path == dir) { + return Void(); + } + + state std::string remoteFile = rocksCF.sstFiles[idx].name; + state std::string localFile = dir + rocksCF.sstFiles[idx].name; + state UID ssID = metaData->ssID; + + state Transaction tr(cx); + state StorageServerInterface ssi; + loop { + try { + Optional ss = wait(tr.get(serverListKeyFor(ssID))); + if (!ss.present()) { + throw checkpoint_not_found(); + } + ssi = decodeServerListValue(ss.get()); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + state int attempt = 0; + loop { + try { + ++attempt; + TraceEvent("FetchCheckpointFileBegin") + .detail("RemoteFile", remoteFile) + .detail("TargetUID", ssID.toString()) + .detail("StorageServer", ssi.id().toString()) + .detail("LocalFile", localFile) + .detail("Attempt", attempt); + + wait(IAsyncFileSystem::filesystem()->deleteFile(localFile, true)); + const int64_t flags = IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE | + IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO; + state int64_t offset = 0; + state Reference asyncFile = wait(IAsyncFileSystem::filesystem()->open(localFile, flags, 0666)); + + state ReplyPromiseStream stream = + ssi.fetchCheckpoint.getReplyStream(FetchCheckpointRequest(metaData->checkpointID, remoteFile)); + TraceEvent("FetchCheckpointFileReceivingData") + .detail("RemoteFile", remoteFile) + .detail("TargetUID", ssID.toString()) + .detail("StorageServer", ssi.id().toString()) + .detail("LocalFile", localFile) + .detail("Attempt", attempt); + loop { + state FetchCheckpointReply rep = waitNext(stream.getFuture()); + wait(asyncFile->write(rep.data.begin(), rep.data.size(), offset)); + wait(asyncFile->flush()); + offset += rep.data.size(); + } + } catch (Error& e) { + if (e.code() != error_code_end_of_stream) { + TraceEvent("FetchCheckpointFileError") + .errorUnsuppressed(e) + .detail("RemoteFile", remoteFile) + .detail("StorageServer", ssi.toString()) + .detail("LocalFile", localFile) + .detail("Attempt", attempt); + if (attempt >= maxRetries) { + throw e; + } + } else { + wait(asyncFile->sync()); + int64_t fileSize = wait(asyncFile->size()); + TraceEvent("FetchCheckpointFileEnd") + .detail("RemoteFile", remoteFile) + .detail("StorageServer", ssi.toString()) + .detail("LocalFile", localFile) + .detail("Attempt", attempt) + .detail("DataSize", offset) + .detail("FileSize", fileSize); + rocksCF.sstFiles[idx].db_path = dir; + rocksCF.sstFiles[idx].fetched = true; + metaData->serializedCheckpoint = ObjectWriter::toValue(rocksCF, IncludeVersion()); + if (cFun) { + wait(cFun(*metaData)); + } + return Void(); + } + } + } +} + +} // namespace + +ACTOR Future fetchRocksDBCheckpoint(Database cx, + CheckpointMetaData initialState, + std::string dir, + std::function(const CheckpointMetaData&)> cFun) { + TraceEvent("FetchRocksCheckpointBegin") + .detail("InitialState", initialState.toString()) + .detail("CheckpointDir", dir); + + state std::shared_ptr metaData = std::make_shared(initialState); + + if (metaData->format == RocksDBColumnFamily) { + state RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(initialState); + TraceEvent("RocksDBCheckpointMetaData").detail("RocksCF", rocksCF.toString()); + + state int i = 0; + state std::vector> fs; + for (; i < rocksCF.sstFiles.size(); ++i) { + fs.push_back(fetchCheckpointFile(cx, metaData, i, dir, cFun)); + TraceEvent("GetCheckpointFetchingFile") + .detail("FileName", rocksCF.sstFiles[i].name) + .detail("Server", metaData->ssID.toString()); + } + wait(waitForAll(fs)); + } else { + throw not_implemented(); + } + + return *metaData; +} + +ACTOR Future deleteRocksCFCheckpoint(CheckpointMetaData checkpoint) { + ASSERT_EQ(checkpoint.getFormat(), RocksDBColumnFamily); + RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(checkpoint); + TraceEvent("DeleteRocksColumnFamilyCheckpoint", checkpoint.checkpointID) + .detail("CheckpointID", checkpoint.checkpointID) + .detail("RocksCF", rocksCF.toString()); + + state std::unordered_set dirs; + for (const LiveFileMetaData& file : rocksCF.sstFiles) { + dirs.insert(file.db_path); + } + + state std::unordered_set::iterator it = dirs.begin(); + for (; it != dirs.end(); ++it) { + const std::string dir = *it; + platform::eraseDirectoryRecursive(dir); + TraceEvent("DeleteCheckpointRemovedDir", checkpoint.checkpointID) + .detail("CheckpointID", checkpoint.checkpointID) + .detail("Dir", dir); + wait(delay(0, TaskPriority::FetchKeys)); + } + return Void(); +} + +ICheckpointReader* newRocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) { + return new RocksDBCheckpointReader(checkpoint, logID); +} + +RocksDBColumnFamilyCheckpoint getRocksCF(const CheckpointMetaData& checkpoint) { + RocksDBColumnFamilyCheckpoint rocksCF; + ObjectReader reader(checkpoint.serializedCheckpoint.begin(), IncludeVersion()); + reader.deserialize(rocksCF); + return rocksCF; +} \ No newline at end of file diff --git a/fdbserver/RocksDBCheckpointUtils.actor.h b/fdbserver/RocksDBCheckpointUtils.actor.h new file mode 100644 index 0000000000..3d0c157bf3 --- /dev/null +++ b/fdbserver/RocksDBCheckpointUtils.actor.h @@ -0,0 +1,209 @@ +/* + *RocksDBCheckpointUtils.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_G_H) +#define FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_G_H +#include "fdbserver/RocksDBCheckpointUtils.actor.g.h" +#elif !defined(FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_H) +#define FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_H + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbserver/ServerCheckpoint.actor.h" +#include "flow/flow.h" + +#include "flow/actorcompiler.h" // has to be last include + +// Copied from rocksdb/metadata.h, so that we can add serializer. +struct SstFileMetaData { + constexpr static FileIdentifier file_identifier = 3804347; + SstFileMetaData() + : size(0), file_number(0), smallest_seqno(0), largest_seqno(0), num_reads_sampled(0), being_compacted(false), + num_entries(0), num_deletions(0), temperature(0), oldest_blob_file_number(0), oldest_ancester_time(0), + file_creation_time(0) {} + + SstFileMetaData(const std::string& _file_name, + uint64_t _file_number, + const std::string& _path, + size_t _size, + uint64_t _smallest_seqno, + uint64_t _largest_seqno, + const std::string& _smallestkey, + const std::string& _largestkey, + uint64_t _num_reads_sampled, + bool _being_compacted, + int _temperature, + uint64_t _oldest_blob_file_number, + uint64_t _oldest_ancester_time, + uint64_t _file_creation_time, + std::string& _file_checksum, + std::string& _file_checksum_func_name) + : size(_size), name(_file_name), file_number(_file_number), db_path(_path), smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno), smallestkey(_smallestkey), largestkey(_largestkey), + num_reads_sampled(_num_reads_sampled), being_compacted(_being_compacted), num_entries(0), num_deletions(0), + temperature(_temperature), oldest_blob_file_number(_oldest_blob_file_number), + oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), + file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name) {} + + // File size in bytes. + size_t size; + // The name of the file. + std::string name; + // The id of the file. + uint64_t file_number; + // The full path where the file locates. + std::string db_path; + + uint64_t smallest_seqno; // Smallest sequence number in file. + uint64_t largest_seqno; // Largest sequence number in file. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled; // How many times the file is read. + bool being_compacted; // true if the file is currently being compacted. + + uint64_t num_entries; + uint64_t num_deletions; + + // This feature is experimental and subject to change. + int temperature; + + uint64_t oldest_blob_file_number; // The id of the oldest blob file + // referenced by the file. + // An SST file may be generated by compactions whose input files may + // in turn be generated by earlier compactions. The creation time of the + // oldest SST file that is the compaction ancestor of this file. + // The timestamp is provided SystemClock::GetCurrentTime(). + // 0 if the information is not available. + // + // Note: for TTL blob files, it contains the start of the expiration range. + uint64_t oldest_ancester_time; + // Timestamp when the SST file is created, provided by + // SystemClock::GetCurrentTime(). 0 if the information is not available. + uint64_t file_creation_time; + + // The checksum of a SST file, the value is decided by the file content and + // the checksum algorithm used for this SST file. The checksum function is + // identified by the file_checksum_func_name. If the checksum function is + // not specified, file_checksum is "0" by default. + std::string file_checksum; + + // The name of the checksum function used to generate the file checksum + // value. If file checksum is not enabled (e.g., sst_file_checksum_func is + // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is + // "Unknown". + std::string file_checksum_func_name; + + template + void serialize(Ar& ar) { + serializer(ar, + size, + name, + file_number, + db_path, + smallest_seqno, + largest_seqno, + smallestkey, + largestkey, + num_reads_sampled, + being_compacted, + num_entries, + num_deletions, + temperature, + oldest_blob_file_number, + oldest_ancester_time, + file_creation_time, + file_checksum, + file_checksum_func_name); + } +}; + +// Copied from rocksdb::LiveFileMetaData. +struct LiveFileMetaData : public SstFileMetaData { + constexpr static FileIdentifier file_identifier = 3804346; + std::string column_family_name; // Name of the column family + int level; // Level at which this file resides. + bool fetched; + LiveFileMetaData() : column_family_name(), level(0), fetched(false) {} + + template + void serialize(Ar& ar) { + serializer(ar, + SstFileMetaData::size, + SstFileMetaData::name, + SstFileMetaData::file_number, + SstFileMetaData::db_path, + SstFileMetaData::smallest_seqno, + SstFileMetaData::largest_seqno, + SstFileMetaData::smallestkey, + SstFileMetaData::largestkey, + SstFileMetaData::num_reads_sampled, + SstFileMetaData::being_compacted, + SstFileMetaData::num_entries, + SstFileMetaData::num_deletions, + SstFileMetaData::temperature, + SstFileMetaData::oldest_blob_file_number, + SstFileMetaData::oldest_ancester_time, + SstFileMetaData::file_creation_time, + SstFileMetaData::file_checksum, + SstFileMetaData::file_checksum_func_name, + column_family_name, + level, + fetched); + } +}; + +// Checkpoint metadata associated with RockDBColumnFamily format. +// Based on rocksdb::ExportImportFilesMetaData. +struct RocksDBColumnFamilyCheckpoint { + constexpr static FileIdentifier file_identifier = 13804346; + std::string dbComparatorName; + + std::vector sstFiles; + + CheckpointFormat format() const { return RocksDBColumnFamily; } + + std::string toString() const { + std::string res = "RocksDBColumnFamilyCheckpoint:\nSST Files:\n"; + for (const auto& file : sstFiles) { + res += file.db_path + file.name + "\n"; + } + return res; + } + + template + void serialize(Ar& ar) { + serializer(ar, dbComparatorName, sstFiles); + } +}; + +// Fetch the checkpoint file(s) to local dir, the checkpoint is specified by initialState. +// If cFun is provided, the fetch progress can be checkpointed, so that next time, the fetch process +// can be continued, in case of crash. +ACTOR Future fetchRocksDBCheckpoint(Database cx, + CheckpointMetaData initialState, + std::string dir, + std::function(const CheckpointMetaData&)> cFun); + +ACTOR Future deleteRocksCFCheckpoint(CheckpointMetaData checkpoint); + +ICheckpointReader* newRocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID); + +RocksDBColumnFamilyCheckpoint getRocksCF(const CheckpointMetaData& checkpoint); +#endif \ No newline at end of file diff --git a/fdbserver/ServerCheckpoint.actor.cpp b/fdbserver/ServerCheckpoint.actor.cpp new file mode 100644 index 0000000000..15047b62eb --- /dev/null +++ b/fdbserver/ServerCheckpoint.actor.cpp @@ -0,0 +1,67 @@ +/* + *ServerCheckpoint.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/ServerCheckpoint.actor.h" +#include "fdbserver/RocksDBCheckpointUtils.actor.h" + +#include "flow/actorcompiler.h" // has to be last include + +ICheckpointReader* newCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) { + if (checkpoint.getFormat() == RocksDBColumnFamily) { + return newRocksDBCheckpointReader(checkpoint, logID); + } else if (checkpoint.getFormat() == RocksDB) { + throw not_implemented(); + } else { + ASSERT(false); + } + + return nullptr; +} + +ACTOR Future deleteCheckpoint(CheckpointMetaData checkpoint) { + wait(delay(0, TaskPriority::FetchKeys)); + + if (checkpoint.getFormat() == RocksDBColumnFamily) { + wait(deleteRocksCFCheckpoint(checkpoint)); + } else if (checkpoint.getFormat() == RocksDB) { + throw not_implemented(); + } else { + ASSERT(false); + } + + return Void(); +} + +ACTOR Future fetchCheckpoint(Database cx, + CheckpointMetaData initialState, + std::string dir, + std::function(const CheckpointMetaData&)> cFun) { + state CheckpointMetaData result; + if (initialState.getFormat() == RocksDBColumnFamily) { + CheckpointMetaData _result = wait(fetchRocksDBCheckpoint(cx, initialState, dir, cFun)); + result = _result; + } else if (initialState.getFormat() == RocksDB) { + throw not_implemented(); + } else { + ASSERT(false); + } + + return result; +} \ No newline at end of file diff --git a/fdbserver/ServerCheckpoint.actor.h b/fdbserver/ServerCheckpoint.actor.h new file mode 100644 index 0000000000..b7a5a2d50c --- /dev/null +++ b/fdbserver/ServerCheckpoint.actor.h @@ -0,0 +1,66 @@ +/* + *ServerCheckpoint.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_SERVER_CHECKPOINT_ACTOR_G_H) +#define FDBSERVER_SERVER_CHECKPOINT_ACTOR_G_H +#include "fdbserver/ServerCheckpoint.actor.g.h" +#elif !defined(FDBSERVER_SERVER_CHECKPOINT_ACTOR_H) +#define FDBSERVER_SERVER_CHECKPOINT_ACTOR_H + +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/StorageCheckpoint.h" +#include "flow/flow.h" + +#include "flow/actorcompiler.h" // has to be last include + +// An ICheckpointReader can read the contents of a checkpoint created from a KV store, +// i.e., by IKeyValueStore::checkpoint(). +class ICheckpointReader { +public: + // `token` is a serialized object defined by each derived ICheckpointReader class, to specify the + // starting point for the underlying checkpoint. + virtual Future init(StringRef token) = 0; + + // Scans the checkpoint, and returns the key-value pairs. + virtual Future nextKeyValues(const int rowLimit, const int ByteLimit) = 0; + + // Returns the next chunk of the serialized checkpoint. + virtual Future> nextChunk(const int ByteLimit) = 0; + + virtual Future close() = 0; + +protected: + virtual ~ICheckpointReader() {} +}; + +ICheckpointReader* newCheckpointReader(const CheckpointMetaData& checkpoint, UID logID); + +// Delete a checkpoint. +ACTOR Future deleteCheckpoint(CheckpointMetaData checkpoint); + +// Fetchs checkpoint to a local `dir`, `initialState` provides the checkpoint formats, location, restart point, etc. +// If cFun is provided, the progress can be checkpointed. +// Returns a CheckpointMetaData, which could contain KVS-specific results, e.g., the list of fetched checkpoint files. +ACTOR Future fetchCheckpoint(Database cx, + CheckpointMetaData initialState, + std::string dir, + std::function(const CheckpointMetaData&)> cFun = nullptr); +#endif \ No newline at end of file diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index d7a8dfb0a9..77a5b904b5 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -58,6 +58,7 @@ #include "fdbserver/MutationTracking.h" #include "fdbserver/RecoveryState.h" #include "fdbserver/StorageMetrics.h" +#include "fdbserver/ServerCheckpoint.actor.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/TLogInterface.h" #include "fdbserver/WaitFailure.h" @@ -104,6 +105,46 @@ bool canReplyWith(Error e) { } } // namespace +#define PERSIST_PREFIX "\xff\xff" + +// Immutable +static const KeyValueRef persistFormat(LiteralStringRef(PERSIST_PREFIX "Format"), + LiteralStringRef("FoundationDB/StorageServer/1/4")); +static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("FoundationDB/StorageServer/1/2"), + LiteralStringRef("FoundationDB/StorageServer/1/5")); +static const KeyRef persistID = LiteralStringRef(PERSIST_PREFIX "ID"); +static const KeyRef persistTssPairID = LiteralStringRef(PERSIST_PREFIX "tssPairID"); +static const KeyRef persistSSPairID = LiteralStringRef(PERSIST_PREFIX "ssWithTSSPairID"); +static const KeyRef persistTssQuarantine = LiteralStringRef(PERSIST_PREFIX "tssQ"); +static const KeyRef persistClusterIdKey = LiteralStringRef(PERSIST_PREFIX "clusterId"); + +// (Potentially) change with the durable version or when fetchKeys completes +static const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version"); +static const KeyRangeRef persistShardAssignedKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "ShardAssigned/"), LiteralStringRef(PERSIST_PREFIX "ShardAssigned0")); +static const KeyRangeRef persistShardAvailableKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "ShardAvailable/"), LiteralStringRef(PERSIST_PREFIX "ShardAvailable0")); +static const KeyRangeRef persistByteSampleKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "BS/"), LiteralStringRef(PERSIST_PREFIX "BS0")); +static const KeyRangeRef persistByteSampleSampleKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "BS/" PERSIST_PREFIX "BS/"), + LiteralStringRef(PERSIST_PREFIX "BS/" PERSIST_PREFIX "BS0")); +static const KeyRef persistLogProtocol = LiteralStringRef(PERSIST_PREFIX "LogProtocol"); +static const KeyRef persistPrimaryLocality = LiteralStringRef(PERSIST_PREFIX "PrimaryLocality"); +static const KeyRangeRef persistChangeFeedKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "RF/"), LiteralStringRef(PERSIST_PREFIX "RF0")); +static const KeyRangeRef persistTenantMapKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "TM/"), LiteralStringRef(PERSIST_PREFIX "TM0")); +// data keys are unmangled (but never start with PERSIST_PREFIX because they are always in allKeys) + +// Checkpoint related prefixes. +static const KeyRangeRef persistCheckpointKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "Checkpoint/"), LiteralStringRef(PERSIST_PREFIX "Checkpoint0")); +static const KeyRangeRef persistPendingCheckpointKeys = + KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "PendingCheckpoint/"), + LiteralStringRef(PERSIST_PREFIX "PendingCheckpoint0")); +static const std::string rocksdbCheckpointDirPrefix = "/rockscheckpoints_"; + struct AddingShard : NonCopyable { KeyRange keys; Future fetchClient; // holds FetchKeys() actor @@ -241,6 +282,14 @@ struct StorageServerDisk { return storage->readRange(keys, rowLimit, byteLimit, type); } + Future checkpoint(const CheckpointRequest& request) { return storage->checkpoint(request); } + + Future restore(const std::vector& checkpoints) { return storage->restore(checkpoints); } + + Future deleteCheckpoint(const CheckpointMetaData& checkpoint) { + return storage->deleteCheckpoint(checkpoint); + } + KeyValueStoreType getKeyValueStoreType() const { return storage->getType(); } StorageBytes getStorageBytes() const { return storage->getStorageBytes(); } std::tuple getSize() const { return storage->getSize(); } @@ -418,6 +467,8 @@ private: std::unordered_map> watchMap; // keep track of server watches public: + std::map> pendingCheckpoints; // Pending checkpoint requests + std::unordered_map checkpoints; // Existing and deleting checkpoints TenantMap tenantMap; TenantPrefixIndex tenantPrefixIndex; @@ -1727,6 +1778,116 @@ ACTOR Future changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req) return Void(); } +// Finds a checkpoint. +ACTOR Future getCheckpointQ(StorageServer* self, GetCheckpointRequest req) { + // Wait until the desired version is durable. + wait(self->durableVersion.whenAtLeast(req.version + 1)); + + TraceEvent(SevDebug, "ServeGetCheckpointVersionSatisfied", self->thisServerID) + .detail("Version", req.version) + .detail("Range", req.range.toString()) + .detail("Format", static_cast(req.format)); + + try { + std::unordered_map::iterator it = self->checkpoints.begin(); + for (; it != self->checkpoints.end(); ++it) { + const CheckpointMetaData& md = it->second; + if (md.version == req.version && md.format == req.format && md.range.contains(req.range) && + md.getState() == CheckpointMetaData::Complete) { + req.reply.send(md); + TraceEvent(SevDebug, "ServeGetCheckpointEnd", self->thisServerID).detail("Checkpoint", md.toString()); + break; + } + } + + if (it == self->checkpoints.end()) { + req.reply.sendError(checkpoint_not_found()); + } + } catch (Error& e) { + if (!canReplyWith(e)) { + throw; + } + req.reply.sendError(e); + } + return Void(); +} + +// Delete the checkpoint from disk, as well as all related presisted meta data. +ACTOR Future deleteCheckpointQ(StorageServer* self, Version version, CheckpointMetaData checkpoint) { + wait(self->durableVersion.whenAtLeast(version)); + + TraceEvent("DeleteCheckpointBegin", self->thisServerID).detail("Checkpoint", checkpoint.toString()); + + self->checkpoints.erase(checkpoint.checkpointID); + + try { + wait(deleteCheckpoint(checkpoint)); + } catch (Error& e) { + // TODO: Handle errors more gracefully. + throw; + } + + state Key persistCheckpointKey(persistCheckpointKeys.begin.toString() + checkpoint.checkpointID.toString()); + state Key pendingCheckpointKey(persistPendingCheckpointKeys.begin.toString() + checkpoint.checkpointID.toString()); + Version version = self->data().getLatestVersion(); + auto& mLV = self->addVersionToMutationLog(version); + self->addMutationToMutationLog( + mLV, MutationRef(MutationRef::ClearRange, pendingCheckpointKey, keyAfter(pendingCheckpointKey))); + self->addMutationToMutationLog( + mLV, MutationRef(MutationRef::ClearRange, persistCheckpointKey, keyAfter(persistCheckpointKey))); + + return Void(); +} + +// Serves FetchCheckpointRequests. +ACTOR Future fetchCheckpointQ(StorageServer* self, FetchCheckpointRequest req) { + TraceEvent("ServeFetchCheckpointBegin", self->thisServerID) + .detail("CheckpointID", req.checkpointID) + .detail("Token", req.token); + + req.reply.setByteLimit(SERVER_KNOBS->CHECKPOINT_TRANSFER_BLOCK_BYTES); + + // Returns error is the checkpoint cannot be found. + const auto it = self->checkpoints.find(req.checkpointID); + if (it == self->checkpoints.end()) { + req.reply.sendError(checkpoint_not_found()); + TraceEvent("ServeFetchCheckpointNotFound", self->thisServerID).detail("CheckpointID", req.checkpointID); + return Void(); + } + + try { + state ICheckpointReader* reader = newCheckpointReader(it->second, deterministicRandom()->randomUniqueID()); + wait(reader->init(req.token)); + + loop { + state Standalone data = wait(reader->nextChunk(CLIENT_KNOBS->REPLY_BYTE_LIMIT)); + wait(req.reply.onReady()); + FetchCheckpointReply reply(req.token); + reply.data = data; + req.reply.send(reply); + } + } catch (Error& e) { + if (e.code() == error_code_end_of_stream) { + req.reply.sendError(end_of_stream()); + TraceEvent("ServeFetchCheckpointEnd", self->thisServerID) + .detail("CheckpointID", req.checkpointID) + .detail("Token", req.token); + } else { + TraceEvent(SevWarnAlways, "ServerFetchCheckpointFailure") + .errorUnsuppressed(e) + .detail("CheckpointID", req.checkpointID) + .detail("Token", req.token); + if (!canReplyWith(e)) { + throw e; + } + req.reply.sendError(e); + } + } + + wait(reader->close()); + return Void(); +} + ACTOR Future overlappingChangeFeedsQ(StorageServer* data, OverlappingChangeFeedsRequest req) { wait(delay(0)); wait(data->version.whenAtLeast(req.minVersion)); @@ -4165,38 +4326,6 @@ ACTOR Future tryGetRange(PromiseStream results, Transaction* } } -#define PERSIST_PREFIX "\xff\xff" - -// Immutable -static const KeyValueRef persistFormat(LiteralStringRef(PERSIST_PREFIX "Format"), - LiteralStringRef("FoundationDB/StorageServer/1/4")); -static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("FoundationDB/StorageServer/1/2"), - LiteralStringRef("FoundationDB/StorageServer/1/5")); -static const KeyRef persistID = LiteralStringRef(PERSIST_PREFIX "ID"); -static const KeyRef persistTssPairID = LiteralStringRef(PERSIST_PREFIX "tssPairID"); -static const KeyRef persistSSPairID = LiteralStringRef(PERSIST_PREFIX "ssWithTSSPairID"); -static const KeyRef persistTssQuarantine = LiteralStringRef(PERSIST_PREFIX "tssQ"); -static const KeyRef persistClusterIdKey = LiteralStringRef(PERSIST_PREFIX "clusterId"); - -// (Potentially) change with the durable version or when fetchKeys completes -static const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version"); -static const KeyRangeRef persistShardAssignedKeys = - KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "ShardAssigned/"), LiteralStringRef(PERSIST_PREFIX "ShardAssigned0")); -static const KeyRangeRef persistShardAvailableKeys = - KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "ShardAvailable/"), LiteralStringRef(PERSIST_PREFIX "ShardAvailable0")); -static const KeyRangeRef persistByteSampleKeys = - KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "BS/"), LiteralStringRef(PERSIST_PREFIX "BS0")); -static const KeyRangeRef persistByteSampleSampleKeys = - KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "BS/" PERSIST_PREFIX "BS/"), - LiteralStringRef(PERSIST_PREFIX "BS/" PERSIST_PREFIX "BS0")); -static const KeyRef persistLogProtocol = LiteralStringRef(PERSIST_PREFIX "LogProtocol"); -static const KeyRef persistPrimaryLocality = LiteralStringRef(PERSIST_PREFIX "PrimaryLocality"); -static const KeyRangeRef persistChangeFeedKeys = - KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "RF/"), LiteralStringRef(PERSIST_PREFIX "RF0")); -static const KeyRangeRef persistTenantMapKeys = - KeyRangeRef(LiteralStringRef(PERSIST_PREFIX "TM/"), LiteralStringRef(PERSIST_PREFIX "TM0")); -// data keys are unmangled (but never start with PERSIST_PREFIX because they are always in allKeys) - ACTOR Future fetchChangeFeedApplier(StorageServer* data, Reference changeFeedInfo, Key rangeId, @@ -5064,9 +5193,11 @@ public: } if (m.param1.startsWith(systemKeys.end)) { - if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) + if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) { applyPrivateCacheData(data, m); - else { + } else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(checkpointPrefix)) { + registerPendingCheckpoint(data, m, ver); + } else { applyPrivateData(data, m); } } else { @@ -5353,6 +5484,24 @@ private: ASSERT(false); // Unknown private mutation } } + + // Registers a pending checkpoint request, it will be fullfilled when the desired version is durable. + void registerPendingCheckpoint(StorageServer* data, const MutationRef& m, Version ver) { + CheckpointMetaData checkpoint = decodeCheckpointValue(m.param2); + ASSERT(checkpoint.getState() == CheckpointMetaData::Pending); + const UID checkpointID = decodeCheckpointKey(m.param1.substr(1)); + checkpoint.version = ver; + data->pendingCheckpoints[ver].push_back(checkpoint); + + auto& mLV = data->addVersionToMutationLog(ver); + const Key pendingCheckpointKey(persistPendingCheckpointKeys.begin.toString() + checkpointID.toString()); + data->addMutationToMutationLog( + mLV, MutationRef(MutationRef::SetValue, pendingCheckpointKey, checkpointValue(checkpoint))); + + TraceEvent("RegisterPendingCheckpoint", data->thisServerID) + .detail("Key", pendingCheckpointKey) + .detail("Checkpoint", checkpoint.toString()); + } }; void StorageServer::insertTenant(TenantNameRef tenantName, @@ -5413,8 +5562,8 @@ ACTOR Future tssDelayForever() { ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { state double start; try { - // If we are disk bound and durableVersion is very old, we need to block updates or we could run out of memory - // This is often referred to as the storage server e-brake (emergency brake) + // If we are disk bound and durableVersion is very old, we need to block updates or we could run out of + // memory. This is often referred to as the storage server e-brake (emergency brake) // We allow the storage server to make some progress between e-brake periods, referreed to as "overage", in // order to ensure that it advances desiredOldestVersion enough for updateStorage to make enough progress on @@ -5452,8 +5601,8 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { data->tssFaultInjectTime.get() < now()) { if (deterministicRandom()->random01() < 0.01) { TraceEvent(SevWarnAlways, "TSSInjectDelayForever", data->thisServerID).log(); - // small random chance to just completely get stuck here, each tss should eventually hit this in this - // mode + // small random chance to just completely get stuck here, each tss should eventually hit this in + // this mode wait(tssDelayForever()); } else { // otherwise pause for part of a second @@ -5550,14 +5699,14 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { } // Any fetchKeys which are ready to transition their shards to the adding,transferred state do so now. - // If there is an epoch end we skip this step, to increase testability and to prevent inserting a version in - // the middle of a rolled back version range. + // If there is an epoch end we skip this step, to increase testability and to prevent inserting a + // version in the middle of a rolled back version range. while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) { auto fk = data->readyFetchKeys.back(); data->readyFetchKeys.pop_back(); fk.send(&fii); - // fetchKeys() would put the data it fetched into the fii. The thread will not return back to this actor - // until it was completed. + // fetchKeys() would put the data it fetched into the fii. The thread will not return back to this + // actor until it was completed. } for (auto& c : fii.changes) @@ -5566,9 +5715,10 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { wait(doEagerReads(data, &eager)); if (data->shardChangeCounter == changeCounter) break; - TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it again. - // SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads only - // selectively + TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it + // again. + // SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads + // only selectively eager = UpdateEagerReadInfo(); } data->eagerReadsLatencyHistogram->sampleSeconds(now() - start); @@ -5598,8 +5748,8 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { for (; mutationNum < pUpdate->mutations.size(); mutationNum++) { updater.applyMutation(data, pUpdate->mutations[mutationNum], pUpdate->version, true); mutationBytes += pUpdate->mutations[mutationNum].totalSize(); - // data->counters.mutationBytes or data->counters.mutations should not be updated because they should - // have counted when the mutations arrive from cursor initially. + // data->counters.mutationBytes or data->counters.mutations should not be updated because they + // should have counted when the mutations arrive from cursor initially. injectedChanges = true; if (mutationBytes > SERVER_KNOBS->DESIRED_UPDATE_BYTES) { mutationBytes = 0; @@ -5830,6 +5980,49 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { } } +ACTOR Future createCheckpoint(StorageServer* data, CheckpointMetaData metaData) { + ASSERT(metaData.ssID == data->thisServerID); + const CheckpointRequest req(metaData.version, + metaData.range, + static_cast(metaData.format), + metaData.checkpointID, + data->folder + rocksdbCheckpointDirPrefix + metaData.checkpointID.toString()); + state CheckpointMetaData checkpointResult; + try { + state CheckpointMetaData res = wait(data->storage.checkpoint(req)); + checkpointResult = res; + checkpointResult.ssID = data->thisServerID; + ASSERT(checkpointResult.getState() == CheckpointMetaData::Complete); + data->checkpoints[checkpointResult.checkpointID] = checkpointResult; + TraceEvent("StorageCreatedCheckpoint", data->thisServerID).detail("Checkpoint", checkpointResult.toString()); + } catch (Error& e) { + // If checkpoint creation fails, the failure is persisted. + checkpointResult = metaData; + checkpointResult.setState(CheckpointMetaData::Fail); + TraceEvent("StorageCreatedCheckpointFailure", data->thisServerID) + .detail("PendingCheckpoint", checkpointResult.toString()); + } + + // Persist the checkpoint meta data. + try { + Key pendingCheckpointKey(persistPendingCheckpointKeys.begin.toString() + + checkpointResult.checkpointID.toString()); + Key persistCheckpointKey(persistCheckpointKeys.begin.toString() + checkpointResult.checkpointID.toString()); + data->storage.clearRange(singleKeyRange(pendingCheckpointKey)); + data->storage.writeKeyValue(KeyValueRef(persistCheckpointKey, checkpointValue(checkpointResult))); + wait(data->storage.commit()); + } catch (Error& e) { + // If the checkpoint meta data is not persisted successfully, remove the checkpoint. + TraceEvent("StorageCreateCheckpointPersistFailure", data->thisServerID) + .errorUnsuppressed(e) + .detail("Checkpoint", checkpointResult.toString()); + data->checkpoints.erase(checkpointResult.checkpointID); + data->actors.add(deleteCheckpointQ(data, metaData.version, checkpointResult)); + } + + return Void(); +} + ACTOR Future updateStorage(StorageServer* data) { loop { ASSERT(data->durableVersion.get() == data->storageVersion()); @@ -5851,6 +6044,33 @@ ACTOR Future updateStorage(StorageServer* data) { state Version desiredVersion = data->desiredOldestVersion.get(); state int64_t bytesLeft = SERVER_KNOBS->STORAGE_COMMIT_BYTES; + // Clean up stale checkpoint requests, this is not supposed to happen, since checkpoints are cleaned up on + // failures. This is kept as a safeguard. + while (!data->pendingCheckpoints.empty() && data->pendingCheckpoints.begin()->first <= startOldestVersion) { + for (int idx = 0; idx < data->pendingCheckpoints.begin()->second.size(); ++idx) { + auto& metaData = data->pendingCheckpoints.begin()->second[idx]; + data->actors.add(deleteCheckpointQ(data, startOldestVersion, metaData)); + TraceEvent(SevWarnAlways, "StorageStaleCheckpointRequest", data->thisServerID) + .detail("PendingCheckpoint", metaData.toString()) + .detail("DurableVersion", startOldestVersion); + } + data->pendingCheckpoints.erase(data->pendingCheckpoints.begin()); + } + + // Create checkpoint if the pending request version is within (startOldestVersion, desiredVersion]. + // Versions newer than the checkpoint version won't be committed before the checkpoint is created. + state bool requireCheckpoint = false; + if (!data->pendingCheckpoints.empty()) { + const Version cVer = data->pendingCheckpoints.begin()->first; + if (cVer <= desiredVersion) { + TraceEvent("CheckpointVersionSatisfied", data->thisServerID) + .detail("DesiredVersion", desiredVersion) + .detail("CheckPointVersion", cVer); + desiredVersion = cVer; + requireCheckpoint = true; + } + } + // Write mutations to storage until we reach the desiredVersion or have written too much (bytesleft) state double beforeStorageUpdates = now(); loop { @@ -5920,13 +6140,24 @@ ACTOR Future updateStorage(StorageServer* data) { debug_advanceMinCommittedVersion(data->thisServerID, newOldestVersion); + if (requireCheckpoint) { + ASSERT(newOldestVersion == data->pendingCheckpoints.begin()->first); + std::vector> createCheckpoints; + for (int idx = 0; idx < data->pendingCheckpoints.begin()->second.size(); ++idx) { + createCheckpoints.push_back(createCheckpoint(data, data->pendingCheckpoints.begin()->second[idx])); + } + wait(waitForAll(createCheckpoints)); + data->pendingCheckpoints.erase(data->pendingCheckpoints.begin()); + requireCheckpoint = false; + } + if (newOldestVersion > data->rebootAfterDurableVersion) { TraceEvent("RebootWhenDurableTriggered", data->thisServerID) .detail("NewOldestVersion", newOldestVersion) .detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion); - // To avoid brokenPromise error, which is caused by the sender of the durableInProgress (i.e., this process) - // never sets durableInProgress, we should set durableInProgress before send the please_reboot() error. - // Otherwise, in the race situation when storage server receives both reboot and + // To avoid brokenPromise error, which is caused by the sender of the durableInProgress (i.e., this + // process) never sets durableInProgress, we should set durableInProgress before send the + // please_reboot() error. Otherwise, in the race situation when storage server receives both reboot and // brokenPromise of durableInProgress, the worker of the storage server will die. // We will eventually end up with no worker for storage server role. // The data distributor's buildTeam() will get stuck in building a team @@ -5948,12 +6179,13 @@ ACTOR Future updateStorage(StorageServer* data) { } durableInProgress.send(Void()); - wait(delay(0, TaskPriority::UpdateStorage)); // Setting durableInProgess could cause the storage server to shut - // down, so delay to check for cancellation + wait(delay(0, TaskPriority::UpdateStorage)); // Setting durableInProgess could cause the storage server to + // shut down, so delay to check for cancellation // Taking and releasing the durableVersionLock ensures that no eager reads both begin before the commit was - // effective and are applied after we change the durable version. Also ensure that we have to lock while calling - // changeDurableVersion, because otherwise the latest version of mutableData might be partially loaded. + // effective and are applied after we change the durable version. Also ensure that we have to lock while + // calling changeDurableVersion, because otherwise the latest version of mutableData might be partially + // loaded. state double beforeSSDurableVersionUpdate = now(); wait(data->durableVersionLock.take()); data->popVersion(data->durableVersion.get() + 1); @@ -6029,6 +6261,23 @@ void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available) { availableKeys.end, endAvailable ? LiteralStringRef("1") : LiteralStringRef("0"))); } + + // When a shard is moved out, delete all related checkpoints created for data move. + if (!available) { + for (auto& [id, checkpoint] : self->checkpoints) { + if (checkpoint.range.intersects(keys)) { + Key persistCheckpointKey(persistCheckpointKeys.begin.toString() + checkpoint.checkpointID.toString()); + checkpoint.setState(CheckpointMetaData::Deleting); + self->addMutationToMutationLog( + mLV, MutationRef(MutationRef::SetValue, persistCheckpointKey, checkpointValue(checkpoint))); + } + self->actors.add(deleteCheckpointQ(self, mLV.version + 1, checkpoint)); + TraceEvent("SSDeleteCheckpointScheduled", self->thisServerID) + .detail("MovedOutRange", keys.toString()) + .detail("Checkpoint", checkpoint.toString()) + .detail("DeleteVersion", mLV.version + 1); + } + } } void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned) { @@ -6297,6 +6546,8 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor state Future fShardAssigned = storage->readRange(persistShardAssignedKeys); state Future fShardAvailable = storage->readRange(persistShardAvailableKeys); state Future fChangeFeeds = storage->readRange(persistChangeFeedKeys); + state Future fPendingCheckpoints = storage->readRange(persistPendingCheckpointKeys); + state Future fCheckpoints = storage->readRange(persistCheckpointKeys); state Future fTenantMap = storage->readRange(persistTenantMapKeys); state Promise byteSampleSampleRecovered; @@ -6307,7 +6558,8 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor TraceEvent("ReadingDurableState", data->thisServerID).log(); wait(waitForAll(std::vector{ fFormat, fID, fClusterID, ftssPairID, fssPairID, fTssQuarantine, fVersion, fLogProtocol, fPrimaryLocality })); - wait(waitForAll(std::vector{ fShardAssigned, fShardAvailable, fChangeFeeds, fTenantMap })); + wait(waitForAll( + std::vector{ fShardAssigned, fShardAvailable, fChangeFeeds, fPendingCheckpoints, fCheckpoints, fTenantMap })); wait(byteSampleSampleRecovered.getFuture()); TraceEvent("RestoringDurableState", data->thisServerID).log(); @@ -6347,8 +6599,8 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor } // It's a bit sketchy to rely on an untrusted storage engine to persist its quarantine state when the quarantine - // state means the storage engine already had a durability or correctness error, but it should get re-quarantined - // very quickly because of a mismatch if it starts trying to do things again + // state means the storage engine already had a durability or correctness error, but it should get + // re-quarantined very quickly because of a mismatch if it starts trying to do things again if (fTssQuarantine.get().present()) { TEST(true); // TSS restarted while quarantined data->tssInQuarantine = true; @@ -6373,6 +6625,25 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor data->setInitialVersion(version); data->bytesRestored += fVersion.get().expectedSize(); + state RangeResult pendingCheckpoints = fPendingCheckpoints.get(); + state int pCLoc; + for (pCLoc = 0; pCLoc < pendingCheckpoints.size(); ++pCLoc) { + CheckpointMetaData metaData = decodeCheckpointValue(pendingCheckpoints[pCLoc].value); + data->pendingCheckpoints[metaData.version].push_back(metaData); + wait(yield()); + } + + state RangeResult checkpoints = fCheckpoints.get(); + state int cLoc; + for (cLoc = 0; cLoc < checkpoints.size(); ++cLoc) { + CheckpointMetaData metaData = decodeCheckpointValue(checkpoints[cLoc].value); + data->checkpoints[metaData.checkpointID] = metaData; + if (metaData.getState() == CheckpointMetaData::Deleting) { + data->actors.add(deleteCheckpointQ(data, version, metaData)); + } + wait(yield()); + } + state RangeResult available = fShardAvailable.get(); data->bytesRestored += available.logicalSize(); state int availableLoc; @@ -6624,15 +6895,15 @@ ACTOR Future waitMetrics(StorageServerMetrics* self, WaitMetricsRequest re // SOMEDAY: validation! The changes here are possibly partial changes (we receive multiple // messages per - // update to our requested range). This means that the validation would have to occur after all - // the messages for one clear or set have been dispatched. + // update to our requested range). This means that the validation would have to occur after + // all the messages for one clear or set have been dispatched. /*StorageMetrics m = getMetrics( data, req.keys ); bool b = ( m.bytes != metrics.bytes || m.bytesPerKSecond != metrics.bytesPerKSecond || m.iosPerKSecond != metrics.iosPerKSecond ); if (b) { printf("keys: '%s' - '%s' @%p\n", printable(req.keys.begin).c_str(), printable(req.keys.end).c_str(), this); - printf("waitMetrics: desync %d (%lld %lld %lld) != (%lld %lld %lld); +(%lld %lld %lld)\n", b, - m.bytes, m.bytesPerKSecond, m.iosPerKSecond, metrics.bytes, metrics.bytesPerKSecond, + printf("waitMetrics: desync %d (%lld %lld %lld) != (%lld %lld %lld); +(%lld %lld %lld)\n", + b, m.bytes, m.bytesPerKSecond, m.iosPerKSecond, metrics.bytes, metrics.bytesPerKSecond, metrics.iosPerKSecond, c.bytes, c.bytesPerKSecond, c.iosPerKSecond); }*/ @@ -6641,8 +6912,8 @@ ACTOR Future waitMetrics(StorageServerMetrics* self, WaitMetricsRequest re } } catch (Error& e) { if (e.code() == error_code_actor_cancelled) - throw; // This is only cancelled when the main loop had exited...no need in this case to clean up - // self + throw; // This is only cancelled when the main loop had exited...no need in this case to clean + // up self error = e; break; } @@ -6822,8 +7093,8 @@ ACTOR Future serveGetValueRequests(StorageServer* self, FutureStreammodify(&TransactionLineage::operation) = TransactionLineage::Operation::GetValue; loop { GetValueRequest req = waitNext(getValue); - // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade - // before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so + // downgrade before doing real work if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), @@ -6841,8 +7112,8 @@ ACTOR Future serveGetKeyValuesRequests(StorageServer* self, FutureStreamactors.add(self->readGuard(req, getKeyValuesQ)); } } @@ -6864,8 +7135,8 @@ ACTOR Future serveGetKeyValuesStreamRequests(StorageServer* self, FutureStream getKeyValuesStream) { loop { GetKeyValuesStreamRequest req = waitNext(getKeyValuesStream); - // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade - // before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so + // downgrade before doing real work // FIXME: add readGuard again self->actors.add(getKeyValuesStreamQ(self, req)); } @@ -6875,8 +7146,8 @@ ACTOR Future serveGetKeyRequests(StorageServer* self, FutureStreammodify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKey; loop { GetKeyRequest req = waitNext(getKey); - // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade - // before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so + // downgrade before doing real work self->actors.add(self->readGuard(req, getKeyQ)); } } @@ -7067,8 +7338,8 @@ ACTOR Future reportStorageServerState(StorageServer* self) { ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface ssi) { state Future doUpdate = Void(); - state bool updateReceived = - false; // true iff the current update() actor assigned to doUpdate has already received an update from the tlog + state bool updateReceived = false; // true iff the current update() actor assigned to doUpdate has already + // received an update from the tlog state double lastLoopTopTime = now(); state Future dbInfoChange = Void(); state Future checkLastUpdate = Void(); @@ -7134,8 +7405,8 @@ ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface self->popVersion(self->durableVersion.get() + 1, true); } // If update() is waiting for results from the tlog, it might never get them, so needs to be - // cancelled. But if it is waiting later, cancelling it could cause problems (e.g. fetchKeys that - // already committed to transitioning to waiting state) + // cancelled. But if it is waiting later, cancelling it could cause problems (e.g. fetchKeys + // that already committed to transitioning to waiting state) if (!updateReceived) { doUpdate = Void(); } @@ -7178,6 +7449,17 @@ ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface else doUpdate = update(self, &updateReceived); } + when(GetCheckpointRequest req = waitNext(ssi.checkpoint.getFuture())) { + if (!self->isReadable(req.range)) { + req.reply.sendError(wrong_shard_server()); + continue; + } else { + self->actors.add(getCheckpointQ(self, req)); + } + } + when(FetchCheckpointRequest req = waitNext(ssi.fetchCheckpoint.getFuture())) { + self->actors.add(fetchCheckpointQ(self, req)); + } when(wait(updateProcessStatsTimer)) { updateProcessStats(self); updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL); @@ -7190,16 +7472,17 @@ ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface bool storageServerTerminated(StorageServer& self, IKeyValueStore* persistentData, Error const& e) { self.shuttingDown = true; - // Clearing shards shuts down any fetchKeys actors; these may do things on cancellation that are best done with self - // still valid + // Clearing shards shuts down any fetchKeys actors; these may do things on cancellation that are best done with + // self still valid self.shards.insert(allKeys, Reference()); - // Dispose the IKVS (destroying its data permanently) only if this shutdown is definitely permanent. Otherwise just - // close it. + // Dispose the IKVS (destroying its data permanently) only if this shutdown is definitely permanent. Otherwise + // just close it. if (e.code() == error_code_please_reboot) { // do nothing. } else if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed) { - // SOMEDAY: could close instead of dispose if tss in quarantine gets removed so it could still be investigated? + // SOMEDAY: could close instead of dispose if tss in quarantine gets removed so it could still be + // investigated? persistentData->dispose(); } else { persistentData->close(); diff --git a/fdbserver/workloads/PhysicalShardMove.actor.cpp b/fdbserver/workloads/PhysicalShardMove.actor.cpp new file mode 100644 index 0000000000..333b7bfc79 --- /dev/null +++ b/fdbserver/workloads/PhysicalShardMove.actor.cpp @@ -0,0 +1,234 @@ +/* + *PhysicalShardMove.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbrpc/simulator.h" +#include "fdbserver/IKeyValueStore.h" +#include "fdbserver/ServerCheckpoint.actor.h" +#include "fdbserver/MoveKeys.actor.h" +#include "fdbserver/QuietDatabase.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "flow/Error.h" +#include "flow/IRandom.h" +#include "flow/flow.h" +#include +#include + +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace { +std::string printValue(const ErrorOr>& value) { + if (value.isError()) { + return value.getError().name(); + } + return value.get().present() ? value.get().get().toString() : "Value Not Found."; +} +} // namespace + +struct SSCheckpointWorkload : TestWorkload { + const bool enabled; + bool pass; + + SSCheckpointWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), enabled(!clientId), pass(true) {} + + void validationFailed(ErrorOr> expectedValue, ErrorOr> actualValue) { + TraceEvent(SevError, "TestFailed") + .detail("ExpectedValue", printValue(expectedValue)) + .detail("ActualValue", printValue(actualValue)); + pass = false; + } + + std::string description() const override { return "SSCheckpoint"; } + + Future setup(Database const& cx) override { return Void(); } + + Future start(Database const& cx) override { + if (!enabled) { + return Void(); + } + return _start(this, cx); + } + + ACTOR Future _start(SSCheckpointWorkload* self, Database cx) { + state Key key = "TestKey"_sr; + state Key endKey = "TestKey0"_sr; + state Value oldValue = "TestValue"_sr; + + int ignore = wait(setDDMode(cx, 0)); + state Version version = wait(self->writeAndVerify(self, cx, key, oldValue)); + + // Create checkpoint. + state Transaction tr(cx); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state CheckpointFormat format = RocksDBColumnFamily; + loop { + try { + wait(createCheckpoint(&tr, KeyRangeRef(key, endKey), format)); + wait(tr.commit()); + version = tr.getCommittedVersion(); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + TraceEvent("TestCheckpointCreated") + .detail("Range", KeyRangeRef(key, endKey).toString()) + .detail("Version", version); + + // Fetch checkpoint meta data. + loop { + try { + state std::vector records = + wait(getCheckpointMetaData(cx, KeyRangeRef(key, endKey), version, format)); + break; + } catch (Error& e) { + TraceEvent("TestFetchCheckpointMetadataError") + .errorUnsuppressed(e) + .detail("Range", KeyRangeRef(key, endKey).toString()) + .detail("Version", version); + + // The checkpoint was just created, we don't expect this error. + ASSERT(e.code() != error_code_checkpoint_not_found); + } + } + + TraceEvent("TestCheckpointFetched") + .detail("Range", KeyRangeRef(key, endKey).toString()) + .detail("Version", version) + .detail("Shards", records.size()); + + state std::string pwd = platform::getWorkingDirectory(); + state std::string folder = pwd + "/checkpoints"; + platform::eraseDirectoryRecursive(folder); + ASSERT(platform::createDirectory(folder)); + + // Fetch checkpoint. + state int i = 0; + for (; i < records.size(); ++i) { + loop { + TraceEvent("TestFetchingCheckpoint").detail("Checkpoint", records[i].toString()); + try { + state CheckpointMetaData record = wait(fetchCheckpoint(cx, records[0], folder)); + TraceEvent("TestCheckpointFetched").detail("Checkpoint", records[i].toString()); + break; + } catch (Error& e) { + TraceEvent("TestFetchCheckpointError") + .errorUnsuppressed(e) + .detail("Checkpoint", records[i].toString()); + wait(delay(1)); + } + } + } + + state std::string rocksDBTestDir = "rocksdb-kvstore-test-db"; + platform::eraseDirectoryRecursive(rocksDBTestDir); + + // Restore KVS. + state IKeyValueStore* kvStore = keyValueStoreRocksDB( + rocksDBTestDir, deterministicRandom()->randomUniqueID(), KeyValueStoreType::SSD_ROCKSDB_V1); + try { + wait(kvStore->restore(records)); + } catch (Error& e) { + TraceEvent(SevError, "TestRestoreCheckpointError") + .errorUnsuppressed(e) + .detail("Checkpoint", describe(records)); + } + + // Compare the keyrange between the original database and the one restored from checkpoint. + // For now, it should have been a single key. + tr.reset(); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + loop { + try { + state RangeResult res = wait(tr.getRange(KeyRangeRef(key, endKey), CLIENT_KNOBS->TOO_MANY)); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + for (i = 0; i < res.size(); ++i) { + Optional value = wait(kvStore->readValue(res[i].key)); + ASSERT(value.present()); + ASSERT(value.get() == res[i].value); + } + + int ignore = wait(setDDMode(cx, 1)); + return Void(); + } + + ACTOR Future readAndVerify(SSCheckpointWorkload* self, + Database cx, + Key key, + ErrorOr> expectedValue) { + state Transaction tr(cx); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + loop { + try { + state Optional res = wait(timeoutError(tr.get(key), 30.0)); + const bool equal = !expectedValue.isError() && res == expectedValue.get(); + if (!equal) { + self->validationFailed(expectedValue, ErrorOr>(res)); + } + break; + } catch (Error& e) { + if (expectedValue.isError() && expectedValue.getError().code() == e.code()) { + break; + } + wait(tr.onError(e)); + } + } + + return Void(); + } + + ACTOR Future writeAndVerify(SSCheckpointWorkload* self, Database cx, Key key, Optional value) { + state Transaction tr(cx); + state Version version; + loop { + try { + if (value.present()) { + tr.set(key, value.get()); + } else { + tr.clear(key); + } + wait(timeoutError(tr.commit(), 30.0)); + version = tr.getCommittedVersion(); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + wait(self->readAndVerify(self, cx, key, value)); + + return version; + } + + Future check(Database const& cx) override { return pass; } + + void getMetrics(std::vector& m) override {} +}; + +WorkloadFactory SSCheckpointWorkloadFactory("SSCheckpointWorkload"); \ No newline at end of file diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 5864a31563..b0a5a25a57 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -174,6 +174,7 @@ ERROR( blob_granule_no_ryw, 2036, "Blob Granule Read Transactions must be specif ERROR( blob_granule_not_materialized, 2037, "Blob Granule Read Transactions must be specified as ryw-disabled" ) ERROR( get_mapped_key_values_has_more, 2038, "getMappedRange does not support continuation for now" ) ERROR( get_mapped_range_reads_your_writes, 2039, "getMappedRange tries to read data that were previously written in the transaction" ) +ERROR( checkpoint_not_found, 2040, "Checkpoint not found" ) ERROR( incompatible_protocol_version, 2100, "Incompatible protocol version" ) ERROR( transaction_too_large, 2101, "Transaction exceeds byte limit" ) @@ -283,6 +284,7 @@ ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length h // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error ERROR( internal_error, 4100, "An internal error occurred" ) +ERROR( not_implemented, 4200, "Not implemented yet" ) // clang-format on #undef ERROR diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1c6d535706..198ed4fb2d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -138,6 +138,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/CycleAndLock.toml) add_fdb_test(TEST_FILES fast/CycleTest.toml) add_fdb_test(TEST_FILES fast/ChangeFeeds.toml) + add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml) add_fdb_test(TEST_FILES fast/DataLossRecovery.toml) add_fdb_test(TEST_FILES fast/EncryptionOps.toml) add_fdb_test(TEST_FILES fast/FuzzApiCorrectness.toml) diff --git a/tests/fast/PhysicalShardMove.toml b/tests/fast/PhysicalShardMove.toml new file mode 100644 index 0000000000..72d1f0331c --- /dev/null +++ b/tests/fast/PhysicalShardMove.toml @@ -0,0 +1,13 @@ +[configuration] +config = 'triple' +storageEngineType = 4 +processesPerMachine = 1 +coordinators = 3 +machineCount = 15 + +[[test]] +testTitle = 'PhysicalShardMove' +useDB = true + + [[test.workload]] + testName = 'SSCheckpointWorkload' From a1dfd92a4356d1b67c32b78bbf47ff9df3b8b46a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 15 Mar 2022 14:18:06 -0700 Subject: [PATCH 103/138] Fix merge conflict between tenants and checkpointing API. Avoid using .toString() in trace event details and rename (and move) a couple variables. --- fdbclient/NativeAPI.actor.cpp | 76 +++++++++++++++++------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index eb87aa26bd..9d1c60301c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -7632,7 +7632,8 @@ ACTOR Future snapCreate(Database cx, Standalone snapCmd, UID sn ACTOR template static Future createCheckpointImpl(T tr, KeyRangeRef range, CheckpointFormat format) { - TraceEvent("CreateCheckpointTransactionBegin").detail("Range", range.toString()); + ASSERT(!tr->getTenant().present()); + TraceEvent("CreateCheckpointTransactionBegin").detail("Range", range); state RangeResult keyServers = wait(krmGetRanges(tr, keyServersPrefix, range)); ASSERT(!keyServers.more); @@ -7656,7 +7657,7 @@ static Future createCheckpointImpl(T tr, KeyRangeRef range, CheckpointForm } TraceEvent("CreateCheckpointTransactionShard") - .detail("Shard", shard.toString()) + .detail("Shard", shard) .detail("SrcServers", describe(src)) .detail("ServerSelected", describe(src)) .detail("CheckpointKey", checkpointKeyFor(checkpointID)) @@ -7680,46 +7681,44 @@ ACTOR static Future getCheckpointMetaDataInternal(GetCheckpo Reference alternatives, double timeout) { TraceEvent("GetCheckpointMetaDataInternalBegin") - .detail("Range", req.range.toString()) + .detail("Range", req.range) .detail("Version", req.version) .detail("Format", static_cast(req.format)) .detail("Locations", alternatives->description()); - state std::vector>> fs; - state int i = 0; - for (i = 0; i < alternatives->size(); ++i) { + state std::vector>> futures; + state int index = 0; + for (index = 0; index < alternatives->size(); ++index) { // For each shard, all storage servers are checked, only one is required. - fs.push_back(errorOr(timeoutError(alternatives->getInterface(i).checkpoint.getReply(req), timeout))); + futures.push_back(errorOr(timeoutError(alternatives->getInterface(index).checkpoint.getReply(req), timeout))); } state Optional error; - wait(waitForAll(fs)); - TraceEvent("GetCheckpointMetaDataInternalWaitEnd") - .detail("Range", req.range.toString()) - .detail("Version", req.version); + wait(waitForAll(futures)); + TraceEvent("GetCheckpointMetaDataInternalWaitEnd").detail("Range", req.range).detail("Version", req.version); - for (i = 0; i < fs.size(); ++i) { - if (!fs[i].isReady()) { + for (index = 0; index < futures.size(); ++index) { + if (!futures[index].isReady()) { error = timed_out(); TraceEvent("GetCheckpointMetaDataInternalSSTimeout") - .detail("Range", req.range.toString()) + .detail("Range", req.range) .detail("Version", req.version) - .detail("StorageServer", alternatives->getInterface(i).uniqueID); + .detail("StorageServer", alternatives->getInterface(index).uniqueID); continue; } - if (fs[i].get().isError()) { - const Error& e = fs[i].get().getError(); + if (futures[index].get().isError()) { + const Error& e = futures[index].get().getError(); TraceEvent("GetCheckpointMetaDataInternalError") .errorUnsuppressed(e) - .detail("Range", req.range.toString()) + .detail("Range", req.range) .detail("Version", req.version) - .detail("StorageServer", alternatives->getInterface(i).uniqueID); + .detail("StorageServer", alternatives->getInterface(index).uniqueID); if (e.code() != error_code_checkpoint_not_found || !error.present()) { error = e; } } else { - return fs[i].get().get(); + return futures[index].get().get(); } } @@ -7733,6 +7732,8 @@ ACTOR Future> getCheckpointMetaData(Database cx, CheckpointFormat format, double timeout) { state Span span("NAPI:GetCheckpoint"_loc); + state int index = 0; + state std::vector> futures; loop { TraceEvent("GetCheckpointBegin") @@ -7740,42 +7741,41 @@ ACTOR Future> getCheckpointMetaData(Database cx, .detail("Version", version) .detail("Format", static_cast(format)); - state std::vector> fs; - state int i = 0; - try { - state std::vector>> locations = + state std::vector locations = wait(getKeyRangeLocations(cx, + Optional(), keys, CLIENT_KNOBS->TOO_MANY, Reverse::False, &StorageServerInterface::checkpoint, span.context, Optional(), - UseProvisionalProxies::False)); + UseProvisionalProxies::False, + latestVersion)); - fs.clear(); - for (i = 0; i < locations.size(); ++i) { - fs.push_back(getCheckpointMetaDataInternal( - GetCheckpointRequest(version, keys, format), locations[i].second, timeout)); + futures.clear(); + for (index = 0; index < locations.size(); ++index) { + futures.push_back(getCheckpointMetaDataInternal( + GetCheckpointRequest(version, keys, format), locations[index].locations, timeout)); TraceEvent("GetCheckpointShardBegin") - .detail("Range", locations[i].first.toString()) + .detail("Range", locations[index].range) .detail("Version", version) - .detail("StorageServers", locations[i].second->description()); + .detail("StorageServers", locations[index].locations->description()); } choose { - when(wait(cx->connectionFileChanged())) { cx->invalidateCache(keys); } - when(wait(waitForAll(fs))) { break; } + when(wait(cx->connectionFileChanged())) { cx->invalidateCache(KeyRef(), keys); } + when(wait(waitForAll(futures))) { break; } when(wait(delay(timeout))) { - TraceEvent("GetCheckpointTimeout").detail("Range", keys.toString()).detail("Version", version); + TraceEvent("GetCheckpointTimeout").detail("Range", keys).detail("Version", version); } } } catch (Error& e) { TraceEvent("GetCheckpointError").errorUnsuppressed(e).detail("Range", keys.toString()); if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_connection_failed || e.code() == error_code_broken_promise) { - cx->invalidateCache(keys); + cx->invalidateCache(KeyRef(), keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY)); } else { throw; @@ -7784,9 +7784,9 @@ ACTOR Future> getCheckpointMetaData(Database cx, } std::vector res; - for (i = 0; i < fs.size(); ++i) { - TraceEvent("GetCheckpointShardEnd").detail("Checkpoint", fs[i].get().toString()); - res.push_back(fs[i].get()); + for (index = 0; index < futures.size(); ++index) { + TraceEvent("GetCheckpointShardEnd").detail("Checkpoint", futures[index].get().toString()); + res.push_back(futures[index].get()); } return res; } From db521f95955fc20792caa77d0d012b0d81c17146 Mon Sep 17 00:00:00 2001 From: He Liu Date: Tue, 15 Mar 2022 16:29:25 -0700 Subject: [PATCH 104/138] Skip PhysicalShardMoveTest if RocksDB is not enabled. --- tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 198ed4fb2d..6600c8eb01 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -138,7 +138,6 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/CycleAndLock.toml) add_fdb_test(TEST_FILES fast/CycleTest.toml) add_fdb_test(TEST_FILES fast/ChangeFeeds.toml) - add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml) add_fdb_test(TEST_FILES fast/DataLossRecovery.toml) add_fdb_test(TEST_FILES fast/EncryptionOps.toml) add_fdb_test(TEST_FILES fast/FuzzApiCorrectness.toml) @@ -182,7 +181,8 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/WriteDuringReadClean.toml) add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT) if (SSD_ROCKSDB_EXPERIMENTAL) - add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE) # re-enable as needed for RocksDB. Breaks correctness tests if RocksDB is disabled. + add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml) + add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml) endif() add_fdb_test(TEST_FILES rare/CheckRelocation.toml) add_fdb_test(TEST_FILES rare/ClogUnclog.toml) From cbd381778e47f5da0a05cb36a29134d3e66a980d Mon Sep 17 00:00:00 2001 From: Xiaoge Su Date: Tue, 15 Mar 2022 16:28:59 -0700 Subject: [PATCH 105/138] Fix the includes in DataDistribution.actor.cpp Update the comment to re-trigger failed checks --- fdbserver/DataDistribution.actor.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index a012445d52..65f472bae7 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -19,17 +19,18 @@ */ #include -#include + +#include "fdbclient/DatabaseContext.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/Knobs.h" -#include "fdbclient/StorageServerInterface.h" -#include "fdbclient/SystemData.h" -#include "fdbclient/DatabaseContext.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/RunTransaction.actor.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/SystemData.h" #include "fdbrpc/Replication.h" #include "fdbserver/DataDistribution.actor.h" +#include "fdbserver/DDTeamCollection.h" #include "fdbserver/FDBExecHelper.actor.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/Knobs.h" @@ -38,14 +39,14 @@ #include "fdbserver/ServerDBInfo.h" #include "fdbserver/TLogInterface.h" #include "fdbserver/WaitFailure.h" -#include "fdbserver/DDTeamCollection.h" #include "flow/ActorCollection.h" #include "flow/Arena.h" #include "flow/BooleanParam.h" +#include "flow/serialize.h" #include "flow/Trace.h" #include "flow/UnitTest.h" + #include "flow/actorcompiler.h" // This must be the last #include. -#include "flow/serialize.h" // Read keyservers, return unique set of teams ACTOR Future> getInitialDataDistribution(Database cx, From 66d71e107df5d9cd3759ab923f87deb7791582bc Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Wed, 16 Mar 2022 00:08:59 -0700 Subject: [PATCH 106/138] Move actorcompiler.h include to the end of includes --- fdbclient/DatabaseBackupAgent.actor.cpp | 3 ++- fdbclient/ManagementAPI.actor.cpp | 2 +- fdbclient/StatusClient.actor.cpp | 2 +- fdbclient/StorageServerInterface.h | 1 + fdbserver/Coordination.actor.cpp | 5 +++-- fdbserver/DiskQueue.actor.cpp | 2 +- fdbserver/FDBExecHelper.actor.cpp | 20 +++++++++++++++++ fdbserver/FDBExecHelper.actor.h | 22 ++++++++++++++++++- fdbserver/RestoreApplier.actor.cpp | 2 +- fdbserver/VersionedBTree.actor.cpp | 2 +- fdbserver/networktest.actor.cpp | 2 +- fdbserver/workloads/ChangeFeeds.actor.cpp | 2 +- .../workloads/ConsistencyCheck.actor.cpp | 2 +- fdbserver/workloads/Cycle.actor.cpp | 5 +++-- fdbserver/workloads/MiniCycle.actor.cpp | 2 +- .../workloads/StreamingRangeRead.actor.cpp | 2 +- fdbserver/workloads/UDPWorkload.actor.cpp | 3 ++- 17 files changed, 62 insertions(+), 17 deletions(-) diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp index 7e41b58eb6..808707f91f 100644 --- a/fdbclient/DatabaseBackupAgent.actor.cpp +++ b/fdbclient/DatabaseBackupAgent.actor.cpp @@ -32,10 +32,11 @@ #include #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/KeyBackedTypes.h" -#include "flow/actorcompiler.h" // has to be last include #include #include +#include "flow/actorcompiler.h" // has to be last include + const Key DatabaseBackupAgent::keyAddPrefix = LiteralStringRef("add_prefix"); const Key DatabaseBackupAgent::keyRemovePrefix = LiteralStringRef("remove_prefix"); const Key DatabaseBackupAgent::keyRangeVersions = LiteralStringRef("range_versions"); diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 058b5ae0a8..9f9f932d24 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -41,8 +41,8 @@ #include "flow/UnitTest.h" #include "fdbrpc/ReplicationPolicy.h" #include "fdbrpc/Replication.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "fdbclient/Schemas.h" +#include "flow/actorcompiler.h" // This must be the last #include. bool isInteger(const std::string& s) { if (s.empty()) diff --git a/fdbclient/StatusClient.actor.cpp b/fdbclient/StatusClient.actor.cpp index fc7a77b17a..a160892544 100644 --- a/fdbclient/StatusClient.actor.cpp +++ b/fdbclient/StatusClient.actor.cpp @@ -27,8 +27,8 @@ #include "fdbclient/json_spirit/json_spirit_writer_template.h" #include "fdbclient/json_spirit/json_spirit_reader_template.h" #include "fdbrpc/genericactors.actor.h" -#include "flow/actorcompiler.h" // has to be last include #include +#include "flow/actorcompiler.h" // has to be last include json_spirit::mValue readJSONStrictly(const std::string& s) { json_spirit::mValue val; diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 592d2dd167..0d10f3e8ac 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -94,6 +94,7 @@ struct StorageServerInterface { NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); } NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); } Optional secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; } + bool readyForTraffic; UID id() const { return uniqueID; } bool isTss() const { return tssPairID.present(); } std::string toString() const { return id().shortString(); } diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index b2814c106b..d974d78b7a 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -18,6 +18,8 @@ * limitations under the License. */ +#include + #include "fdbclient/ConfigTransactionInterface.h" #include "fdbserver/CoordinationInterface.h" #include "fdbserver/ConfigNode.h" @@ -31,9 +33,8 @@ #include "flow/UnitTest.h" #include "flow/IndexedSet.h" #include "fdbclient/MonitorLeader.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/network.h" -#include +#include "flow/actorcompiler.h" // This must be the last #include. // This module implements coordinationServer() and the interfaces in CoordinationInterface.h diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index a955bafa4f..6ab8012ce5 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -24,8 +24,8 @@ #include "fdbrpc/simulator.h" #include "flow/crc32c.h" #include "flow/genericactors.actor.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/xxhash.h" +#include "flow/actorcompiler.h" // This must be the last #include. typedef bool (*compare_pages)(void*, void*); typedef int64_t loc_t; diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index 6dc359b2df..9b690d4d35 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -1,3 +1,23 @@ +/* + * FDBExecHelper.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__INTEL_COMPILER) #define BOOST_SYSTEM_NO_LIB #define BOOST_DATE_TIME_NO_LIB diff --git a/fdbserver/FDBExecHelper.actor.h b/fdbserver/FDBExecHelper.actor.h index d285b376e8..41d9fd80db 100644 --- a/fdbserver/FDBExecHelper.actor.h +++ b/fdbserver/FDBExecHelper.actor.h @@ -1,3 +1,23 @@ +/* + * FDBExecHelper.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_EXEC_HELPER_ACTOR_G_H) #define FDBSERVER_EXEC_HELPER_ACTOR_G_H @@ -10,8 +30,8 @@ #include #include "flow/Arena.h" #include "flow/flow.h" -#include "flow/actorcompiler.h" #include "fdbclient/FDBTypes.h" +#include "flow/actorcompiler.h" // This must be the last #include. // execute/snapshot command takes two arguments: // param1 - represents the command type/name diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 4dc3ea4b09..0a7791532a 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -33,8 +33,8 @@ #include "fdbserver/RestoreRoleCommon.actor.h" #include "fdbserver/RestoreApplier.actor.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/network.h" +#include "flow/actorcompiler.h" // This must be the last #include. ACTOR static Future handleSendMutationVectorRequest(RestoreSendVersionedMutationsRequest req, Reference self); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 5f571c7988..1c911445bf 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -45,9 +45,9 @@ #include "fdbserver/IKeyValueStore.h" #include "fdbserver/DeltaTree.h" #include -#include "flow/actorcompiler.h" #include #include +#include "flow/actorcompiler.h" // must be last include #define REDWOOD_DEBUG 0 diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp index e9df6b709d..51c5ed5a68 100644 --- a/fdbserver/networktest.actor.cpp +++ b/fdbserver/networktest.actor.cpp @@ -21,10 +21,10 @@ #include "contrib/fmt-8.1.1/include/fmt/format.h" #include "fdbserver/NetworkTest.h" #include "flow/Knobs.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/ActorCollection.h" #include "flow/UnitTest.h" #include +#include "flow/actorcompiler.h" // This must be the last #include. constexpr int WLTOKEN_NETWORKTEST = WLTOKEN_FIRST_AVAILABLE; diff --git a/fdbserver/workloads/ChangeFeeds.actor.cpp b/fdbserver/workloads/ChangeFeeds.actor.cpp index fcf272beb0..c2a3eb4107 100644 --- a/fdbserver/workloads/ChangeFeeds.actor.cpp +++ b/fdbserver/workloads/ChangeFeeds.actor.cpp @@ -28,10 +28,10 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/Trace.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/serialize.h" #include #include +#include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future>, Version>> readDatabase(Database cx) { state Transaction tr(cx); diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index b64850f544..2adbd0fad4 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -36,8 +36,8 @@ #include "flow/DeterministicRandom.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/StorageServerInterface.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/network.h" +#include "flow/actorcompiler.h" // This must be the last #include. //#define SevCCheckInfo SevVerbose #define SevCCheckInfo SevInfo diff --git a/fdbserver/workloads/Cycle.actor.cpp b/fdbserver/workloads/Cycle.actor.cpp index 87c477eab4..187dbb324f 100644 --- a/fdbserver/workloads/Cycle.actor.cpp +++ b/fdbserver/workloads/Cycle.actor.cpp @@ -18,6 +18,8 @@ * limitations under the License. */ +#include + #include "fdbclient/FDBOptions.g.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/TesterInterface.actor.h" @@ -26,9 +28,8 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/Trace.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/serialize.h" -#include +#include "flow/actorcompiler.h" // This must be the last #include. struct CycleWorkload : TestWorkload { int actorCount, nodeCount; diff --git a/fdbserver/workloads/MiniCycle.actor.cpp b/fdbserver/workloads/MiniCycle.actor.cpp index 4140cd3e07..36a287abc5 100644 --- a/fdbserver/workloads/MiniCycle.actor.cpp +++ b/fdbserver/workloads/MiniCycle.actor.cpp @@ -26,9 +26,9 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/Trace.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/serialize.h" #include +#include "flow/actorcompiler.h" // This must be the last #include. struct MiniCycleWorkload : TestWorkload { int actorCount, nodeCount; diff --git a/fdbserver/workloads/StreamingRangeRead.actor.cpp b/fdbserver/workloads/StreamingRangeRead.actor.cpp index 7b9846a2b8..ec8112da4e 100644 --- a/fdbserver/workloads/StreamingRangeRead.actor.cpp +++ b/fdbserver/workloads/StreamingRangeRead.actor.cpp @@ -27,9 +27,9 @@ #include "flow/Error.h" #include "flow/IRandom.h" #include "flow/Trace.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/serialize.h" #include +#include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future streamUsingGetRange(PromiseStream results, Transaction* tr, KeyRange keys) { state KeySelectorRef begin = firstGreaterOrEqual(keys.begin); diff --git a/fdbserver/workloads/UDPWorkload.actor.cpp b/fdbserver/workloads/UDPWorkload.actor.cpp index 35f4ff3e64..cd707c61d8 100644 --- a/fdbserver/workloads/UDPWorkload.actor.cpp +++ b/fdbserver/workloads/UDPWorkload.actor.cpp @@ -27,7 +27,6 @@ #include "flow/IRandom.h" #include "flow/flow.h" #include "flow/network.h" -#include "flow/actorcompiler.h" // has to be last include #include "flow/serialize.h" #include #include @@ -36,6 +35,8 @@ #include #include +#include "flow/actorcompiler.h" // has to be last include + namespace { struct UDPWorkload : TestWorkload { From c3e48fff9f0341de359574da6520636ba2d5893e Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 16 Mar 2022 08:59:12 -0700 Subject: [PATCH 107/138] Update fdbserver/PaxosConfigConsumer.actor.cpp Co-authored-by: Trevor Clinkenbeard --- fdbserver/PaxosConfigConsumer.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/PaxosConfigConsumer.actor.cpp b/fdbserver/PaxosConfigConsumer.actor.cpp index cdc21f14cb..1ebb73be95 100644 --- a/fdbserver/PaxosConfigConsumer.actor.cpp +++ b/fdbserver/PaxosConfigConsumer.actor.cpp @@ -244,7 +244,7 @@ public: } return smallest; } - return invalidVersion; + return ::invalidVersion; } Future complete() const { return waitForAll(actors); } }; From 6eca71832a636d65fb4292430abfa526c233d6b6 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 16 Mar 2022 12:07:03 -0700 Subject: [PATCH 108/138] Fix disablement of rocksdb tests in simulation --- tests/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6600c8eb01..2c5f93c4f3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -180,9 +180,12 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/WriteDuringRead.toml) add_fdb_test(TEST_FILES fast/WriteDuringReadClean.toml) add_fdb_test(TEST_FILES noSim/RandomUnitTests.toml UNIT) - if (SSD_ROCKSDB_EXPERIMENTAL) + if (WITH_ROCKSDB_EXPERIMENTAL) add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml) add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml) + else() + add_fdb_test(TEST_FILES noSim/KeyValueStoreRocksDBTest.toml IGNORE) + add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml IGNORE) endif() add_fdb_test(TEST_FILES rare/CheckRelocation.toml) add_fdb_test(TEST_FILES rare/ClogUnclog.toml) From ccc11cbdb2d26bdad30368239ca0080c48dd06eb Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 16 Mar 2022 13:19:28 -0700 Subject: [PATCH 109/138] Apply jemalloc fix, and always build jemalloc (#6610) * Apply jemalloc fix, and always build jemalloc See https://github.com/jemalloc/jemalloc/pull/1924 * Move USE_JEMALLOC to flow/config.h * Add unit test to validate fix --- cmake/Jemalloc.cmake | 58 +++++++++++++++----------------------------- cmake/jemalloc.patch | 38 +++++++++++++++++++++++++++++ flow/FastAlloc.cpp | 18 ++++++++++++++ flow/config.h.cmake | 1 + 4 files changed, 77 insertions(+), 38 deletions(-) create mode 100644 cmake/jemalloc.patch diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index c59a290d77..b2ce6ae1b9 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -11,43 +11,25 @@ if(NOT USE_JEMALLOC) return() endif() -add_definitions(-DUSE_JEMALLOC) -find_path(JEMALLOC_INCLUDE_DIR - NAMES - jemalloc/jemalloc.h - PATH_SUFFIXES - include - ) -find_library(JEMALLOC NAMES libjemalloc.a) -find_library(JEMALLOC_PIC NAMES libjemalloc_pic.a) add_library(im_jemalloc_pic STATIC IMPORTED) add_library(im_jemalloc STATIC IMPORTED) -if(JEMALLOC_INCLUDE_DIR AND JEMALLOC AND JEMALLOC_PIC) - set_target_properties(im_jemalloc_pic PROPERTIES IMPORTED_LOCATION "${JEMALLOC_PIC}") - set_target_properties(im_jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC}") - target_include_directories(jemalloc INTERFACE "${JEMALLOC_INCLUDE_DIR}") - # the ordering here is important: for dynamic libraries we have to use all - # symbols that are in the library which was compiled with PIC (for executables - # we could omit the pic-library) - target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) -else() - include(ExternalProject) - set(JEMALLOC_DIR "${CMAKE_BINARY_DIR}/jemalloc") - ExternalProject_add(Jemalloc_project - URL "https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2" - URL_HASH SHA256=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6 - BUILD_BYPRODUCTS "${JEMALLOC_DIR}/include/jemalloc/jemalloc.h" - "${JEMALLOC_DIR}/lib/libjemalloc.a" - "${JEMALLOC_DIR}/lib/libjemalloc_pic.a" - CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof - BUILD_IN_SOURCE ON - BUILD_COMMAND make - INSTALL_DIR "${JEMALLOC_DIR}" - INSTALL_COMMAND make install) - add_dependencies(im_jemalloc Jemalloc_project) - add_dependencies(im_jemalloc_pic Jemalloc_project) - set_target_properties(im_jemalloc_pic PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc_pic.a") - set_target_properties(im_jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc.a") - target_include_directories(jemalloc INTERFACE "${JEMALLOC_DIR}/include") - target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) -endif() +include(ExternalProject) +set(JEMALLOC_DIR "${CMAKE_BINARY_DIR}/jemalloc") +ExternalProject_add(Jemalloc_project + URL "https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2" + URL_HASH SHA256=34330e5ce276099e2e8950d9335db5a875689a4c6a56751ef3b1d8c537f887f6 + BUILD_BYPRODUCTS "${JEMALLOC_DIR}/include/jemalloc/jemalloc.h" + "${JEMALLOC_DIR}/lib/libjemalloc.a" + "${JEMALLOC_DIR}/lib/libjemalloc_pic.a" + PATCH_COMMAND patch -p1 < ${CMAKE_SOURCE_DIR}/cmake/jemalloc.patch + CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof + BUILD_IN_SOURCE ON + BUILD_COMMAND make + INSTALL_DIR "${JEMALLOC_DIR}" + INSTALL_COMMAND make install) +add_dependencies(im_jemalloc Jemalloc_project) +add_dependencies(im_jemalloc_pic Jemalloc_project) +set_target_properties(im_jemalloc_pic PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc_pic.a") +set_target_properties(im_jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc.a") +target_include_directories(jemalloc INTERFACE "${JEMALLOC_DIR}/include") +target_link_libraries(jemalloc INTERFACE im_jemalloc_pic im_jemalloc) \ No newline at end of file diff --git a/cmake/jemalloc.patch b/cmake/jemalloc.patch new file mode 100644 index 0000000000..660849041c --- /dev/null +++ b/cmake/jemalloc.patch @@ -0,0 +1,38 @@ +diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h +index 68e558ab..87bb2280 100644 +--- a/include/jemalloc/internal/sz.h ++++ b/include/jemalloc/internal/sz.h +@@ -266,7 +266,7 @@ sz_sa2u(size_t size, size_t alignment) { + assert(alignment != 0 && ((alignment - 1) & alignment) == 0); + + /* Try for a small size class. */ +- if (size <= SC_SMALL_MAXCLASS && alignment < PAGE) { ++ if (size <= SC_SMALL_MAXCLASS && alignment <= PAGE) { + /* + * Round size up to the nearest multiple of alignment. + * +diff --git a/src/arena.c b/src/arena.c +index ba50e410..dc7646e6 100644 +--- a/src/arena.c ++++ b/src/arena.c +@@ -1533,10 +1533,17 @@ arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero, tcache_t *tcache) { + void *ret; + +- if (usize <= SC_SMALL_MAXCLASS +- && (alignment < PAGE +- || (alignment == PAGE && (usize & PAGE_MASK) == 0))) { ++ if (usize <= SC_SMALL_MAXCLASS) { + /* Small; alignment doesn't require special slab placement. */ ++ ++ /* usize should be a result of sz_sa2u() */ ++ assert((usize & (alignment - 1)) == 0); ++ ++ /* ++ * Small usize can't come from an alignment larger than a page. ++ */ ++ assert(alignment <= PAGE); ++ + ret = arena_malloc(tsdn, arena, usize, sz_size2index(usize), + zero, tcache, true); + } else { diff --git a/flow/FastAlloc.cpp b/flow/FastAlloc.cpp index 3fe8d6af31..d344c4cd21 100644 --- a/flow/FastAlloc.cpp +++ b/flow/FastAlloc.cpp @@ -24,6 +24,7 @@ #include "flow/Trace.h" #include "flow/Error.h" #include "flow/Knobs.h" +#include "flow/UnitTest.h" #include "flow/crc32c.h" #include "flow/flow.h" @@ -588,3 +589,20 @@ template class FastAllocator<2048>; template class FastAllocator<4096>; template class FastAllocator<8192>; template class FastAllocator<16384>; + +#ifdef USE_JEMALLOC +#include +TEST_CASE("/jemalloc/4k_aligned_usable_size") { + for (int i = 1; i < 4; ++i) { + auto* ptr = aligned_alloc(4096, i * 4096); + try { + ASSERT_EQ(malloc_usable_size(ptr), i * 4096); + } catch (...) { + aligned_free(ptr); + throw; + } + aligned_free(ptr); + } + return Void(); +} +#endif \ No newline at end of file diff --git a/flow/config.h.cmake b/flow/config.h.cmake index 35a3c147fa..864d7c68fb 100644 --- a/flow/config.h.cmake +++ b/flow/config.h.cmake @@ -28,4 +28,5 @@ # endif # cmakedefine DTRACE_PROBES # cmakedefine HAS_ALIGNED_ALLOC +# cmakedefine USE_JEMALLOC #endif // WIN32 From 73fd5c01ae6bc28ae9ea2814c66f90ca8f1f7eae Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Wed, 2 Mar 2022 15:37:12 -0500 Subject: [PATCH 110/138] quick-fix-fetch-key --- fdbserver/storageserver.actor.cpp | 42 ++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 9d5f43207e..f83b4ff285 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -118,6 +118,7 @@ struct AddingShard : NonCopyable { struct StorageServer* server; Version transferredVersion; + Version fetchVersion; // To learn more details of the phase transitions, see function fetchKeys(). The phases below are sorted in // chronological order and do not go back. @@ -138,7 +139,7 @@ struct AddingShard : NonCopyable { // When fetchKeys "partially completes" (splits an adding shard in two), this is used to construct the left half AddingShard(AddingShard* prev, KeyRange const& keys) : keys(keys), fetchClient(prev->fetchClient), server(prev->server), transferredVersion(prev->transferredVersion), - phase(prev->phase) {} + fetchVersion(prev->fetchVersion), phase(prev->phase) {} ~AddingShard() { if (!fetchComplete.isSet()) fetchComplete.send(Void()); @@ -4447,6 +4448,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { // Get the history state int debug_getRangeRetries = 0; state int debug_nextRetryToLog = 1; + state Error lastError(); // FIXME: The client cache does not notice when servers are added to a team. To read from a local storage server // we must refresh the cache manually. @@ -4454,8 +4456,27 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { loop { state Transaction tr(data->cx); - fetchVersion = data->version.get(); - + // fetchVersion = data->version.get(); + // A quick fix: + // By default, we use data->version as the fetchVersion. + // In the case where dest SS falls far behind src SS, we use GRV as the fetchVersion instead of + // data->version, and then the dest SS waits for catching up the fetchVersion outside the + // fetchKeysParallelismLock. + // For example, consider dest SS falls far behind src SS. + // At iteration 0, dest SS selects its version as fetchVersion, + // but cannot read src SS and result in error_code_transaction_too_old. + // Due to error_code_transaction_too_old, dest SS starts iteration 1. + // At iteration 1, dest SS selects GRV as fetchVersion and (suppose) can read the data from src SS. + // Then dest SS waits its version catch up with this GRV version and write the data to disk. + // Note that dest SS waits outside the fetchKeysParallelismLock. + if (lastError.code() == error_code_transaction_too_old) { + Version grvVersion = wait(tr.getRawReadVersion()); + fetchVersion = std::max(grvVersion, data->version.get()); + } else { + fetchVersion = std::max(shard->fetchVersion, data->version.get()); + } + ASSERT(fetchVersion >= shard->fetchVersion); // at this point, shard->fetchVersion is the last fetchVersion + shard->fetchVersion = fetchVersion; TraceEvent(SevDebug, "FetchKeysUnblocked", data->thisServerID) .detail("FKID", interval.pairID) .detail("Version", fetchVersion); @@ -4532,6 +4553,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { e.code() != error_code_process_behind && e.code() != error_code_server_overloaded) { throw; } + lastError = e; if (nfk == keys.begin) { TraceEvent("FKBlockFail", data->thisServerID) .errorUnsuppressed(e) @@ -4604,10 +4626,11 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { // we have written) state Future fetchDurable = data->durableVersion.whenAtLeast(data->storageVersion() + 1); + state Future dataArrive = data->version.whenAtLeast(fetchVersion); wait(dispatchChangeFeeds(data, fetchKeysID, keys, fetchVersion)); holdingFKPL.release(); - wait(fetchDurable); + wait(dataArrive && fetchDurable); TraceEvent(SevDebug, "FKAfterFinalCommit", data->thisServerID) .detail("FKID", interval.pairID) @@ -4625,7 +4648,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { TraceEvent(SevDebug, "FKUpdateBatch", data->thisServerID).detail("FKID", interval.pairID); shard->phase = AddingShard::Waiting; - + ASSERT(data->version.get() >= fetchVersion); // Choose a transferredVersion. This choice and timing ensure that // * The transferredVersion can be mutated in versionedData // * The transferredVersion isn't yet committed to storage (so we can write the availability status change) @@ -4645,6 +4668,9 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { .detail("StorageVersion", data->storageVersion()); validate(data); + // the minimal version in updates must be larger than fetchVersion + ASSERT(shard->updates.empty() || shard->updates[0].version > fetchVersion); + // Put the updates that were collected during the FinalCommit phase into the batch at the transferredVersion. // Eager reads will be done for them by update(), and the mutations will come back through // AddingShard::addMutations and be applied to versionedMap and mutationLog as normal. The lie about their @@ -4734,11 +4760,15 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { }; AddingShard::AddingShard(StorageServer* server, KeyRangeRef const& keys) - : keys(keys), server(server), transferredVersion(invalidVersion), phase(WaitPrevious) { + : keys(keys), server(server), transferredVersion(invalidVersion), fetchVersion(invalidVersion), phase(WaitPrevious) { fetchClient = fetchKeys(server, this); } void AddingShard::addMutation(Version version, bool fromFetch, MutationRef const& mutation) { + if (version <= fetchVersion) { + return; + } + server->counters.logicalBytesMoveInOverhead += mutation.expectedSize(); if (mutation.type == mutation.ClearRange) { ASSERT(keys.begin <= mutation.param1 && mutation.param2 <= keys.end); From 62f547ff6e9c460fe08505698933ee7ebf84a16e Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Wed, 16 Mar 2022 14:15:48 -0700 Subject: [PATCH 111/138] Add line before actorcompiler.h include (to prevent IDE from reordering includes) --- fdbclient/ManagementAPI.actor.cpp | 1 + fdbclient/StatusClient.actor.cpp | 1 + fdbclient/StorageServerInterface.h | 1 - fdbserver/Coordination.actor.cpp | 1 + fdbserver/DiskQueue.actor.cpp | 1 + fdbserver/FDBExecHelper.actor.h | 1 + fdbserver/RestoreApplier.actor.cpp | 1 + fdbserver/networktest.actor.cpp | 1 + fdbserver/workloads/ChangeFeeds.actor.cpp | 1 + fdbserver/workloads/ConsistencyCheck.actor.cpp | 1 + fdbserver/workloads/Cycle.actor.cpp | 1 + fdbserver/workloads/MiniCycle.actor.cpp | 1 + fdbserver/workloads/StreamingRangeRead.actor.cpp | 1 + 13 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 9f9f932d24..044ac79395 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -42,6 +42,7 @@ #include "fdbrpc/ReplicationPolicy.h" #include "fdbrpc/Replication.h" #include "fdbclient/Schemas.h" + #include "flow/actorcompiler.h" // This must be the last #include. bool isInteger(const std::string& s) { diff --git a/fdbclient/StatusClient.actor.cpp b/fdbclient/StatusClient.actor.cpp index a160892544..aaa3970f47 100644 --- a/fdbclient/StatusClient.actor.cpp +++ b/fdbclient/StatusClient.actor.cpp @@ -28,6 +28,7 @@ #include "fdbclient/json_spirit/json_spirit_reader_template.h" #include "fdbrpc/genericactors.actor.h" #include + #include "flow/actorcompiler.h" // has to be last include json_spirit::mValue readJSONStrictly(const std::string& s) { diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 0d10f3e8ac..592d2dd167 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -94,7 +94,6 @@ struct StorageServerInterface { NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); } NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); } Optional secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; } - bool readyForTraffic; UID id() const { return uniqueID; } bool isTss() const { return tssPairID.present(); } std::string toString() const { return id().shortString(); } diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index d974d78b7a..eac75e9f8b 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -34,6 +34,7 @@ #include "flow/IndexedSet.h" #include "fdbclient/MonitorLeader.h" #include "flow/network.h" + #include "flow/actorcompiler.h" // This must be the last #include. // This module implements coordinationServer() and the interfaces in CoordinationInterface.h diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index 6ab8012ce5..adfd682c1a 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -25,6 +25,7 @@ #include "flow/crc32c.h" #include "flow/genericactors.actor.h" #include "flow/xxhash.h" + #include "flow/actorcompiler.h" // This must be the last #include. typedef bool (*compare_pages)(void*, void*); diff --git a/fdbserver/FDBExecHelper.actor.h b/fdbserver/FDBExecHelper.actor.h index 41d9fd80db..f5f07a000d 100644 --- a/fdbserver/FDBExecHelper.actor.h +++ b/fdbserver/FDBExecHelper.actor.h @@ -31,6 +31,7 @@ #include "flow/Arena.h" #include "flow/flow.h" #include "fdbclient/FDBTypes.h" + #include "flow/actorcompiler.h" // This must be the last #include. // execute/snapshot command takes two arguments: diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 0a7791532a..b8ff926358 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -34,6 +34,7 @@ #include "fdbserver/RestoreApplier.actor.h" #include "flow/network.h" + #include "flow/actorcompiler.h" // This must be the last #include. ACTOR static Future handleSendMutationVectorRequest(RestoreSendVersionedMutationsRequest req, diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp index 51c5ed5a68..371f61dd25 100644 --- a/fdbserver/networktest.actor.cpp +++ b/fdbserver/networktest.actor.cpp @@ -24,6 +24,7 @@ #include "flow/ActorCollection.h" #include "flow/UnitTest.h" #include + #include "flow/actorcompiler.h" // This must be the last #include. constexpr int WLTOKEN_NETWORKTEST = WLTOKEN_FIRST_AVAILABLE; diff --git a/fdbserver/workloads/ChangeFeeds.actor.cpp b/fdbserver/workloads/ChangeFeeds.actor.cpp index c2a3eb4107..edc3444340 100644 --- a/fdbserver/workloads/ChangeFeeds.actor.cpp +++ b/fdbserver/workloads/ChangeFeeds.actor.cpp @@ -31,6 +31,7 @@ #include "flow/serialize.h" #include #include + #include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future>, Version>> readDatabase(Database cx) { diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 2adbd0fad4..4176740f52 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -37,6 +37,7 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/StorageServerInterface.h" #include "flow/network.h" + #include "flow/actorcompiler.h" // This must be the last #include. //#define SevCCheckInfo SevVerbose diff --git a/fdbserver/workloads/Cycle.actor.cpp b/fdbserver/workloads/Cycle.actor.cpp index 187dbb324f..4fb5bd3590 100644 --- a/fdbserver/workloads/Cycle.actor.cpp +++ b/fdbserver/workloads/Cycle.actor.cpp @@ -29,6 +29,7 @@ #include "flow/IRandom.h" #include "flow/Trace.h" #include "flow/serialize.h" + #include "flow/actorcompiler.h" // This must be the last #include. struct CycleWorkload : TestWorkload { diff --git a/fdbserver/workloads/MiniCycle.actor.cpp b/fdbserver/workloads/MiniCycle.actor.cpp index 36a287abc5..84d6adeda2 100644 --- a/fdbserver/workloads/MiniCycle.actor.cpp +++ b/fdbserver/workloads/MiniCycle.actor.cpp @@ -28,6 +28,7 @@ #include "flow/Trace.h" #include "flow/serialize.h" #include + #include "flow/actorcompiler.h" // This must be the last #include. struct MiniCycleWorkload : TestWorkload { diff --git a/fdbserver/workloads/StreamingRangeRead.actor.cpp b/fdbserver/workloads/StreamingRangeRead.actor.cpp index ec8112da4e..76400f72c8 100644 --- a/fdbserver/workloads/StreamingRangeRead.actor.cpp +++ b/fdbserver/workloads/StreamingRangeRead.actor.cpp @@ -29,6 +29,7 @@ #include "flow/Trace.h" #include "flow/serialize.h" #include + #include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future streamUsingGetRange(PromiseStream results, Transaction* tr, KeyRange keys) { From 58de6e22cc06ee99507fa6e1b60162344021b142 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Wed, 16 Mar 2022 11:24:02 -0700 Subject: [PATCH 112/138] Add BalanceOnRequests boolean parameter for ModelInterface --- fdbclient/NativeAPI.actor.cpp | 15 +++++++++------ fdbrpc/MultiInterface.h | 4 +++- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 9d1c60301c..38b9fdf128 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -129,6 +129,9 @@ Future loadBalance( FDB_BOOLEAN_PARAM(TransactionRecordLogInfo); FDB_DEFINE_BOOLEAN_PARAM(UseProvisionalProxies); +// Used to determine whether or not client will load balance based on the number of GRVs released by each proxy +FDB_DEFINE_BOOLEAN_PARAM(BalanceOnRequests); + // Whether or not a request should include the tenant name FDB_BOOLEAN_PARAM(UseTenant); @@ -1875,9 +1878,9 @@ void DatabaseContext::setOption(FDBDatabaseOptions::Option option, Optionalget().commitProxies.size()) - commitProxies = makeReference(clientInfo->get().commitProxies, false); + commitProxies = makeReference(clientInfo->get().commitProxies); if (clientInfo->get().grvProxies.size()) - grvProxies = makeReference(clientInfo->get().grvProxies, true); + grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); server_interf.clear(); locationCache.insert(allKeys, Reference()); break; @@ -1891,9 +1894,9 @@ void DatabaseContext::setOption(FDBDatabaseOptions::Option option, Optional(value.get()) : Optional>()); if (clientInfo->get().commitProxies.size()) - commitProxies = makeReference(clientInfo->get().commitProxies, false); + commitProxies = makeReference(clientInfo->get().commitProxies); if (clientInfo->get().grvProxies.size()) - grvProxies = makeReference(clientInfo->get().grvProxies, true); + grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); server_interf.clear(); locationCache.insert(allKeys, Reference()); break; @@ -2448,11 +2451,11 @@ void DatabaseContext::updateProxies() { grvProxies.clear(); bool commitProxyProvisional = false, grvProxyProvisional = false; if (clientInfo->get().commitProxies.size()) { - commitProxies = makeReference(clientInfo->get().commitProxies, false); + commitProxies = makeReference(clientInfo->get().commitProxies); commitProxyProvisional = clientInfo->get().commitProxies[0].provisional; } if (clientInfo->get().grvProxies.size()) { - grvProxies = makeReference(clientInfo->get().grvProxies, true); + grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); grvProxyProvisional = clientInfo->get().grvProxies[0].provisional; } if (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) { diff --git a/fdbrpc/MultiInterface.h b/fdbrpc/MultiInterface.h index a4628ad903..ad59772bf7 100644 --- a/fdbrpc/MultiInterface.h +++ b/fdbrpc/MultiInterface.h @@ -91,13 +91,15 @@ struct AlternativeInfo { bool operator==(double const& r) const { return cumulativeProbability == r; } }; +FDB_DECLARE_BOOLEAN_PARAM(BalanceOnRequests); + template class ModelInterface : public ReferenceCounted> { public: // If balanceOnRequests is true, the client will load balance based on the number of GRVs released by each proxy // If balanceOnRequests is false, the client will load balance based on the CPU usage of each proxy // Only requests which take from the GRV budget on the proxy should set balanceOnRequests to true - ModelInterface(const std::vector& v, bool balanceOnRequests) : balanceOnRequests(balanceOnRequests) { + explicit ModelInterface(const std::vector& v, BalanceOnRequests balanceOnRequests = BalanceOnRequests::False) : balanceOnRequests(balanceOnRequests) { for (int i = 0; i < v.size(); i++) { alternatives.push_back(AlternativeInfo(v[i], 1.0 / v.size(), (i + 1.0) / v.size())); } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 65f472bae7..36036e34ac 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -495,7 +495,7 @@ ACTOR Future monitorBatchLimitedTime(Reference cons loop { wait(delay(SERVER_KNOBS->METRIC_UPDATE_RATE)); - state Reference grvProxies(new GrvProxyInfo(db->get().client.grvProxies, false)); + state Reference grvProxies(new GrvProxyInfo(db->get().client.grvProxies)); choose { when(wait(db->onChange())) {} diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 87e56c0c71..692c99ee9b 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -7620,7 +7620,7 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface loop { state Future infoChanged = self->db->onChange(); state Reference commitProxies( - new CommitProxyInfo(self->db->get().client.commitProxies, false)); + new CommitProxyInfo(self->db->get().client.commitProxies)); choose { when(GetStorageServerRejoinInfoReply _rep = wait(commitProxies->size() From 0e7dc83f251d213b7e28fcd54c4eee87a012c1a3 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Wed, 16 Mar 2022 12:02:04 -0700 Subject: [PATCH 113/138] Fix compilation issues with ModelInterface construction in configuration database code --- fdbclient/PaxosConfigTransaction.actor.cpp | 6 +++--- fdbserver/PaxosConfigConsumer.actor.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fdbclient/PaxosConfigTransaction.actor.cpp b/fdbclient/PaxosConfigTransaction.actor.cpp index ee5b0829ad..dd715ee9e7 100644 --- a/fdbclient/PaxosConfigTransaction.actor.cpp +++ b/fdbclient/PaxosConfigTransaction.actor.cpp @@ -227,7 +227,7 @@ class PaxosConfigTransactionImpl { try { ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); state Reference configNodes( - new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false)); + new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas())); ConfigTransactionGetReply reply = wait(timeoutError(basicLoadBalance(configNodes, &ConfigTransactionInterface::get, @@ -250,7 +250,7 @@ class PaxosConfigTransactionImpl { ACTOR static Future getConfigClasses(PaxosConfigTransactionImpl* self) { ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); state Reference configNodes( - new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false)); + new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas())); ConfigTransactionGetConfigClassesReply reply = wait(basicLoadBalance(configNodes, &ConfigTransactionInterface::getClasses, @@ -266,7 +266,7 @@ class PaxosConfigTransactionImpl { ACTOR static Future getKnobs(PaxosConfigTransactionImpl* self, Optional configClass) { ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration()); state Reference configNodes( - new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false)); + new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas())); ConfigTransactionGetKnobsReply reply = wait(basicLoadBalance(configNodes, &ConfigTransactionInterface::getKnobs, diff --git a/fdbserver/PaxosConfigConsumer.actor.cpp b/fdbserver/PaxosConfigConsumer.actor.cpp index 1ebb73be95..437a0db090 100644 --- a/fdbserver/PaxosConfigConsumer.actor.cpp +++ b/fdbserver/PaxosConfigConsumer.actor.cpp @@ -99,7 +99,7 @@ class GetCommittedVersionQuorum { // Now roll node forward to match the largest committed version of // the replies. - state Reference quorumCfi(new ConfigFollowerInfo(self->replies[target], false)); + state Reference quorumCfi(new ConfigFollowerInfo(self->replies[target])); try { state Version lastSeenVersion = std::max( rollback.present() ? rollback.get() : nodeVersion.lastCommitted, self->largestCompactedResponse); @@ -295,7 +295,7 @@ class PaxosConfigConsumerImpl { try { state Version committedVersion = wait(getCommittedVersion(self)); state Reference configNodes( - new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas(), false)); + new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas())); ConfigFollowerGetSnapshotAndChangesReply reply = wait(timeoutError(basicLoadBalance(configNodes, &ConfigFollowerInterface::getSnapshotAndChanges, @@ -350,7 +350,7 @@ class PaxosConfigConsumerImpl { if (committedVersion > self->lastSeenVersion) { ASSERT(self->getCommittedVersionQuorum.getReadReplicas().size() >= self->cfis.size() / 2 + 1); state Reference configNodes( - new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas(), false)); + new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas())); ConfigFollowerGetChangesReply reply = wait(timeoutError( basicLoadBalance(configNodes, &ConfigFollowerInterface::getChanges, From 320c115c71887eb25e1996328e2848d71470e351 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Wed, 16 Mar 2022 13:04:11 -0700 Subject: [PATCH 114/138] Apply clang-format to mis-formatted files --- fdbrpc/MultiInterface.h | 3 ++- fdbserver/storageserver.actor.cpp | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbrpc/MultiInterface.h b/fdbrpc/MultiInterface.h index ad59772bf7..739b7285fa 100644 --- a/fdbrpc/MultiInterface.h +++ b/fdbrpc/MultiInterface.h @@ -99,7 +99,8 @@ public: // If balanceOnRequests is true, the client will load balance based on the number of GRVs released by each proxy // If balanceOnRequests is false, the client will load balance based on the CPU usage of each proxy // Only requests which take from the GRV budget on the proxy should set balanceOnRequests to true - explicit ModelInterface(const std::vector& v, BalanceOnRequests balanceOnRequests = BalanceOnRequests::False) : balanceOnRequests(balanceOnRequests) { + explicit ModelInterface(const std::vector& v, BalanceOnRequests balanceOnRequests = BalanceOnRequests::False) + : balanceOnRequests(balanceOnRequests) { for (int i = 0; i < v.size(); i++) { alternatives.push_back(AlternativeInfo(v[i], 1.0 / v.size(), (i + 1.0) / v.size())); } diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 692c99ee9b..6ee72c396c 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -7619,8 +7619,7 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface loop { state Future infoChanged = self->db->onChange(); - state Reference commitProxies( - new CommitProxyInfo(self->db->get().client.commitProxies)); + state Reference commitProxies(new CommitProxyInfo(self->db->get().client.commitProxies)); choose { when(GetStorageServerRejoinInfoReply _rep = wait(commitProxies->size() From 052220f8abb076f47a1c8adedc7d6e812c98fb4c Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 16 Mar 2022 14:57:53 -0700 Subject: [PATCH 115/138] Update the storage cache to handle all requests types. Most assert false, one is updated to send a broken_promise as a workaround to the fact that we do expect some requests to go there. --- fdbserver/StorageCache.actor.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index ebf3d06d5e..ba6f78e81d 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -2223,6 +2223,27 @@ ACTOR Future storageCacheServer(StorageServerInterface ssi, when(ReplyPromise reply = waitNext(ssi.getKeyValueStoreType.getFuture())) { ASSERT(false); } + + when(GetKeyValuesAndFlatMapRequest req = waitNext(ssi.getKeyValuesAndFlatMap.getFuture())) { + ASSERT(false); + } + when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) { ASSERT(false); } + when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { ASSERT(false); } + when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { ASSERT(false); } + when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) { ASSERT(false); } + when(SplitRangeRequest req = waitNext(ssi.getRangeSplitPoints.getFuture())) { ASSERT(false); } + when(GetKeyValuesStreamRequest req = waitNext(ssi.getKeyValuesStream.getFuture())) { ASSERT(false); } + when(ChangeFeedStreamRequest req = waitNext(ssi.changeFeedStream.getFuture())) { ASSERT(false); } + when(OverlappingChangeFeedsRequest req = waitNext(ssi.overlappingChangeFeeds.getFuture())) { + // Simulate endpoint not found so that the requester will try another endpoint + // This is a workaround to the fact that storage servers do not have an easy way to enforce this + // request goes only to other storage servers, and in simulation we manage to trigger this behavior + req.reply.sendError(broken_promise()); + } + when(ChangeFeedPopRequest req = waitNext(ssi.changeFeedPop.getFuture())) { ASSERT(false); } + when(ChangeFeedVersionUpdateRequest req = waitNext(ssi.changeFeedVersionUpdate.getFuture())) { + ASSERT(false); + } when(wait(actors.getResult())) {} } } From 31bd9c8b3efb52cf11e5c5125a81eb333ff83c95 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 16 Mar 2022 15:05:06 -0700 Subject: [PATCH 116/138] Use the new name for mapped key values requests --- fdbserver/StorageCache.actor.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index ba6f78e81d..7ffe845445 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -2224,9 +2224,7 @@ ACTOR Future storageCacheServer(StorageServerInterface ssi, ASSERT(false); } - when(GetKeyValuesAndFlatMapRequest req = waitNext(ssi.getKeyValuesAndFlatMap.getFuture())) { - ASSERT(false); - } + when(GetMappedKeyValuesRequest req = waitNext(ssi.getMappedKeyValues.getFuture())) { ASSERT(false); } when(WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) { ASSERT(false); } when(SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) { ASSERT(false); } when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { ASSERT(false); } From 61468dab6dd14d0e83ecce24c766410aab32f304 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 16 Mar 2022 15:26:43 -0700 Subject: [PATCH 117/138] Significantly lower the expected rate in the cache test --- tests/fast/CacheTest.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fast/CacheTest.toml b/tests/fast/CacheTest.toml index b3c2c448ab..a63731617b 100644 --- a/tests/fast/CacheTest.toml +++ b/tests/fast/CacheTest.toml @@ -12,5 +12,5 @@ testTitle = 'Cycle' testName = 'Cycle' transactionsPerSecond = 2500.0 testDuration = 10.0 - expectedRate = 0.80 + expectedRate = 0.01 keyPrefix = 'foo/' From 6cef85e2689d4daceee9864d4e0c5757498f71c7 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 16 Mar 2022 18:59:32 -0700 Subject: [PATCH 118/138] Change assertions to not rely on the first argument being evaluated before the second. --- flow/Arena.cpp | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/flow/Arena.cpp b/flow/Arena.cpp index 11f8795690..f1537c3f5e 100644 --- a/flow/Arena.cpp +++ b/flow/Arena.cpp @@ -676,6 +676,7 @@ TEST_CASE("/flow/Arena/DefaultBoostHash") { TEST_CASE("/flow/Arena/Size") { Arena a; + int fastSize, slowSize; // Size estimates are accurate unless dependencies are added to an Arena via another Arena // handle which points to a non-root node. @@ -683,10 +684,14 @@ TEST_CASE("/flow/Arena/Size") { // Note that the ASSERT argument order matters, the estimate must be calculated first as // the full accurate calculation will update the estimate makeString(40, a); - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); makeString(700, a); - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); // Copy a at a point where it points to a large block with room for block references Arena b = a; @@ -697,35 +702,51 @@ TEST_CASE("/flow/Arena/Size") { makeString(1000, a); makeString(1000, a); - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); Standalone s = makeString(500); a.dependsOn(s.arena()); - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); Standalone s2 = makeString(500); a.dependsOn(s2.arena()); - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); // Add a dependency to b, which will fit in b's root and update b's size estimate Standalone s3 = makeString(100); b.dependsOn(s3.arena()); - ASSERT_EQ(b.getSize(true), b.getSize()); + fastSize = b.getSize(true); + slowSize = b.getSize(); + ASSERT_EQ(fastSize, slowSize); // But now a's size estimate is out of date because the new reference in b's root is still // in a's tree - ASSERT_LT(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_LT(fastSize, slowSize); // Now that a full size calc has been done on a, the estimate is up to date. - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); // Add a dependency to c, which will NOT fit in c's root, so it will be added to a new // root for c and that root will not be in a's tree so a's size and estimate remain // unchanged and the same. The size and estimate of c will also match. Standalone s4 = makeString(100); c.dependsOn(s4.arena()); - ASSERT_EQ(c.getSize(true), c.getSize()); - ASSERT_EQ(a.getSize(true), a.getSize()); + fastSize = c.getSize(true); + slowSize = c.getSize(); + ASSERT_EQ(fastSize, slowSize); + fastSize = a.getSize(true); + slowSize = a.getSize(); + ASSERT_EQ(fastSize, slowSize); return Void(); } From d39b8810457b145f47590b3659329e4e18500dba Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 17 Mar 2022 09:03:20 -0700 Subject: [PATCH 119/138] Avoid a call to VersionedMap::ViewAtVersion::lower_bound (#6606) * Avoid a call to VersionedMap::ViewAtVersion::lower_bound in some cases * Avoid lower_bound in both cases --- fdbserver/storageserver.actor.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 90fc288bf5..421304c716 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -2469,7 +2469,17 @@ ACTOR Future readRange(StorageServer* data, else readBegin = range.begin; - vCurrent = view.lower_bound(readBegin); + if (vCurrent) { + // We can get first greater or equal from the result of lastLessOrEqual + if (vCurrent.key() != readBegin) { + ++vCurrent; + } + } else { + // There's nothing less than or equal to readBegin in view, so + // begin() is the first thing greater than readBegin, or end(). + // Either way that's the correct result for lower_bound. + vCurrent = view.begin(); + } while (limit > 0 && *pLimitBytes > 0 && readBegin < range.end) { ASSERT(!vCurrent || vCurrent.key() >= readBegin); From 00277140ec820c728696595eafd071cf22005cb6 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 17 Mar 2022 11:09:14 -0700 Subject: [PATCH 120/138] Non-simulated test harness runs don't check the unseed. --- contrib/TestHarness/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/TestHarness/Program.cs b/contrib/TestHarness/Program.cs index 2955072fce..9327bd6d68 100644 --- a/contrib/TestHarness/Program.cs +++ b/contrib/TestHarness/Program.cs @@ -359,7 +359,7 @@ namespace SummarizeTest } int result = 0; - bool unseedCheck = random.NextDouble() < unseedRatio; + bool unseedCheck = !noSim && random.NextDouble() < unseedRatio; for (int i = 0; i < maxTries; ++i) { bool logOnRetryableError = i == maxTries - 1; From 05495908b8be4029b03041f9e7254be09d0b42b6 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 23 Feb 2022 08:06:39 -0800 Subject: [PATCH 121/138] Implement some tenant tests --- fdbclient/CMakeLists.txt | 1 + fdbclient/DatabaseContext.h | 10 +- fdbclient/NativeAPI.actor.cpp | 9 +- fdbclient/Tenant.cpp | 57 +++ fdbserver/CMakeLists.txt | 1 + fdbserver/SimulatedCluster.actor.cpp | 35 +- fdbserver/TesterInterface.actor.h | 5 +- fdbserver/tester.actor.cpp | 41 ++- .../workloads/FuzzApiCorrectness.actor.cpp | 226 ++++++++---- fdbserver/workloads/Performance.actor.cpp | 4 +- .../workloads/TenantManagement.actor.cpp | 347 ++++++++++++++++++ fdbserver/workloads/workloads.actor.h | 5 +- tests/CMakeLists.txt | 2 + tests/slow/SwizzledTenantManagement.toml | 43 +++ tests/slow/TenantManagement.toml | 10 + 15 files changed, 694 insertions(+), 102 deletions(-) create mode 100644 fdbclient/Tenant.cpp create mode 100644 fdbserver/workloads/TenantManagement.actor.cpp create mode 100644 tests/slow/SwizzledTenantManagement.toml create mode 100644 tests/slow/TenantManagement.toml diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 3d3a0d4ecd..81b41314d1 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -139,6 +139,7 @@ set(FDBCLIENT_SRCS TagThrottle.actor.h TaskBucket.actor.cpp TaskBucket.h + Tenant.cpp Tenant.h TestKnobCollection.cpp TestKnobCollection.h diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 30c346cecc..2fd7bfffb5 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -246,7 +246,8 @@ public: lockAware, internal, apiVersion, - switchable)); + switchable, + defaultTenant)); } Optional getCachedLocation(const Optional& tenant, @@ -370,7 +371,8 @@ public: LockAware, IsInternal = IsInternal::True, int apiVersion = Database::API_VERSION_LATEST, - IsSwitchable = IsSwitchable::False); + IsSwitchable = IsSwitchable::False, + Optional defaultTenant = Optional()); explicit DatabaseContext(const Error& err); @@ -392,6 +394,10 @@ public: QueueModel queueModel; EnableLocalityLoadBalance enableLocalityLoadBalance{ EnableLocalityLoadBalance::False }; + // The tenant used when none is specified for a transaction. Ordinarily this is unspecified, in which case the raw + // key-space is used. + Optional defaultTenant; + struct VersionRequest { SpanID spanContext; Promise reply; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 38b9fdf128..630d444ee5 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1314,10 +1314,11 @@ DatabaseContext::DatabaseContext(Reference defaultTenant) : lockAware(lockAware), switchable(switchable), connectionRecord(connectionRecord), proxyProvisional(false), - clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), internal(internal), - cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), + clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), defaultTenant(defaultTenant), + internal(internal), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), transactionReadVersionsCompleted("ReadVersionsCompleted", cc), transactionReadVersionBatches("ReadVersionBatches", cc), @@ -4860,7 +4861,7 @@ Transaction::Transaction() Transaction::Transaction(Database const& cx, Optional const& tenant) : trState(makeReference(cx, - tenant, + tenant.present() ? tenant : cx->defaultTenant, cx->taskID, generateSpanID(cx->transactionTracingSample), createTrLogInfoProbabilistically(cx))), diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp new file mode 100644 index 0000000000..5a651b4134 --- /dev/null +++ b/fdbclient/Tenant.cpp @@ -0,0 +1,57 @@ +/* + * Tenant.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/SystemData.h" +#include "fdbclient/Tenant.h" +#include "flow/UnitTest.h" + +TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { + TenantMapEntry entry1(1, ""_sr); + ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr); + TenantMapEntry entry2 = decodeTenantEntry(encodeTenantEntry(entry1)); + ASSERT(entry1.id == entry2.id && entry1.prefix == entry2.prefix); + + TenantMapEntry entry3(std::numeric_limits::max(), "foo"_sr); + ASSERT(entry3.prefix == "foo\xfe\xff\xff\xff\xff\xff\xff\xff"_sr); + TenantMapEntry entry4 = decodeTenantEntry(encodeTenantEntry(entry3)); + ASSERT(entry3.id == entry4.id && entry3.prefix == entry4.prefix); + + for (int i = 0; i < 100; ++i) { + int bits = deterministicRandom()->randomInt(1, 64); + int64_t min = bits == 1 ? 0 : (1 << (bits - 1)); + int64_t maxPlusOne = std::min(1 << bits, std::numeric_limits::max()); + int64_t id = deterministicRandom()->randomInt64(min, maxPlusOne); + + int subspaceLength = deterministicRandom()->randomInt(0, 20); + Standalone subspace = makeString(subspaceLength); + generateRandomData(mutateString(subspace), subspaceLength); + + TenantMapEntry entry(id, subspace); + int64_t bigEndianId = bigEndian64(id); + ASSERT(entry.id == id && entry.prefix.startsWith(subspace) && + entry.prefix.endsWith(StringRef(reinterpret_cast(&bigEndianId), 8)) && + entry.prefix.size() == subspaceLength + 8); + + TenantMapEntry decodedEntry = decodeTenantEntry(encodeTenantEntry(entry)); + ASSERT(decodedEntry.id = entry.id && decodedEntry.prefix == entry.prefix); + } + + return Void(); +} diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index f548fba0b5..47a17ab10c 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -270,6 +270,7 @@ set(FDBSERVER_SRCS workloads/TagThrottleApi.actor.cpp workloads/TargetedKill.actor.cpp workloads/TaskBucketCorrectness.actor.cpp + workloads/TenantManagement.actor.cpp workloads/ThreadSafety.actor.cpp workloads/Throttling.actor.cpp workloads/Throughput.actor.cpp diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 025ff1ae2d..95894502fe 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1786,7 +1786,8 @@ void setupSimulatedSystem(std::vector>* systemActors, Standalone* pStartingConfiguration, std::string whitelistBinPaths, TestConfig testConfig, - ProtocolVersion protocolVersion) { + ProtocolVersion protocolVersion, + bool tenantModeRequired) { // SOMEDAY: this does not test multi-interface configurations SimulationConfig simconfig(testConfig); if (testConfig.logAntiQuorum != -1) { @@ -1839,6 +1840,10 @@ void setupSimulatedSystem(std::vector>* systemActors, startingConfigString += " " + g_simulator.originalRegions; } + if (tenantModeRequired) { + startingConfigString += " tenant_mode=required"; + } + g_simulator.storagePolicy = simconfig.db.storagePolicy; g_simulator.tLogPolicy = simconfig.db.tLogPolicy; g_simulator.tLogWriteAntiQuorum = simconfig.db.tLogWriteAntiQuorum; @@ -1884,7 +1889,7 @@ void setupSimulatedSystem(std::vector>* systemActors, ASSERT(g_simulator.storagePolicy && g_simulator.tLogPolicy); ASSERT(!g_simulator.hasSatelliteReplication || g_simulator.satelliteTLogPolicy); - TraceEvent("SimulatorConfig").detail("ConfigString", StringRef(startingConfigString)); + TraceEvent("SimulatorConfig").setMaxFieldLength(10000).detail("ConfigString", StringRef(startingConfigString)); const int dataCenters = simconfig.datacenters; const int machineCount = simconfig.machine_count; @@ -2267,6 +2272,7 @@ ACTOR void setupAndRun(std::string dataFolder, state Standalone startingConfiguration; state int testerCount = 1; state TestConfig testConfig; + state bool allowDefaultTenant = true; testConfig.readFromConfig(testFile); g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess; g_simulator.setDiffProtocol = false; @@ -2276,6 +2282,10 @@ ACTOR void setupAndRun(std::string dataFolder, // https://github.com/apple/foundationdb/issues/5155 if (std::string_view(testFile).find("restarting") != std::string_view::npos) { testConfig.storageEngineExcludeTypes.push_back(4); + + // Disable the default tenant in restarting tests for now + // TODO: persist the chosen default tenant in the restartInfo.ini file for the second test + allowDefaultTenant = false; } // TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine @@ -2319,6 +2329,19 @@ ACTOR void setupAndRun(std::string dataFolder, FlowTransport::createInstance(true, 1, WLTOKEN_RESERVED_COUNT); TEST(true); // Simulation start + state Optional defaultTenant; + state bool requireTenants = false; + if (allowDefaultTenant && deterministicRandom()->random01() < 1.0) { + defaultTenant = "SimulatedDefaultTenant"_sr; + if (deterministicRandom()->random01() < 0.9) { + requireTenants = true; + } + } + + TraceEvent("SimulatedClusterTenantMode") + .detail("UsingTenant", defaultTenant) + .detail("TenantRequired", requireTenants); + try { // systemActors.push_back( startSystemMonitor(dataFolder) ); if (rebooting) { @@ -2344,7 +2367,8 @@ ACTOR void setupAndRun(std::string dataFolder, &startingConfiguration, whitelistBinPaths, testConfig, - protocolVersion); + protocolVersion, + requireTenants); wait(delay(1.0)); // FIXME: WHY!!! //wait for machines to boot } std::string clusterFileDir = joinPath(dataFolder, deterministicRandom()->randomUniqueID().toString()); @@ -2355,7 +2379,10 @@ ACTOR void setupAndRun(std::string dataFolder, TEST_ON_TESTERS, testerCount, testFile, - startingConfiguration), + startingConfiguration, + LocalityData(), + UnitTestParameters(), + defaultTenant), isBuggifyEnabled(BuggifyType::General) ? 36000.0 : 5400.0)); } catch (Error& e) { TraceEvent(SevError, "SetupAndRunError").error(e); diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h index 0e73217a10..b50a2223fe 100644 --- a/fdbserver/TesterInterface.actor.h +++ b/fdbserver/TesterInterface.actor.h @@ -66,6 +66,7 @@ struct WorkloadRequest { double databasePingDelay; int64_t sharedRandomNumber; bool useDatabase; + Optional defaultTenant; // The vector of option lists are to construct compound workloads. If there // is only one workload to be run...pass just one list of options! @@ -96,6 +97,7 @@ struct WorkloadRequest { clientId, clientCount, reply, + defaultTenant, arena); } }; @@ -127,7 +129,8 @@ ACTOR Future runTests(Reference connRecord, std::string fileName = std::string(), StringRef startingConfiguration = StringRef(), LocalityData locality = LocalityData(), - UnitTestParameters testOptions = UnitTestParameters()); + UnitTestParameters testOptions = UnitTestParameters(), + Optional defaultTenant = Optional()); #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index b7af32cf1b..dff58e03eb 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -642,6 +642,7 @@ ACTOR Future testerServerWorkload(WorkloadRequest work, if (work.useDatabase) { cx = Database::createDatabase(ccr, -1, IsInternal::True, locality); + cx->defaultTenant = work.defaultTenant.castTo(); wait(delay(1.0)); } @@ -779,7 +780,10 @@ void throwIfError(const std::vector>>& futures, std::string er } } -ACTOR Future runWorkload(Database cx, std::vector testers, TestSpec spec) { +ACTOR Future runWorkload(Database cx, + std::vector testers, + TestSpec spec, + Optional defaultTenant) { TraceEvent("TestRunning") .detail("WorkloadTitle", spec.title) .detail("TesterCount", testers.size()) @@ -803,6 +807,7 @@ ACTOR Future runWorkload(Database cx, std::vector(); workRequests.push_back(testers[i].recruitments.getReply(req)); } @@ -894,7 +899,7 @@ ACTOR Future changeConfiguration(Database cx, std::vector options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("configMode"), configMode)); spec.options.push_back_deep(spec.options.arena(), options); - DistributedTestResults testResults = wait(runWorkload(cx, testers, spec)); + DistributedTestResults testResults = wait(runWorkload(cx, testers, spec, Optional())); return Void(); } @@ -949,7 +954,7 @@ ACTOR Future checkConsistency(Database cx, state double start = now(); state bool lastRun = false; loop { - DistributedTestResults testResults = wait(runWorkload(cx, testers, spec)); + DistributedTestResults testResults = wait(runWorkload(cx, testers, spec, Optional())); if (testResults.ok() || lastRun) { if (g_network->isSimulated()) { g_simulator.connectionFailuresDisableDuration = connectionFailures; @@ -969,11 +974,12 @@ ACTOR Future checkConsistency(Database cx, ACTOR Future runTest(Database cx, std::vector testers, TestSpec spec, - Reference> dbInfo) { + Reference> dbInfo, + Optional defaultTenant) { state DistributedTestResults testResults; try { - Future fTestResults = runWorkload(cx, testers, spec); + Future fTestResults = runWorkload(cx, testers, spec, defaultTenant); if (spec.timeout > 0) { fTestResults = timeoutError(fTestResults, spec.timeout); } @@ -1418,7 +1424,8 @@ ACTOR Future runTests(Reference testers, std::vector tests, StringRef startingConfiguration, - LocalityData locality) { + LocalityData locality, + Optional defaultTenant) { state Database cx; state Reference> dbInfo(new AsyncVar); state Future ccMonitor = monitorServerDBInfo(cc, LocalityData(), dbInfo); // FIXME: locality @@ -1466,6 +1473,7 @@ ACTOR Future runTests(ReferencedefaultTenant = defaultTenant; } state Future disabler = disableConnectionFailuresAfter(FLOW_KNOBS->SIM_SPEEDUP_AFTER_SECONDS, "Tester"); @@ -1493,6 +1501,10 @@ ACTOR Future runTests(Reference runTests(Reference runTests(Reference defaultTenant) { state int flags = (at == TEST_ON_SERVERS ? 0 : GetWorkersRequest::TESTER_CLASS_ONLY) | GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY; state Future testerTimeout = delay(600.0); // wait 600 sec for testers to show up @@ -1599,7 +1612,7 @@ ACTOR Future runTests(Reference runTests(Reference connRecord, std::string fileName, StringRef startingConfiguration, LocalityData locality, - UnitTestParameters testOptions) { + UnitTestParameters testOptions, + Optional defaultTenant) { state std::vector testSpecs; auto cc = makeReference>>(); auto ci = makeReference>>(); @@ -1718,10 +1732,11 @@ ACTOR Future runTests(Reference connRecord, actors.push_back( reportErrors(monitorServerDBInfo(cc, LocalityData(), db), "MonitorServerDBInfo")); // FIXME: Locality actors.push_back(reportErrors(testerServerCore(iTesters[0], connRecord, db, locality), "TesterServerCore")); - tests = runTests(cc, ci, iTesters, testSpecs, startingConfiguration, locality); + tests = runTests(cc, ci, iTesters, testSpecs, startingConfiguration, locality, defaultTenant); } else { - tests = reportErrors(runTests(cc, ci, testSpecs, at, minTestersExpected, startingConfiguration, locality), - "RunTests"); + tests = reportErrors( + runTests(cc, ci, testSpecs, at, minTestersExpected, startingConfiguration, locality, defaultTenant), + "RunTests"); } choose { diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index 8866eacb20..dc002e48dd 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -24,6 +24,7 @@ #include #include "fdbserver/TesterInterface.actor.h" +#include "fdbclient/GenericManagementAPI.actor.h" #include "fdbclient/ThreadSafeTransaction.h" #include "flow/ActorCollection.h" #include "fdbserver/workloads/workloads.actor.h" @@ -122,6 +123,12 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { bool success; + Reference db; + + std::vector> tenants; + std::set createdTenants; + int numTenants; + FuzzApiCorrectnessWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), operationId(0), success(true) { std::call_once(onceFlag, [&]() { addTestCases(); }); @@ -138,6 +145,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { // Only enable special keys writes when allowed to access system keys specialKeysWritesEnabled = useSystemKeys && deterministicRandom()->coinflip(); + int maxTenants = getOption(options, "numTenants"_sr, 4); + numTenants = deterministicRandom()->randomInt(0, maxTenants + 1); + // See https://github.com/apple/foundationdb/issues/2424 if (BUGGIFY) { enableBuggify(true, BuggifyType::Client); @@ -187,9 +197,39 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::string description() const override { return "FuzzApiCorrectness"; } + static TenantName getTenant(int num) { return TenantNameRef(format("tenant_%d", num)); } + bool hasTenant(Optional tenant) { return !tenant.present() || createdTenants.count(tenant.get()); } + + Future setup(Database const& cx) override { + if (clientId == 0) { + return _setup(cx, this); + } + return Void(); + } + + ACTOR Future _setup(Database cx, FuzzApiCorrectnessWorkload* self) { + Reference db = wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx))); + self->db = db; + + std::vector> tenantFutures; + for (int i = 0; i < self->numTenants + 1; ++i) { + TenantName tenantName = getTenant(i); + self->tenants.push_back(self->db->openTenant(tenantName)); + + // The last tenant will not be created + if (i < self->numTenants) { + tenantFutures.push_back(ManagementAPI::createTenant(cx.getReference(), tenantName)); + self->createdTenants.insert(tenantName); + } + } + + wait(waitForAll(tenantFutures)); + return Void(); + } + Future start(Database const& cx) override { if (clientId == 0) { - return loadAndRun(cx, this); + return loadAndRun(this); } return Void(); } @@ -217,65 +257,70 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { // m.push_back( retries.getMetric() ); } - ACTOR Future loadAndRun(Database db, FuzzApiCorrectnessWorkload* self) { + ACTOR Future loadAndRun(FuzzApiCorrectnessWorkload* self) { state double startTime = now(); - state Reference cx = - wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(db))); + state int nodesPerTenant = self->nodes / (self->numTenants + 1); + state int keysPerBatch = + std::min(1000, + 1 + CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT / 2 / + (self->getKeyForIndex(nodesPerTenant).size() + self->valueSizeRange.second)); try { loop { - state int i = 0; - state int keysPerBatch = - std::min(1000, - 1 + CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT / 2 / - (self->getKeyForIndex(self->nodes).size() + self->valueSizeRange.second)); - for (; i < self->nodes; i += keysPerBatch) { - state Reference tr = cx->createTransaction(); - loop { - if (now() - startTime > self->testDuration) - return Void(); - try { - if (i == 0) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->addWriteConflictRange( - allKeys); // To prevent a write only transaction whose commit was previously - // cancelled from being reordered after this transaction - tr->clear(normalKeys); - } - if (self->useSystemKeys) - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - if (self->specialKeysRelaxed) - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED); - if (self->specialKeysWritesEnabled) - tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + state int tenantNum = -1; + for (; tenantNum < self->numTenants; ++tenantNum) { + state int i = 0; + for (; i < nodesPerTenant; i += keysPerBatch) { + state Reference tr = + tenantNum < 0 ? self->db->createTransaction() : self->tenants[i]->createTransaction(); + loop { + if (now() - startTime > self->testDuration) + return Void(); + try { + if (i == 0) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - int end = std::min(self->nodes, i + keysPerBatch); - tr->clear(KeyRangeRef(self->getKeyForIndex(i), self->getKeyForIndex(end))); + // To prevent a write only transaction whose commit was previously + // cancelled from being reordered after this transaction + tr->addWriteConflictRange(allKeys); - for (int j = i; j < end; j++) { - if (deterministicRandom()->random01() < self->initialKeyDensity) { - Key key = self->getKeyForIndex(j); - if (key.size() <= (key.startsWith(systemKeys.begin) - ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT - : CLIENT_KNOBS->KEY_SIZE_LIMIT)) { - Value value = self->getRandomValue(); - value = value.substr( - 0, std::min(value.size(), CLIENT_KNOBS->VALUE_SIZE_LIMIT)); - tr->set(key, value); + tr->clear(normalKeys); + } + if (self->useSystemKeys) + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + if (self->specialKeysRelaxed) + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED); + if (self->specialKeysWritesEnabled) + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + + int end = std::min(nodesPerTenant, i + keysPerBatch); + tr->clear(KeyRangeRef(self->getKeyForIndex(i), self->getKeyForIndex(end))); + + for (int j = i; j < end; j++) { + if (deterministicRandom()->random01() < self->initialKeyDensity) { + Key key = self->getKeyForIndex(j); + if (key.size() <= (key.startsWith(systemKeys.begin) + ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT + : CLIENT_KNOBS->KEY_SIZE_LIMIT)) { + Value value = self->getRandomValue(); + value = value.substr( + 0, std::min(value.size(), CLIENT_KNOBS->VALUE_SIZE_LIMIT)); + tr->set(key, value); + } } } + wait(unsafeThreadFutureToFuture(tr->commit())); + //TraceEvent("WDRInitBatch").detail("I", i).detail("CommittedVersion", tr->getCommittedVersion()); + break; + } catch (Error& e) { + wait(unsafeThreadFutureToFuture(tr->onError(e))); } - wait(unsafeThreadFutureToFuture(tr->commit())); - //TraceEvent("WDRInitBatch").detail("I", i).detail("CommittedVersion", tr->getCommittedVersion()); - break; - } catch (Error& e) { - wait(unsafeThreadFutureToFuture(tr->onError(e))); } } } loop { try { - wait(self->randomTransaction(cx, self) && delay(self->numOps * .001)); + wait(self->randomTransaction(self) && delay(self->numOps * .001)); } catch (Error& e) { if (e.code() != error_code_not_committed) throw e; @@ -291,13 +336,20 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { } } - ACTOR Future randomTransaction(Reference cx, FuzzApiCorrectnessWorkload* self) { - state Reference tr = cx->createTransaction(); + ACTOR Future randomTransaction(FuzzApiCorrectnessWorkload* self) { + state Reference tr; state bool readYourWritesDisabled = deterministicRandom()->coinflip(); state bool readAheadDisabled = deterministicRandom()->coinflip(); state std::vector> operations; state int waitLocation = 0; + int tenantNum = deterministicRandom()->randomInt(-1, self->tenants.size()); + if (tenantNum == -1) { + tr = self->db->createTransaction(); + } else { + tr = self->tenants[tenantNum]->createTransaction(); + } + loop { state bool cancelled = false; if (readYourWritesDisabled) @@ -380,7 +432,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef T value_type; ACTOR static Future runTest(unsigned int id, FuzzApiCorrectnessWorkload* wl, Reference tr) { - state Subclass self(id, wl); + state Subclass self(id, wl, tr); try { value_type result = wait(timeoutError(BaseTest::runTest2(tr, &self), 1000)); @@ -588,7 +640,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Version v; - TestSetVersion(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestSetVersion(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTest(id, workload, "TestSetVersion") { if (deterministicRandom()->coinflip()) v = deterministicRandom()->randomInt64(INT64_MIN, 0); @@ -620,7 +672,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest> base_type; Key key; - TestGet(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestGet") { + TestGet(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestGet") { key = makeKey(); contract = { std::make_pair(error_code_key_outside_legal_range, @@ -641,7 +694,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { ExceptionContract::possibleIf( key == LiteralStringRef("auto_coordinators") - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))) + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } @@ -659,14 +714,17 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; KeySelector keysel; - TestGetKey(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestGetKey") { + TestGetKey(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestGetKey") { keysel = makeKeySel(); contract = { std::make_pair( error_code_key_outside_legal_range, ExceptionContract::requiredIf( (keysel.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), - std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible) }; + std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } ThreadFuture createFuture(Reference tr) override { @@ -684,7 +742,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { KeySelector keysel1, keysel2; int limit; - TestGetRange0(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestGetRange0") { + TestGetRange0(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestGetRange0") { keysel1 = makeKeySel(); keysel2 = makeKeySel(); limit = 0; @@ -715,7 +774,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { // Read some special keys, e.g. status/json, can throw timed_out std::make_pair(error_code_timed_out, ExceptionContract::possibleIf(isSpecialKeyRange)), std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(isSpecialKeyRange)), - std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible) + std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } @@ -735,7 +796,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { KeySelector keysel1, keysel2; GetRangeLimits limits; - TestGetRange1(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestGetRange1") { + TestGetRange1(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestGetRange1") { keysel1 = makeKeySel(); keysel2 = makeKeySel(); limits = makeRangeLimits(); @@ -758,7 +820,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { ExceptionContract::possibleIf(isSpecialKeyRange && !workload->specialKeysRelaxed)), std::make_pair(error_code_timed_out, ExceptionContract::possibleIf(isSpecialKeyRange)), std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(isSpecialKeyRange)), - std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible) + std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } @@ -781,7 +845,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { Key key1, key2; int limit; - TestGetRange2(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestGetRange2") { + TestGetRange2(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestGetRange2") { key1 = makeKey(); key2 = makeKey(); limit = 0; @@ -821,7 +886,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(key1 <= autoCoordinatorSpecialKey && autoCoordinatorSpecialKey < key2)), - std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible) + std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } @@ -841,7 +908,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { Key key1, key2; GetRangeLimits limits; - TestGetRange3(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestGetRange3") { + TestGetRange3(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestGetRange3") { key1 = makeKey(); key2 = makeKey(); limits = makeRangeLimits(); @@ -872,7 +940,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf((key1 <= autoCoordinatorSpecialKey) && (autoCoordinatorSpecialKey < key2))), - std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible) + std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } @@ -894,10 +964,12 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest>> base_type; Key key; - TestGetAddressesForKey(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestGetAddressesForKey(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTest(id, workload, "TestGetAddressesForKey") { key = makeKey(); - contract = { std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible) }; + contract = { std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } ThreadFuture createFuture(Reference tr) override { @@ -914,7 +986,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Key key1, key2; - TestAddReadConflictRange(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestAddReadConflictRange(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestAddReadConflictRange") { key1 = makeKey(); key2 = makeKey(); @@ -940,7 +1012,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { uint8_t op; int32_t pos; - TestAtomicOp(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestAtomicOp(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestAtomicOp") { key = makeKey(); while (isProtectedKey(key)) { @@ -1013,7 +1085,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { Key key; Value value; - TestSet(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTestCallback(id, workload, "TestSet") { + TestSet(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTestCallback(id, workload, "TestSet") { key = makeKey(); while (isProtectedKey(key)) { key = makeKey(); @@ -1050,7 +1123,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Key key1, key2; - TestClear0(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestClear0(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestClear0") { key1 = makeKey(); key2 = makeKey(); @@ -1089,7 +1162,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Key key1, key2; - TestClear1(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestClear1(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestClear1") { key1 = makeKey(); key2 = makeKey(); @@ -1128,7 +1201,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Key key; - TestClear2(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestClear2(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestClear2") { key = makeKey(); while (isProtectedKey(key)) { @@ -1157,7 +1230,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Key key; - TestWatch(unsigned int id, FuzzApiCorrectnessWorkload* workload) : BaseTest(id, workload, "TestWatch") { + TestWatch(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) + : BaseTest(id, workload, "TestWatch") { key = makeKey(); contract = { std::make_pair( error_code_key_too_large, @@ -1170,7 +1244,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { (key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_timed_out, ExceptionContract::Possible), - std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible) }; + std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), + std::make_pair(error_code_tenant_not_found, + ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; } ThreadFuture createFuture(Reference tr) override { return tr->watch(key); } @@ -1185,7 +1261,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; Key key1, key2; - TestAddWriteConflictRange(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestAddWriteConflictRange(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestAddWriteConflictRange") { key1 = makeKey(); key2 = makeKey(); @@ -1209,7 +1285,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { int op; Optional> val; - TestSetOption(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestSetOption(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestSetOption") { double arv = deterministicRandom()->random01(); if (arv < 0.25) { @@ -1293,7 +1369,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { typedef BaseTest base_type; int errorcode; - TestOnError(unsigned int id, FuzzApiCorrectnessWorkload* workload) + TestOnError(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTestCallback(id, workload, "TestOnError") { errorcode = 0; double erv = deterministicRandom()->random01(); diff --git a/fdbserver/workloads/Performance.actor.cpp b/fdbserver/workloads/Performance.actor.cpp index 77d7a45f43..ff11124cef 100644 --- a/fdbserver/workloads/Performance.actor.cpp +++ b/fdbserver/workloads/Performance.actor.cpp @@ -137,7 +137,7 @@ struct PerformanceWorkload : TestWorkload { TestSpec spec(LiteralStringRef("PerformanceSetup"), false, false); spec.options = options; spec.phases = TestWorkload::SETUP; - DistributedTestResults results = wait(runWorkload(cx, testers, spec)); + DistributedTestResults results = wait(runWorkload(cx, testers, spec, Optional())); return Void(); } @@ -172,7 +172,7 @@ struct PerformanceWorkload : TestWorkload { TestSpec spec(LiteralStringRef("PerformanceRun"), false, false); spec.phases = TestWorkload::EXECUTION | TestWorkload::METRICS; spec.options = options; - DistributedTestResults r = wait(runWorkload(cx, self->testers, spec)); + DistributedTestResults r = wait(runWorkload(cx, self->testers, spec, Optional())); results = r; } catch (Error& e) { TraceEvent("PerformanceRunError") diff --git a/fdbserver/workloads/TenantManagement.actor.cpp b/fdbserver/workloads/TenantManagement.actor.cpp new file mode 100644 index 0000000000..59e78dea85 --- /dev/null +++ b/fdbserver/workloads/TenantManagement.actor.cpp @@ -0,0 +1,347 @@ +/* + * TenantManagement.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/GenericManagementAPI.actor.h" +#include "fdbrpc/simulator.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" +#include "flow/Error.h" +#include "flow/IRandom.h" +#include "flow/flow.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +struct TenantManagementWorkload : TestWorkload { + struct TenantState { + int64_t id; + bool empty; + + TenantState() : id(-1), empty(true) {} + TenantState(int64_t id, bool empty) : id(id), empty(empty) {} + }; + + std::map createdTenants; + int64_t maxId = -1; + Key tenantSubspace; + + const Key keyName = "key"_sr; + const Key tenantSubspaceKey = "tenant_subspace"_sr; + const Value noTenantValue = "no_tenant"_sr; + + int maxTenants; + double testDuration; + + TenantManagementWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + maxTenants = getOption(options, "maxTenants"_sr, 1000); + testDuration = getOption(options, "testDuration"_sr, 60.0); + } + + std::string description() const override { return "TenantManagement"; } + + Future setup(Database const& cx) override { return _setup(cx, this); } + ACTOR Future _setup(Database cx, TenantManagementWorkload* self) { + state Transaction tr(cx); + if (self->clientId == 0) { + self->tenantSubspace = makeString(deterministicRandom()->randomInt(0, 10)); + generateRandomData(mutateString(self->tenantSubspace), self->tenantSubspace.size()); + loop { + try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); + tr.set(self->keyName, self->noTenantValue); + tr.set(self->tenantSubspaceKey, self->tenantSubspace); + wait(tr.commit()); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } else { + loop { + try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); + Optional val = wait(tr.get(self->tenantSubspaceKey)); + if (val.present()) { + self->tenantSubspace = val.get(); + break; + } + + wait(delay(1.0)); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + return Void(); + } + + TenantName chooseTenantName() { + TenantName tenant(format("tenant_%d_%d", clientId, deterministicRandom()->randomInt(0, maxTenants))); + if (deterministicRandom()->random01() < 0.02) { + tenant = tenant.withPrefix("\xff"_sr); + } + + return tenant; + } + + ACTOR Future createTenant(Database cx, TenantManagementWorkload* self) { + state TenantName tenant = self->chooseTenantName(); + state bool alreadyExists = self->createdTenants.count(tenant); + try { + wait(ManagementAPI::createTenant(cx.getReference(), tenant)); + ASSERT(!alreadyExists); + ASSERT(!tenant.startsWith("\xff"_sr)); + + state Optional entry = wait(ManagementAPI::tryGetTenant(cx.getReference(), tenant)); + ASSERT(entry.present()); + ASSERT(entry.get().id > self->maxId); + ASSERT(entry.get().prefix.startsWith(self->tenantSubspace)); + + self->maxId = entry.get().id; + + state bool insertData = deterministicRandom()->random01() < 0.5; + if (insertData) { + state Transaction tr(cx, tenant); + loop { + try { + tr.set(self->keyName, tenant); + wait(tr.commit()); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + tr = Transaction(cx); + loop { + try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); + Optional val = wait(tr.get(self->keyName.withPrefix(entry.get().prefix))); + ASSERT(val.present()); + ASSERT(val.get() == tenant); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + self->createdTenants[tenant] = TenantState(entry.get().id, insertData); + } catch (Error& e) { + if (e.code() == error_code_tenant_already_exists) { + ASSERT(alreadyExists); + } else if (e.code() == error_code_invalid_tenant_name) { + ASSERT(tenant.startsWith("\xff"_sr)); + } else { + TraceEvent(SevError, "CreateTenantFailure").detail("TenantName", tenant).error(e); + } + } + + return Void(); + } + + ACTOR Future deleteTenant(Database cx, TenantManagementWorkload* self) { + state TenantName tenant = self->chooseTenantName(); + + auto itr = self->createdTenants.find(tenant); + state bool alreadyExists = itr != self->createdTenants.end(); + state bool isEmpty = (itr == self->createdTenants.end() || itr->second.empty); + + try { + if (alreadyExists && deterministicRandom()->random01() < 0.5) { + state Transaction tr(cx, tenant); + loop { + try { + tr.clear(self->keyName); + wait(tr.commit()); + isEmpty = true; + auto itr = self->createdTenants.find(tenant); + ASSERT(itr != self->createdTenants.end()); + itr->second.empty = true; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + wait(ManagementAPI::deleteTenant(cx.getReference(), tenant)); + ASSERT(alreadyExists); + ASSERT(isEmpty); + self->createdTenants.erase(tenant); + } catch (Error& e) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(!alreadyExists); + } else if (e.code() == error_code_tenant_not_empty) { + ASSERT(!isEmpty); + } else { + TraceEvent(SevError, "DeleteTenantFailure").detail("TenantName", tenant).error(e); + } + } + + return Void(); + } + + ACTOR Future checkTenant(Database cx, + TenantManagementWorkload* self, + TenantName tenant, + TenantState tenantState) { + state Transaction tr(cx, tenant); + loop { + try { + RangeResult result = wait(tr.getRange(KeyRangeRef(""_sr, "\xff"_sr), 2)); + if (tenantState.empty) { + ASSERT(result.size() == 0); + } else { + ASSERT(result.size() == 1); + ASSERT(result[0].key == self->keyName); + ASSERT(result[0].value == tenant); + } + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + return Void(); + } + + ACTOR Future getTenant(Database cx, TenantManagementWorkload* self) { + state TenantName tenant = self->chooseTenantName(); + auto itr = self->createdTenants.find(tenant); + state bool alreadyExists = itr != self->createdTenants.end(); + state TenantState tenantState = itr->second; + + try { + TenantMapEntry entry = wait(ManagementAPI::getTenant(cx.getReference(), tenant)); + ASSERT(alreadyExists); + ASSERT(entry.id == tenantState.id); + wait(self->checkTenant(cx, self, tenant, tenantState)); + } catch (Error& e) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(!alreadyExists); + } else { + TraceEvent(SevError, "GetTenantFailure").detail("TenantName", tenant).error(e); + } + } + + return Void(); + } + + ACTOR Future listTenants(Database cx, TenantManagementWorkload* self) { + state TenantName beginTenant = self->chooseTenantName(); + state TenantName endTenant = self->chooseTenantName(); + state int limit = std::min(CLIENT_KNOBS->TOO_MANY, deterministicRandom()->randomInt(0, self->maxTenants * 2)); + + try { + Standalone> tenants = + wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, limit)); + + ASSERT(tenants.size() <= limit); + + int index = 0; + auto itr = self->createdTenants.begin(); + for (; index < tenants.size(); ++itr) { + ASSERT(itr != self->createdTenants.end()); + ASSERT(itr->first == tenants[index++]); + } + + ASSERT(tenants.size() == limit || itr == self->createdTenants.end()); + if (tenants.size() == limit) { + ASSERT(itr == self->createdTenants.end() || itr->first >= endTenant); + } + } catch (Error& e) { + TraceEvent(SevError, "ListTenantFailure") + .detail("BeginTenant", beginTenant) + .detail("EndTenant", endTenant) + .error(e); + } + + return Void(); + } + + Future start(Database const& cx) override { return _start(cx, this); } + ACTOR Future _start(Database cx, TenantManagementWorkload* self) { + state double start = now(); + while (now() < start + self->testDuration) { + state int operation = deterministicRandom()->randomInt(0, 4); + if (operation == 0) { + wait(self->createTenant(cx, self)); + } else if (operation == 1) { + wait(self->deleteTenant(cx, self)); + } else if (operation == 2) { + wait(self->getTenant(cx, self)); + } else { + wait(self->listTenants(cx, self)); + } + } + + return Void(); + } + + Future check(Database const& cx) override { return _check(cx, this); } + ACTOR Future _check(Database cx, TenantManagementWorkload* self) { + state Transaction tr(cx); + + loop { + try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); + Optional val = wait(tr.get(self->keyName)); + ASSERT(val.present() && val.get() == self->noTenantValue); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + state std::map::iterator itr = self->createdTenants.begin(); + state std::vector> checkTenants; + state TenantName beginTenant = ""_sr; + + loop { + Standalone> tenants = + wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, "\xff\xff"_sr, 1000)); + + for (auto tenant : tenants) { + ASSERT(!tenant.startsWith("\xff"_sr)); + ASSERT(tenant == itr->first); + checkTenants.push_back(self->checkTenant(cx, self, tenant, itr->second)); + ++itr; + } + + if (tenants.size() < 1000) { + break; + } else { + beginTenant = keyAfter(tenants[tenants.size() - 1]); + } + } + + ASSERT(itr == self->createdTenants.end()); + wait(waitForAll(checkTenants)); + + return true; + } + + void getMetrics(std::vector& m) override {} +}; + +WorkloadFactory TenantManagementWorkload("TenantManagement"); diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index 1770c7eb52..34e9111f68 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -207,7 +207,10 @@ public: ISimulator::BackupAgentType simDrAgents; }; -ACTOR Future runWorkload(Database cx, std::vector testers, TestSpec spec); +ACTOR Future runWorkload(Database cx, + std::vector testers, + TestSpec spec, + Optional defaultTenant); void logMetrics(std::vector metrics); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2c5f93c4f3..775f46c294 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -300,6 +300,8 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES slow/SwizzledDdBalance.toml) add_fdb_test(TEST_FILES slow/SwizzledRollbackTimeLapse.toml) add_fdb_test(TEST_FILES slow/SwizzledRollbackTimeLapseIncrement.toml) + add_fdb_test(TEST_FILES slow/SwizzledTenantManagement.toml) + add_fdb_test(TEST_FILES slow/TenantManagement.toml) add_fdb_test(TEST_FILES slow/VersionStampBackupToDB.toml) add_fdb_test(TEST_FILES slow/VersionStampSwitchover.toml) add_fdb_test(TEST_FILES slow/WriteDuringReadAtomicRestore.toml) diff --git a/tests/slow/SwizzledTenantManagement.toml b/tests/slow/SwizzledTenantManagement.toml new file mode 100644 index 0000000000..7c06d21208 --- /dev/null +++ b/tests/slow/SwizzledTenantManagement.toml @@ -0,0 +1,43 @@ +[[test]] +testTitle = 'TenantManagementTest' +clearAfterTest = true +timeout = 2100 +runSetup = true + + [[test.workload]] + testName = 'TenantManagement' + maxTenants = 1000 + testDuration = 60 + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 120.0 + swizzle = 1 + + [[test.workload]] + testName = 'Rollback' + testDuration = 120.0 + meanDelay = 10.0 + + [[test.workload]] + testName = 'Attrition' + testDuration = 120.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 120.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 120.0 + + [[test.workload]] + testName = 'ChangeConfig' + maxDelayBeforeChange = 120.0 + coordinators = 'auto' diff --git a/tests/slow/TenantManagement.toml b/tests/slow/TenantManagement.toml new file mode 100644 index 0000000000..9dce2e2fba --- /dev/null +++ b/tests/slow/TenantManagement.toml @@ -0,0 +1,10 @@ +[[test]] +testTitle = 'TenantManagementTest' +clearAfterTest = true +timeout = 2100 +runSetup = true + + [[test.workload]] + testName = 'TenantManagement' + maxTenants = 1000 + testDuration = 60 From 592f31755e97dd896296548bd3e21322958ac992 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 25 Feb 2022 10:54:43 -0800 Subject: [PATCH 122/138] Fixes to the new tenant tests --- fdbserver/tester.actor.cpp | 1 + .../workloads/FuzzApiCorrectness.actor.cpp | 71 ++++++++++--------- .../workloads/TenantManagement.actor.cpp | 60 ++++++++++------ 3 files changed, 78 insertions(+), 54 deletions(-) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index dff58e03eb..911eb14405 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1502,6 +1502,7 @@ ACTOR Future runTests(Reference tr) const { // We should always ignore these. if (e.code() == error_code_used_during_commit || e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_transaction_cancelled || @@ -71,6 +71,7 @@ struct ExceptionContract { evt.error(e) .detail("Thrown", true) .detail("Expected", i->second == Possible ? "possible" : "always") + .detail("Tenant", tr->getTenant()) .backtrace(); if (augment) augment(evt); @@ -78,20 +79,21 @@ struct ExceptionContract { } TraceEvent evt(SevError, func.c_str()); - evt.error(e).detail("Thrown", true).detail("Expected", "never").backtrace(); + evt.error(e).detail("Thrown", true).detail("Expected", "never").detail("Tenant", tr->getTenant()).backtrace(); if (augment) augment(evt); throw e; } // Return true if we should have thrown, but didn't. - void handleNotThrown() const { + void handleNotThrown(Reference tr) const { for (auto i : expected) { if (i.second == Always) { TraceEvent evt(SevError, func.c_str()); evt.error(Error::fromUnvalidatedCode(i.first)) .detail("Thrown", false) .detail("Expected", "always") + .detail("Tenant", tr->getTenant()) .backtrace(); if (augment) augment(evt); @@ -172,6 +174,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { conflictRange = KeyRangeRef(LiteralStringRef("\xfe"), LiteralStringRef("\xfe\x00")); TraceEvent("FuzzApiCorrectnessConfiguration") .detail("Nodes", nodes) + .detail("NumTenants", numTenants) .detail("InitialKeyDensity", initialKeyDensity) .detail("AdjacentKeys", adjacentKeys) .detail("ValueSizeMin", valueSizeRange.first) @@ -198,7 +201,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::string description() const override { return "FuzzApiCorrectness"; } static TenantName getTenant(int num) { return TenantNameRef(format("tenant_%d", num)); } - bool hasTenant(Optional tenant) { return !tenant.present() || createdTenants.count(tenant.get()); } + bool canUseTenant(Optional tenant) { + return !tenant.present() || createdTenants.count(tenant.get()) || useSystemKeys; + } Future setup(Database const& cx) override { if (clientId == 0) { @@ -259,7 +264,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { ACTOR Future loadAndRun(FuzzApiCorrectnessWorkload* self) { state double startTime = now(); - state int nodesPerTenant = self->nodes / (self->numTenants + 1); + state int nodesPerTenant = std::max(1, self->nodes / (self->numTenants + 1)); state int keysPerBatch = std::min(1000, 1 + CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT / 2 / @@ -270,8 +275,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { for (; tenantNum < self->numTenants; ++tenantNum) { state int i = 0; for (; i < nodesPerTenant; i += keysPerBatch) { - state Reference tr = - tenantNum < 0 ? self->db->createTransaction() : self->tenants[i]->createTransaction(); + state Reference tr = tenantNum < 0 + ? self->db->createTransaction() + : self->tenants[tenantNum]->createTransaction(); loop { if (now() - startTime > self->testDuration) return Void(); @@ -343,7 +349,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { state std::vector> operations; state int waitLocation = 0; - int tenantNum = deterministicRandom()->randomInt(-1, self->tenants.size()); + state int tenantNum = deterministicRandom()->randomInt(-1, self->tenants.size()); if (tenantNum == -1) { tr = self->db->createTransaction(); } else { @@ -436,10 +442,10 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { try { value_type result = wait(timeoutError(BaseTest::runTest2(tr, &self), 1000)); - self.contract.handleNotThrown(); + self.contract.handleNotThrown(tr); return self.errorCheck(tr, result); } catch (Error& e) { - self.contract.handleException(e); + self.contract.handleException(e, tr); } return Void(); } @@ -456,7 +462,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { value_type result = wait(future); if (future.isError()) { - self->contract.handleException(future.getError()); + self->contract.handleException(future.getError(), tr); } else { ASSERT(future.isValid()); } @@ -696,7 +702,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { LiteralStringRef("auto_coordinators") .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } @@ -706,7 +712,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key", printable(key)); + e.detail("Key", key); + e.detail("Size", key.size()); } }; @@ -724,7 +731,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } ThreadFuture createFuture(Reference tr) override { @@ -776,7 +783,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(isSpecialKeyRange)), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } @@ -822,7 +829,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(isSpecialKeyRange)), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } @@ -888,7 +895,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { autoCoordinatorSpecialKey < key2)), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } @@ -899,7 +906,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key1", printable(key1)).detail("Key2", printable(key2)).detail("Limit", limit); + e.detail("Key1", key1).detail("Key2", key2).detail("Limit", limit); } }; @@ -942,7 +949,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { (autoCoordinatorSpecialKey < key2))), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } @@ -953,7 +960,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key1", printable(key1)).detail("Key2", printable(key2)); + e.detail("Key1", key1).detail("Key2", key2); std::stringstream ss; ss << "(" << limits.rows << ", " << limits.minRows << ", " << limits.bytes << ")"; e.detail("Limits", ss.str()); @@ -969,7 +976,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { key = makeKey(); contract = { std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; + ExceptionContract::requiredIf(!workload->canUseTenant(tr->getTenant()))) }; } ThreadFuture createFuture(Reference tr) override { @@ -978,7 +985,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key", printable(key)); + e.detail("Key", key); } }; @@ -1001,7 +1008,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key1", printable(key1)).detail("Key2", printable(key2)); + e.detail("Key1", key1).detail("Key2", key2); } }; @@ -1076,7 +1083,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key", printable(key)).detail("Value", printable(value)).detail("Op", op).detail("Pos", pos); + e.detail("Key", key).detail("Value", value).detail("Op", op).detail("Pos", pos); } }; @@ -1115,7 +1122,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key", printable(key)).detail("Value", printable(value)); + e.detail("Key", key).detail("Value", value); } }; @@ -1154,7 +1161,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key1", printable(key1)).detail("Key2", printable(key2)); + e.detail("Key1", key1).detail("Key2", key2); } }; @@ -1193,7 +1200,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key1", printable(key1)).detail("Key2", printable(key2)); + e.detail("Key1", key1).detail("Key2", key2); } }; @@ -1222,7 +1229,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key", printable(key)); + e.detail("Key", key); } }; @@ -1246,14 +1253,14 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_timed_out, ExceptionContract::Possible), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::requiredIf(!workload->hasTenant(tr->getTenant()))) }; + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) }; } ThreadFuture createFuture(Reference tr) override { return tr->watch(key); } void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key", printable(key)); + e.detail("Key", key); } }; @@ -1276,7 +1283,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Key1", printable(key1)).detail("Key2", printable(key2)); + e.detail("Key1", key1).detail("Key2", key2); } }; @@ -1361,7 +1368,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { void augmentTrace(TraceEvent& e) const override { base_type::augmentTrace(e); - e.detail("Op", op).detail("Val", printable(val)); + e.detail("Op", op).detail("Val", val); } }; diff --git a/fdbserver/workloads/TenantManagement.actor.cpp b/fdbserver/workloads/TenantManagement.actor.cpp index 59e78dea85..d9d0b3e865 100644 --- a/fdbserver/workloads/TenantManagement.actor.cpp +++ b/fdbserver/workloads/TenantManagement.actor.cpp @@ -46,6 +46,8 @@ struct TenantManagementWorkload : TestWorkload { const Key keyName = "key"_sr; const Key tenantSubspaceKey = "tenant_subspace"_sr; const Value noTenantValue = "no_tenant"_sr; + const TenantName tenantNamePrefix = "tenant_management_workload_"_sr; + TenantName localTenantNamePrefix; int maxTenants; double testDuration; @@ -53,6 +55,8 @@ struct TenantManagementWorkload : TestWorkload { TenantManagementWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { maxTenants = getOption(options, "maxTenants"_sr, 1000); testDuration = getOption(options, "testDuration"_sr, 60.0); + + localTenantNamePrefix = format("%stenant_%d_", tenantNamePrefix.toString().c_str(), clientId); } std::string description() const override { return "TenantManagement"; } @@ -68,6 +72,7 @@ struct TenantManagementWorkload : TestWorkload { tr.setOption(FDBTransactionOptions::RAW_ACCESS); tr.set(self->keyName, self->noTenantValue); tr.set(self->tenantSubspaceKey, self->tenantSubspace); + tr.set(tenantDataPrefixKey, self->tenantSubspace); wait(tr.commit()); break; } catch (Error& e) { @@ -94,9 +99,10 @@ struct TenantManagementWorkload : TestWorkload { return Void(); } - TenantName chooseTenantName() { - TenantName tenant(format("tenant_%d_%d", clientId, deterministicRandom()->randomInt(0, maxTenants))); - if (deterministicRandom()->random01() < 0.02) { + TenantName chooseTenantName(bool allowSystemTenant) { + TenantName tenant( + format("%s%d", localTenantNamePrefix.toString().c_str(), deterministicRandom()->randomInt(0, maxTenants))); + if (allowSystemTenant && deterministicRandom()->random01() < 0.02) { tenant = tenant.withPrefix("\xff"_sr); } @@ -104,7 +110,7 @@ struct TenantManagementWorkload : TestWorkload { } ACTOR Future createTenant(Database cx, TenantManagementWorkload* self) { - state TenantName tenant = self->chooseTenantName(); + state TenantName tenant = self->chooseTenantName(true); state bool alreadyExists = self->createdTenants.count(tenant); try { wait(ManagementAPI::createTenant(cx.getReference(), tenant)); @@ -117,6 +123,7 @@ struct TenantManagementWorkload : TestWorkload { ASSERT(entry.get().prefix.startsWith(self->tenantSubspace)); self->maxId = entry.get().id; + self->createdTenants[tenant] = TenantState(entry.get().id, true); state bool insertData = deterministicRandom()->random01() < 0.5; if (insertData) { @@ -131,6 +138,8 @@ struct TenantManagementWorkload : TestWorkload { } } + self->createdTenants[tenant].empty = false; + tr = Transaction(cx); loop { try { @@ -145,14 +154,14 @@ struct TenantManagementWorkload : TestWorkload { } } - self->createdTenants[tenant] = TenantState(entry.get().id, insertData); + wait(self->checkTenant(cx, self, tenant, self->createdTenants[tenant])); } catch (Error& e) { if (e.code() == error_code_tenant_already_exists) { ASSERT(alreadyExists); } else if (e.code() == error_code_invalid_tenant_name) { ASSERT(tenant.startsWith("\xff"_sr)); } else { - TraceEvent(SevError, "CreateTenantFailure").detail("TenantName", tenant).error(e); + TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant); } } @@ -160,7 +169,7 @@ struct TenantManagementWorkload : TestWorkload { } ACTOR Future deleteTenant(Database cx, TenantManagementWorkload* self) { - state TenantName tenant = self->chooseTenantName(); + state TenantName tenant = self->chooseTenantName(true); auto itr = self->createdTenants.find(tenant); state bool alreadyExists = itr != self->createdTenants.end(); @@ -194,7 +203,7 @@ struct TenantManagementWorkload : TestWorkload { } else if (e.code() == error_code_tenant_not_empty) { ASSERT(!isEmpty); } else { - TraceEvent(SevError, "DeleteTenantFailure").detail("TenantName", tenant).error(e); + TraceEvent(SevError, "DeleteTenantFailure").error(e).detail("TenantName", tenant); } } @@ -208,7 +217,7 @@ struct TenantManagementWorkload : TestWorkload { state Transaction tr(cx, tenant); loop { try { - RangeResult result = wait(tr.getRange(KeyRangeRef(""_sr, "\xff"_sr), 2)); + state RangeResult result = wait(tr.getRange(KeyRangeRef(""_sr, "\xff"_sr), 2)); if (tenantState.empty) { ASSERT(result.size() == 0); } else { @@ -226,7 +235,7 @@ struct TenantManagementWorkload : TestWorkload { } ACTOR Future getTenant(Database cx, TenantManagementWorkload* self) { - state TenantName tenant = self->chooseTenantName(); + state TenantName tenant = self->chooseTenantName(true); auto itr = self->createdTenants.find(tenant); state bool alreadyExists = itr != self->createdTenants.end(); state TenantState tenantState = itr->second; @@ -240,7 +249,7 @@ struct TenantManagementWorkload : TestWorkload { if (e.code() == error_code_tenant_not_found) { ASSERT(!alreadyExists); } else { - TraceEvent(SevError, "GetTenantFailure").detail("TenantName", tenant).error(e); + TraceEvent(SevError, "GetTenantFailure").error(e).detail("TenantName", tenant); } } @@ -248,10 +257,14 @@ struct TenantManagementWorkload : TestWorkload { } ACTOR Future listTenants(Database cx, TenantManagementWorkload* self) { - state TenantName beginTenant = self->chooseTenantName(); - state TenantName endTenant = self->chooseTenantName(); + state TenantName beginTenant = self->chooseTenantName(false); + state TenantName endTenant = self->chooseTenantName(false); state int limit = std::min(CLIENT_KNOBS->TOO_MANY, deterministicRandom()->randomInt(0, self->maxTenants * 2)); + if (beginTenant > endTenant) { + std::swap(beginTenant, endTenant); + } + try { Standalone> tenants = wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, limit)); @@ -259,21 +272,23 @@ struct TenantManagementWorkload : TestWorkload { ASSERT(tenants.size() <= limit); int index = 0; - auto itr = self->createdTenants.begin(); + auto itr = self->createdTenants.lower_bound(beginTenant); for (; index < tenants.size(); ++itr) { ASSERT(itr != self->createdTenants.end()); ASSERT(itr->first == tenants[index++]); } - ASSERT(tenants.size() == limit || itr == self->createdTenants.end()); - if (tenants.size() == limit) { - ASSERT(itr == self->createdTenants.end() || itr->first >= endTenant); + if (!(tenants.size() == limit || itr == self->createdTenants.end())) { + for (auto tenant : self->createdTenants) { + TraceEvent("ExistingTenant").detail("Tenant", tenant.first); + } } + ASSERT(tenants.size() == limit || itr == self->createdTenants.end() || itr->first >= endTenant); } catch (Error& e) { TraceEvent(SevError, "ListTenantFailure") + .error(e) .detail("BeginTenant", beginTenant) - .detail("EndTenant", endTenant) - .error(e); + .detail("EndTenant", endTenant); } return Void(); @@ -315,14 +330,15 @@ struct TenantManagementWorkload : TestWorkload { state std::map::iterator itr = self->createdTenants.begin(); state std::vector> checkTenants; - state TenantName beginTenant = ""_sr; + state TenantName beginTenant = ""_sr.withPrefix(self->localTenantNamePrefix); + state TenantName endTenant = "\xff\xff"_sr.withPrefix(self->localTenantNamePrefix); loop { Standalone> tenants = - wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, "\xff\xff"_sr, 1000)); + wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, 1000)); for (auto tenant : tenants) { - ASSERT(!tenant.startsWith("\xff"_sr)); + ASSERT(itr != self->createdTenants.end()); ASSERT(tenant == itr->first); checkTenants.push_back(self->checkTenant(cx, self, tenant, itr->second)); ++itr; From 4b521b38cb6bc0c78a2c64d5d4a829c7acbccd62 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 1 Mar 2022 08:21:43 -0800 Subject: [PATCH 123/138] Update various workloads to specify that they are expecting raw access. A couple other raw access related fixes. Disable tenant tests in backup and DR tests for now. --- fdbserver/SimulatedCluster.actor.cpp | 8 +++++ fdbserver/workloads/ChangeFeeds.actor.cpp | 2 ++ .../workloads/DataLossRecovery.actor.cpp | 5 +++ .../workloads/FuzzApiCorrectness.actor.cpp | 26 ++++++++++---- fdbserver/workloads/LockDatabase.actor.cpp | 3 ++ .../SpecialKeySpaceCorrectness.actor.cpp | 35 +++++++++++++++++++ fdbserver/workloads/WriteDuringRead.actor.cpp | 35 ++++++++++++++----- 7 files changed, 99 insertions(+), 15 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 95894502fe..79610e379c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2295,6 +2295,14 @@ ACTOR void setupAndRun(std::string dataFolder, testConfig.storageEngineExcludeTypes.push_back(4); } + // Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate + // the tenant map and related state. + // TODO: reenable when backup/DR supports tenants. + if (std::string_view(testFile).find("Backup") != std::string_view::npos || + std::string_view(testFile).find("Switchover") != std::string_view::npos) { + allowDefaultTenant = false; + } + // The RocksDB engine is not always built with the rest of fdbserver. Don't try to use it if it is not included // in the build. if (!rocksDBEnabled) { diff --git a/fdbserver/workloads/ChangeFeeds.actor.cpp b/fdbserver/workloads/ChangeFeeds.actor.cpp index edc3444340..d7227c3269 100644 --- a/fdbserver/workloads/ChangeFeeds.actor.cpp +++ b/fdbserver/workloads/ChangeFeeds.actor.cpp @@ -37,6 +37,8 @@ ACTOR Future>, Version>> readDatabase(Database cx) { state Transaction tr(cx); loop { + // Change feeds do not currently support tenant based access + tr.setOption(FDBTransactionOptions::RAW_ACCESS); state Standalone> output; state Version readVersion; try { diff --git a/fdbserver/workloads/DataLossRecovery.actor.cpp b/fdbserver/workloads/DataLossRecovery.actor.cpp index 8169b6ecf0..38e6ede9ce 100644 --- a/fdbserver/workloads/DataLossRecovery.actor.cpp +++ b/fdbserver/workloads/DataLossRecovery.actor.cpp @@ -20,6 +20,7 @@ #include #include +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/MoveKeys.actor.h" @@ -105,6 +106,7 @@ struct DataLossRecoveryWorkload : TestWorkload { loop { try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); state Optional res = wait(timeoutError(tr.get(key), 30.0)); const bool equal = !expectedValue.isError() && res == expectedValue.get(); if (!equal) { @@ -126,6 +128,7 @@ struct DataLossRecoveryWorkload : TestWorkload { state Transaction tr(cx); loop { try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); if (value.present()) { tr.set(key, value.get()); } else { @@ -193,6 +196,7 @@ struct DataLossRecoveryWorkload : TestWorkload { loop { try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); BinaryWriter wrMyOwner(Unversioned()); wrMyOwner << owner; tr.set(moveKeysLockOwnerKey, wrMyOwner.toValue()); @@ -228,6 +232,7 @@ struct DataLossRecoveryWorkload : TestWorkload { state Transaction validateTr(cx); loop { try { + validateTr.setOption(FDBTransactionOptions::RAW_ACCESS); Standalone> addresses = wait(validateTr.getAddressesForKey(keys.begin)); // The move function is not what we are testing here, crash the test if the move fails. ASSERT(addresses.size() == 1); diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index 39037999c5..b79844c63c 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -262,6 +262,25 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { // m.push_back( retries.getMetric() ); } + // Prevent a write only transaction whose commit was previously cancelled from being reordered after this + // transaction + ACTOR Future writeBarrier(Reference db) { + state Reference tr = db->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + // Write-only transactions have a self-conflict in the system keys + tr->addWriteConflictRange(allKeys); + tr->clear(normalKeys); + wait(unsafeThreadFutureToFuture(tr->commit())); + return Void(); + } catch (Error& e) { + wait(unsafeThreadFutureToFuture(tr->onError(e))); + } + } + } + ACTOR Future loadAndRun(FuzzApiCorrectnessWorkload* self) { state double startTime = now(); state int nodesPerTenant = std::max(1, self->nodes / (self->numTenants + 1)); @@ -274,6 +293,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { state int tenantNum = -1; for (; tenantNum < self->numTenants; ++tenantNum) { state int i = 0; + wait(self->writeBarrier(self->db)); for (; i < nodesPerTenant; i += keysPerBatch) { state Reference tr = tenantNum < 0 ? self->db->createTransaction() @@ -283,12 +303,6 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { return Void(); try { if (i == 0) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - - // To prevent a write only transaction whose commit was previously - // cancelled from being reordered after this transaction - tr->addWriteConflictRange(allKeys); - tr->clear(normalKeys); } if (self->useSystemKeys) diff --git a/fdbserver/workloads/LockDatabase.actor.cpp b/fdbserver/workloads/LockDatabase.actor.cpp index 2b52932608..a5ba9f07fe 100644 --- a/fdbserver/workloads/LockDatabase.actor.cpp +++ b/fdbserver/workloads/LockDatabase.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" @@ -54,6 +55,7 @@ struct LockDatabaseWorkload : TestWorkload { state Transaction tr(cx); loop { try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); wait(lockDatabase(&tr, lockID)); state RangeResult data = wait(tr.getRange(normalKeys, 50000)); ASSERT(!data.more); @@ -70,6 +72,7 @@ struct LockDatabaseWorkload : TestWorkload { loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional val = wait(tr.get(databaseLockedKey)); if (!val.present()) return Void(); diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index 98c75c26aa..2c142569d2 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -70,6 +70,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { Future _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) { cx->specialKeySpace = std::make_unique(); self->ryw = makeReference(cx); + self->ryw->setOption(FDBTransactionOptions::RAW_ACCESS); self->ryw->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED); self->ryw->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); self->ryw->setVersion(100); @@ -291,6 +292,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { state Reference tx = makeReference(cx); // begin key outside module range try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); wait(success(tx->getRange( KeyRangeRef(LiteralStringRef("\xff\xff/transactio"), LiteralStringRef("\xff\xff/transaction0")), CLIENT_KNOBS->TOO_MANY))); @@ -303,6 +305,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // end key outside module range try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); wait(success(tx->getRange( KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction1")), CLIENT_KNOBS->TOO_MANY))); @@ -315,6 +318,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // both begin and end outside module range try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); wait(success(tx->getRange( KeyRangeRef(LiteralStringRef("\xff\xff/transaction"), LiteralStringRef("\xff\xff/transaction1")), CLIENT_KNOBS->TOO_MANY))); @@ -327,6 +331,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // legal range read using the module range try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); wait(success(tx->getRange( KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")), CLIENT_KNOBS->TOO_MANY))); @@ -337,6 +342,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // cross module read with option turned on try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED); const KeyRef startKey = LiteralStringRef("\xff\xff/transactio"); const KeyRef endKey = LiteralStringRef("\xff\xff/transaction1"); @@ -350,6 +356,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // end keySelector inside module range, *** a tricky corner case *** try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->addReadConflictRange(singleKeyRange(LiteralStringRef("testKey"))); KeySelector begin = KeySelectorRef(readConflictRangeKeysRange.begin, false, 1); KeySelector end = KeySelectorRef(LiteralStringRef("\xff\xff/transaction0"), false, 0); @@ -361,6 +368,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // No module found error case with keys try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); wait(success(tx->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/A_no_module_related_prefix"), LiteralStringRef("\xff\xff/I_am_also_not_in_any_module")), CLIENT_KNOBS->TOO_MANY))); @@ -373,6 +381,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // No module found error with KeySelectors, *** a tricky corner case *** try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); KeySelector begin = KeySelectorRef(LiteralStringRef("\xff\xff/zzz_i_am_not_a_module"), false, 1); KeySelector end = KeySelectorRef(LiteralStringRef("\xff\xff/zzz_to_be_the_final_one"), false, 2); wait(success(tx->getRange(begin, end, CLIENT_KNOBS->TOO_MANY))); @@ -385,6 +394,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // begin and end keySelectors clamp up to the boundary of the module try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); const KeyRef key = LiteralStringRef("\xff\xff/cluster_file_path"); KeySelector begin = KeySelectorRef(key, false, 0); KeySelector end = KeySelectorRef(keyAfter(key), false, 2); @@ -395,6 +405,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { throw; } try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->addReadConflictRange(singleKeyRange(LiteralStringRef("readKey"))); const KeyRef key = LiteralStringRef("\xff\xff/transaction/a_to_be_the_first"); KeySelector begin = KeySelectorRef(key, false, 0); @@ -408,6 +419,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // Errors introduced by SpecialKeyRangeRWImpl // Writes are disabled by default try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->set(LiteralStringRef("\xff\xff/I_am_not_a_range_can_be_written"), ValueRef()); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) @@ -417,6 +429,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // The special key is not in a range that can be called with set try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set(LiteralStringRef("\xff\xff/I_am_not_a_range_can_be_written"), ValueRef()); ASSERT(false); @@ -428,6 +441,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // A clear cross two ranges are forbidden try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->clear(KeyRangeRef(SpecialKeySpace::getManagementApiCommandRange("exclude").begin, SpecialKeySpace::getManagementApiCommandRange("failed").end)); @@ -440,6 +454,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // base key of the end key selector not in (\xff\xff, \xff\xff\xff), throw key_outside_legal_range() try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); const KeySelector startKeySelector = KeySelectorRef(LiteralStringRef("\xff\xff/test"), true, -200); const KeySelector endKeySelector = KeySelectorRef(LiteralStringRef("test"), true, -10); RangeResult result = @@ -453,6 +468,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // test case when registered range is the same as the underlying module try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); state RangeResult result = wait(tx->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")), CLIENT_KNOBS->TOO_MANY)); @@ -480,6 +496,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { state Reference tx = makeReference(cx); state Reference referenceTx = makeReference(cx); state bool ryw = deterministicRandom()->coinflip(); + tx->setOption(FDBTransactionOptions::RAW_ACCESS); if (!ryw) { tx->setOption(FDBTransactionOptions::READ_YOUR_WRITES_DISABLE); } @@ -630,6 +647,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { state Reference tx = makeReference(cx); // test ordered option keys { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); for (const std::string& option : SpecialKeySpace::getManagementApiOptionsSet()) { tx->set( @@ -648,6 +666,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // "exclude" error message shema check try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set(LiteralStringRef("Invalid_Network_Address") .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("exclude")), @@ -676,6 +695,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // "setclass" { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); // test getRange state RangeResult result = wait(tx->getRange( @@ -747,6 +767,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { { try { // test getRange + tx->setOption(FDBTransactionOptions::RAW_ACCESS); state RangeResult class_source_result = wait(tx->getRange( KeyRangeRef(LiteralStringRef("process/class_source/"), LiteralStringRef("process/class_source0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin), @@ -807,6 +828,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // maske sure we lock the database loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); // lock the database UID uid = deterministicRandom()->randomUniqueID(); @@ -853,6 +875,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { loop { try { tx->reset(); + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); // unlock the database tx->clear(SpecialKeySpace::getManagementApiCommandPrefix("lock")); @@ -904,6 +927,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { { loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->clear(SpecialKeySpace::getManagementApiCommandPrefix("consistencycheck")); wait(tx->commit()); @@ -1001,6 +1025,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { loop { try { std::string new_processes_key(new_coordinator_process); + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); for (const auto& address : old_coordinators_processes) { new_processes_key += "," + address; @@ -1071,6 +1096,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { loop { try { std::string new_processes_key; + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); for (const auto& address : old_coordinators_processes) { new_processes_key += new_processes_key.size() ? "," : ""; @@ -1127,6 +1153,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { TraceEvent(SevDebug, "AdvanceVersionSuccess").detail("Version", v3); break; } + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); // force the cluster to recover at v2 tx->set(SpecialKeySpace::getManagementApiCommandPrefix("advanceversion"), std::to_string(v2)); @@ -1192,6 +1219,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // update the sample rate and size limit loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set(LiteralStringRef("client_txn_sample_rate") .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("profile")), @@ -1225,6 +1253,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // Change back to default loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set(LiteralStringRef("client_txn_sample_rate") .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("profile")), @@ -1242,6 +1271,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // Test invalid values loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set((deterministicRandom()->coinflip() ? LiteralStringRef("client_txn_sample_rate") : LiteralStringRef("client_txn_size_limit")) @@ -1297,6 +1327,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // Make sure setting more than one zone as maintenance will fail loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set(Key(deterministicRandom()->randomAlphaNumeric(8)) .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("maintenance")), @@ -1333,6 +1364,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { state int ignoreSSFailuresRetry = 0; loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->set(ignoreSSFailuresZoneString.withPrefix( SpecialKeySpace::getManagementApiCommandPrefix("maintenance")), @@ -1371,6 +1403,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // set dd mode to 0 and disable DD for rebalance loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution"); tx->set(LiteralStringRef("mode").withPrefix(ddPrefix), LiteralStringRef("0")); @@ -1410,6 +1443,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // then, clear all changes loop { try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tx->clear(ignoreSSFailuresZoneString.withPrefix( SpecialKeySpace::getManagementApiCommandPrefix("maintenance"))); @@ -1452,6 +1486,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { try { Version readVersion = wait(tr1->getReadVersion()); tr2->setVersion(readVersion); + tr1->setOption(FDBTransactionOptions::RAW_ACCESS); tr1->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tr2->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution"); diff --git a/fdbserver/workloads/WriteDuringRead.actor.cpp b/fdbserver/workloads/WriteDuringRead.actor.cpp index 151b66a9c6..66f65b38cd 100644 --- a/fdbserver/workloads/WriteDuringRead.actor.cpp +++ b/fdbserver/workloads/WriteDuringRead.actor.cpp @@ -583,9 +583,29 @@ struct WriteDuringReadWorkload : TestWorkload { std::string(deterministicRandom()->randomInt(valueSizeRange.first, valueSizeRange.second + 1), 'x')); } + // Prevent a write only transaction whose commit was previously cancelled from being reordered after this + // transaction + ACTOR Future writeBarrier(Database cx) { + state Transaction tr(cx); + loop { + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + // Write-only transactions have a self-conflict in the system keys + tr.addWriteConflictRange(allKeys); + wait(tr.commit()); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + ACTOR Future loadAndRun(Database cx, WriteDuringReadWorkload* self) { state double startTime = now(); loop { + wait(self->writeBarrier(cx)); + state int i = 0; state int keysPerBatch = std::min(1000, @@ -595,19 +615,16 @@ struct WriteDuringReadWorkload : TestWorkload { for (; i < self->nodes; i += keysPerBatch) { state Transaction tr(cx); loop { - if (now() - startTime > self->testDuration) - return Void(); try { - if (i == 0) { - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr.addWriteConflictRange( - allKeys); // To prevent a write only transaction whose commit was previously cancelled - // from being reordered after this transaction - tr.clear(normalKeys); - } + if (now() - startTime > self->testDuration) + return Void(); if (self->useSystemKeys) tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + if (i == 0) { + tr.clear(normalKeys); + } + int end = std::min(self->nodes, i + keysPerBatch); tr.clear(KeyRangeRef(self->getKeyForIndex(i), self->getKeyForIndex(end))); self->memoryDatabase.erase(self->memoryDatabase.lower_bound(self->getKeyForIndex(i)), From 81e8c7c36218f1d8afa42d2f5e6086c6e2bc8d31 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 2 Mar 2022 10:10:37 -0800 Subject: [PATCH 124/138] Various test fixes to work with tenants. --- fdbserver/SimulatedCluster.actor.cpp | 11 +++++++---- fdbserver/workloads/ConflictRange.actor.cpp | 6 +++--- fdbserver/workloads/MachineAttrition.actor.cpp | 2 ++ fdbserver/workloads/SelectorCorrectness.actor.cpp | 2 -- fdbserver/workloads/VersionStamp.actor.cpp | 12 +++++++++--- tests/slow/WriteDuringReadAtomicRestore.toml | 1 + 6 files changed, 22 insertions(+), 12 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 79610e379c..549dcb245c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -300,6 +300,8 @@ public: stderrSeverity, machineCount, processesPerMachine, coordinators; Optional config; + bool allowDefaultTenant = true; + ConfigDBType getConfigDBType() const { return configDBType; } bool tomlKeyPresent(const toml::value& data, std::string key) { @@ -350,7 +352,8 @@ public: .add("processesPerMachine", &processesPerMachine) .add("coordinators", &coordinators) .add("configDB", &configDBType) - .add("extraMachineCountDC", &extraMachineCountDC); + .add("extraMachineCountDC", &extraMachineCountDC) + .add("allowDefaultTenant", &allowDefaultTenant); try { auto file = toml::parse(testFile); if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { @@ -2272,11 +2275,12 @@ ACTOR void setupAndRun(std::string dataFolder, state Standalone startingConfiguration; state int testerCount = 1; state TestConfig testConfig; - state bool allowDefaultTenant = true; testConfig.readFromConfig(testFile); g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess; g_simulator.setDiffProtocol = false; + state bool allowDefaultTenant = testConfig.allowDefaultTenant; + // The RocksDB storage engine does not support the restarting tests because you cannot consistently get a clean // snapshot of the storage engine without a snapshotting file system. // https://github.com/apple/foundationdb/issues/5155 @@ -2298,8 +2302,7 @@ ACTOR void setupAndRun(std::string dataFolder, // Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate // the tenant map and related state. // TODO: reenable when backup/DR supports tenants. - if (std::string_view(testFile).find("Backup") != std::string_view::npos || - std::string_view(testFile).find("Switchover") != std::string_view::npos) { + if (std::string_view(testFile).find("Backup") != std::string_view::npos || testConfig.extraDB != 0) { allowDefaultTenant = false; } diff --git a/fdbserver/workloads/ConflictRange.actor.cpp b/fdbserver/workloads/ConflictRange.actor.cpp index 29ce033743..8d5302bbeb 100644 --- a/fdbserver/workloads/ConflictRange.actor.cpp +++ b/fdbserver/workloads/ConflictRange.actor.cpp @@ -177,7 +177,6 @@ struct ConflictRangeWorkload : TestWorkload { if (self->testReadYourWrites) { trRYOW.setVersion(readVersion); - trRYOW.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); } else tr3.setVersion(readVersion); @@ -262,7 +261,8 @@ struct ConflictRangeWorkload : TestWorkload { throw not_committed(); } - if (originalResults[originalResults.size() - 1].key >= LiteralStringRef("\xff")) { + if (originalResults[originalResults.size() - 1].key >= LiteralStringRef("\xff") || + originalResults.readThroughEnd) { // Results go into server keyspace, so if a key selector does not fully resolve offset, a // change won't effect results throw not_committed(); @@ -316,7 +316,7 @@ struct ConflictRangeWorkload : TestWorkload { allKeyEntries += printable(res[i].key) + " "; } - TraceEvent("ConflictRangeDump").detail("Keys", allKeyEntries); + TraceEvent("ConflictRangeDump").setMaxFieldLength(10000).detail("Keys", allKeyEntries); } throw not_committed(); } else { diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 151ad22f1f..9411eb33eb 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/CoordinationInterface.h" #include "fdbserver/TesterInterface.actor.h" @@ -48,6 +49,7 @@ ACTOR Future ignoreSSFailuresForDuration(Database cx, double duration) { loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.clear(healthyZoneKey); wait(tr.commit()); TraceEvent("IgnoreSSFailureComplete").log(); diff --git a/fdbserver/workloads/SelectorCorrectness.actor.cpp b/fdbserver/workloads/SelectorCorrectness.actor.cpp index 69a639ab3a..c6e0a5c719 100644 --- a/fdbserver/workloads/SelectorCorrectness.actor.cpp +++ b/fdbserver/workloads/SelectorCorrectness.actor.cpp @@ -118,8 +118,6 @@ struct SelectorCorrectnessWorkload : TestWorkload { state Transaction tr(cx); state ReadYourWritesTransaction trRYOW(cx); - trRYOW.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - if (self->testReadYourWrites) { myValue = StringRef(format("%010d", deterministicRandom()->randomInt(0, 10000000))); for (int i = 2; i < self->maxKeySpace; i += 4) diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp index a5055d2e4d..69dc14a865 100644 --- a/fdbserver/workloads/VersionStamp.actor.cpp +++ b/fdbserver/workloads/VersionStamp.actor.cpp @@ -42,6 +42,7 @@ struct VersionStampWorkload : TestWorkload { std::map>>> versionStampKey_commit; int apiVersion; bool soleOwnerOfMetadataVersionKey; + bool allowMetadataVersionKey; VersionStampWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { testDuration = getOption(options, LiteralStringRef("testDuration"), 60.0); @@ -74,6 +75,12 @@ struct VersionStampWorkload : TestWorkload { apiVersion = Database::API_VERSION_LATEST; } TraceEvent("VersionStampApiVersion").detail("ApiVersion", apiVersion); + + allowMetadataVersionKey = apiVersion >= 610 || apiVersion == Database::API_VERSION_LATEST; + + // TODO: change this once metadata versions are supported for tenants + allowMetadataVersionKey = allowMetadataVersionKey && !cx->defaultTenant.present(); + cx->apiVersion = apiVersion; if (clientId == 0) return _start(cx, this, 1 / transactionsPerSecond); @@ -81,7 +88,7 @@ struct VersionStampWorkload : TestWorkload { } Key keyForIndex(uint64_t index) { - if ((apiVersion >= 610 || apiVersion == Database::API_VERSION_LATEST) && index == 0) { + if (allowMetadataVersionKey && index == 0) { return metadataVersionKey; } @@ -191,8 +198,7 @@ struct VersionStampWorkload : TestWorkload { RangeResult result_ = wait(tr.getRange( KeyRangeRef(self->vsValuePrefix, endOfRange(self->vsValuePrefix)), self->nodeCount + 1)); result = result_; - if ((self->apiVersion >= 610 || self->apiVersion == Database::API_VERSION_LATEST) && - self->key_commit.count(metadataVersionKey)) { + if (self->allowMetadataVersionKey && self->key_commit.count(metadataVersionKey)) { Optional mVal = wait(tr.get(metadataVersionKey)); if (mVal.present()) { result.push_back_deep(result.arena(), KeyValueRef(metadataVersionKey, mVal.get())); diff --git a/tests/slow/WriteDuringReadAtomicRestore.toml b/tests/slow/WriteDuringReadAtomicRestore.toml index a148f0a1c9..79afd3911d 100644 --- a/tests/slow/WriteDuringReadAtomicRestore.toml +++ b/tests/slow/WriteDuringReadAtomicRestore.toml @@ -1,5 +1,6 @@ [configuration] StderrSeverity = 30 +allowDefaultTenant = false [[test]] testTitle = 'WriteDuringReadTest' From d0dc756c6de97b7d2933fcd1e55c41dca5312cf9 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 3 Mar 2022 14:43:35 -0800 Subject: [PATCH 125/138] Allow disabling tenant mode in simulation. Fix a few bugs. --- fdbclient/NativeAPI.actor.cpp | 6 +++++ fdbserver/SimulatedCluster.actor.cpp | 26 +++++++++++++-------- fdbserver/workloads/ConflictRange.actor.cpp | 9 +++++++ tests/fast/FuzzApiCorrectness.toml | 1 + tests/fast/FuzzApiCorrectnessClean.toml | 1 + tests/slow/SwizzledTenantManagement.toml | 3 +++ tests/slow/TenantManagement.toml | 3 +++ 7 files changed, 39 insertions(+), 10 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 630d444ee5..f000111557 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3860,6 +3860,12 @@ Future getRangeFallback(Reference trState, if (b == allKeys.begin && ((reverse && !r.more) || !reverse)) r.readToBegin = true; + + // TODO: this currently causes us to have a conflict range that is too large if our end key resolves to the + // key after the last key in the database. In that case, we don't need a conflict between the last key and + // the end of the database. + // + // If fixed, the ConflictRange test can be updated to stop checking for this condition. if (e == allKeys.end && ((!reverse && !r.more) || reverse)) r.readThroughEnd = true; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 549dcb245c..0f23955fba 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -301,6 +301,7 @@ public: Optional config; bool allowDefaultTenant = true; + bool allowDisablingTenants = true; ConfigDBType getConfigDBType() const { return configDBType; } @@ -353,7 +354,8 @@ public: .add("coordinators", &coordinators) .add("configDB", &configDBType) .add("extraMachineCountDC", &extraMachineCountDC) - .add("allowDefaultTenant", &allowDefaultTenant); + .add("allowDefaultTenant", &allowDefaultTenant) + .add("allowDisablingTenants", &allowDisablingTenants); try { auto file = toml::parse(testFile); if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { @@ -1790,12 +1792,15 @@ void setupSimulatedSystem(std::vector>* systemActors, std::string whitelistBinPaths, TestConfig testConfig, ProtocolVersion protocolVersion, - bool tenantModeRequired) { + TenantMode tenantMode) { // SOMEDAY: this does not test multi-interface configurations SimulationConfig simconfig(testConfig); if (testConfig.logAntiQuorum != -1) { simconfig.db.tLogWriteAntiQuorum = testConfig.logAntiQuorum; } + + simconfig.db.tenantMode = tenantMode; + StatusObject startingConfigJSON = simconfig.db.toJSON(true); std::string startingConfigString = "new"; if (testConfig.configureLocked) { @@ -1843,10 +1848,6 @@ void setupSimulatedSystem(std::vector>* systemActors, startingConfigString += " " + g_simulator.originalRegions; } - if (tenantModeRequired) { - startingConfigString += " tenant_mode=required"; - } - g_simulator.storagePolicy = simconfig.db.storagePolicy; g_simulator.tLogPolicy = simconfig.db.tLogPolicy; g_simulator.tLogWriteAntiQuorum = simconfig.db.tLogWriteAntiQuorum; @@ -2280,6 +2281,7 @@ ACTOR void setupAndRun(std::string dataFolder, g_simulator.setDiffProtocol = false; state bool allowDefaultTenant = testConfig.allowDefaultTenant; + state bool allowDisablingTenants = testConfig.allowDisablingTenants; // The RocksDB storage engine does not support the restarting tests because you cannot consistently get a clean // snapshot of the storage engine without a snapshotting file system. @@ -2341,17 +2343,21 @@ ACTOR void setupAndRun(std::string dataFolder, TEST(true); // Simulation start state Optional defaultTenant; - state bool requireTenants = false; + state TenantMode tenantMode = TenantMode::DISABLED; if (allowDefaultTenant && deterministicRandom()->random01() < 1.0) { defaultTenant = "SimulatedDefaultTenant"_sr; if (deterministicRandom()->random01() < 0.9) { - requireTenants = true; + tenantMode = TenantMode::REQUIRED; + } else { + tenantMode = TenantMode::OPTIONAL; } + } else if (!allowDisablingTenants || deterministicRandom()->random01() < 0.5) { + tenantMode = TenantMode::OPTIONAL; } TraceEvent("SimulatedClusterTenantMode") .detail("UsingTenant", defaultTenant) - .detail("TenantRequired", requireTenants); + .detail("TenantRequired", tenantMode.toString()); try { // systemActors.push_back( startSystemMonitor(dataFolder) ); @@ -2379,7 +2385,7 @@ ACTOR void setupAndRun(std::string dataFolder, whitelistBinPaths, testConfig, protocolVersion, - requireTenants); + tenantMode); wait(delay(1.0)); // FIXME: WHY!!! //wait for machines to boot } std::string clusterFileDir = joinPath(dataFolder, deterministicRandom()->randomUniqueID().toString()); diff --git a/fdbserver/workloads/ConflictRange.actor.cpp b/fdbserver/workloads/ConflictRange.actor.cpp index 8d5302bbeb..c5b0b1938d 100644 --- a/fdbserver/workloads/ConflictRange.actor.cpp +++ b/fdbserver/workloads/ConflictRange.actor.cpp @@ -212,6 +212,7 @@ struct ConflictRangeWorkload : TestWorkload { wait(tr2.commit()); state bool foundConflict = false; + state bool readToEnd = false; try { // Do the generated getRange in the other transaction and commit. if (self->testReadYourWrites) { @@ -220,6 +221,7 @@ struct ConflictRangeWorkload : TestWorkload { RangeResult res = wait(trRYOW.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA), KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB), randomLimit)); + readToEnd = res.readThroughEnd; wait(trRYOW.commit()); } else { tr3.clear(StringRef(format("%010d", self->maxKeySpace + 1))); @@ -268,6 +270,13 @@ struct ConflictRangeWorkload : TestWorkload { throw not_committed(); } + // GetRangeFallback has a conflict range that is too large if the end selector resolves to the + // key after the last key. In that case, we may get a spurious conflict. + // This check can be removed if GetRangeFallback is fixed. + if (readToEnd) { + throw not_committed(); + } + if ((originalResults[0].key == firstElement || originalResults[0].key == StringRef(format("%010d", *(insertedSet.begin())))) && offsetA < 0) { diff --git a/tests/fast/FuzzApiCorrectness.toml b/tests/fast/FuzzApiCorrectness.toml index 20d4e215b5..51e884c090 100644 --- a/tests/fast/FuzzApiCorrectness.toml +++ b/tests/fast/FuzzApiCorrectness.toml @@ -1,5 +1,6 @@ [configuration] StderrSeverity = 30 +allowDisablingTenants = false [[test]] testTitle = 'FuzzApiCorrectness' diff --git a/tests/fast/FuzzApiCorrectnessClean.toml b/tests/fast/FuzzApiCorrectnessClean.toml index 7165deda42..85aee61d12 100644 --- a/tests/fast/FuzzApiCorrectnessClean.toml +++ b/tests/fast/FuzzApiCorrectnessClean.toml @@ -1,5 +1,6 @@ [configuration] StderrSeverity = 30 +allowDisablingTenants = false [[test]] testTitle = 'FuzzApiCorrectness' diff --git a/tests/slow/SwizzledTenantManagement.toml b/tests/slow/SwizzledTenantManagement.toml index 7c06d21208..0c7c8a6b69 100644 --- a/tests/slow/SwizzledTenantManagement.toml +++ b/tests/slow/SwizzledTenantManagement.toml @@ -1,3 +1,6 @@ +[[configuration]] +allowDisablingTenants = false + [[test]] testTitle = 'TenantManagementTest' clearAfterTest = true diff --git a/tests/slow/TenantManagement.toml b/tests/slow/TenantManagement.toml index 9dce2e2fba..9bdef4f7f6 100644 --- a/tests/slow/TenantManagement.toml +++ b/tests/slow/TenantManagement.toml @@ -1,3 +1,6 @@ +[[configuration]] +allowDisablingTenants = false + [[test]] testTitle = 'TenantManagementTest' clearAfterTest = true From c81c68af4f10b232d15c70079eb0eb5b90fba0fd Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 4 Mar 2022 12:12:24 -0800 Subject: [PATCH 126/138] Add unknown_tenant error to FuzzApiCorrectness --- fdbserver/workloads/FuzzApiCorrectness.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index b79844c63c..de74351395 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -422,7 +422,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { wait(timeoutError(unsafeThreadFutureToFuture(tr->commit()), 30)); } catch (Error& e) { if (e.code() == error_code_client_invalid_operation || - e.code() == error_code_transaction_too_large) { + e.code() == error_code_transaction_too_large || + e.code() == error_code_unknown_tenant) { throw not_committed(); } } From 0a2135039ca133ee360be79eb4f17a0f5a5ccbbb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Sun, 6 Mar 2022 21:33:58 -0800 Subject: [PATCH 127/138] Update the tenant management workload to test special keys and transaction-based management functions --- .../workloads/TenantManagement.actor.cpp | 444 +++++++++++++----- 1 file changed, 330 insertions(+), 114 deletions(-) diff --git a/fdbserver/workloads/TenantManagement.actor.cpp b/fdbserver/workloads/TenantManagement.actor.cpp index d9d0b3e865..2cf4f48a0c 100644 --- a/fdbserver/workloads/TenantManagement.actor.cpp +++ b/fdbserver/workloads/TenantManagement.actor.cpp @@ -49,11 +49,27 @@ struct TenantManagementWorkload : TestWorkload { const TenantName tenantNamePrefix = "tenant_management_workload_"_sr; TenantName localTenantNamePrefix; + const Key specialKeysTenantMapPrefix = TenantMapRangeImpl::submoduleRange.begin.withPrefix( + SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin); + int maxTenants; double testDuration; + enum class OperationType { SPECIAL_KEYS, MANAGEMENT_DATABASE, MANAGEMENT_TRANSACTION }; + + static OperationType randomOperationType() { + int randomNum = deterministicRandom()->randomInt(0, 3); + if (randomNum == 0) { + return OperationType::SPECIAL_KEYS; + } else if (randomNum == 1) { + return OperationType::MANAGEMENT_DATABASE; + } else { + return OperationType::MANAGEMENT_TRANSACTION; + } + } + TenantManagementWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { - maxTenants = getOption(options, "maxTenants"_sr, 1000); + maxTenants = std::min(1e8 - 1, getOption(options, "maxTenants"_sr, 1000)); testDuration = getOption(options, "testDuration"_sr, 60.0); localTenantNamePrefix = format("%stenant_%d_", tenantNamePrefix.toString().c_str(), clientId); @@ -100,8 +116,8 @@ struct TenantManagementWorkload : TestWorkload { } TenantName chooseTenantName(bool allowSystemTenant) { - TenantName tenant( - format("%s%d", localTenantNamePrefix.toString().c_str(), deterministicRandom()->randomInt(0, maxTenants))); + TenantName tenant(format( + "%s%08d", localTenantNamePrefix.toString().c_str(), deterministicRandom()->randomInt(0, maxTenants))); if (allowSystemTenant && deterministicRandom()->random01() < 0.02) { tenant = tenant.withPrefix("\xff"_sr); } @@ -112,102 +128,218 @@ struct TenantManagementWorkload : TestWorkload { ACTOR Future createTenant(Database cx, TenantManagementWorkload* self) { state TenantName tenant = self->chooseTenantName(true); state bool alreadyExists = self->createdTenants.count(tenant); - try { - wait(ManagementAPI::createTenant(cx.getReference(), tenant)); - ASSERT(!alreadyExists); - ASSERT(!tenant.startsWith("\xff"_sr)); + state OperationType operationType = TenantManagementWorkload::randomOperationType(); + state Reference tr = makeReference(cx); - state Optional entry = wait(ManagementAPI::tryGetTenant(cx.getReference(), tenant)); - ASSERT(entry.present()); - ASSERT(entry.get().id > self->maxId); - ASSERT(entry.get().prefix.startsWith(self->tenantSubspace)); + loop { + try { + if (operationType == OperationType::SPECIAL_KEYS) { + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + Key key = self->specialKeysTenantMapPrefix.withSuffix(tenant); + tr->set(key, ""_sr); + wait(tr->commit()); + } else if (operationType == OperationType::MANAGEMENT_DATABASE) { + wait(ManagementAPI::createTenant(cx.getReference(), tenant)); + } else { + Optional _ = wait(ManagementAPI::createTenantTransaction(tr, tenant)); + wait(tr->commit()); + } - self->maxId = entry.get().id; - self->createdTenants[tenant] = TenantState(entry.get().id, true); + if (operationType != OperationType::MANAGEMENT_DATABASE && alreadyExists) { + return Void(); + } - state bool insertData = deterministicRandom()->random01() < 0.5; - if (insertData) { - state Transaction tr(cx, tenant); - loop { - try { - tr.set(self->keyName, tenant); - wait(tr.commit()); - break; - } catch (Error& e) { - wait(tr.onError(e)); + ASSERT(!alreadyExists); + ASSERT(!tenant.startsWith("\xff"_sr)); + + state Optional entry = wait(ManagementAPI::tryGetTenant(cx.getReference(), tenant)); + ASSERT(entry.present()); + ASSERT(entry.get().id > self->maxId); + ASSERT(entry.get().prefix.startsWith(self->tenantSubspace)); + + self->maxId = entry.get().id; + self->createdTenants[tenant] = TenantState(entry.get().id, true); + + state bool insertData = deterministicRandom()->random01() < 0.5; + if (insertData) { + state Transaction insertTr(cx, tenant); + loop { + try { + insertTr.set(self->keyName, tenant); + wait(insertTr.commit()); + break; + } catch (Error& e) { + wait(insertTr.onError(e)); + } + } + + self->createdTenants[tenant].empty = false; + + state Transaction checkTr(cx); + loop { + try { + checkTr.setOption(FDBTransactionOptions::RAW_ACCESS); + Optional val = wait(checkTr.get(self->keyName.withPrefix(entry.get().prefix))); + ASSERT(val.present()); + ASSERT(val.get() == tenant); + break; + } catch (Error& e) { + wait(checkTr.onError(e)); + } } } - self->createdTenants[tenant].empty = false; - - tr = Transaction(cx); - loop { + wait(self->checkTenant(cx, self, tenant, self->createdTenants[tenant])); + return Void(); + } catch (Error& e) { + if (e.code() == error_code_invalid_tenant_name) { + ASSERT(tenant.startsWith("\xff"_sr)); + return Void(); + } else if (operationType == OperationType::MANAGEMENT_DATABASE) { + if (e.code() == error_code_tenant_already_exists) { + ASSERT(alreadyExists && operationType == OperationType::MANAGEMENT_DATABASE); + } else { + TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant); + } + return Void(); + } else { try { - tr.setOption(FDBTransactionOptions::RAW_ACCESS); - Optional val = wait(tr.get(self->keyName.withPrefix(entry.get().prefix))); - ASSERT(val.present()); - ASSERT(val.get() == tenant); - break; + wait(tr->onError(e)); } catch (Error& e) { - wait(tr.onError(e)); + TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant); + return Void(); } } } - - wait(self->checkTenant(cx, self, tenant, self->createdTenants[tenant])); - } catch (Error& e) { - if (e.code() == error_code_tenant_already_exists) { - ASSERT(alreadyExists); - } else if (e.code() == error_code_invalid_tenant_name) { - ASSERT(tenant.startsWith("\xff"_sr)); - } else { - TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant); - } } - - return Void(); } ACTOR Future deleteTenant(Database cx, TenantManagementWorkload* self) { state TenantName tenant = self->chooseTenantName(true); + state OperationType operationType = TenantManagementWorkload::randomOperationType(); + state Reference tr = makeReference(cx); + + state Optional endTenant = operationType != OperationType::MANAGEMENT_DATABASE && + !tenant.startsWith("\xff"_sr) && + deterministicRandom()->random01() < 0.2 + ? Optional(self->chooseTenantName(false)) + : Optional(); + + if (endTenant.present() && endTenant < tenant) { + TenantName temp = tenant; + tenant = endTenant.get(); + endTenant = temp; + } auto itr = self->createdTenants.find(tenant); state bool alreadyExists = itr != self->createdTenants.end(); - state bool isEmpty = (itr == self->createdTenants.end() || itr->second.empty); + state bool isEmpty = true; - try { - if (alreadyExists && deterministicRandom()->random01() < 0.5) { - state Transaction tr(cx, tenant); - loop { - try { - tr.clear(self->keyName); - wait(tr.commit()); - isEmpty = true; - auto itr = self->createdTenants.find(tenant); - ASSERT(itr != self->createdTenants.end()); - itr->second.empty = true; - break; - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - - wait(ManagementAPI::deleteTenant(cx.getReference(), tenant)); - ASSERT(alreadyExists); - ASSERT(isEmpty); - self->createdTenants.erase(tenant); - } catch (Error& e) { - if (e.code() == error_code_tenant_not_found) { - ASSERT(!alreadyExists); - } else if (e.code() == error_code_tenant_not_empty) { - ASSERT(!isEmpty); - } else { - TraceEvent(SevError, "DeleteTenantFailure").error(e).detail("TenantName", tenant); + state std::vector tenants; + if (!endTenant.present()) { + tenants.push_back(tenant); + } else if (endTenant.present()) { + for (auto itr = self->createdTenants.lower_bound(tenant); + itr != self->createdTenants.end() && itr->first < endTenant.get(); + ++itr) { + tenants.push_back(itr->first); } } - return Void(); + try { + if (alreadyExists || endTenant.present()) { + state int tenantIndex = 0; + for (; tenantIndex < tenants.size(); ++tenantIndex) { + if (deterministicRandom()->random01() < 0.9) { + state Transaction clearTr(cx, tenants[tenantIndex]); + loop { + try { + clearTr.clear(self->keyName); + wait(clearTr.commit()); + auto itr = self->createdTenants.find(tenants[tenantIndex]); + ASSERT(itr != self->createdTenants.end()); + itr->second.empty = true; + break; + } catch (Error& e) { + wait(clearTr.onError(e)); + } + } + } else { + auto itr = self->createdTenants.find(tenants[tenantIndex]); + ASSERT(itr != self->createdTenants.end()); + isEmpty = isEmpty && itr->second.empty; + } + } + } + } catch (Error& e) { + TraceEvent(SevError, "DeleteTenantFailure") + .error(e) + .detail("TenantName", tenant) + .detail("EndTenant", endTenant); + return Void(); + } + + loop { + try { + if (operationType == OperationType::SPECIAL_KEYS) { + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + Key key = self->specialKeysTenantMapPrefix.withSuffix(tenant); + if (endTenant.present()) { + tr->clear(KeyRangeRef(key, self->specialKeysTenantMapPrefix.withSuffix(endTenant.get()))); + } else { + tr->clear(key); + } + wait(tr->commit()); + } else if (operationType == OperationType::MANAGEMENT_DATABASE) { + ASSERT(tenants.size() == 1); + for (auto tenant : tenants) { + wait(ManagementAPI::deleteTenant(cx.getReference(), tenant)); + } + } else { + for (auto tenant : tenants) { + wait(ManagementAPI::deleteTenantTransaction(tr, tenant)); + } + + wait(tr->commit()); + } + + if (!alreadyExists && !endTenant.present() && operationType != OperationType::MANAGEMENT_DATABASE) { + return Void(); + } + + ASSERT(alreadyExists || endTenant.present()); + ASSERT(isEmpty); + for (auto tenant : tenants) { + self->createdTenants.erase(tenant); + } + return Void(); + } catch (Error& e) { + if (e.code() == error_code_tenant_not_empty) { + ASSERT(!isEmpty); + return Void(); + } else if (operationType == OperationType::MANAGEMENT_DATABASE) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(!alreadyExists && !endTenant.present()); + } else { + TraceEvent(SevError, "DeleteTenantFailure") + .error(e) + .detail("TenantName", tenant) + .detail("EndTenant", endTenant); + } + return Void(); + } else { + try { + wait(tr->onError(e)); + } catch (Error& e) { + TraceEvent(SevError, "DeleteTenantFailure") + .error(e) + .detail("TenantName", tenant) + .detail("EndTenant", endTenant); + return Void(); + } + } + } + } } ACTOR Future checkTenant(Database cx, @@ -234,64 +366,146 @@ struct TenantManagementWorkload : TestWorkload { return Void(); } + static TenantMapEntry jsonToTenantMapEntry(ValueRef tenantJson) { + json_spirit::mValue jsonObject; + json_spirit::read_string(tenantJson.toString(), jsonObject); + JSONDoc jsonDoc(jsonObject); + + int64_t id; + std::string prefix; + jsonDoc.get("id", id); + jsonDoc.get("prefix", prefix); + + Key prefixKey = KeyRef(unprintable(prefix)); + TenantMapEntry entry(id, prefixKey.substr(0, prefixKey.size() - 8)); + + ASSERT(entry.prefix == prefixKey); + return entry; + } + ACTOR Future getTenant(Database cx, TenantManagementWorkload* self) { state TenantName tenant = self->chooseTenantName(true); auto itr = self->createdTenants.find(tenant); state bool alreadyExists = itr != self->createdTenants.end(); state TenantState tenantState = itr->second; + state OperationType operationType = TenantManagementWorkload::randomOperationType(); + state Reference tr = makeReference(cx); - try { - TenantMapEntry entry = wait(ManagementAPI::getTenant(cx.getReference(), tenant)); - ASSERT(alreadyExists); - ASSERT(entry.id == tenantState.id); - wait(self->checkTenant(cx, self, tenant, tenantState)); - } catch (Error& e) { - if (e.code() == error_code_tenant_not_found) { - ASSERT(!alreadyExists); - } else { - TraceEvent(SevError, "GetTenantFailure").error(e).detail("TenantName", tenant); + loop { + try { + state TenantMapEntry entry; + if (operationType == OperationType::SPECIAL_KEYS) { + Key key = self->specialKeysTenantMapPrefix.withSuffix(tenant); + Optional value = wait(tr->get(key)); + if (!value.present()) { + throw tenant_not_found(); + } + entry = TenantManagementWorkload::jsonToTenantMapEntry(value.get()); + } else if (operationType == OperationType::MANAGEMENT_DATABASE) { + TenantMapEntry _entry = wait(ManagementAPI::getTenant(cx.getReference(), tenant)); + entry = _entry; + } else { + TenantMapEntry _entry = wait(ManagementAPI::getTenantTransaction(tr, tenant)); + entry = _entry; + } + ASSERT(alreadyExists); + ASSERT(entry.id == tenantState.id); + wait(self->checkTenant(cx, self, tenant, tenantState)); + return Void(); + } catch (Error& e) { + state bool retry = true; + state Error error = e; + + if (e.code() == error_code_tenant_not_found) { + ASSERT(!alreadyExists); + return Void(); + } else if (operationType != OperationType::MANAGEMENT_DATABASE) { + try { + wait(tr->onError(e)); + } catch (Error& e) { + error = e; + retry = false; + } + } + + if (!retry) { + TraceEvent(SevError, "GetTenantFailure").error(error).detail("TenantName", tenant); + return Void(); + } } } - - return Void(); } ACTOR Future listTenants(Database cx, TenantManagementWorkload* self) { state TenantName beginTenant = self->chooseTenantName(false); state TenantName endTenant = self->chooseTenantName(false); - state int limit = std::min(CLIENT_KNOBS->TOO_MANY, deterministicRandom()->randomInt(0, self->maxTenants * 2)); + state int limit = std::min(CLIENT_KNOBS->TOO_MANY, deterministicRandom()->randomInt(1, self->maxTenants * 2)); + state OperationType operationType = TenantManagementWorkload::randomOperationType(); + state Reference tr = makeReference(cx); if (beginTenant > endTenant) { std::swap(beginTenant, endTenant); } - try { - Standalone> tenants = - wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, limit)); + loop { + try { + state std::map tenants; + if (operationType == OperationType::SPECIAL_KEYS) { + KeyRange range = KeyRangeRef(beginTenant, endTenant).withPrefix(self->specialKeysTenantMapPrefix); + RangeResult results = wait(tr->getRange(range, limit)); + for (auto result : results) { + tenants[result.key.removePrefix(self->specialKeysTenantMapPrefix)] = + TenantManagementWorkload::jsonToTenantMapEntry(result.value); + } + } else if (operationType == OperationType::MANAGEMENT_DATABASE) { + std::map _tenants = + wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, limit)); + tenants = _tenants; + } else { + std::map _tenants = + wait(ManagementAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit)); + tenants = _tenants; + } - ASSERT(tenants.size() <= limit); + ASSERT(tenants.size() <= limit); - int index = 0; - auto itr = self->createdTenants.lower_bound(beginTenant); - for (; index < tenants.size(); ++itr) { - ASSERT(itr != self->createdTenants.end()); - ASSERT(itr->first == tenants[index++]); - } + auto localItr = self->createdTenants.lower_bound(beginTenant); + auto tenantMapItr = tenants.begin(); + for (; tenantMapItr != tenants.end(); ++tenantMapItr, ++localItr) { + ASSERT(localItr != self->createdTenants.end()); + ASSERT(localItr->first == tenantMapItr->first); + } - if (!(tenants.size() == limit || itr == self->createdTenants.end())) { - for (auto tenant : self->createdTenants) { - TraceEvent("ExistingTenant").detail("Tenant", tenant.first); + if (!(tenants.size() == limit || localItr == self->createdTenants.end())) { + for (auto tenant : self->createdTenants) { + TraceEvent("ExistingTenant").detail("Tenant", tenant.first); + } + } + ASSERT(tenants.size() == limit || localItr == self->createdTenants.end() || + localItr->first >= endTenant); + return Void(); + } catch (Error& e) { + state bool retry = true; + state Error error = e; + if (operationType != OperationType::MANAGEMENT_DATABASE) { + try { + wait(tr->onError(e)); + } catch (Error& e) { + error = e; + retry = false; + } + } + + if (!retry) { + TraceEvent(SevError, "ListTenantFailure") + .error(error) + .detail("BeginTenant", beginTenant) + .detail("EndTenant", endTenant); + + return Void(); } } - ASSERT(tenants.size() == limit || itr == self->createdTenants.end() || itr->first >= endTenant); - } catch (Error& e) { - TraceEvent(SevError, "ListTenantFailure") - .error(e) - .detail("BeginTenant", beginTenant) - .detail("EndTenant", endTenant); } - - return Void(); } Future start(Database const& cx) override { return _start(cx, this); } @@ -334,20 +548,22 @@ struct TenantManagementWorkload : TestWorkload { state TenantName endTenant = "\xff\xff"_sr.withPrefix(self->localTenantNamePrefix); loop { - Standalone> tenants = + std::map tenants = wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, 1000)); + TenantNameRef lastTenant; for (auto tenant : tenants) { ASSERT(itr != self->createdTenants.end()); - ASSERT(tenant == itr->first); - checkTenants.push_back(self->checkTenant(cx, self, tenant, itr->second)); + ASSERT(tenant.first == itr->first); + checkTenants.push_back(self->checkTenant(cx, self, tenant.first, itr->second)); + lastTenant = tenant.first; ++itr; } if (tenants.size() < 1000) { break; } else { - beginTenant = keyAfter(tenants[tenants.size() - 1]); + beginTenant = keyAfter(lastTenant); } } From e339c8f8f25371833c9f67aed84b718c05e74d07 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 7 Mar 2022 11:49:25 -0800 Subject: [PATCH 128/138] Fix formatting --- fdbserver/workloads/FuzzApiCorrectness.actor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index de74351395..d919d6c140 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -422,8 +422,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { wait(timeoutError(unsafeThreadFutureToFuture(tr->commit()), 30)); } catch (Error& e) { if (e.code() == error_code_client_invalid_operation || - e.code() == error_code_transaction_too_large || - e.code() == error_code_unknown_tenant) { + e.code() == error_code_transaction_too_large || e.code() == error_code_unknown_tenant) { throw not_committed(); } } From 49064054f66b7ebccb8c9d05fa759eac53c5a40e Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 7 Mar 2022 16:00:45 -0800 Subject: [PATCH 129/138] Use a sentinel key to test if a key selector potentially resolves into the system keyspace. --- fdbserver/workloads/ConflictRange.actor.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fdbserver/workloads/ConflictRange.actor.cpp b/fdbserver/workloads/ConflictRange.actor.cpp index c5b0b1938d..9b53b503cd 100644 --- a/fdbserver/workloads/ConflictRange.actor.cpp +++ b/fdbserver/workloads/ConflictRange.actor.cpp @@ -93,6 +93,10 @@ struct ConflictRangeWorkload : TestWorkload { wait(timeKeeperSetDisable(cx)); } + // Set one key after the end of the tested range. If this key is included in the result, then + // we may have drifted into the system key-space and cannot evaluate the result. + state Key sentinelKey = StringRef(format("%010d", self->maxKeySpace)); + loop { randomSets = !randomSets; @@ -127,6 +131,8 @@ struct ConflictRangeWorkload : TestWorkload { } } + tr0.set(sentinelKey, deterministicRandom()->randomUniqueID().toString()); + wait(tr0.commit()); break; } catch (Error& e) { @@ -212,7 +218,6 @@ struct ConflictRangeWorkload : TestWorkload { wait(tr2.commit()); state bool foundConflict = false; - state bool readToEnd = false; try { // Do the generated getRange in the other transaction and commit. if (self->testReadYourWrites) { @@ -221,7 +226,6 @@ struct ConflictRangeWorkload : TestWorkload { RangeResult res = wait(trRYOW.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA), KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB), randomLimit)); - readToEnd = res.readThroughEnd; wait(trRYOW.commit()); } else { tr3.clear(StringRef(format("%010d", self->maxKeySpace + 1))); @@ -263,20 +267,12 @@ struct ConflictRangeWorkload : TestWorkload { throw not_committed(); } - if (originalResults[originalResults.size() - 1].key >= LiteralStringRef("\xff") || - originalResults.readThroughEnd) { + if (originalResults[originalResults.size() - 1].key >= sentinelKey) { // Results go into server keyspace, so if a key selector does not fully resolve offset, a // change won't effect results throw not_committed(); } - // GetRangeFallback has a conflict range that is too large if the end selector resolves to the - // key after the last key. In that case, we may get a spurious conflict. - // This check can be removed if GetRangeFallback is fixed. - if (readToEnd) { - throw not_committed(); - } - if ((originalResults[0].key == firstElement || originalResults[0].key == StringRef(format("%010d", *(insertedSet.begin())))) && offsetA < 0) { From 9e39ebd069e4567d91d47bf3004738da23ba53a0 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 8 Mar 2022 15:53:05 -0800 Subject: [PATCH 130/138] Use the new enum name OPTIONAL_TENANT --- fdbserver/SimulatedCluster.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 0f23955fba..a17574ccdd 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2349,10 +2349,10 @@ ACTOR void setupAndRun(std::string dataFolder, if (deterministicRandom()->random01() < 0.9) { tenantMode = TenantMode::REQUIRED; } else { - tenantMode = TenantMode::OPTIONAL; + tenantMode = TenantMode::OPTIONAL_TENANT; } } else if (!allowDisablingTenants || deterministicRandom()->random01() < 0.5) { - tenantMode = TenantMode::OPTIONAL; + tenantMode = TenantMode::OPTIONAL_TENANT; } TraceEvent("SimulatedClusterTenantMode") From 4079b9f23d95f5aa82c0d7209d786fba2b7cb138 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 14 Mar 2022 09:21:22 -0700 Subject: [PATCH 131/138] Lazily assign the default tenant in simulated tests so that we can enforce that raw access isn't used on tenant transactions. Update various tests to avoid using raw access or system key access after they have performed other operations on the transaction. --- fdbclient/NativeAPI.actor.cpp | 118 +++++++----- fdbclient/NativeAPI.actor.h | 14 +- .../workloads/FuzzApiCorrectness.actor.cpp | 169 ++++++++++-------- .../SpecialKeySpaceCorrectness.actor.cpp | 10 +- tests/fast/FuzzApiCorrectness.toml | 1 + tests/fast/FuzzApiCorrectnessClean.toml | 1 + 6 files changed, 179 insertions(+), 134 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index f000111557..22ed708f0d 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -2796,7 +2796,7 @@ Future getKeyLocation(Reference trState, UseTenant useTenant, Version version) { auto f = getKeyLocation(trState->cx, - useTenant ? trState->tenant : Optional(), + useTenant ? trState->tenant() : Optional(), key, member, trState->spanID, @@ -2805,7 +2805,7 @@ Future getKeyLocation(Reference trState, isBackward, version); - if (trState->tenant.present() && useTenant) { + if (trState->tenant().present() && useTenant) { return map(f, [trState](const KeyRangeLocationInfo& locationInfo) { trState->tenantId = locationInfo.tenantEntry.id; return locationInfo; @@ -2940,7 +2940,7 @@ Future> getKeyRangeLocations(Referencecx, - useTenant ? trState->tenant : Optional(), + useTenant ? trState->tenant() : Optional(), keys, limit, reverse, @@ -2950,7 +2950,7 @@ Future> getKeyRangeLocations(ReferenceuseProvisionalProxies, version); - if (trState->tenant.present() && useTenant) { + if (trState->tenant().present() && useTenant) { return map(f, [trState](const std::vector& locationInfo) { ASSERT(!locationInfo.empty()); trState->tenantId = locationInfo[0].tenantEntry.id; @@ -2970,7 +2970,7 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange loop { std::vector locations = wait(getKeyRangeLocations_internal(trState->cx, - trState->tenant, + trState->tenant(), keys, CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, Reverse::False, @@ -2988,7 +2988,7 @@ ACTOR Future warmRange_impl(Reference trState, KeyRange if (totalRequests % 20 == 0) { // To avoid blocking the proxies from starting other transactions, occasionally get a read version. - state Transaction tr(trState->cx, trState->tenant); + state Transaction tr(trState->cx, trState->tenant()); loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); @@ -3028,14 +3028,15 @@ TransactionState::TransactionState(Database cx, TaskPriority taskID, SpanID spanID, Reference trLogInfo) - : cx(cx), tenant(tenant), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID) {} + : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID), tenant_(tenant), + tenantSet(tenant.present()) {} Reference TransactionState::cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const { SpanID newSpanID = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanID; Reference newState = - makeReference(cx, tenant, cx->taskID, newSpanID, newTrLogInfo); + makeReference(cx, tenant_, cx->taskID, newSpanID, newTrLogInfo); if (!cx->apiVersionAtLeast(16)) { newState->options = options; @@ -3049,18 +3050,37 @@ Reference TransactionState::cloneAndReset(Referenceinternal && !options.rawAccess && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && - !tenant.present()) { - throw tenant_name_required(); - } else if (options.rawAccess || !tenant.present()) { +TenantInfo TransactionState::getTenantInfo() { + Optional const& t = tenant(); + + if (options.rawAccess) { return TenantInfo(); - } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && tenant.present()) { + } else if (!cx->internal && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !t.present()) { + throw tenant_name_required(); + } else if (!t.present()) { + return TenantInfo(); + } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) { throw tenants_disabled(); } ASSERT(tenantId != TenantInfo::INVALID_TENANT); - return TenantInfo(tenant.get(), tenantId); + return TenantInfo(t.get(), tenantId); +} + +Optional const& TransactionState::tenant() { + if (tenantSet) { + return tenant_; + } else { + if (!tenant_.present() && !options.rawAccess) { + tenant_ = cx->defaultTenant; + } + tenantSet = true; + return tenant_; + } +} + +bool TransactionState::hasTenant() const { + return tenantSet && tenant_.present(); } Future Transaction::warmRange(KeyRange keys) { @@ -3074,8 +3094,8 @@ ACTOR Future> getValue(Reference trState, TransactionRecordLogInfo recordLogInfo) { state Version ver = wait(version); state Span span("NAPI:getValue"_loc, trState->spanID); - if (useTenant && trState->tenant.present()) { - span.addTag("tenant"_sr, trState->tenant.get()); + if (useTenant && trState->tenant().present()) { + span.addTag("tenant"_sr, trState->tenant().get()); } span.addTag("key"_sr, key); @@ -3143,7 +3163,7 @@ ACTOR Future> getValue(Reference trState, if (trState->trLogInfo && recordLogInfo) { int valueSize = reply.value.present() ? reply.value.get().size() : 0; trState->trLogInfo->addLog(FdbClientLogEvents::EventGet( - startTimeD, trState->cx->clientLocality.dcId(), latency, valueSize, key, trState->tenant)); + startTimeD, trState->cx->clientLocality.dcId(), latency, valueSize, key, trState->tenant())); } trState->cx->getValueCompleted->latency = timer_int() - startTime; trState->cx->getValueCompleted->log(); @@ -3178,8 +3198,8 @@ ACTOR Future> getValue(Reference trState, trState->cx->invalidateCache(locationInfo.tenantEntry.prefix, key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(useTenant && trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { if (trState->trLogInfo && recordLogInfo) @@ -3187,7 +3207,7 @@ ACTOR Future> getValue(Reference trState, trState->cx->clientLocality.dcId(), static_cast(e.code()), key, - trState->tenant)); + trState->tenant())); throw e; } } @@ -3285,8 +3305,8 @@ ACTOR Future getKey(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(useTenant && trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); @@ -3631,8 +3651,8 @@ Future getExactRange(Reference trState, state RangeResultFamily output; state Span span("NAPI:getExactRange"_loc, trState->spanID); - if (useTenant && trState->tenant.present()) { - span.addTag("tenant"_sr, trState->tenant.get()); + if (useTenant && trState->tenant().present()) { + span.addTag("tenant"_sr, trState->tenant().get()); } // printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); @@ -3793,14 +3813,14 @@ Future getExactRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(useTenant && trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); break; } else { TraceEvent(SevInfo, "GetExactRangeError") .error(e) - .detail("Tenant", trState->tenant) + .detail("Tenant", trState->tenant()) .detail("ShardBegin", locations[shard].range.begin) .detail("ShardEnd", locations[shard].range.end); throw; @@ -3931,7 +3951,7 @@ void getRangeFinished(Reference trState, bytes, begin.getKey(), end.getKey(), - trState->tenant)); + trState->tenant())); } if (!snapshot) { @@ -3988,8 +4008,8 @@ Future getRange(Reference trState, state KeySelector originalEnd = end; state RangeResultFamily output; state Span span("NAPI:getRange"_loc, trState->spanID); - if (useTenant && trState->tenant.present()) { - span.addTag("tenant"_sr, trState->tenant.get()); + if (useTenant && trState->tenant().present()) { + span.addTag("tenant"_sr, trState->tenant().get()); } try { @@ -4255,8 +4275,8 @@ Future getRange(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(useTenant && trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(useTenant && trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { if (trState->trLogInfo) @@ -4266,7 +4286,7 @@ Future getRange(Reference trState, static_cast(e.code()), begin.getKey(), end.getKey(), - trState->tenant)); + trState->tenant())); throw e; } @@ -4711,8 +4731,8 @@ ACTOR Future getRangeStreamFragment(Reference trState, wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); break; } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); break; } else { @@ -4867,7 +4887,7 @@ Transaction::Transaction() Transaction::Transaction(Database const& cx, Optional const& tenant) : trState(makeReference(cx, - tenant.present() ? tenant : cx->defaultTenant, + tenant, cx->taskID, generateSpanID(cx->transactionTracingSample), createTrLogInfoProbabilistically(cx))), @@ -4986,7 +5006,7 @@ ACTOR Future getTenantMetadata(Reference trState, } Future populateAndGetTenant(Reference trState, Key const& key, Version version) { - if (!trState->tenant.present() || key == metadataVersionKey) { + if (!trState->tenant().present() || key == metadataVersionKey) { return TenantInfo(); } else if (trState->tenantId != TenantInfo::INVALID_TENANT) { return trState->getTenantInfo(); @@ -5073,7 +5093,7 @@ ACTOR Future>> getAddressesForKeyActor(Referen state std::vector ssi; state Key resolvedKey = key; - if (trState->tenant.present()) { + if (trState->tenant().present()) { state Version version = wait(ver); KeyRangeLocationInfo locationInfo = wait(getKeyLocation( trState, ""_sr, &StorageServerInterface::getValue, Reverse::False, UseTenant::True, version)); @@ -5871,7 +5891,7 @@ ACTOR static Future tryCommit(Reference trState, } state Key tenantPrefix; - if (trState->tenant.present()) { + if (trState->tenant().present()) { KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, ""_sr, &StorageServerInterface::getValue, @@ -5958,7 +5978,7 @@ ACTOR static Future tryCommit(Reference trState, req.transaction.mutations.expectedSize(), ci.version, req, - trState->tenant)); + trState->tenant())); return Void(); } else { // clear the RYW transaction which contains previous conflicting keys @@ -6016,8 +6036,8 @@ ACTOR static Future tryCommit(Reference trState, // retry it anyway (relying on transaction idempotence) but a client might do something else. throw commit_unknown_result(); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); throw; } else { if (e.code() != error_code_transaction_too_old && e.code() != error_code_not_committed && @@ -6029,7 +6049,7 @@ ACTOR static Future tryCommit(Reference trState, } if (trState->trLogInfo) trState->trLogInfo->addLog(FdbClientLogEvents::EventCommitError( - startTime, trState->cx->clientLocality.dcId(), static_cast(e.code()), req, trState->tenant)); + startTime, trState->cx->clientLocality.dcId(), static_cast(e.code()), req, trState->tenant())); throw; } } @@ -6375,9 +6395,9 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optionaltenant.present()) { + if (trState->hasTenant()) { Error e = invalid_option(); - TraceEvent(SevWarn, "TenantTransactionRawAccess").error(e).detail("Tenant", trState->tenant); + TraceEvent(SevWarn, "TenantTransactionRawAccess").error(e).detail("Tenant", trState->tenant()); throw e; } trState->options.rawAccess = true; @@ -6570,7 +6590,7 @@ ACTOR Future extractReadVersion(Reference trState, latency, trState->options.priority, rep.version, - trState->tenant)); + trState->tenant())); if (rep.locked && !trState->options.lockAware) throw database_locked(); @@ -7242,8 +7262,8 @@ ACTOR Future>> getRangeSplitPoints(Referencecx->invalidateCache(locations[0].tenantEntry.prefix, keys); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } else if (e.code() == error_code_unknown_tenant) { - ASSERT(trState->tenant.present()); - trState->cx->invalidateCachedTenant(trState->tenant.get()); + ASSERT(trState->tenant().present()); + trState->cx->invalidateCachedTenant(trState->tenant().get()); wait(delay(CLIENT_KNOBS->UNKNOWN_TENANT_RETRY_DELAY, trState->taskID)); } else { TraceEvent(SevError, "GetRangeSplitPoints").error(e); diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 9bbe1073a7..a3c8c9e856 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -237,7 +237,6 @@ struct Watch : public ReferenceCounted, NonCopyable { struct TransactionState : ReferenceCounted { Database cx; - Optional tenant; int64_t tenantId = TenantInfo::INVALID_TENANT; Reference trLogInfo; TransactionOptions options; @@ -259,7 +258,7 @@ struct TransactionState : ReferenceCounted { std::shared_ptr> conflictingKeys; // Only available so that Transaction can have a default constructor, for use in state variables - TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID) {} + TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID), tenantSet(false) {} TransactionState(Database cx, Optional tenant, @@ -268,7 +267,14 @@ struct TransactionState : ReferenceCounted { Reference trLogInfo); Reference cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const; - TenantInfo getTenantInfo() const; + TenantInfo getTenantInfo(); + + Optional const& tenant(); + bool hasTenant() const; + +private: + Optional tenant_; + bool tenantSet; }; class Transaction : NonCopyable { @@ -447,7 +453,7 @@ public: return Standalone>(tr.transaction.write_conflict_ranges, tr.arena); } - Optional getTenant() { return trState->tenant; } + Optional getTenant() { return trState->tenant(); } Reference trState; std::vector> watches; diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index d919d6c140..5e1f39e5b5 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -23,6 +23,7 @@ #include #include +#include "fdbclient/FDBOptions.g.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbclient/GenericManagementAPI.actor.h" #include "fdbclient/ThreadSafeTransaction.h" @@ -116,7 +117,6 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { int maxClearSize; double initialKeyDensity; bool useSystemKeys; - std::string keyPrefix; KeyRange conflictRange; unsigned int operationId; int64_t maximumTotalData; @@ -131,6 +131,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::set createdTenants; int numTenants; + // Map from tenant number to key prefix + std::map keyPrefixes; + FuzzApiCorrectnessWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), operationId(0), success(true) { std::call_once(onceFlag, [&]() { addTestCases(); }); @@ -162,12 +165,13 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { nodes = deterministicRandom()->randomInt(1, 4 << deterministicRandom()->randomInt(0, 20)); } - int newNodes = std::min(nodes, maximumTotalData / (getKeyForIndex(nodes).size() + valueSizeRange.second)); + int newNodes = + std::min(nodes, maximumTotalData / (getKeyForIndex(-1, nodes).size() + valueSizeRange.second)); minNode = std::max(minNode, nodes - newNodes); nodes = newNodes; if (useSystemKeys && deterministicRandom()->coinflip()) { - keyPrefix = "\xff\x01"; + keyPrefixes[-1] = "\xff\x01"; } maxClearSize = 1 << deterministicRandom()->randomInt(0, 20); @@ -201,9 +205,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::string description() const override { return "FuzzApiCorrectness"; } static TenantName getTenant(int num) { return TenantNameRef(format("tenant_%d", num)); } - bool canUseTenant(Optional tenant) { - return !tenant.present() || createdTenants.count(tenant.get()) || useSystemKeys; - } + bool canUseTenant(Optional tenant) { return !tenant.present() || createdTenants.count(tenant.get()); } Future setup(Database const& cx) override { if (clientId == 0) { @@ -241,14 +243,20 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { Future check(Database const& cx) override { return success; } - Key getRandomKey() const { return getKeyForIndex(deterministicRandom()->randomInt(0, nodes)); } - - Key getKeyForIndex(int idx) const { + Key getKeyForIndex(int tenantNum, int idx) { idx += minNode; if (adjacentKeys) { - return Key(keyPrefix + std::string(idx, '\x00')); + return Key(keyPrefixes[tenantNum] + std::string(idx, '\x00')); } else { - return Key(keyPrefix + format("%010d", idx)); + return Key(keyPrefixes[tenantNum] + format("%010d", idx)); + } + } + + KeyRef getMaxKey(Reference tr) const { + if (useSystemKeys && !tr->getTenant().present()) { + return systemKeys.end; + } else { + return normalKeys.end; } } @@ -287,7 +295,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { state int keysPerBatch = std::min(1000, 1 + CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT / 2 / - (self->getKeyForIndex(nodesPerTenant).size() + self->valueSizeRange.second)); + (self->getKeyForIndex(-1, nodesPerTenant).size() + self->valueSizeRange.second)); try { loop { state int tenantNum = -1; @@ -302,22 +310,25 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { if (now() - startTime > self->testDuration) return Void(); try { - if (i == 0) { - tr->clear(normalKeys); - } - if (self->useSystemKeys) + if (self->useSystemKeys && tenantNum == -1) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + } if (self->specialKeysRelaxed) tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED); if (self->specialKeysWritesEnabled) tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + if (i == 0) { + tr->clear(normalKeys); + } + int end = std::min(nodesPerTenant, i + keysPerBatch); - tr->clear(KeyRangeRef(self->getKeyForIndex(i), self->getKeyForIndex(end))); + tr->clear(KeyRangeRef(self->getKeyForIndex(tenantNum, i), + self->getKeyForIndex(tenantNum, end))); for (int j = i; j < end; j++) { if (deterministicRandom()->random01() < self->initialKeyDensity) { - Key key = self->getKeyForIndex(j); + Key key = self->getKeyForIndex(tenantNum, j); if (key.size() <= (key.startsWith(systemKeys.begin) ? CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT : CLIENT_KNOBS->KEY_SIZE_LIMIT)) { @@ -370,13 +381,15 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { tr = self->tenants[tenantNum]->createTransaction(); } + state bool rawAccess = tenantNum == -1 && deterministicRandom()->coinflip(); + loop { state bool cancelled = false; if (readYourWritesDisabled) tr->setOption(FDBTransactionOptions::READ_YOUR_WRITES_DISABLE); if (readAheadDisabled) tr->setOption(FDBTransactionOptions::READ_AHEAD_DISABLE); - if (self->useSystemKeys) { + if (self->useSystemKeys && tenantNum == -1) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); } if (self->specialKeysRelaxed) { @@ -385,6 +398,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { if (self->specialKeysWritesEnabled) { tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); } + if (rawAccess) { + tr->setOption(FDBTransactionOptions::RAW_ACCESS); + } tr->addWriteConflictRange(self->conflictRange); try { @@ -422,7 +438,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { wait(timeoutError(unsafeThreadFutureToFuture(tr->commit()), 30)); } catch (Error& e) { if (e.code() == error_code_client_invalid_operation || - e.code() == error_code_transaction_too_large || e.code() == error_code_unknown_tenant) { + e.code() == error_code_transaction_too_large || e.code() == error_code_unknown_tenant || + e.code() == error_code_invalid_option) { throw not_committed(); } } @@ -696,10 +713,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { : BaseTest(id, workload, "TestGet") { key = makeKey(); contract = { - std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) && - !specialKeys.contains(key))), + std::make_pair( + error_code_key_outside_legal_range, + ExceptionContract::requiredIf((key >= workload->getMaxKey(tr)) && !specialKeys.contains(key))), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair( @@ -716,7 +732,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { LiteralStringRef("auto_coordinators") .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))), std::make_pair(error_code_tenant_not_found, - ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), + std::make_pair(error_code_invalid_option, + ExceptionContract::possibleIf(tr->getTenant().present() && specialKeys.contains(key))) }; } @@ -738,10 +756,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { TestGetKey(unsigned int id, FuzzApiCorrectnessWorkload* workload, Reference tr) : BaseTest(id, workload, "TestGetKey") { keysel = makeKeySel(); - contract = { std::make_pair( - error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (keysel.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))), + contract = { std::make_pair(error_code_key_outside_legal_range, + ExceptionContract::requiredIf((keysel.getKey() > workload->getMaxKey(tr)))), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, @@ -784,10 +800,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0)), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - ((keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && - !isSpecialKeyRange)), + ExceptionContract::requiredIf(((keysel1.getKey() > workload->getMaxKey(tr)) || + (keysel2.getKey() > workload->getMaxKey(tr))) && + !isSpecialKeyRange)), std::make_pair(error_code_special_keys_cross_module_read, ExceptionContract::possibleIf(isSpecialKeyRange && !workload->specialKeysRelaxed)), std::make_pair(error_code_special_keys_no_module_found, @@ -797,7 +812,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(isSpecialKeyRange)), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), + std::make_pair(error_code_invalid_option, + ExceptionContract::possibleIf(tr->getTenant().present() && isSpecialKeyRange)) }; } @@ -831,10 +848,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { ExceptionContract::possibleButRequiredIf(!limits.isReached() && !limits.isValid())), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - ((keysel1.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (keysel2.getKey() > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && - !isSpecialKeyRange)), + ExceptionContract::requiredIf(((keysel1.getKey() > workload->getMaxKey(tr)) || + (keysel2.getKey() > workload->getMaxKey(tr))) && + !isSpecialKeyRange)), std::make_pair(error_code_special_keys_cross_module_read, ExceptionContract::possibleIf(isSpecialKeyRange && !workload->specialKeysRelaxed)), std::make_pair(error_code_special_keys_no_module_found, @@ -843,7 +859,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_special_keys_api_failure, ExceptionContract::possibleIf(isSpecialKeyRange)), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), + std::make_pair(error_code_invalid_option, + ExceptionContract::possibleIf(tr->getTenant().present() && isSpecialKeyRange)) }; } @@ -892,11 +910,10 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2)), std::make_pair(error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(limit < 0)), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), - std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - ((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && - !isSpecialKeyRange)), + std::make_pair( + error_code_key_outside_legal_range, + ExceptionContract::requiredIf( + ((key1 > workload->getMaxKey(tr)) || (key2 > workload->getMaxKey(tr))) && !isSpecialKeyRange)), std::make_pair(error_code_special_keys_cross_module_read, ExceptionContract::possibleIf(isSpecialKeyRange && !workload->specialKeysRelaxed)), std::make_pair(error_code_special_keys_no_module_found, @@ -909,7 +926,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { autoCoordinatorSpecialKey < key2)), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), + std::make_pair(error_code_invalid_option, + ExceptionContract::possibleIf(tr->getTenant().present() && isSpecialKeyRange)) }; } @@ -946,11 +965,10 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_range_limits_invalid, ExceptionContract::possibleButRequiredIf(!limits.isReached() && !limits.isValid())), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), - std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - ((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && - !isSpecialKeyRange)), + std::make_pair( + error_code_key_outside_legal_range, + ExceptionContract::requiredIf( + ((key1 > workload->getMaxKey(tr)) || (key2 > workload->getMaxKey(tr))) && !isSpecialKeyRange)), std::make_pair(error_code_special_keys_cross_module_read, ExceptionContract::possibleIf(isSpecialKeyRange && !workload->specialKeysRelaxed)), std::make_pair(error_code_special_keys_no_module_found, @@ -963,7 +981,9 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { (autoCoordinatorSpecialKey < key2))), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), std::make_pair(error_code_tenant_not_found, - ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))) + ExceptionContract::possibleIf(!workload->canUseTenant(tr->getTenant()))), + std::make_pair(error_code_invalid_option, + ExceptionContract::possibleIf(tr->getTenant().present() && isSpecialKeyRange)) }; } @@ -1013,9 +1033,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { key2 = makeKey(); contract = { std::make_pair(error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2)), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))) }; + ExceptionContract::requiredIf((key1 > workload->getMaxKey(tr)) || + (key2 > workload->getMaxKey(tr)))) }; } void callback(Reference tr) override { tr->addReadConflictRange(KeyRangeRef(key1, key2)); } @@ -1083,8 +1102,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { error_code_invalid_mutation_type, ExceptionContract::requiredIf(!isValidMutationType(op) || !isAtomicOp((MutationRef::Type)op))), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))), + ExceptionContract::requiredIf((key >= workload->getMaxKey(tr)))), std::make_pair( error_code_client_invalid_operation, ExceptionContract::requiredIf( @@ -1121,9 +1139,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { std::make_pair(error_code_value_too_large, ExceptionContract::requiredIf(value.size() > CLIENT_KNOBS->VALUE_SIZE_LIMIT)), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) && - !specialKeys.contains(key))), + ExceptionContract::requiredIf((key >= workload->getMaxKey(tr)) && + !specialKeys.contains(key))), std::make_pair(error_code_special_keys_write_disabled, ExceptionContract::requiredIf(specialKeys.contains(key) && !workload->specialKeysWritesEnabled)), @@ -1157,11 +1174,10 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { contract = { std::make_pair(error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2)), - std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - ((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && - !isSpecialKeyRange)), + std::make_pair( + error_code_key_outside_legal_range, + ExceptionContract::requiredIf( + ((key1 > workload->getMaxKey(tr)) || (key2 > workload->getMaxKey(tr))) && !isSpecialKeyRange)), std::make_pair(error_code_special_keys_write_disabled, ExceptionContract::requiredIf(isSpecialKeyRange && !workload->specialKeysWritesEnabled)), std::make_pair(error_code_special_keys_cross_module_clear, @@ -1196,11 +1212,10 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { contract = { std::make_pair(error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2)), - std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - ((key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end))) && - !isSpecialKeyRange)), + std::make_pair( + error_code_key_outside_legal_range, + ExceptionContract::requiredIf( + ((key1 > workload->getMaxKey(tr)) || (key2 > workload->getMaxKey(tr))) && !isSpecialKeyRange)), std::make_pair(error_code_special_keys_write_disabled, ExceptionContract::requiredIf(isSpecialKeyRange && !workload->specialKeysWritesEnabled)), std::make_pair(error_code_special_keys_cross_module_clear, @@ -1229,8 +1244,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { key = makeKey(); } contract = { std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end))), + ExceptionContract::requiredIf(key >= workload->getMaxKey(tr))), std::make_pair(error_code_special_keys_write_disabled, ExceptionContract::requiredIf(specialKeys.contains(key) && !workload->specialKeysWritesEnabled)), @@ -1261,8 +1275,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { : CLIENT_KNOBS->KEY_SIZE_LIMIT))), std::make_pair(error_code_watches_disabled, ExceptionContract::Possible), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (key >= (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))), + ExceptionContract::requiredIf((key >= workload->getMaxKey(tr)))), std::make_pair(error_code_client_invalid_operation, ExceptionContract::Possible), std::make_pair(error_code_timed_out, ExceptionContract::Possible), std::make_pair(error_code_accessed_unreadable, ExceptionContract::Possible), @@ -1288,9 +1301,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { key2 = makeKey(); contract = { std::make_pair(error_code_inverted_range, ExceptionContract::requiredIf(key1 > key2)), std::make_pair(error_code_key_outside_legal_range, - ExceptionContract::requiredIf( - (key1 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)) || - (key2 > (workload->useSystemKeys ? systemKeys.end : normalKeys.end)))) }; + ExceptionContract::requiredIf((key1 > workload->getMaxKey(tr)) || + (key2 > workload->getMaxKey(tr)))) }; } void callback(Reference tr) override { tr->addWriteConflictRange(KeyRangeRef(key1, key2)); } @@ -1337,7 +1349,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { // do not test the following options since they are actually used by the workload if (op == FDBTransactionOptions::ACCESS_SYSTEM_KEYS || op == FDBTransactionOptions::READ_SYSTEM_KEYS || - op == FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES) { + op == FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES || + op == FDBTransactionOptions::RAW_ACCESS) { op = -1; } @@ -1406,7 +1419,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { tr->onError(Error::fromUnvalidatedCode(errorcode)); // This is necessary here, as onError will have reset this // value, we will be looking at the wrong thing. - if (workload->useSystemKeys) + if (workload->useSystemKeys && !tr->getTenant().present()) tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); } diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index 2c142569d2..a390c3bc48 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -500,9 +500,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { if (!ryw) { tx->setOption(FDBTransactionOptions::READ_YOUR_WRITES_DISABLE); } + referenceTx->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); referenceTx->setVersion(100); // Prevent this from doing a GRV or committing referenceTx->clear(normalKeys); - referenceTx->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); int numKeys = deterministicRandom()->randomInt(1, self->conflictRangeSizeFactor) * 4; state std::vector keys; // Must all be distinct keys.resize(numKeys); @@ -805,6 +805,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // class source will be changed wait(tx->commit()); tx->reset(); + tx->setOption(FDBTransactionOptions::RAW_ACCESS); Optional class_source = wait(tx->get( Key("process/class_source/" + address) .withPrefix( @@ -864,6 +865,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // if database locked, fdb read should get database_locked error try { tx->reset(); + tx->setOption(FDBTransactionOptions::RAW_ACCESS); RangeResult res = wait(tx->getRange(normalKeys, 1)); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) @@ -883,6 +885,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { TraceEvent(SevDebug, "DatabaseUnlocked").log(); tx->reset(); // read should be successful + tx->setOption(FDBTransactionOptions::RAW_ACCESS); RangeResult res = wait(tx->getRange(normalKeys, 1)); tx->reset(); break; @@ -1484,11 +1487,12 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { state Reference tr2(new ReadYourWritesTransaction(cx)); loop { try { - Version readVersion = wait(tr1->getReadVersion()); - tr2->setVersion(readVersion); tr1->setOption(FDBTransactionOptions::RAW_ACCESS); tr1->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); tr2->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + Version readVersion = wait(tr1->getReadVersion()); + tr2->setVersion(readVersion); KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution"); tr1->set(LiteralStringRef("mode").withPrefix(ddPrefix), LiteralStringRef("1")); wait(tr1->commit()); diff --git a/tests/fast/FuzzApiCorrectness.toml b/tests/fast/FuzzApiCorrectness.toml index 51e884c090..36b8e6812f 100644 --- a/tests/fast/FuzzApiCorrectness.toml +++ b/tests/fast/FuzzApiCorrectness.toml @@ -1,6 +1,7 @@ [configuration] StderrSeverity = 30 allowDisablingTenants = false +allowDefaultTenant = false [[test]] testTitle = 'FuzzApiCorrectness' diff --git a/tests/fast/FuzzApiCorrectnessClean.toml b/tests/fast/FuzzApiCorrectnessClean.toml index 85aee61d12..deddfacce3 100644 --- a/tests/fast/FuzzApiCorrectnessClean.toml +++ b/tests/fast/FuzzApiCorrectnessClean.toml @@ -1,6 +1,7 @@ [configuration] StderrSeverity = 30 allowDisablingTenants = false +allowDefaultTenant = false [[test]] testTitle = 'FuzzApiCorrectness' From 513e8887ddd0d8dfc41cc73453f2290fb6239788 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 14 Mar 2022 16:08:32 -0700 Subject: [PATCH 132/138] Add a new C unit test that uses tenants. Fix simulation issue where resetting a transaction could cause in-flight operations to pick up the simulated default tenant for the next retry. Use the new list tenant JSON format in the tenant management test. Fix missing raw_access option in special key space correctness test. --- bindings/c/test/unit/fdb_api.cpp | 16 ++- bindings/c/test/unit/fdb_api.hpp | 10 ++ bindings/c/test/unit/unit_tests.cpp | 106 ++++++++++++++++++ fdbclient/NativeAPI.actor.cpp | 1 + .../SpecialKeySpaceCorrectness.actor.cpp | 1 + .../workloads/TenantManagement.actor.cpp | 2 +- tests/TestRunner/local_cluster.py | 2 +- 7 files changed, 135 insertions(+), 3 deletions(-) diff --git a/bindings/c/test/unit/fdb_api.cpp b/bindings/c/test/unit/fdb_api.cpp index 301cc5832b..9ce9dd5d9e 100644 --- a/bindings/c/test/unit/fdb_api.cpp +++ b/bindings/c/test/unit/fdb_api.cpp @@ -130,8 +130,15 @@ EmptyFuture Database::create_snapshot(FDBDatabase* db, return EmptyFuture(fdb_database_create_snapshot(db, uid, uid_length, snap_command, snap_command_length)); } -// Transaction +// Tenant +Tenant::Tenant(FDBDatabase* db, const uint8_t* name, int name_length) { + if (fdb_error_t err = fdb_database_open_tenant(db, name, name_length, &tenant)) { + std::cerr << fdb_get_error(err) << std::endl; + std::abort(); + } +} +// Transaction Transaction::Transaction(FDBDatabase* db) { if (fdb_error_t err = fdb_database_create_transaction(db, &tr_)) { std::cerr << fdb_get_error(err) << std::endl; @@ -139,6 +146,13 @@ Transaction::Transaction(FDBDatabase* db) { } } +Transaction::Transaction(Tenant tenant) { + if (fdb_error_t err = fdb_tenant_create_transaction(tenant.tenant, &tr_)) { + std::cerr << fdb_get_error(err) << std::endl; + std::abort(); + } +} + Transaction::~Transaction() { fdb_transaction_destroy(tr_); } diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp index 63ee9573c8..5653d6e7cb 100644 --- a/bindings/c/test/unit/fdb_api.hpp +++ b/bindings/c/test/unit/fdb_api.hpp @@ -203,6 +203,15 @@ public: int snap_command_length); }; +class Tenant final { +public: + Tenant(FDBDatabase* db, const uint8_t* name, int name_length); + +private: + friend class Transaction; + FDBTenant* tenant; +}; + // Wrapper around FDBTransaction, providing the same set of calls as the C API. // Handles cleanup of memory, removing the need to call // fdb_transaction_destroy. @@ -210,6 +219,7 @@ class Transaction final { public: // Given an FDBDatabase, initializes a new transaction. Transaction(FDBDatabase* db); + Transaction(Tenant tenant); ~Transaction(); // Wrapper around fdb_transaction_reset. diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index d96f5ccdfc..5befee28a4 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -2410,6 +2410,112 @@ TEST_CASE("Fast alloc thread cleanup") { } } +TEST_CASE("Tenant create, access, and delete") { + std::string tenantName = "tenant"; + std::string testKey = "foo"; + std::string testValue = "bar"; + + fdb::Transaction tr(db); + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); + fprintf(stderr, "Create tenant\n"); + tr.set("\xff\xff/management/tenant_map/" + tenantName, ""); + fdb::EmptyFuture commitFuture = tr.commit(); + fdb_error_t err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr.on_error(err); + fdb_check(wait_future(f)); + continue; + } + tr.reset(); + break; + } + + fdb::Tenant tenant(db, reinterpret_cast(tenantName.c_str()), tenantName.size()); + fdb::Transaction tr2(tenant); + + while (1) { + fprintf(stderr, "Set tenant key\n"); + tr2.set(testKey, testValue); + fdb::EmptyFuture commitFuture = tr2.commit(); + fdb_error_t err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr2.on_error(err); + fdb_check(wait_future(f)); + continue; + } + tr2.reset(); + break; + } + + while (1) { + fprintf(stderr, "Get tenant key\n"); + fdb::ValueFuture f1 = tr2.get(testKey, false); + fdb_error_t err = wait_future(f1); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + + int out_present; + char* val; + int vallen; + fdb_check(f1.get(&out_present, (const uint8_t**)&val, &vallen)); + CHECK(out_present == 1); + CHECK(vallen == testValue.size()); + CHECK(testValue == val); + + tr2.clear(testKey); + fdb::EmptyFuture commitFuture = tr2.commit(); + err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr2.on_error(err); + fdb_check(wait_future(f)); + continue; + } + + tr2.reset(); + break; + } + + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); + fprintf(stderr, "Delete tenant\n"); + tr.clear("\xff\xff/management/tenant_map/" + tenantName); + fdb::EmptyFuture commitFuture = tr.commit(); + fdb_error_t err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr.on_error(err); + fdb_check(wait_future(f)); + continue; + } + tr.reset(); + break; + } + + while (1) { + fprintf(stderr, "Get tenant after delete\n"); + fdb::ValueFuture f1 = tr2.get(testKey, false); + fdb_error_t err = wait_future(f1); + fprintf(stderr, "Got error: %d\n", err); + if (err == error_code_tenant_not_found) { + tr2.reset(); + break; + } + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + } +} + int main(int argc, char** argv) { if (argc < 3) { std::cout << "Unit tests for the FoundationDB C API.\n" diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 22ed708f0d..aa0045fe35 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3046,6 +3046,7 @@ Reference TransactionState::cloneAndReset(ReferencestartTime = startTime; newState->committedVersion = committedVersion; newState->conflictingKeys = conflictingKeys; + newState->tenantSet = tenantSet; return newState; } diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index a390c3bc48..b4e9e3c3a6 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -1145,6 +1145,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { } // advanceversion try { + tx->setOption(FDBTransactionOptions::RAW_ACCESS); Version v1 = wait(tx->getReadVersion()); TraceEvent(SevDebug, "InitialReadVersion").detail("Version", v1); state Version v2 = 2 * v1; diff --git a/fdbserver/workloads/TenantManagement.actor.cpp b/fdbserver/workloads/TenantManagement.actor.cpp index 2cf4f48a0c..ab2864d3b9 100644 --- a/fdbserver/workloads/TenantManagement.actor.cpp +++ b/fdbserver/workloads/TenantManagement.actor.cpp @@ -376,7 +376,7 @@ struct TenantManagementWorkload : TestWorkload { jsonDoc.get("id", id); jsonDoc.get("prefix", prefix); - Key prefixKey = KeyRef(unprintable(prefix)); + Key prefixKey = KeyRef(prefix); TenantMapEntry entry(id, prefixKey.substr(0, prefixKey.size() - 8)); ASSERT(entry.prefix == prefixKey); diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index 1f733aa4d6..30162147fe 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -136,5 +136,5 @@ logdir = {logdir} def create_database(self, storage='ssd'): args = [self.fdbcli_binary, '-C', self.etc.joinpath('fdb.cluster'), '--exec', - 'configure new single {}'.format(storage)] + 'configure new single {} tenant_mode=optional_experimental'.format(storage)] subprocess.run(args) From a691481629d74395576a732580164bbaf0f70a3b Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 15 Mar 2022 10:47:56 -0700 Subject: [PATCH 133/138] Allow tenants to test the metadata version key. Add a watch on the metadata version key to the versionstamp test. --- fdbserver/workloads/VersionStamp.actor.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp index 69dc14a865..4b0403b0a2 100644 --- a/fdbserver/workloads/VersionStamp.actor.cpp +++ b/fdbserver/workloads/VersionStamp.actor.cpp @@ -78,9 +78,6 @@ struct VersionStampWorkload : TestWorkload { allowMetadataVersionKey = apiVersion >= 610 || apiVersion == Database::API_VERSION_LATEST; - // TODO: change this once metadata versions are supported for tenants - allowMetadataVersionKey = allowMetadataVersionKey && !cx->defaultTenant.present(); - cx->apiVersion = apiVersion; if (clientId == 0) return _start(cx, this, 1 / transactionsPerSecond); @@ -320,6 +317,7 @@ struct VersionStampWorkload : TestWorkload { extraDB = Database::createDatabase(extraFile, -1); } + state Future metadataWatch = Void(); loop { wait(poisson(&lastTime, delay)); bool oldVSFormat = !cx->apiVersionAtLeast(520); @@ -356,6 +354,7 @@ struct VersionStampWorkload : TestWorkload { state Error err; //TraceEvent("VST_CommitBegin").detail("Key", printable(key)).detail("VsKey", printable(versionStampKey)).detail("Clear", printable(range)); state Key testKey; + state Future nextMetadataWatch; try { tr.atomicOp(key, versionStampValue, MutationRef::SetVersionstampedValue); if (key == metadataVersionKey) { @@ -364,12 +363,21 @@ struct VersionStampWorkload : TestWorkload { } tr.clear(range); tr.atomicOp(versionStampKey, value, MutationRef::SetVersionstampedKey); + if (key == metadataVersionKey) { + nextMetadataWatch = tr.watch(versionStampKey); + } state Future> fTrVs = tr.getVersionstamp(); wait(tr.commit()); committedVersion = tr.getCommittedVersion(); Standalone committedVersionStamp_ = wait(fTrVs); committedVersionStamp = committedVersionStamp_; + + if (key == metadataVersionKey) { + wait(timeoutError(metadataWatch, 30)); + nextMetadataWatch = metadataWatch; + } + } catch (Error& e) { err = e; if (err.code() == error_code_database_locked && g_simulator.extraDB != nullptr) { From 92c1044a3d5f9255ae8dc8586e06c591c82b4c1e Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 15 Mar 2022 10:59:28 -0700 Subject: [PATCH 134/138] Fix formatting --- bindings/c/test/unit/unit_tests.cpp | 188 ++++++++++++++-------------- 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 5befee28a4..07d44efbb9 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -2411,109 +2411,109 @@ TEST_CASE("Fast alloc thread cleanup") { } TEST_CASE("Tenant create, access, and delete") { - std::string tenantName = "tenant"; - std::string testKey = "foo"; - std::string testValue = "bar"; + std::string tenantName = "tenant"; + std::string testKey = "foo"; + std::string testValue = "bar"; - fdb::Transaction tr(db); - while (1) { - fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); - fprintf(stderr, "Create tenant\n"); - tr.set("\xff\xff/management/tenant_map/" + tenantName, ""); - fdb::EmptyFuture commitFuture = tr.commit(); - fdb_error_t err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); - if (err) { - fdb::EmptyFuture f = tr.on_error(err); - fdb_check(wait_future(f)); - continue; - } - tr.reset(); - break; - } + fdb::Transaction tr(db); + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); + fprintf(stderr, "Create tenant\n"); + tr.set("\xff\xff/management/tenant_map/" + tenantName, ""); + fdb::EmptyFuture commitFuture = tr.commit(); + fdb_error_t err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr.on_error(err); + fdb_check(wait_future(f)); + continue; + } + tr.reset(); + break; + } - fdb::Tenant tenant(db, reinterpret_cast(tenantName.c_str()), tenantName.size()); - fdb::Transaction tr2(tenant); + fdb::Tenant tenant(db, reinterpret_cast(tenantName.c_str()), tenantName.size()); + fdb::Transaction tr2(tenant); - while (1) { - fprintf(stderr, "Set tenant key\n"); - tr2.set(testKey, testValue); - fdb::EmptyFuture commitFuture = tr2.commit(); - fdb_error_t err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); - if (err) { - fdb::EmptyFuture f = tr2.on_error(err); - fdb_check(wait_future(f)); - continue; - } - tr2.reset(); - break; - } + while (1) { + fprintf(stderr, "Set tenant key\n"); + tr2.set(testKey, testValue); + fdb::EmptyFuture commitFuture = tr2.commit(); + fdb_error_t err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr2.on_error(err); + fdb_check(wait_future(f)); + continue; + } + tr2.reset(); + break; + } - while (1) { - fprintf(stderr, "Get tenant key\n"); - fdb::ValueFuture f1 = tr2.get(testKey, false); - fdb_error_t err = wait_future(f1); - fprintf(stderr, "Got error: %d\n", err); - if (err) { - fdb::EmptyFuture f2 = tr.on_error(err); - fdb_check(wait_future(f2)); - continue; - } + while (1) { + fprintf(stderr, "Get tenant key\n"); + fdb::ValueFuture f1 = tr2.get(testKey, false); + fdb_error_t err = wait_future(f1); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } - int out_present; - char* val; - int vallen; - fdb_check(f1.get(&out_present, (const uint8_t**)&val, &vallen)); - CHECK(out_present == 1); - CHECK(vallen == testValue.size()); - CHECK(testValue == val); + int out_present; + char* val; + int vallen; + fdb_check(f1.get(&out_present, (const uint8_t**)&val, &vallen)); + CHECK(out_present == 1); + CHECK(vallen == testValue.size()); + CHECK(testValue == val); - tr2.clear(testKey); - fdb::EmptyFuture commitFuture = tr2.commit(); - err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); - if (err) { - fdb::EmptyFuture f = tr2.on_error(err); - fdb_check(wait_future(f)); - continue; - } + tr2.clear(testKey); + fdb::EmptyFuture commitFuture = tr2.commit(); + err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr2.on_error(err); + fdb_check(wait_future(f)); + continue; + } - tr2.reset(); - break; - } + tr2.reset(); + break; + } - while (1) { - fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); - fprintf(stderr, "Delete tenant\n"); - tr.clear("\xff\xff/management/tenant_map/" + tenantName); - fdb::EmptyFuture commitFuture = tr.commit(); - fdb_error_t err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); - if (err) { - fdb::EmptyFuture f = tr.on_error(err); - fdb_check(wait_future(f)); - continue; - } - tr.reset(); - break; - } + while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); + fprintf(stderr, "Delete tenant\n"); + tr.clear("\xff\xff/management/tenant_map/" + tenantName); + fdb::EmptyFuture commitFuture = tr.commit(); + fdb_error_t err = wait_future(commitFuture); + fprintf(stderr, "Got error: %d\n", err); + if (err) { + fdb::EmptyFuture f = tr.on_error(err); + fdb_check(wait_future(f)); + continue; + } + tr.reset(); + break; + } - while (1) { - fprintf(stderr, "Get tenant after delete\n"); - fdb::ValueFuture f1 = tr2.get(testKey, false); - fdb_error_t err = wait_future(f1); - fprintf(stderr, "Got error: %d\n", err); - if (err == error_code_tenant_not_found) { - tr2.reset(); - break; - } - if (err) { - fdb::EmptyFuture f2 = tr.on_error(err); - fdb_check(wait_future(f2)); - continue; - } - } + while (1) { + fprintf(stderr, "Get tenant after delete\n"); + fdb::ValueFuture f1 = tr2.get(testKey, false); + fdb_error_t err = wait_future(f1); + fprintf(stderr, "Got error: %d\n", err); + if (err == error_code_tenant_not_found) { + tr2.reset(); + break; + } + if (err) { + fdb::EmptyFuture f2 = tr.on_error(err); + fdb_check(wait_future(f2)); + continue; + } + } } int main(int argc, char** argv) { From f5bf4f8465f5c7a938c59d852cda5808b557ed00 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 15 Mar 2022 15:15:34 -0700 Subject: [PATCH 135/138] Request system key access when running management API transaction functions. --- fdbserver/workloads/TenantManagement.actor.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdbserver/workloads/TenantManagement.actor.cpp b/fdbserver/workloads/TenantManagement.actor.cpp index ab2864d3b9..891b1e9482 100644 --- a/fdbserver/workloads/TenantManagement.actor.cpp +++ b/fdbserver/workloads/TenantManagement.actor.cpp @@ -141,6 +141,7 @@ struct TenantManagementWorkload : TestWorkload { } else if (operationType == OperationType::MANAGEMENT_DATABASE) { wait(ManagementAPI::createTenant(cx.getReference(), tenant)); } else { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); Optional _ = wait(ManagementAPI::createTenantTransaction(tr, tenant)); wait(tr->commit()); } @@ -296,6 +297,7 @@ struct TenantManagementWorkload : TestWorkload { wait(ManagementAPI::deleteTenant(cx.getReference(), tenant)); } } else { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); for (auto tenant : tenants) { wait(ManagementAPI::deleteTenantTransaction(tr, tenant)); } @@ -405,6 +407,7 @@ struct TenantManagementWorkload : TestWorkload { TenantMapEntry _entry = wait(ManagementAPI::getTenant(cx.getReference(), tenant)); entry = _entry; } else { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); TenantMapEntry _entry = wait(ManagementAPI::getTenantTransaction(tr, tenant)); entry = _entry; } @@ -462,6 +465,7 @@ struct TenantManagementWorkload : TestWorkload { wait(ManagementAPI::listTenants(cx.getReference(), beginTenant, endTenant, limit)); tenants = _tenants; } else { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); std::map _tenants = wait(ManagementAPI::listTenantsTransaction(tr, beginTenant, endTenant, limit)); tenants = _tenants; From ee708379703ea21f792d0d408a4c7ff4ee5e27cd Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 16 Mar 2022 10:58:23 -0700 Subject: [PATCH 136/138] Remove some debugging lines, change simulated default cluster frequency, and use a different mechanism to disable tenants for the change feed test. --- bindings/c/test/unit/unit_tests.cpp | 11 ----------- fdbserver/SimulatedCluster.actor.cpp | 2 +- fdbserver/workloads/ChangeFeeds.actor.cpp | 2 -- tests/fast/ChangeFeeds.toml | 1 + 4 files changed, 2 insertions(+), 14 deletions(-) diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 07d44efbb9..b4bc11f9d9 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -2418,11 +2418,9 @@ TEST_CASE("Tenant create, access, and delete") { fdb::Transaction tr(db); while (1) { fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); - fprintf(stderr, "Create tenant\n"); tr.set("\xff\xff/management/tenant_map/" + tenantName, ""); fdb::EmptyFuture commitFuture = tr.commit(); fdb_error_t err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); if (err) { fdb::EmptyFuture f = tr.on_error(err); fdb_check(wait_future(f)); @@ -2436,11 +2434,9 @@ TEST_CASE("Tenant create, access, and delete") { fdb::Transaction tr2(tenant); while (1) { - fprintf(stderr, "Set tenant key\n"); tr2.set(testKey, testValue); fdb::EmptyFuture commitFuture = tr2.commit(); fdb_error_t err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); if (err) { fdb::EmptyFuture f = tr2.on_error(err); fdb_check(wait_future(f)); @@ -2451,10 +2447,8 @@ TEST_CASE("Tenant create, access, and delete") { } while (1) { - fprintf(stderr, "Get tenant key\n"); fdb::ValueFuture f1 = tr2.get(testKey, false); fdb_error_t err = wait_future(f1); - fprintf(stderr, "Got error: %d\n", err); if (err) { fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); @@ -2472,7 +2466,6 @@ TEST_CASE("Tenant create, access, and delete") { tr2.clear(testKey); fdb::EmptyFuture commitFuture = tr2.commit(); err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); if (err) { fdb::EmptyFuture f = tr2.on_error(err); fdb_check(wait_future(f)); @@ -2485,11 +2478,9 @@ TEST_CASE("Tenant create, access, and delete") { while (1) { fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0)); - fprintf(stderr, "Delete tenant\n"); tr.clear("\xff\xff/management/tenant_map/" + tenantName); fdb::EmptyFuture commitFuture = tr.commit(); fdb_error_t err = wait_future(commitFuture); - fprintf(stderr, "Got error: %d\n", err); if (err) { fdb::EmptyFuture f = tr.on_error(err); fdb_check(wait_future(f)); @@ -2500,10 +2491,8 @@ TEST_CASE("Tenant create, access, and delete") { } while (1) { - fprintf(stderr, "Get tenant after delete\n"); fdb::ValueFuture f1 = tr2.get(testKey, false); fdb_error_t err = wait_future(f1); - fprintf(stderr, "Got error: %d\n", err); if (err == error_code_tenant_not_found) { tr2.reset(); break; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index a17574ccdd..599173ee67 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2344,7 +2344,7 @@ ACTOR void setupAndRun(std::string dataFolder, state Optional defaultTenant; state TenantMode tenantMode = TenantMode::DISABLED; - if (allowDefaultTenant && deterministicRandom()->random01() < 1.0) { + if (allowDefaultTenant && deterministicRandom()->random01() < 0.5) { defaultTenant = "SimulatedDefaultTenant"_sr; if (deterministicRandom()->random01() < 0.9) { tenantMode = TenantMode::REQUIRED; diff --git a/fdbserver/workloads/ChangeFeeds.actor.cpp b/fdbserver/workloads/ChangeFeeds.actor.cpp index d7227c3269..edc3444340 100644 --- a/fdbserver/workloads/ChangeFeeds.actor.cpp +++ b/fdbserver/workloads/ChangeFeeds.actor.cpp @@ -37,8 +37,6 @@ ACTOR Future>, Version>> readDatabase(Database cx) { state Transaction tr(cx); loop { - // Change feeds do not currently support tenant based access - tr.setOption(FDBTransactionOptions::RAW_ACCESS); state Standalone> output; state Version readVersion; try { diff --git a/tests/fast/ChangeFeeds.toml b/tests/fast/ChangeFeeds.toml index d88341be6c..edebe5dc5f 100644 --- a/tests/fast/ChangeFeeds.toml +++ b/tests/fast/ChangeFeeds.toml @@ -1,5 +1,6 @@ [[test]] testTitle = 'ChangeFeed' +allowDefaultTenant = false [[test.workload]] testName = 'Cycle' From 74487310fa27fe76952edb03cc2f3a883f5bc115 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 16 Mar 2022 11:40:01 -0700 Subject: [PATCH 137/138] Fix a couple test specification errors --- tests/fast/ChangeFeeds.toml | 4 +++- tests/slow/SwizzledTenantManagement.toml | 2 +- tests/slow/TenantManagement.toml | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/fast/ChangeFeeds.toml b/tests/fast/ChangeFeeds.toml index edebe5dc5f..8f2d348dd3 100644 --- a/tests/fast/ChangeFeeds.toml +++ b/tests/fast/ChangeFeeds.toml @@ -1,6 +1,8 @@ +[configuration] +allowDefaultTenant = false + [[test]] testTitle = 'ChangeFeed' -allowDefaultTenant = false [[test.workload]] testName = 'Cycle' diff --git a/tests/slow/SwizzledTenantManagement.toml b/tests/slow/SwizzledTenantManagement.toml index 0c7c8a6b69..504d1640bb 100644 --- a/tests/slow/SwizzledTenantManagement.toml +++ b/tests/slow/SwizzledTenantManagement.toml @@ -1,4 +1,4 @@ -[[configuration]] +[configuration] allowDisablingTenants = false [[test]] diff --git a/tests/slow/TenantManagement.toml b/tests/slow/TenantManagement.toml index 9bdef4f7f6..33585353e5 100644 --- a/tests/slow/TenantManagement.toml +++ b/tests/slow/TenantManagement.toml @@ -1,4 +1,4 @@ -[[configuration]] +[configuration] allowDisablingTenants = false [[test]] From 6bccaa3e2df5e8a0c96c9abcee19035579df4946 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 17 Mar 2022 11:06:34 -0700 Subject: [PATCH 138/138] Fix option use in PhysicalShardMove and add support for tenants in the test. --- fdbserver/RocksDBCheckpointUtils.actor.cpp | 1 + fdbserver/workloads/PhysicalShardMove.actor.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fdbserver/RocksDBCheckpointUtils.actor.cpp b/fdbserver/RocksDBCheckpointUtils.actor.cpp index 612f8b1f20..abaa235abe 100644 --- a/fdbserver/RocksDBCheckpointUtils.actor.cpp +++ b/fdbserver/RocksDBCheckpointUtils.actor.cpp @@ -140,6 +140,7 @@ ACTOR Future fetchCheckpointFile(Database cx, state StorageServerInterface ssi; loop { try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); Optional ss = wait(tr.get(serverListKeyFor(ssID))); if (!ss.present()) { throw checkpoint_not_found(); diff --git a/fdbserver/workloads/PhysicalShardMove.actor.cpp b/fdbserver/workloads/PhysicalShardMove.actor.cpp index 333b7bfc79..feb2ebd8e8 100644 --- a/fdbserver/workloads/PhysicalShardMove.actor.cpp +++ b/fdbserver/workloads/PhysicalShardMove.actor.cpp @@ -77,11 +77,11 @@ struct SSCheckpointWorkload : TestWorkload { // Create checkpoint. state Transaction tr(cx); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); state CheckpointFormat format = RocksDBColumnFamily; loop { try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); wait(createCheckpoint(&tr, KeyRangeRef(key, endKey), format)); wait(tr.commit()); version = tr.getCommittedVersion(); @@ -157,9 +157,10 @@ struct SSCheckpointWorkload : TestWorkload { // Compare the keyrange between the original database and the one restored from checkpoint. // For now, it should have been a single key. tr.reset(); - tr.setOption(FDBTransactionOptions::LOCK_AWARE); loop { try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::RAW_ACCESS); state RangeResult res = wait(tr.getRange(KeyRangeRef(key, endKey), CLIENT_KNOBS->TOO_MANY)); break; } catch (Error& e) { @@ -182,10 +183,10 @@ struct SSCheckpointWorkload : TestWorkload { Key key, ErrorOr> expectedValue) { state Transaction tr(cx); - tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); loop { try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); state Optional res = wait(timeoutError(tr.get(key), 30.0)); const bool equal = !expectedValue.isError() && res == expectedValue.get(); if (!equal) { @@ -208,6 +209,7 @@ struct SSCheckpointWorkload : TestWorkload { state Version version; loop { try { + tr.setOption(FDBTransactionOptions::RAW_ACCESS); if (value.present()) { tr.set(key, value.get()); } else {