diff --git a/bindings/c/CMakeLists.txt b/bindings/c/CMakeLists.txt index 7f71a0f44b..59a86e7261 100644 --- a/bindings/c/CMakeLists.txt +++ b/bindings/c/CMakeLists.txt @@ -119,6 +119,7 @@ if(NOT WIN32) set(API_TESTER_SRCS test/apitester/fdb_c_api_tester.cpp + test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp test/apitester/TesterApiWorkload.cpp test/apitester/TesterApiWorkload.h test/apitester/TesterTestSpec.cpp diff --git a/bindings/c/test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp new file mode 100644 index 0000000000..ae0d600422 --- /dev/null +++ b/bindings/c/test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp @@ -0,0 +1,330 @@ +/* + * TesterAtomicOpsCorrectnessWorkload.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "TesterApiWorkload.h" +#include "TesterUtil.h" +#include "fdb_c_options.g.h" +#include "fmt/core.h" +#include "test/fdb_api.hpp" +#include +#include +#include + +namespace FdbApiTester { + +using fdb::Key; +using fdb::Value; +using fdb::ValueRef; + +class AtomicOpsCorrectnessWorkload : public ApiWorkload { +public: + AtomicOpsCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {} + +private: + typedef std::function IntAtomicOpFunction; + typedef std::function AtomicOpFunction; + + enum OpType { + OP_ATOMIC_ADD, + OP_ATOMIC_BIT_AND, + OP_ATOMIC_BIT_OR, + OP_ATOMIC_BIT_XOR, + OP_ATOMIC_APPEND_IF_FITS, + OP_ATOMIC_MAX, + OP_ATOMIC_MIN, + OP_ATOMIC_VERSIONSTAMPED_KEY, + OP_ATOMIC_VERSIONSTAMPED_VALUE, + OP_ATOMIC_BYTE_MIN, + OP_ATOMIC_BYTE_MAX, + OP_ATOMIC_COMPARE_AND_CLEAR, + OP_LAST = OP_ATOMIC_COMPARE_AND_CLEAR + }; + + void randomOperation(TTaskFct cont) override { + OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); + + switch (txType) { + case OP_ATOMIC_ADD: + testIntAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_ADD, [](uint64_t val1, uint64_t val2) { return val1 + val2; }, cont); + break; + case OP_ATOMIC_BIT_AND: + testIntAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_BIT_AND, + [](uint64_t val1, uint64_t val2) { return val1 & val2; }, + cont); + break; + case OP_ATOMIC_BIT_OR: + testIntAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_BIT_OR, + [](uint64_t val1, uint64_t val2) { return val1 | val2; }, + cont); + break; + case OP_ATOMIC_BIT_XOR: + testIntAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_BIT_XOR, + [](uint64_t val1, uint64_t val2) { return val1 ^ val2; }, + cont); + break; + case OP_ATOMIC_APPEND_IF_FITS: { + Value val1 = randomValue(); + Value val2 = randomValue(); + testAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_APPEND_IF_FITS, + val1, + val2, + [](ValueRef val1, ValueRef val2) { return Value(val1) + Value(val2); }, + cont); + break; + } + case OP_ATOMIC_MAX: + testIntAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_MAX, + [](uint64_t val1, uint64_t val2) { return std::max(val1, val2); }, + cont); + break; + case OP_ATOMIC_MIN: + testIntAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_MIN, + [](uint64_t val1, uint64_t val2) { return std::min(val1, val2); }, + cont); + break; + case OP_ATOMIC_VERSIONSTAMPED_KEY: + testAtomicVersionstampedKeyOp(cont); + break; + case OP_ATOMIC_VERSIONSTAMPED_VALUE: + testAtomicVersionstampedValueOp(cont); + break; + case OP_ATOMIC_BYTE_MIN: { + Value val1 = randomValue(); + Value val2 = randomValue(); + testAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_BYTE_MIN, + val1, + val2, + [](ValueRef val1, ValueRef val2) { return Value(std::min(val1, val2)); }, + cont); + break; + } + case OP_ATOMIC_BYTE_MAX: { + Value val1 = randomValue(); + Value val2 = randomValue(); + testAtomicOp( + FDBMutationType::FDB_MUTATION_TYPE_BYTE_MAX, + val1, + val2, + [](ValueRef val1, ValueRef val2) { return Value(std::max(val1, val2)); }, + cont); + break; + } + case OP_ATOMIC_COMPARE_AND_CLEAR: + testAtomicCompareAndClearOp(cont); + break; + } + } + + void testIntAtomicOp(FDBMutationType opType, IntAtomicOpFunction opFunc, TTaskFct cont) { + uint64_t intValue1 = Random::get().randomInt(0, 10000000); + uint64_t intValue2 = Random::get().randomInt(0, 10000000); + + Value val1 = toByteString(intValue1); + Value val2 = toByteString(intValue2); + testAtomicOp( + opType, + val1, + val2, + [opFunc](ValueRef val1, ValueRef val2) { + return toByteString(opFunc(toInteger(val1), toInteger(val2))); + }, + cont); + } + + void testAtomicOp(FDBMutationType opType, Value val1, Value val2, AtomicOpFunction opFunc, TTaskFct cont) { + Key key(randomKeyName()); + execTransaction( + // 1. Set the key to val1 + [key, val1](auto ctx) { + ctx->tx().set(key, val1); + ctx->commit(); + }, + [this, opType, opFunc, key, val1, val2, cont]() { + execTransaction( + // 2. Perform the given atomic operation to val2, but only if it hasn't been applied yet, otherwise + // retries of commit_unknown_result would cause the operation to be applied multiple times, see + // https://github.com/apple/foundationdb/issues/1321. + [key, opType, val1, val2](auto ctx) { + auto f = ctx->tx().get(key, false); + ctx->continueAfter(f, [ctx, f, opType, key, val1, val2]() { + auto outputVal = f.get(); + ASSERT(outputVal.has_value()); + if (outputVal.value() == val1) { + ctx->tx().atomicOp(key, val2, opType); + ctx->commit(); + } else { + ctx->done(); + } + }); + }, + [this, opFunc, key, val1, val2, cont]() { + auto result = std::make_shared(); + + execTransaction( + // 3. Fetch the final value. + [key, result](auto ctx) { + auto f = ctx->tx().get(key, false); + ctx->continueAfter( + f, + [ctx, f, result]() { + auto outputVal = f.get(); + ASSERT(outputVal.has_value()); + *result = outputVal.value(); + ctx->done(); + }, + true); + }, + [this, opFunc, key, val1, val2, result, cont]() { + // 4. Assert expectation. + auto expected = opFunc(val1, val2); + if (*result != expected) { + error(fmt::format("testAtomicOp expected: {} actual: {}", + fdb::toCharsRef(expected), + fdb::toCharsRef(*result))); + ASSERT(false); + } + schedule(cont); + }); + }); + }); + } + + void testAtomicVersionstampedKeyOp(TTaskFct cont) { + Key keyPrefix(randomKeyName()); + Key key = keyPrefix + fdb::ByteString(10, '\0') + toByteString((uint32_t)keyPrefix.size()); + Value val = randomValue(); + + auto versionstamp_f = std::make_shared>(); + execTransaction( + // 1. Perform SetVersionstampedKey operation. + [key, val, versionstamp_f](auto ctx) { + ctx->tx().atomicOp(key, val, FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_KEY); + *versionstamp_f = ctx->tx().getVersionstamp(); + ctx->commit(); + }, + [this, keyPrefix, val, versionstamp_f, cont]() { + ASSERT(versionstamp_f->ready()); + auto resultKey = keyPrefix + Key(versionstamp_f->get()); + auto resultVal = std::make_shared(); + execTransaction( + // 2. Fetch the resulting versionstamped key and value. + [keyPrefix, resultKey, resultVal](auto ctx) { + auto fv = ctx->tx().get(resultKey, false); + ctx->continueAfter(fv, [ctx, fv, resultVal]() { + auto outputVal = fv.get(); + ASSERT(outputVal.has_value()); + *resultVal = outputVal.value(); + + ctx->done(); + }); + }, + [this, keyPrefix, val, resultVal, cont]() { + // 3. Assert expectation. + ASSERT(*resultVal == val); + schedule(cont); + }); + }); + } + + void testAtomicVersionstampedValueOp(TTaskFct cont) { + Key key(randomKeyName()); + Value valPrefix = randomValue(); + Value val = valPrefix + fdb::ByteString(10, '\0') + toByteString((uint32_t)valPrefix.size()); + auto versionstamp_f = std::make_shared>(); + execTransaction( + // 1. Perform SetVersionstampedValue operation. + [key, val, versionstamp_f](auto ctx) { + ctx->tx().atomicOp(key, val, FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE); + *versionstamp_f = ctx->tx().getVersionstamp(); + ctx->commit(); + }, + [this, key, valPrefix, versionstamp_f, cont]() { + versionstamp_f->blockUntilReady(); + auto versionstamp = Key(versionstamp_f->get()); + auto result = std::make_shared(); + + execTransaction( + // 2. Fetch the resulting versionstamped value. + [key, result](auto ctx) { + auto f = ctx->tx().get(key, false); + ctx->continueAfter( + f, + [ctx, f, result]() { + auto outputVal = f.get(); + ASSERT(outputVal.has_value()); + *result = outputVal.value(); + ctx->done(); + }, + true); + }, + [this, key, valPrefix, result, versionstamp, cont]() { + // 3. Assert expectation. + ASSERT(*result == valPrefix + versionstamp); + schedule(cont); + }); + }); + } + + void testAtomicCompareAndClearOp(TTaskFct cont) { + Key key(randomKeyName()); + Value val = randomValue(); + execTransaction( + // 1. Set the key to initial value + [key, val](auto ctx) { + ctx->tx().set(key, val); + ctx->commit(); + }, + [this, key, val, cont]() { + execTransaction( + // 2. Perform CompareAndClear operation. + [key, val](auto ctx) { + ctx->tx().atomicOp(key, val, FDBMutationType::FDB_MUTATION_TYPE_COMPARE_AND_CLEAR); + ctx->commit(); + }, + [this, key, cont]() { + execTransaction( + // 3. Verify that the key was cleared. + [key](auto ctx) { + auto f = ctx->tx().get(key, false); + ctx->continueAfter( + f, + [ctx, f]() { + auto outputVal = f.get(); + ASSERT(!outputVal.has_value()); + ctx->done(); + }, + true); + }, + [this, cont]() { schedule(cont); }); + }); + }); + } +}; + +WorkloadFactory AtomicOpsCorrectnessWorkloadFactory("AtomicOpsCorrectness"); + +} // namespace FdbApiTester diff --git a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp index 5ce643a99f..ca1bef9c41 100644 --- a/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterCorrectnessWorkload.cpp @@ -30,7 +30,7 @@ public: ApiCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {} private: - enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ }; + enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_GET_RANGE, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ }; void randomCommitReadOp(TTaskFct cont) { int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); @@ -125,6 +125,71 @@ private: }); } + void getRangeLoop(std::shared_ptr ctx, + fdb::KeySelector begin, + fdb::KeySelector end, + std::shared_ptr> results) { + auto f = ctx->tx().getRange(begin, + end, + 0 /*limit*/, + 0 /*target_bytes*/, + FDB_STREAMING_MODE_WANT_ALL, + 0 /*iteration*/, + false /*snapshot*/, + false /*reverse*/); + ctx->continueAfter(f, [this, ctx, f, end, results]() { + auto out = copyKeyValueArray(f.get()); + results->insert(results->end(), out.first.begin(), out.first.end()); + const bool more = out.second; + if (more) { + // Fetch the remaining results. + getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), end, results); + } else { + ctx->done(); + } + }); + } + + void randomGetRangeOp(TTaskFct cont) { + auto begin = randomKey(readExistingKeysRatio); + auto end = randomKey(readExistingKeysRatio); + auto results = std::make_shared>(); + + execTransaction( + [this, begin, end, results](auto ctx) { + // Clear the results vector, in case the transaction is retried. + results->clear(); + + getRangeLoop(ctx, + fdb::key_select::firstGreaterOrEqual(begin), + fdb::key_select::firstGreaterOrEqual(end), + results); + }, + [this, begin, end, results, cont]() { + auto expected = store.getRange(begin, end, results->size() + 10, false); + if (results->size() != expected.size()) { + error(fmt::format("randomGetRangeOp mismatch. expected {} keys, actual {} keys", + expected.size(), + results->size())); + } else { + auto expected_kv = expected.begin(); + for (auto actual_kv : *results) { + if (actual_kv.key != expected_kv->key || actual_kv.value != expected_kv->value) { + error(fmt::format( + "randomGetRangeOp mismatch. expected key: {} actual key: {} expected value: " + "{:.80} actual value: {:.80}", + fdb::toCharsRef(expected_kv->key), + fdb::toCharsRef(actual_kv.key), + fdb::toCharsRef(expected_kv->value), + fdb::toCharsRef(actual_kv.value))); + } + expected_kv++; + } + } + schedule(cont); + }); + } + void randomOperation(TTaskFct cont) { OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); switch (txType) { @@ -137,6 +202,9 @@ private: case OP_CLEAR: randomClearOp(cont); break; + case OP_GET_RANGE: + randomGetRangeOp(cont); + break; case OP_CLEAR_RANGE: randomClearRangeOp(cont); break; diff --git a/bindings/c/test/apitester/TesterUtil.h b/bindings/c/test/apitester/TesterUtil.h index 092e0d7f1f..de5e5c8990 100644 --- a/bindings/c/test/apitester/TesterUtil.h +++ b/bindings/c/test/apitester/TesterUtil.h @@ -120,6 +120,25 @@ KeyValueArray copyKeyValueArray(fdb::future_var::KeyValueRefArray::Type array); using KeyRangeArray = std::vector; KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array); +static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems"); + +// Converts a little-endian encoded number into an integral type. +template ::value>> +static T toInteger(fdb::BytesRef value) { + ASSERT(value.size() == sizeof(T)); + T output; + memcpy(&output, value.data(), value.size()); + return output; +} + +// Converts an integral type to a little-endian encoded byte string. +template ::value>> +static fdb::ByteString toByteString(T value) { + fdb::ByteString output(sizeof(T), 0); + memcpy(output.data(), (const uint8_t*)&value, sizeof(value)); + return output; +} + } // namespace FdbApiTester #endif diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml index a55d484616..54585c2f01 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessBlocking.toml @@ -12,14 +12,18 @@ maxClientThreads = 8 minClients = 2 maxClients = 8 - [[test.workload]] name = 'ApiCorrectness' minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml index 3c609624ce..d5703e1222 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessCallbacksOnExtThr.toml @@ -15,10 +15,15 @@ maxClients = 8 [[test.workload]] name = 'ApiCorrectness' minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml index 0f7f25e494..769c0ad2c3 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessDBPerTX.toml @@ -15,10 +15,15 @@ maxClients = 8 [[test.workload]] name = 'ApiCorrectness' minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml index e0b07b09e1..81081ec976 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessMultiThr.toml @@ -14,10 +14,15 @@ maxClients = 8 [[test.workload]] name = 'ApiCorrectness' minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 diff --git a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml index 9e27c27353..a6b9e05a72 100644 --- a/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml +++ b/bindings/c/test/apitester/tests/CApiCorrectnessSingleThr.toml @@ -7,10 +7,15 @@ multiThreaded = false [[test.workload]] name = 'ApiCorrectness' minKeyLength = 1 - maxKeyLength = 64 - minValueLength = 1 - maxValueLength = 1000 - maxKeysPerTransaction = 50 - initialSize = 100 - numRandomOperations = 100 - readExistingKeysRatio = 0.9 \ No newline at end of file + maxKeyLength = 64 + minValueLength = 1 + maxValueLength = 1000 + maxKeysPerTransaction = 50 + initialSize = 100 + numRandomOperations = 100 + readExistingKeysRatio = 0.9 + + [[test.workload]] + name = 'AtomicOpsCorrectness' + initialSize = 0 + numRandomOperations = 100 diff --git a/bindings/c/test/fdb_api.hpp b/bindings/c/test/fdb_api.hpp index 24ef97fad0..7aba3ae950 100644 --- a/bindings/c/test/fdb_api.hpp +++ b/bindings/c/test/fdb_api.hpp @@ -516,6 +516,8 @@ public: return out; } + TypedFuture getVersionstamp() { return native::fdb_transaction_get_versionstamp(tr.get()); } + TypedFuture getKey(KeySelector sel, bool snapshot) { return native::fdb_transaction_get_key(tr.get(), sel.key, sel.keyLength, sel.orEqual, sel.offset, snapshot); } @@ -577,6 +579,11 @@ public: native::fdb_transaction_set(tr.get(), key.data(), intSize(key), value.data(), intSize(value)); } + void atomicOp(KeyRef key, ValueRef param, FDBMutationType operationType) { + native::fdb_transaction_atomic_op( + tr.get(), key.data(), intSize(key), param.data(), intSize(param), operationType); + } + void clear(KeyRef key) { native::fdb_transaction_clear(tr.get(), key.data(), intSize(key)); } void clearRange(KeyRef begin, KeyRef end) { diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index c21f504cf7..13e3f790a8 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -193,10 +193,9 @@ endif() find_package(toml11 QUIET) if(toml11_FOUND) add_library(toml11_target INTERFACE) - add_dependencies(toml11_target INTERFACE toml11::toml11) + target_link_libraries(toml11_target INTERFACE toml11::toml11) else() - include(ExternalProject) - + include(ExternalProject) ExternalProject_add(toml11Project URL "https://github.com/ToruNiina/toml11/archive/v3.4.0.tar.gz" URL_HASH SHA256=bc6d733efd9216af8c119d8ac64a805578c79cc82b813e4d1d880ca128bd154d diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b66780d09a..cce49cf76c 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -4,6 +4,7 @@ target_include_directories(rapidjson INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/rapid add_subdirectory(crc32) add_subdirectory(stacktrace) add_subdirectory(folly_memcpy) +add_subdirectory(rapidxml) add_subdirectory(sqlite) add_subdirectory(SimpleOpt) add_subdirectory(fmt-8.1.1) diff --git a/contrib/crc32/crc32c.cpp b/contrib/crc32/crc32c.cpp index 79951280ad..9c3ae6d354 100644 --- a/contrib/crc32/crc32c.cpp +++ b/contrib/crc32/crc32c.cpp @@ -35,6 +35,10 @@ #endif #endif +#ifdef _WIN32 +#include +#endif + #include "crc32/crc32c.h" #if !defined(__aarch64__) && !defined(__powerpc64__) diff --git a/contrib/rapidxml/CMakeLists.txt b/contrib/rapidxml/CMakeLists.txt new file mode 100644 index 0000000000..1da80ecbc0 --- /dev/null +++ b/contrib/rapidxml/CMakeLists.txt @@ -0,0 +1,2 @@ + add_library(rapidxml INTERFACE) + target_include_directories(rapidxml INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include") diff --git a/fdbclient/include/fdbclient/rapidxml/rapidxml.hpp b/contrib/rapidxml/include/rapidxml/rapidxml.hpp similarity index 100% rename from fdbclient/include/fdbclient/rapidxml/rapidxml.hpp rename to contrib/rapidxml/include/rapidxml/rapidxml.hpp diff --git a/fdbclient/include/fdbclient/rapidxml/rapidxml_iterators.hpp b/contrib/rapidxml/include/rapidxml/rapidxml_iterators.hpp similarity index 100% rename from fdbclient/include/fdbclient/rapidxml/rapidxml_iterators.hpp rename to contrib/rapidxml/include/rapidxml/rapidxml_iterators.hpp diff --git a/fdbclient/include/fdbclient/rapidxml/rapidxml_print.hpp b/contrib/rapidxml/include/rapidxml/rapidxml_print.hpp similarity index 100% rename from fdbclient/include/fdbclient/rapidxml/rapidxml_print.hpp rename to contrib/rapidxml/include/rapidxml/rapidxml_print.hpp diff --git a/fdbclient/include/fdbclient/rapidxml/rapidxml_utils.hpp b/contrib/rapidxml/include/rapidxml/rapidxml_utils.hpp similarity index 100% rename from fdbclient/include/fdbclient/rapidxml/rapidxml_utils.hpp rename to contrib/rapidxml/include/rapidxml/rapidxml_utils.hpp diff --git a/fdbclient/rapidxml/license.txt b/contrib/rapidxml/license.txt similarity index 100% rename from fdbclient/rapidxml/license.txt rename to contrib/rapidxml/license.txt diff --git a/fdbclient/rapidxml/manual.html b/contrib/rapidxml/manual.html similarity index 100% rename from fdbclient/rapidxml/manual.html rename to contrib/rapidxml/manual.html diff --git a/fdbcli/QuotaCommand.actor.cpp b/fdbcli/QuotaCommand.actor.cpp new file mode 100644 index 0000000000..ba8546fa15 --- /dev/null +++ b/fdbcli/QuotaCommand.actor.cpp @@ -0,0 +1,178 @@ +/* + * QuotaCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" +#include "flow/actorcompiler.h" // This must be the last include + +namespace { + +enum class LimitType { RESERVED, TOTAL }; + +enum class OpType { READ, WRITE }; + +Optional parseTag(StringRef token) { + if (token.size() > CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH) { + return {}; + } else { + return token; + } +} + +Optional parseLimitType(StringRef token) { + if (token == "reserved"_sr) { + return LimitType::RESERVED; + } else if (token == "total"_sr) { + return LimitType::TOTAL; + } else { + return {}; + } +} + +Optional parseOpType(StringRef token) { + if (token == "read"_sr) { + return OpType::READ; + } else if (token == "write"_sr) { + return OpType::WRITE; + } else { + return {}; + } +} + +Optional parseLimitValue(StringRef token) { + try { + return std::stod(token.toString()); + } catch (...) { + return {}; + } +} + +ACTOR Future getQuota(Reference db, TransactionTag tag, LimitType limitType, OpType opType) { + state Reference tr = db->createTransaction(); + loop { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + try { + state ThreadFuture> resultFuture = tr->get(tag.withPrefix(tagQuotaPrefix)); + Optional v = wait(safeThreadFutureToFuture(resultFuture)); + if (!v.present()) { + fmt::print("\n"); + } else { + auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get()); + if (limitType == LimitType::TOTAL && opType == OpType::READ) { + fmt::print("{}\n", quota.totalReadQuota); + } else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) { + fmt::print("{}\n", quota.totalWriteQuota); + } else if (limitType == LimitType::RESERVED && opType == OpType::READ) { + fmt::print("{}\n", quota.reservedReadQuota); + } else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) { + fmt::print("{}\n", quota.reservedWriteQuota); + } + } + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +ACTOR Future setQuota(Reference db, + TransactionTag tag, + LimitType limitType, + OpType opType, + double value) { + state Reference tr = db->createTransaction(); + state Key key = tag.withPrefix(tagQuotaPrefix); + loop { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + try { + state ThreadFuture> resultFuture = tr->get(key); + Optional v = wait(safeThreadFutureToFuture(resultFuture)); + ThrottleApi::TagQuotaValue quota; + if (v.present()) { + quota = ThrottleApi::TagQuotaValue::fromValue(v.get()); + } + if (limitType == LimitType::TOTAL && opType == OpType::READ) { + quota.totalReadQuota = value; + } else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) { + quota.totalWriteQuota = value; + } else if (limitType == LimitType::RESERVED && opType == OpType::READ) { + quota.reservedReadQuota = value; + } else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) { + quota.reservedWriteQuota = value; + } + ThrottleApi::setTagQuota(tr, + tag, + quota.reservedReadQuota, + quota.totalReadQuota, + quota.reservedWriteQuota, + quota.totalWriteQuota); + wait(safeThreadFutureToFuture(tr->commit())); + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + +constexpr auto usage = + "quota [get [reserved|total] [read|write]|set [reserved|total] [read|write] ]"; + +bool exitFailure() { + fmt::print(usage); + return false; +} + +} // namespace + +namespace fdb_cli { + +ACTOR Future quotaCommandActor(Reference db, std::vector tokens) { + state bool result = true; + if (tokens.size() != 5 && tokens.size() != 6) { + return exitFailure(); + } else { + auto tag = parseTag(tokens[2]); + auto limitType = parseLimitType(tokens[3]); + auto opType = parseOpType(tokens[4]); + if (!tag.present() || !limitType.present() || !opType.present()) { + return exitFailure(); + } + if (tokens[1] == "get"_sr) { + if (tokens.size() != 5) { + return exitFailure(); + } + wait(getQuota(db, tag.get(), limitType.get(), opType.get())); + return true; + } else if (tokens[1] == "set"_sr) { + if (tokens.size() != 6) { + return exitFailure(); + } + auto const limitValue = parseLimitValue(tokens[5]); + if (!limitValue.present()) { + return exitFailure(); + } + wait(setQuota(db, tag.get(), limitType.get(), opType.get(), limitValue.get())); + return true; + } else { + return exitFailure(); + } + } +} + +} // namespace fdb_cli diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 580f5151dc..8caba70ec9 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -509,6 +509,10 @@ void initHelp() { CommandHelp("getversion", "Fetch the current read version", "Displays the current read version of the database or currently running transaction."); + helpMap["quota"] = + CommandHelp("quota", + "quota [get [reserved|total] [read|write]|set [reserved|total] [read|write] ]", + "Get or modify the throughput quota for the specified tag."); helpMap["reset"] = CommandHelp("reset", "reset the current transaction", @@ -1468,6 +1472,14 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { continue; } + if (tokencmp(tokens[0], "quota")) { + bool _result = wait(makeInterruptable(quotaCommandActor(db, tokens))); + if (!_result) { + is_error = true; + } + continue; + } + if (tokencmp(tokens[0], "reset")) { if (tokens.size() != 1) { printUsage(tokens[0]); diff --git a/fdbcli/include/fdbcli/fdbcli.actor.h b/fdbcli/include/fdbcli/fdbcli.actor.h index fbc6c04ec1..e27500a170 100644 --- a/fdbcli/include/fdbcli/fdbcli.actor.h +++ b/fdbcli/include/fdbcli/fdbcli.actor.h @@ -222,6 +222,8 @@ ACTOR Future profileCommandActor(Database db, Reference tr, std::vector tokens, bool intrans); +// quota command +ACTOR Future quotaCommandActor(Reference db, std::vector tokens); // setclass command ACTOR Future setClassCommandActor(Reference db, std::vector tokens); // snapshot command diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 800b1e319b..473a9875d7 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -78,7 +78,7 @@ add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS $ target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include") configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h) add_dependencies(fdbclient fdboptions) -target_link_libraries(fdbclient PUBLIC fdbrpc msgpack) +target_link_libraries(fdbclient PUBLIC fdbrpc msgpack PRIVATE rapidxml) # Create a separate fdbclient library with sampling enabled. This lets # fdbserver retain sampling functionality in client code while disabling @@ -86,7 +86,7 @@ target_link_libraries(fdbclient PUBLIC fdbrpc msgpack) add_flow_target(STATIC_LIBRARY NAME fdbclient_sampling SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs}) target_include_directories(fdbclient_sampling PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include") add_dependencies(fdbclient_sampling fdboptions) -target_link_libraries(fdbclient_sampling PUBLIC fdbrpc_sampling msgpack) +target_link_libraries(fdbclient_sampling PUBLIC fdbrpc_sampling msgpack PRIVATE rapidxml) target_compile_definitions(fdbclient_sampling PRIVATE -DENABLE_SAMPLING) if(WIN32) diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 1994cad6f4..ed1f1b5e77 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -450,16 +450,21 @@ bool isCompleteConfiguration(std::map const& options) options.count(p + "storage_engine") == 1; } +ACTOR Future getDatabaseConfiguration(Transaction* tr) { + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + RangeResult res = wait(tr->getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY); + DatabaseConfiguration config; + config.fromKeyValues((VectorRef)res); + return config; +} + ACTOR Future getDatabaseConfiguration(Database cx) { state Transaction tr(cx); loop { try { - tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - RangeResult res = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY)); - ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY); - DatabaseConfiguration config; - config.fromKeyValues((VectorRef)res); + DatabaseConfiguration config = wait(getDatabaseConfiguration(&tr)); return config; } catch (Error& e) { wait(tr.onError(e)); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 9633c5dcef..64d65dd22b 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -5158,8 +5158,9 @@ Future> Transaction::get(const Key& key, Snapshot snapshot) { if (!ver.isReady() || metadataVersion.isSet()) { return metadataVersion.getFuture(); } else { - if (ver.isError()) + if (ver.isError()) { return ver.getError(); + } if (ver.get() == trState->cx->metadataVersionCache[trState->cx->mvCacheInsertLocation].first) { return trState->cx->metadataVersionCache[trState->cx->mvCacheInsertLocation].second; } @@ -5763,6 +5764,10 @@ void Transaction::resetImpl(bool generateNewSpan) { cancelWatches(); } +TagSet const& Transaction::getTags() const { + return trState->options.tags; +} + void Transaction::reset() { resetImpl(false); } @@ -7067,6 +7072,25 @@ Future DatabaseContext::getClusterProtocol(OptionalTAG_THROTTLE_SMOOTHING_WINDOW; + + if (capacity >= 1) { + return 0.0; + } + + if (tpsRate == 0) { + return std::max(0.0, expiration - now()); + } + + return std::min(expiration - now(), capacity / tpsRate); +} + uint32_t Transaction::getSize() { auto s = tr.transaction.mutations.expectedSize() + tr.transaction.read_conflict_ranges.expectedSize() + tr.transaction.write_conflict_ranges.expectedSize(); @@ -7892,7 +7916,8 @@ ACTOR Future splitStorageMetricsStream(PromiseStream resultStream, Database cx, KeyRange keys, StorageMetrics limit, - StorageMetrics estimated) { + StorageMetrics estimated, + Optional minSplitBytes) { state Span span("NAPI:SplitStorageMetricsStream"_loc); state Key beginKey = keys.begin; state Key globalLastKey = beginKey; @@ -7923,7 +7948,8 @@ ACTOR Future splitStorageMetricsStream(PromiseStream resultStream, limit, localUsed, estimated, - i == locations.size() - 1 && keys.end <= locations.back().range.end); + i == locations.size() - 1 && keys.end <= locations.back().range.end, + minSplitBytes); SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(), &StorageServerInterface::splitMetrics, req, @@ -7986,15 +8012,17 @@ ACTOR Future splitStorageMetricsStream(PromiseStream resultStream, Future DatabaseContext::splitStorageMetricsStream(const PromiseStream& resultStream, KeyRange const& keys, StorageMetrics const& limit, - StorageMetrics const& estimated) { + StorageMetrics const& estimated, + Optional const& minSplitBytes) { return ::splitStorageMetricsStream( - resultStream, Database(Reference::addRef(this)), keys, limit, estimated); + resultStream, Database(Reference::addRef(this)), keys, limit, estimated, minSplitBytes); } ACTOR Future>> splitStorageMetrics(Database cx, KeyRange keys, StorageMetrics limit, - StorageMetrics estimated) { + StorageMetrics estimated, + Optional minSplitBytes) { state Span span("NAPI:SplitStorageMetrics"_loc); loop { state std::vector locations = @@ -8023,7 +8051,8 @@ ACTOR Future>> splitStorageMetrics(Database cx, state int i = 0; for (; i < locations.size(); i++) { - SplitMetricsRequest req(locations[i].range, limit, used, estimated, i == locations.size() - 1); + SplitMetricsRequest req( + locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes); SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(), &StorageServerInterface::splitMetrics, req, @@ -8067,8 +8096,10 @@ ACTOR Future>> splitStorageMetrics(Database cx, Future>> DatabaseContext::splitStorageMetrics(KeyRange const& keys, StorageMetrics const& limit, - StorageMetrics const& estimated) { - return ::splitStorageMetrics(Database(Reference::addRef(this)), keys, limit, estimated); + StorageMetrics const& estimated, + Optional const& minSplitBytes) { + return ::splitStorageMetrics( + Database(Reference::addRef(this)), keys, limit, estimated, minSplitBytes); } void Transaction::checkDeferredError() const { diff --git a/fdbclient/S3BlobStore.actor.cpp b/fdbclient/S3BlobStore.actor.cpp index efdd644853..8c87774e6c 100644 --- a/fdbclient/S3BlobStore.actor.cpp +++ b/fdbclient/S3BlobStore.actor.cpp @@ -40,7 +40,7 @@ #include "flow/IAsyncFile.h" #include "flow/Hostname.h" #include "flow/UnitTest.h" -#include "fdbclient/rapidxml/rapidxml.hpp" +#include "rapidxml/rapidxml.hpp" #ifdef BUILD_AWS_BACKUP #include "fdbclient/FDBAWSCredentialsProvider.h" #endif diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index c9b0912e5b..bdcfcc3afd 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -575,7 +575,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "duplicate_mutation_streams", "duplicate_mutation_fetch_timeout", "primary_dc_missing", - "fetch_primary_dc_timeout" + "fetch_primary_dc_timeout", + "fetch_storage_wiggler_stats_timeout" ] }, "issues":[ diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 5a3609935c..c18984f058 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -20,6 +20,7 @@ #include "fdbclient/ServerKnobs.h" #include "flow/IRandom.h" +#include "flow/flow.h" #define init(...) KNOB_FN(__VA_ARGS__, INIT_ATOMIC_KNOB, INIT_KNOB)(__VA_ARGS__) @@ -35,12 +36,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( MAX_VERSIONS_IN_FLIGHT, 100 * VERSIONS_PER_SECOND ); init( MAX_VERSIONS_IN_FLIGHT_FORCED, 6e5 * VERSIONS_PER_SECOND ); //one week of versions init( ENABLE_VERSION_VECTOR, false ); + init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false ); + bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR; init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (buggifyShortReadWindow) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND; init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max(1, 1 * VERSIONS_PER_SECOND); init( MAX_COMMIT_BATCH_INTERVAL, 2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough - init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false ); init( MAX_VERSION_RATE_MODIFIER, 0.1 ); init( MAX_VERSION_RATE_OFFSET, VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps. init( ENABLE_VERSION_VECTOR_HA_OPTIMIZATION, false ); @@ -117,7 +119,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // disk snapshot max timeout, to be put in TLog, storage and coordinator nodes init( MAX_FORKED_PROCESS_OUTPUT, 1024 ); - init( SNAP_CREATE_MAX_TIMEOUT, 300.0 ); + init( SNAP_CREATE_MAX_TIMEOUT, isSimulated ? 70.0 : 300.0 ); + init( SNAP_MINIMUM_TIME_GAP, 5.0 ); + init( SNAP_NETWORK_FAILURE_RETRY_LIMIT, 10 ); init( MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE, 1 ); init( MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE, 1 ); @@ -181,7 +185,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi /* The bytesRead/byteSize radio. Will be declared as read hot when larger than this. 8.0 was chosen to avoid reporting table scan as read hot. */ - init ( SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS, 1666667 * 1000); + init ( SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS, 1666667 * 1000); /* The read bandwidth of a given shard needs to be larger than this value in order to be evaluated if it's read hot. The roughly 1.67MB per second is calculated as following: - Heuristic data suggests that each storage process can do max 500K read operations per second @@ -662,6 +666,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( AUTO_TAG_THROTTLE_UPDATE_FREQUENCY, 10.0 ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLE_UPDATE_FREQUENCY = 0.5; init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0; init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false; + init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10); + init( GLOBAL_TAG_THROTTLING, false ); + init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 ); + init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 ); + init( GLOBAL_TAG_THROTTLING_TRACE_INTERVAL, 5.0 ); //Storage Metrics init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 ); @@ -725,6 +734,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 ); init( QUICK_GET_VALUE_FALLBACK, true ); init( QUICK_GET_KEY_VALUES_FALLBACK, true ); + init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index cb05d6b4c3..ec21297f46 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -1404,17 +1404,6 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) { return interf; } -Value encodeTenantEntry(TenantMapEntry const& tenantEntry) { - return ObjectWriter::toValue(tenantEntry, IncludeVersion()); -} - -TenantMapEntry decodeTenantEntry(ValueRef const& value) { - TenantMapEntry entry; - ObjectReader reader(value.begin(), IncludeVersion()); - reader.deserialize(entry); - return entry; -} - const KeyRangeRef tenantMapKeys("\xff/tenantMap/"_sr, "\xff/tenantMap0"_sr); const KeyRef tenantMapPrefix = tenantMapKeys.begin; const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr; diff --git a/fdbclient/TagThrottle.actor.cpp b/fdbclient/TagThrottle.actor.cpp index 16c0b0489c..b5205fd153 100644 --- a/fdbclient/TagThrottle.actor.cpp +++ b/fdbclient/TagThrottle.actor.cpp @@ -18,12 +18,16 @@ * limitations under the License. */ -#include "fdbclient/TagThrottle.actor.h" #include "fdbclient/CommitProxyInterface.h" #include "fdbclient/DatabaseContext.h" +#include "fdbclient/SystemData.h" +#include "fdbclient/TagThrottle.actor.h" +#include "fdbclient/Tuple.h" #include "flow/actorcompiler.h" // has to be last include +double const ClientTagThrottleLimits::NO_EXPIRATION = std::numeric_limits::max(); + void TagSet::addTag(TransactionTagRef tag) { ASSERT(CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH < 256); // Tag length is encoded with a single byte ASSERT(CLIENT_KNOBS->MAX_TAGS_PER_TRANSACTION < 256); // Number of tags is encoded with a single byte @@ -124,6 +128,53 @@ TagThrottleValue TagThrottleValue::fromValue(const ValueRef& value) { return throttleValue; } +KeyRangeRef const tagQuotaKeys = KeyRangeRef("\xff/tagQuota/"_sr, "\xff/tagQuota0"_sr); +KeyRef const tagQuotaPrefix = tagQuotaKeys.begin; + +Key ThrottleApi::getTagQuotaKey(TransactionTagRef tag) { + return tag.withPrefix(tagQuotaPrefix); +} + +bool ThrottleApi::TagQuotaValue::isValid() const { + return reservedReadQuota <= totalReadQuota && reservedWriteQuota <= totalWriteQuota && reservedReadQuota >= 0 && + reservedWriteQuota >= 0; +} + +Value ThrottleApi::TagQuotaValue::toValue() const { + Tuple tuple; + tuple.appendDouble(reservedReadQuota); + tuple.appendDouble(totalReadQuota); + tuple.appendDouble(reservedWriteQuota); + tuple.appendDouble(totalWriteQuota); + return tuple.pack(); +} + +ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) { + auto tuple = Tuple::unpack(value); + if (tuple.size() != 4) { + throw invalid_throttle_quota_value(); + } + TagQuotaValue result; + try { + result.reservedReadQuota = tuple.getDouble(0); + result.totalReadQuota = tuple.getDouble(1); + result.reservedWriteQuota = tuple.getDouble(2); + result.totalWriteQuota = tuple.getDouble(3); + } catch (Error& e) { + TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e); + throw invalid_throttle_quota_value(); + } + if (!result.isValid()) { + TraceEvent(SevWarnAlways, "TagQuotaValueInvalidQuotas") + .detail("ReservedReadQuota", result.reservedReadQuota) + .detail("TotalReadQuota", result.totalReadQuota) + .detail("ReservedWriteQuota", result.reservedWriteQuota) + .detail("TotalWriteQuota", result.totalWriteQuota); + throw invalid_throttle_quota_value(); + } + return result; +} + FDB_DEFINE_BOOLEAN_PARAM(ContainsRecommended); FDB_DEFINE_BOOLEAN_PARAM(Capitalize); diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp index 216f85afa4..28892db243 100644 --- a/fdbclient/Tenant.cpp +++ b/fdbclient/Tenant.cpp @@ -96,12 +96,12 @@ bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const { TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { TenantMapEntry entry1(1, ""_sr, TenantState::READY); ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr); - TenantMapEntry entry2 = decodeTenantEntry(encodeTenantEntry(entry1)); + TenantMapEntry entry2 = TenantMapEntry::decode(entry1.encode()); ASSERT(entry1.id == entry2.id && entry1.prefix == entry2.prefix); TenantMapEntry entry3(std::numeric_limits::max(), "foo"_sr, TenantState::READY); ASSERT(entry3.prefix == "foo\x7f\xff\xff\xff\xff\xff\xff\xff"_sr); - TenantMapEntry entry4 = decodeTenantEntry(encodeTenantEntry(entry3)); + TenantMapEntry entry4 = TenantMapEntry::decode(entry3.encode()); ASSERT(entry3.id == entry4.id && entry3.prefix == entry4.prefix); for (int i = 0; i < 100; ++i) { @@ -120,7 +120,7 @@ TEST_CASE("/fdbclient/TenantMapEntry/Serialization") { entry.prefix.endsWith(StringRef(reinterpret_cast(&bigEndianId), 8)) && entry.prefix.size() == subspaceLength + 8); - TenantMapEntry decodedEntry = decodeTenantEntry(encodeTenantEntry(entry)); + TenantMapEntry decodedEntry = TenantMapEntry::decode(entry.encode()); ASSERT(decodedEntry.id == entry.id && decodedEntry.prefix == entry.prefix); } diff --git a/fdbclient/include/fdbclient/CommitTransaction.h b/fdbclient/include/fdbclient/CommitTransaction.h index 3b2c7da874..f6757ac17e 100644 --- a/fdbclient/include/fdbclient/CommitTransaction.h +++ b/fdbclient/include/fdbclient/CommitTransaction.h @@ -79,6 +79,7 @@ struct MutationRef { CompareAndClear, Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */, Reserved_For_OTELSpanContextMessage, + Reserved_For_EncryptedMutationMessage /* See fdbserver/EncryptedMutationMessage.actor.h */, MAX_ATOMIC_OP }; // This is stored this way for serialization purposes. diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index c2fbb78ed6..07d27a9365 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -116,23 +116,7 @@ public: bool canRecheck() const { return lastCheck < now() - CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL; } - double throttleDuration() const { - if (expiration <= now()) { - return 0.0; - } - - double capacity = - (smoothRate.smoothTotal() - smoothReleased.smoothRate()) * CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW; - if (capacity >= 1) { - return 0.0; - } - - if (tpsRate == 0) { - return std::max(0.0, expiration - now()); - } - - return std::min(expiration - now(), capacity / tpsRate); - } + double throttleDuration() const; }; struct WatchParameters : public ReferenceCounted { @@ -307,10 +291,12 @@ public: Future splitStorageMetricsStream(PromiseStream const& resultsStream, KeyRange const& keys, StorageMetrics const& limit, - StorageMetrics const& estimated); + StorageMetrics const& estimated, + Optional const& minSplitBytes = {}); Future>> splitStorageMetrics(KeyRange const& keys, StorageMetrics const& limit, - StorageMetrics const& estimated); + StorageMetrics const& estimated, + Optional const& minSplitBytes = {}); Future>> getReadHotRanges(KeyRange const& keys); diff --git a/fdbclient/include/fdbclient/ManagementAPI.actor.h b/fdbclient/include/fdbclient/ManagementAPI.actor.h index d4b551eeaa..63f56242f7 100644 --- a/fdbclient/include/fdbclient/ManagementAPI.actor.h +++ b/fdbclient/include/fdbclient/ManagementAPI.actor.h @@ -41,6 +41,7 @@ standard API and some knowledge of the contents of the system key space. #include "fdbclient/MonitorLeader.h" #include "flow/actorcompiler.h" // has to be last include +ACTOR Future getDatabaseConfiguration(Transaction* tr); ACTOR Future getDatabaseConfiguration(Database cx); ACTOR Future waitForFullReplication(Database cx); diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index ec5ba530a2..009c22d7cd 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -465,6 +465,7 @@ public: Reference trState; std::vector> watches; + TagSet const& getTags() const; Span span; // used in template functions as returned Future type diff --git a/fdbclient/include/fdbclient/ReadYourWrites.h b/fdbclient/include/fdbclient/ReadYourWrites.h index 6ddf892774..89de979bc1 100644 --- a/fdbclient/include/fdbclient/ReadYourWrites.h +++ b/fdbclient/include/fdbclient/ReadYourWrites.h @@ -196,6 +196,7 @@ public: Transaction& getTransaction() { return tr; } Optional getTenant() { return tr.getTenant(); } + TagSet const& getTags() const { return tr.getTags(); } // used in template functions as returned Future type template diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index db0525899b..c6c926ab7a 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -177,7 +177,7 @@ public: SHARD_MIN_BYTES_PER_KSEC, // Shards with more than this bandwidth will not be merged SHARD_SPLIT_BYTES_PER_KSEC; // When splitting a shard, it is split into pieces with less than this bandwidth double SHARD_MAX_READ_DENSITY_RATIO; - int64_t SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS; + int64_t SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS; double SHARD_MAX_BYTES_READ_PER_KSEC_JITTER; double STORAGE_METRIC_TIMEOUT; double METRIC_DELAY; @@ -564,6 +564,7 @@ public: int64_t TLOG_RECOVER_MEMORY_LIMIT; double TLOG_IGNORE_POP_AUTO_ENABLE_DELAY; + // Tag throttling int64_t MAX_MANUAL_THROTTLED_TRANSACTION_TAGS; int64_t MAX_AUTO_THROTTLED_TRANSACTION_TAGS; double MIN_TAG_COST; @@ -576,6 +577,17 @@ public: double AUTO_TAG_THROTTLE_UPDATE_FREQUENCY; double TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL; bool AUTO_TAG_THROTTLING_ENABLED; + // Limit to the number of throttling tags each storage server + // will track and send to the ratekeeper + int64_t SS_THROTTLE_TAGS_TRACKED; + // Use global tag throttling strategy. i.e. throttle based on the cluster-wide + // throughput for tags and their associated quotas. + bool GLOBAL_TAG_THROTTLING; + // Minimum number of transactions per second that the global tag throttler must allow for each tag + double GLOBAL_TAG_THROTTLING_MIN_RATE; + // Used by global tag throttling counters + double GLOBAL_TAG_THROTTLING_FOLDING_TIME; + double GLOBAL_TAG_THROTTLING_TRACE_INTERVAL; double MAX_TRANSACTIONS_PER_BYTE; @@ -603,7 +615,12 @@ public: // disk snapshot int64_t MAX_FORKED_PROCESS_OUTPUT; + // retry limit after network failures + int64_t SNAP_NETWORK_FAILURE_RETRY_LIMIT; + // time limit for creating snapshot double SNAP_CREATE_MAX_TIMEOUT; + // minimum gap time between two snapshot requests for the same process + double SNAP_MINIMUM_TIME_GAP; // Maximum number of storage servers a snapshot can fail to // capture while still succeeding int64_t MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE; @@ -672,6 +689,7 @@ public: bool ENABLE_CLEAR_RANGE_EAGER_READS; bool QUICK_GET_VALUE_FALLBACK; bool QUICK_GET_KEY_VALUES_FALLBACK; + int MAX_PARALLEL_QUICK_GET_VALUE; int CHECKPOINT_TRANSFER_BLOCK_BYTES; int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index 006f4c12d3..febbc1311b 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -716,18 +716,21 @@ struct SplitMetricsRequest { StorageMetrics estimated; bool isLastShard; ReplyPromise reply; + Optional minSplitBytes; SplitMetricsRequest() {} SplitMetricsRequest(KeyRangeRef const& keys, StorageMetrics const& limits, StorageMetrics const& used, StorageMetrics const& estimated, - bool isLastShard) - : keys(arena, keys), limits(limits), used(used), estimated(estimated), isLastShard(isLastShard) {} + bool isLastShard, + Optional minSplitBytes) + : keys(arena, keys), limits(limits), used(used), estimated(estimated), isLastShard(isLastShard), + minSplitBytes(minSplitBytes) {} template void serialize(Ar& ar) { - serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena); + serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena, minSplitBytes); } }; diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h index 857ec09835..1cc462c16d 100644 --- a/fdbclient/include/fdbclient/SystemData.h +++ b/fdbclient/include/fdbclient/SystemData.h @@ -390,6 +390,8 @@ extern const KeyRef tagThrottleSignalKey; extern const KeyRef tagThrottleAutoEnabledKey; extern const KeyRef tagThrottleLimitKey; extern const KeyRef tagThrottleCountKey; +extern const KeyRangeRef tagQuotaKeys; +extern const KeyRef tagQuotaPrefix; // Log Range constant variables // Used in the backup pipeline to track mutations @@ -632,9 +634,6 @@ extern const KeyRef tenantDataPrefixKey; extern const KeyRangeRef tenantGroupTenantIndexKeys; extern const KeyRangeRef tenantTombstoneKeys; -Value encodeTenantEntry(TenantMapEntry const& tenantEntry); -TenantMapEntry decodeTenantEntry(ValueRef const& value); - // Metacluster keys extern const KeyRangeRef dataClusterMetadataKeys; extern const KeyRef dataClusterMetadataPrefix; diff --git a/fdbclient/include/fdbclient/TagThrottle.actor.h b/fdbclient/include/fdbclient/TagThrottle.actor.h index 3330abb4d9..020fcea568 100644 --- a/fdbclient/include/fdbclient/TagThrottle.actor.h +++ b/fdbclient/include/fdbclient/TagThrottle.actor.h @@ -207,6 +207,8 @@ struct ClientTagThrottleLimits { double tpsRate; double expiration; + static double const NO_EXPIRATION; + ClientTagThrottleLimits() : tpsRate(0), expiration(0) {} ClientTagThrottleLimits(double tpsRate, double expiration) : tpsRate(tpsRate), expiration(expiration) {} @@ -595,6 +597,38 @@ Future enableAuto(Reference db, bool enabled) { } } +class TagQuotaValue { +public: + double reservedReadQuota{ 0.0 }; + double totalReadQuota{ 0.0 }; + double reservedWriteQuota{ 0.0 }; + double totalWriteQuota{ 0.0 }; + bool isValid() const; + Value toValue() const; + static TagQuotaValue fromValue(ValueRef); +}; + +Key getTagQuotaKey(TransactionTagRef); + +template +void setTagQuota(Reference tr, + TransactionTagRef tag, + double reservedReadQuota, + double totalReadQuota, + double reservedWriteQuota, + double totalWriteQuota) { + TagQuotaValue tagQuotaValue; + tagQuotaValue.reservedReadQuota = reservedReadQuota; + tagQuotaValue.totalReadQuota = totalReadQuota; + tagQuotaValue.reservedWriteQuota = reservedWriteQuota; + tagQuotaValue.totalWriteQuota = totalWriteQuota; + if (!tagQuotaValue.isValid()) { + throw invalid_throttle_quota_value(); + } + tr->set(getTagQuotaKey(tag), tagQuotaValue.toValue()); + signalThrottleChange(tr); +} + }; // namespace ThrottleApi template diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index 1591ff7656..152f204ddf 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -59,6 +59,15 @@ struct TenantMapEntry { void setSubspace(KeyRef subspace); bool matchesConfiguration(TenantMapEntry const& other) const; + Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion(ProtocolVersion::withTenants())); } + + static TenantMapEntry decode(ValueRef const& value) { + TenantMapEntry entry; + ObjectReader reader(value.begin(), IncludeVersion(ProtocolVersion::withTenants())); + reader.deserialize(entry); + return entry; + } + template void serialize(Ar& ar) { KeyRef subspace; diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index 7e96f506d1..6a203c74de 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -41,7 +41,7 @@ Future> tryGetTenantTransaction(Transaction tr, TenantN state typename transaction_future_type>::type tenantFuture = tr->get(tenantMapKey); Optional val = wait(safeThreadFutureToFuture(tenantFuture)); - return val.map([](Optional v) { return decodeTenantEntry(v.get()); }); + return val.map([](Optional v) { return TenantMapEntry::decode(v.get()); }); } ACTOR template @@ -181,7 +181,8 @@ Future, bool>> createTenantTransaction( tenantEntry.assignedCluster = Optional(); } - tr->set(tenantMapKey, encodeTenantEntry(tenantEntry)); + tr->set(tenantMapKey, newTenant.encode()); + if (tenantEntry.tenantGroup.present()) { tr->set(getTenantGroupIndexKey(tenantEntry.tenantGroup.get(), name), ""_sr); } @@ -355,7 +356,7 @@ Future> listTenantsTransaction(Transaction std::map tenants; for (auto kv : results) { - tenants[kv.key.removePrefix(tenantMapPrefix)] = decodeTenantEntry(kv.value); + tenants[kv.key.removePrefix(tenantMapPrefix)] = TenantMapEntry::decode(kv.value); } return tenants; diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index d74a238510..99702bf237 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -126,6 +126,10 @@ public: Future onShutdown() { return shutdownSignal.getFuture(); } + bool isSpawnedKVProcess() const { + // SOMEDAY: use a separate bool may be better? + return name == "remote flow process"; + } bool isReliable() const { return !failed && fault_injection_p1 == 0 && fault_injection_p2 == 0 && !failedDisk && (!machine || (machine->machineProcess->fault_injection_p1 == 0 && diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 6c508c0279..98bc972b57 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1328,7 +1328,8 @@ public: std::vector primaryLocalitiesDead, primaryLocalitiesLeft; for (auto processInfo : getAllProcesses()) { - if (processInfo->isAvailableClass() && processInfo->locality.dcId() == dcId) { + if (!processInfo->isSpawnedKVProcess() && processInfo->isAvailableClass() && + processInfo->locality.dcId() == dcId) { if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) { primaryProcessesDead.add(processInfo->locality); primaryLocalitiesDead.push_back(processInfo->locality); @@ -1348,7 +1349,6 @@ public: if (usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) { primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy); } - return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy); } @@ -1602,7 +1602,7 @@ public: .detail("Protected", protectedAddresses.count(machine->address)) .backtrace(); // This will remove all the "tracked" messages that came from the machine being killed - if (machine->name != "remote flow process") + if (!machine->isSpawnedKVProcess()) latestEventCache.clear(); machine->failed = true; } else if (kt == InjectFaults) { @@ -1631,8 +1631,7 @@ public: } else { ASSERT(false); } - ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting || - machine->name == "remote flow process"); + ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting || machine->isSpawnedKVProcess()); } void rebootProcess(ProcessInfo* process, KillType kt) override { if (kt == RebootProcessAndDelete && protectedAddresses.count(process->address)) { @@ -2498,7 +2497,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) { .detail("Rebooting", p->rebooting) .detail("Reliable", p->isReliable()); return; - } else if (p->name == "remote flow process") { + } else if (p->isSpawnedKVProcess()) { TraceEvent(SevDebug, "DoRebootFailed").detail("Name", p->name).detail("Address", p->address); return; } else if (p->getChilds().size()) { diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index e7426e4d43..c43d1b4498 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -24,6 +24,7 @@ #include "fdbclient/Notified.h" #include "fdbclient/SystemData.h" #include "fdbserver/ApplyMetadataMutation.h" +#include "fdbserver/EncryptedMutationMessage.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/LogProtocolMessage.h" #include "fdbserver/LogSystem.h" @@ -67,13 +68,14 @@ public: ProxyCommitData& proxyCommitData_, Reference logSystem_, LogPushData* toCommit_, + const std::unordered_map>* cipherKeys_, bool& confChange_, Version version, Version popVersion_, bool initialCommit_) : spanContext(spanContext_), dbgid(proxyCommitData_.dbgid), arena(arena_), mutations(mutations_), - txnStateStore(proxyCommitData_.txnStateStore), toCommit(toCommit_), confChange(confChange_), - logSystem(logSystem_), version(version), popVersion(popVersion_), + txnStateStore(proxyCommitData_.txnStateStore), toCommit(toCommit_), cipherKeys(cipherKeys_), + confChange(confChange_), logSystem(logSystem_), version(version), popVersion(popVersion_), vecBackupKeys(&proxyCommitData_.vecBackupKeys), keyInfo(&proxyCommitData_.keyInfo), cacheInfo(&proxyCommitData_.cacheInfo), uid_applyMutationsData(proxyCommitData_.firstProxy ? &proxyCommitData_.uid_applyMutationsData : nullptr), @@ -108,6 +110,9 @@ private: // non-null if these mutations were part of a new commit handled by this commit proxy LogPushData* toCommit = nullptr; + // Cipher keys used to encrypt to be committed mutations + const std::unordered_map>* cipherKeys = nullptr; + // Flag indicates if the configure is changed bool& confChange; @@ -152,6 +157,16 @@ private: bool dummyConfChange = false; private: + void writeMutation(const MutationRef& m) { + if (forResolver || !SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) { + toCommit->writeTypedMessage(m); + } else { + ASSERT(cipherKeys != nullptr); + Arena arena; + toCommit->writeTypedMessage(EncryptedMutationMessage::encryptMetadata(arena, *cipherKeys, m)); + } + } + void checkSetKeyServersPrefix(MutationRef m) { if (!m.param1.startsWith(keyServersPrefix)) { return; @@ -221,7 +236,7 @@ private: .detail("Tag", tag.toString()); toCommit->addTag(tag); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } @@ -243,7 +258,7 @@ private: toCommit->writeTypedMessage(LogProtocolMessage(), true); TraceEvent(SevDebug, "SendingPrivatized_ServerTag", dbgid).detail("M", privatized); toCommit->addTag(tag); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } if (!initialCommit) { txnStateStore->set(KeyValueRef(m.param1, m.param2)); @@ -303,7 +318,7 @@ private: privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivatized_CacheTag", dbgid).detail("M", privatized); toCommit->addTag(cacheTag); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } void checkSetConfigKeys(MutationRef m) { @@ -354,7 +369,7 @@ private: toCommit->addTags(allSources); } TraceEvent(SevDebug, "SendingPrivatized_ChangeFeed", dbgid).detail("M", privatized); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } @@ -408,7 +423,7 @@ private: if (tagV.present()) { TraceEvent(SevDebug, "SendingPrivatized_TSSID", dbgid).detail("M", privatized); toCommit->addTag(decodeServerTagValue(tagV.get())); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } } @@ -437,7 +452,7 @@ private: privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivatized_TSSQuarantine", dbgid).detail("M", privatized); toCommit->addTag(decodeServerTagValue(tagV.get())); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } @@ -560,7 +575,7 @@ private: privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivatized_GlobalKeys", dbgid).detail("M", privatized); toCommit->addTags(allTags); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } // Generates private mutations for the target storage server, instructing it to create a checkpoint. @@ -582,7 +597,7 @@ private: .detail("Checkpoint", checkpoint.toString()); toCommit->addTag(tag); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } @@ -639,7 +654,7 @@ private: if (tenantMap) { ASSERT(version != invalidVersion); TenantName tenantName = m.param1.removePrefix(tenantMapPrefix); - TenantMapEntry tenantEntry = decodeTenantEntry(m.param2); + TenantMapEntry tenantEntry = TenantMapEntry::decode(m.param2); TraceEvent("CommitProxyInsertTenant", dbgid).detail("Tenant", tenantName).detail("Version", version); (*tenantMap)[tenantName] = tenantEntry; @@ -662,7 +677,7 @@ private: MutationRef privatized = m; privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } TEST(true); // Tenant added to map @@ -780,7 +795,7 @@ private: TraceEvent(SevDebug, "SendingPrivatized_ClearServerTag", dbgid).detail("M", privatized); toCommit->addTag(decodeServerTagValue(kv.value)); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } // Might be a tss removal, which doesn't store a tag there. @@ -804,7 +819,7 @@ private: TraceEvent(SevDebug, "SendingPrivatized_TSSClearServerTag", dbgid) .detail("M", privatized); toCommit->addTag(decodeServerTagValue(tagV.get())); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } } @@ -989,7 +1004,7 @@ private: privatized.param2 = m.param2.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivatized_ClearTSSMapping", dbgid).detail("M", privatized); toCommit->addTag(decodeServerTagValue(tagV.get())); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } @@ -1016,7 +1031,7 @@ private: privatized.param2 = m.param2.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivatized_ClearTSSQuarantine", dbgid).detail("M", privatized); toCommit->addTag(decodeServerTagValue(tagV.get())); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } } } @@ -1070,7 +1085,7 @@ private: privatized.type = MutationRef::ClearRange; privatized.param1 = range.begin.withPrefix(systemKeys.begin, arena); privatized.param2 = range.end.withPrefix(systemKeys.begin, arena); - toCommit->writeTypedMessage(privatized); + writeMutation(privatized); } TEST(true); // Tenant cleared from map @@ -1179,9 +1194,9 @@ private: .detail("MBegin", mutationBegin) .detail("MEnd", mutationEnd); toCommit->addTags(allTags); - toCommit->writeTypedMessage(mutationBegin); + writeMutation(mutationBegin); toCommit->addTags(allTags); - toCommit->writeTypedMessage(mutationEnd); + writeMutation(mutationEnd); } } @@ -1258,6 +1273,7 @@ void applyMetadataMutations(SpanContext const& spanContext, Reference logSystem, const VectorRef& mutations, LogPushData* toCommit, + const std::unordered_map>* pCipherKeys, bool& confChange, Version version, Version popVersion, @@ -1269,6 +1285,7 @@ void applyMetadataMutations(SpanContext const& spanContext, proxyCommitData, logSystem, toCommit, + pCipherKeys, confChange, version, popVersion, diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 4458937a4f..91c9962577 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -25,6 +25,8 @@ #include "fdbclient/SystemData.h" #include "fdbserver/BackupInterface.h" #include "fdbserver/BackupProgress.actor.h" +#include "fdbserver/EncryptedMutationMessage.h" +#include "fdbserver/GetEncryptCipherKeys.h" #include "fdbserver/Knobs.h" #include "fdbserver/LogProtocolMessage.h" #include "fdbserver/LogSystem.h" @@ -44,6 +46,7 @@ struct VersionedMessage { StringRef message; VectorRef tags; Arena arena; // Keep a reference to the memory containing the message + Arena decryptArena; // Arena used for decrypt buffer. size_t bytes; // arena's size when inserted, which can grow afterwards VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef& t, const Arena& a) @@ -53,7 +56,8 @@ struct VersionedMessage { // Returns true if the message is a mutation that should be backuped, i.e., // either key is not in system key space or is not a metadataVersionKey. - bool isBackupMessage(MutationRef* m) const { + bool isBackupMessage(MutationRef* m, + const std::unordered_map>& cipherKeys) { for (Tag tag : tags) { if (tag.locality == tagLocalitySpecial || tag.locality == tagLocalityTxs) { return false; // skip Txs mutations @@ -71,10 +75,26 @@ struct VersionedMessage { TEST(true); // Returning false for OTELSpanContextMessage return false; } - - reader >> *m; + if (EncryptedMutationMessage::isNextIn(reader)) { + // In case the mutation is encrypted, get the decrypted mutation and also update message to point to + // the decrypted mutation. + // We use dedicated arena for decrypt buffer, as the other arena is used to count towards backup lock bytes. + *m = EncryptedMutationMessage::decrypt(reader, decryptArena, cipherKeys, &message); + } else { + reader >> *m; + } return normalKeys.contains(m->param1) || m->param1 == metadataVersionKey; } + + void collectCipherDetailIfEncrypted(std::unordered_set& cipherDetails) { + ArenaReader reader(arena, message, AssumeVersion(g_network->protocolVersion())); + if (EncryptedMutationMessage::isNextIn(reader)) { + EncryptedMutationMessage emm; + reader >> emm; + cipherDetails.insert(emm.header.cipherTextDetails); + cipherDetails.insert(emm.header.cipherHeaderDetails); + } + } }; struct BackupData { @@ -89,6 +109,7 @@ struct BackupData { Version minKnownCommittedVersion; Version savedVersion; // Largest version saved to blob storage Version popVersion; // Largest version popped in NOOP mode, can be larger than savedVersion. + Reference const> db; AsyncVar> logSystem; Database cx; std::vector messages; @@ -245,7 +266,7 @@ struct BackupData { : myId(id), tag(req.routerTag), totalTags(req.totalTags), startVersion(req.startVersion), endVersion(req.endVersion), recruitedEpoch(req.recruitedEpoch), backupEpoch(req.backupEpoch), minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1), popVersion(req.startVersion - 1), - pulledVersion(0), paused(false), lock(new FlowLock(SERVER_KNOBS->BACKUP_LOCK_BYTES)), + db(db), pulledVersion(0), paused(false), lock(new FlowLock(SERVER_KNOBS->BACKUP_LOCK_BYTES)), cc("BackupWorker", myId.toString()) { cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True); @@ -682,7 +703,10 @@ ACTOR static Future updateLogBytesWritten(BackupData* self, // Saves messages in the range of [0, numMsg) to a file and then remove these // messages. The file content format is a sequence of (Version, sub#, msgSize, message). // Note only ready backups are saved. -ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int numMsg) { +ACTOR Future saveMutationsToFile(BackupData* self, + Version popVersion, + int numMsg, + std::unordered_set cipherDetails) { state int blockSize = SERVER_KNOBS->BACKUP_FILE_BLOCK_BYTES; state std::vector>> logFileFutures; state std::vector> logFiles; @@ -691,6 +715,7 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int state std::vector beginVersions; // logFiles' begin versions state KeyRangeMap> keyRangeMap; // range to index in logFileFutures, logFiles, & blockEnds state std::vector> mutations; + state std::unordered_map> cipherKeys; state int idx; // Make sure all backups are ready, otherwise mutations will be lost. @@ -742,11 +767,18 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int .detail("File", logFiles[i]->getFileName()); } + // Fetch cipher keys if any of the messages are encrypted. + if (!cipherDetails.empty()) { + std::unordered_map> getCipherKeysResult = + wait(getEncryptCipherKeys(self->db, cipherDetails)); + cipherKeys = getCipherKeysResult; + } + blockEnds = std::vector(logFiles.size(), 0); for (idx = 0; idx < numMsg; idx++) { - const auto& message = self->messages[idx]; + auto& message = self->messages[idx]; MutationRef m; - if (!message.isBackupMessage(&m)) + if (!message.isBackupMessage(&m, cipherKeys)) continue; DEBUG_MUTATION("addMutation", message.version.version, m) @@ -815,6 +847,7 @@ ACTOR Future uploadData(BackupData* self) { state Future uploadDelay = delay(SERVER_KNOBS->BACKUP_UPLOAD_DELAY); state int numMsg = 0; + state std::unordered_set cipherDetails; Version lastPopVersion = popVersion; // index of last version's end position in self->messages int lastVersionIndex = 0; @@ -826,7 +859,7 @@ ACTOR Future uploadData(BackupData* self) { popVersion = std::max(popVersion, self->minKnownCommittedVersion); } } else { - for (const auto& message : self->messages) { + for (auto& message : self->messages) { // message may be prefetched in peek; uncommitted message should not be uploaded. const Version version = message.getVersion(); if (version > self->maxPopVersion()) @@ -836,6 +869,7 @@ ACTOR Future uploadData(BackupData* self) { lastVersion = popVersion; popVersion = version; } + message.collectCipherDetailIfEncrypted(cipherDetails); numMsg++; } } @@ -859,7 +893,7 @@ ACTOR Future uploadData(BackupData* self) { .detail("NumMsg", numMsg) .detail("MsgQ", self->messages.size()); // save an empty file for old epochs so that log file versions are continuous - wait(saveMutationsToFile(self, popVersion, numMsg)); + wait(saveMutationsToFile(self, popVersion, numMsg, cipherDetails)); self->eraseMessages(numMsg); } diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index 671d08ccbb..0b6e940f9f 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -354,8 +354,9 @@ ACTOR Future>> splitRange(Reference resultStream; state Standalone> keys; - state Future streamFuture = - bmData->db->splitStorageMetricsStream(resultStream, range, splitMetrics, estimated); + // SplitMetrics.bytes / 3 as min split size because of same splitThreshold logic above. + state Future streamFuture = bmData->db->splitStorageMetricsStream( + resultStream, range, splitMetrics, estimated, splitMetrics.bytes / 3); loop { try { Key k = waitNext(resultStream.getFuture()); @@ -846,7 +847,7 @@ ACTOR Future monitorClientRanges(Reference bmData) { std::vector prefixes; for (auto& it : tenantResults) { TenantNameRef tenantName = it.key.removePrefix(tenantMapPrefix); - TenantMapEntry entry = decodeTenantEntry(it.value); + TenantMapEntry entry = TenantMapEntry::decode(it.value); tenants.push_back(std::pair(tenantName, entry)); prefixes.push_back(entry.prefix); } diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index ac4210cc5d..44a90030a5 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -3198,7 +3198,7 @@ ACTOR Future monitorTenants(Reference bwData) { for (auto& it : tenantResults) { // FIXME: handle removing/moving tenants! TenantNameRef tenantName = it.key.removePrefix(tenantMapPrefix); - TenantMapEntry entry = decodeTenantEntry(it.value); + TenantMapEntry entry = TenantMapEntry::decode(it.value); tenants.push_back(std::pair(tenantName, entry)); } bwData->tenantData.addTenants(tenants); diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index 7a1b6eb18d..3480d60d95 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -1013,11 +1013,6 @@ ACTOR Future updateLocalityForDcId(Optional dcId, if (ver == invalidVersion) { ver = oldLogSystem->getKnownCommittedVersion(); } - if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) { - // Do not try to split peeks between data centers in peekTxns() to recover mem kvstore. - // This recovery optimization won't work in UNICAST mode. - loc.first = -1; - } locality->set(PeekTxsInfo(loc.first, loc.second, ver)); TraceEvent("UpdatedLocalityForDcId") diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 9df56a83aa..b4afad5170 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -34,7 +34,9 @@ #include "fdbserver/ApplyMetadataMutation.h" #include "fdbserver/ConflictSet.h" #include "fdbserver/DataDistributorInterface.h" +#include "fdbserver/EncryptedMutationMessage.h" #include "fdbserver/FDBExecHelper.actor.h" +#include "fdbserver/GetEncryptCipherKeys.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/Knobs.h" #include "fdbserver/LogSystem.h" @@ -48,6 +50,7 @@ #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" #include "flow/ActorCollection.h" +#include "flow/BlobCipher.h" #include "flow/Error.h" #include "flow/IRandom.h" #include "flow/Knobs.h" @@ -641,6 +644,9 @@ struct CommitBatchContext { std::set writtenTags; // final set tags written to in the batch std::set writtenTagsPreResolution; // tags written to in the batch not including any changes from the resolver. + // Cipher keys to be used to encrypt mutations + std::unordered_map> cipherKeys; + CommitBatchContext(ProxyCommitData*, const std::vector*, const int); void setupTraceBatch(); @@ -897,6 +903,27 @@ ACTOR Future getResolution(CommitBatchContext* self) { self->transactionResolverMap.swap(requests.transactionResolverMap); // Used to report conflicting keys self->txReadConflictRangeIndexMap.swap(requests.txReadConflictRangeIndexMap); + + // Fetch cipher keys if needed. + state Future>> getCipherKeys; + if (SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) { + static std::unordered_map defaultDomains = { + { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }, + { ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME } + }; + std::unordered_map encryptDomains = defaultDomains; + for (int t = 0; t < trs.size(); t++) { + int64_t tenantId = trs[t].tenantInfo.tenantId; + Optional tenantName = trs[t].tenantInfo.name; + // TODO(yiwu): In raw access mode, use tenant prefix to figure out tenant id for user data + if (tenantId != TenantInfo::INVALID_TENANT) { + ASSERT(tenantName.present()); + encryptDomains[tenantId] = tenantName.get(); + } + } + getCipherKeys = getLatestEncryptCipherKeys(pProxyCommitData->db, encryptDomains); + } + self->releaseFuture = releaseResolvingAfter(pProxyCommitData, self->releaseDelay, self->localBatchNumber); if (self->localBatchNumber - self->pProxyCommitData->latestLocalCommitBatchLogging.get() > @@ -922,6 +949,11 @@ ACTOR Future getResolution(CommitBatchContext* self) { "CommitDebug", self->debugID.get().first(), "CommitProxyServer.commitBatch.AfterResolution"); } + if (SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) { + std::unordered_map> cipherKeys = wait(getCipherKeys); + self->cipherKeys = cipherKeys; + } + return Void(); } @@ -961,6 +993,7 @@ void applyMetadataEffect(CommitBatchContext* self) { self->pProxyCommitData->logSystem, self->resolution[0].stateMutations[versionIndex][transactionIndex].mutations, /* pToCommit= */ nullptr, + /* pCipherKeys= */ nullptr, self->forceRecovery, /* version= */ self->commitVersion, /* popVersion= */ 0, @@ -1060,6 +1093,7 @@ ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self pProxyCommitData->logSystem, trs[t].transaction.mutations, SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? nullptr : &self->toCommit, + SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION ? &self->cipherKeys : nullptr, self->forceRecovery, self->commitVersion, self->commitVersion + 1, @@ -1111,6 +1145,22 @@ ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self return Void(); } +void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef& mutation) { + static_assert(TenantInfo::INVALID_TENANT == ENCRYPT_INVALID_DOMAIN_ID); + if (!SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION || tenantId == TenantInfo::INVALID_TENANT) { + // TODO(yiwu): In raw access mode, use tenant prefix to figure out tenant id for user data + bool isRawAccess = tenantId == TenantInfo::INVALID_TENANT && !isSystemKey(mutation.param1) && + !(mutation.type == MutationRef::ClearRange && isSystemKey(mutation.param2)) && + self->pProxyCommitData->db->get().client.tenantMode == TenantMode::REQUIRED; + TEST(isRawAccess); // Raw access to tenant key space + self->toCommit.writeTypedMessage(mutation); + } else { + Arena arena; + self->toCommit.writeTypedMessage( + EncryptedMutationMessage::encrypt(arena, self->cipherKeys, tenantId /*domainId*/, mutation)); + } +} + /// This second pass through committed transactions assigns the actual mutations to the appropriate storage servers' /// tags ACTOR Future assignMutationsToStorageServers(CommitBatchContext* self) { @@ -1127,6 +1177,7 @@ ACTOR Future assignMutationsToStorageServers(CommitBatchContext* self) { state Optional* trCost = &trs[self->transactionNum].commitCostEstimation; state int mutationNum = 0; state VectorRef* pMutations = &trs[self->transactionNum].transaction.mutations; + state int64_t tenantId = trs[self->transactionNum].tenantInfo.tenantId; self->toCommit.addTransactionInfo(trs[self->transactionNum].spanContext); @@ -1184,7 +1235,7 @@ ACTOR Future assignMutationsToStorageServers(CommitBatchContext* self) { if (pProxyCommitData->cacheInfo[m.param1]) { self->toCommit.addTag(cacheTag); } - self->toCommit.writeTypedMessage(m); + writeMutation(self, tenantId, m); } else if (m.type == MutationRef::ClearRange) { KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); auto ranges = pProxyCommitData->keyInfo.intersectingRanges(clearRange); @@ -1237,7 +1288,7 @@ ACTOR Future assignMutationsToStorageServers(CommitBatchContext* self) { if (pProxyCommitData->needsCacheTag(clearRange)) { self->toCommit.addTag(cacheTag); } - self->toCommit.writeTypedMessage(m); + writeMutation(self, tenantId, m); } else { UNREACHABLE(); } @@ -2086,21 +2137,32 @@ ACTOR Future proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co throw snap_log_anti_quorum_unsupported(); } - // send a snap request to DD - if (!commitData->db->get().distributor.present()) { - TraceEvent(SevWarnAlways, "DataDistributorNotPresent").detail("Operation", "SnapRequest"); - throw dd_not_found(); - } - state Future> ddSnapReq = commitData->db->get().distributor.get().distributorSnapReq.tryGetReply( - DistributorSnapRequest(snapReq.snapPayload, snapReq.snapUID)); - try { - wait(throwErrorOr(ddSnapReq)); - } catch (Error& e) { - TraceEvent("SnapCommitProxy_DDSnapResponseError") - .errorUnsuppressed(e) - .detail("SnapPayload", snapReq.snapPayload) - .detail("SnapUID", snapReq.snapUID); - throw e; + state int snapReqRetry = 0; + state double snapRetryBackoff = FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + loop { + // send a snap request to DD + if (!commitData->db->get().distributor.present()) { + TraceEvent(SevWarnAlways, "DataDistributorNotPresent").detail("Operation", "SnapRequest"); + throw dd_not_found(); + } + try { + Future> ddSnapReq = + commitData->db->get().distributor.get().distributorSnapReq.tryGetReply( + DistributorSnapRequest(snapReq.snapPayload, snapReq.snapUID)); + wait(throwErrorOr(ddSnapReq)); + break; + } catch (Error& e) { + TraceEvent("SnapCommitProxy_DDSnapResponseError") + .errorUnsuppressed(e) + .detail("SnapPayload", snapReq.snapPayload) + .detail("SnapUID", snapReq.snapUID); + // Retry if we have network issues + if (e.code() != error_code_request_maybe_delivered || + ++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT) + throw e; + wait(delay(snapRetryBackoff)); + snapRetryBackoff = snapRetryBackoff * 2; // exponential backoff + } } snapReq.reply.send(Void()); } catch (Error& e) { @@ -2297,6 +2359,7 @@ ACTOR Future processCompleteTransactionStateRequest(TransactionStateResolv Reference(), mutations, /* pToCommit= */ nullptr, + /* pCipherKeys= */ nullptr, confChanges, /* version= */ 0, /* popVersion= */ 0, @@ -2388,7 +2451,8 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, // Wait until we can load the "real" logsystem, since we don't support switching them currently while (!(masterLifetime.isEqual(commitData.db->get().masterLifetime) && - commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) { + commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION && + (!SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION || commitData.db->get().encryptKeyProxy.present()))) { //TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch); wait(commitData.db->onChange()); } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index d80d25c6b2..429b3c4b64 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -876,14 +876,26 @@ Future sendSnapReq(RequestStream stream, Req req, Error e) { return Void(); } -ACTOR template -Future> trySendSnapReq(RequestStream stream, Req req) { - ErrorOr reply = wait(stream.tryGetReply(req)); - if (reply.isError()) { - TraceEvent("SnapDataDistributor_ReqError") - .errorUnsuppressed(reply.getError()) - .detail("Peer", stream.getEndpoint().getPrimaryAddress()); - return ErrorOr(reply.getError()); +ACTOR Future> trySendSnapReq(RequestStream stream, WorkerSnapRequest req) { + state int snapReqRetry = 0; + state double snapRetryBackoff = FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; + loop { + ErrorOr reply = wait(stream.tryGetReply(req)); + if (reply.isError()) { + TraceEvent("SnapDataDistributor_ReqError") + .errorUnsuppressed(reply.getError()) + .detail("Peer", stream.getEndpoint().getPrimaryAddress()); + if (reply.getError().code() != error_code_request_maybe_delivered || + ++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT) + return ErrorOr(reply.getError()); + else { + // retry for network failures with same snap UID to avoid snapshot twice + req = WorkerSnapRequest(req.snapPayload, req.snapUID, req.role); + wait(delay(snapRetryBackoff)); + snapRetryBackoff = snapRetryBackoff * 2; + } + } else + break; } return ErrorOr(Void()); } @@ -906,6 +918,124 @@ ACTOR static Future waitForMost(std::vector>> futures return Void(); } +ACTOR Future>> getStatefulWorkers( + Database cx, + Reference const> dbInfo, + std::vector* tlogs, + int* storageFaultTolerance) { + state std::map> result; + state std::map workersMap; + state Transaction tr(cx); + state DatabaseConfiguration configuration; + loop { + try { + // necessary options + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + + // get database configuration + DatabaseConfiguration _configuration = wait(getDatabaseConfiguration(&tr)); + configuration = _configuration; + + // get storages + RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY); + state std::vector storageServers; + storageServers.reserve(serverList.size()); + for (int i = 0; i < serverList.size(); i++) + storageServers.push_back(decodeServerListValue(serverList[i].value)); + + // get workers + state std::vector workers = wait(getWorkers(dbInfo)); + for (const auto& worker : workers) { + workersMap[worker.interf.address()] = worker.interf; + } + + Optional regionsValue = + wait(tr.get(LiteralStringRef("usable_regions").withPrefix(configKeysPrefix))); + int usableRegions = 1; + if (regionsValue.present()) { + usableRegions = atoi(regionsValue.get().toString().c_str()); + } + auto masterDcId = dbInfo->get().master.locality.dcId(); + int storageFailures = 0; + for (const auto& server : storageServers) { + TraceEvent(SevDebug, "StorageServerDcIdInfo") + .detail("Address", server.address().toString()) + .detail("ServerLocalityID", server.locality.dcId()) + .detail("MasterDcID", masterDcId); + if (usableRegions == 1 || server.locality.dcId() == masterDcId) { + auto itr = workersMap.find(server.address()); + if (itr == workersMap.end()) { + TraceEvent(SevWarn, "GetStorageWorkers") + .detail("Reason", "Could not find worker for storage server") + .detail("SS", server.id()); + ++storageFailures; + } else { + if (result.count(server.address())) { + ASSERT(itr->second.id() == result[server.address()].first.id()); + if (result[server.address()].second.find("storage") == std::string::npos) + result[server.address()].second.append(",storage"); + } else { + result[server.address()] = std::make_pair(itr->second, "storage"); + } + } + } + } + // calculate fault tolerance + *storageFaultTolerance = std::min(static_cast(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE), + configuration.storageTeamSize - 1) - + storageFailures; + if (*storageFaultTolerance < 0) { + TEST(true); // Too many failed storage servers to complete snapshot + throw snap_storage_failed(); + } + // tlogs + for (const auto& tlog : *tlogs) { + TraceEvent(SevDebug, "GetStatefulWorkersTlog").detail("Addr", tlog.address()); + if (workersMap.find(tlog.address()) == workersMap.end()) { + TraceEvent(SevError, "MissingTlogWorkerInterface").detail("TlogAddress", tlog.address()); + throw snap_tlog_failed(); + } + if (result.count(tlog.address())) { + ASSERT(workersMap[tlog.address()].id() == result[tlog.address()].first.id()); + result[tlog.address()].second.append(",tlog"); + } else { + result[tlog.address()] = std::make_pair(workersMap[tlog.address()], "tlog"); + } + } + + // get coordinators + Optional coordinators = wait(tr.get(coordinatorsKey)); + if (!coordinators.present()) { + throw operation_failed(); + } + ClusterConnectionString ccs(coordinators.get().toString()); + std::vector coordinatorsAddr = wait(ccs.tryResolveHostnames()); + std::set coordinatorsAddrSet(coordinatorsAddr.begin(), coordinatorsAddr.end()); + for (const auto& worker : workers) { + // Note : only considers second address for coordinators, + // as we use primary addresses from storage and tlog interfaces above + NetworkAddress primary = worker.interf.address(); + Optional secondary = worker.interf.tLog.getEndpoint().addresses.secondaryAddress; + if (coordinatorsAddrSet.find(primary) != coordinatorsAddrSet.end() || + (secondary.present() && (coordinatorsAddrSet.find(secondary.get()) != coordinatorsAddrSet.end()))) { + if (result.count(primary)) { + ASSERT(workersMap[primary].id() == result[primary].first.id()); + result[primary].second.append(",coord"); + } else { + result[primary] = std::make_pair(workersMap[primary], "coord"); + } + } + } + return result; + } catch (Error& e) { + wait(tr.onError(e)); + result.clear(); + } + } +} + ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference const> db) { state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True); @@ -942,47 +1072,44 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference, int> storageWorkersAndFailures = - wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed())); - const auto& [storageWorkers, storageFailures] = storageWorkersAndFailures; - auto const storageFaultTolerance = - std::min(static_cast(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE), - configuration.storageTeamSize - 1) - - storageFailures; - if (storageFaultTolerance < 0) { - TEST(true); // Too many failed storage servers to complete snapshot - throw snap_storage_failed(); - } - TraceEvent("SnapDataDistributor_GotStorageWorkers") + + state int storageFaultTolerance; + // snap stateful nodes + state std::map> statefulWorkers = + wait(transformErrors(getStatefulWorkers(cx, db, &tlogs, &storageFaultTolerance), snap_storage_failed())); + + TraceEvent("SnapDataDistributor_GotStatefulWorkers") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); + + // we need to snapshot storage nodes before snapshot any tlogs std::vector>> storageSnapReqs; - storageSnapReqs.reserve(storageWorkers.size()); - for (const auto& worker : storageWorkers) { - storageSnapReqs.push_back(trySendSnapReq( - worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr))); + for (const auto& [addr, entry] : statefulWorkers) { + auto& [interf, role] = entry; + if (role.find("storage") != std::string::npos) + storageSnapReqs.push_back(trySendSnapReq( + interf.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr))); } wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed())); - TraceEvent("SnapDataDistributor_AfterSnapStorage") + .detail("FaultTolerance", storageFaultTolerance) .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - // snap local tlog nodes - std::vector> tLogSnapReqs; + + std::vector>> tLogSnapReqs; tLogSnapReqs.reserve(tlogs.size()); - for (const auto& tlog : tlogs) { - tLogSnapReqs.push_back(sendSnapReq(tlog.snapRequest, - TLogSnapRequest{ snapReq.snapPayload, snapReq.snapUID, "tlog"_sr }, - snap_tlog_failed())); + for (const auto& [addr, entry] : statefulWorkers) { + auto& [interf, role] = entry; + if (role.find("tlog") != std::string::npos) + tLogSnapReqs.push_back(trySendSnapReq( + interf.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "tlog"_sr))); } - wait(waitForAll(tLogSnapReqs)); + wait(waitForMost(tLogSnapReqs, 0, snap_tlog_failed())); TraceEvent("SnapDataDistributor_AfterTLogStorage") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); + // enable tlog pop on local tlog nodes std::vector> enablePops; enablePops.reserve(tlogs.size()); @@ -995,20 +1122,18 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference coordWorkers = wait(getCoordWorkers(cx, db)); - TraceEvent("SnapDataDistributor_GotCoordWorkers") - .detail("SnapPayload", snapReq.snapPayload) - .detail("SnapUID", snapReq.snapUID); + std::vector>> coordSnapReqs; - coordSnapReqs.reserve(coordWorkers.size()); - for (const auto& worker : coordWorkers) { - coordSnapReqs.push_back(trySendSnapReq( - worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr))); + for (const auto& [addr, entry] : statefulWorkers) { + auto& [interf, role] = entry; + if (role.find("coord") != std::string::npos) + coordSnapReqs.push_back(trySendSnapReq( + interf.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr))); } auto const coordFaultTolerance = std::min(std::max(0, coordSnapReqs.size() / 2 - 1), SERVER_KNOBS->MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE); wait(waitForMost(coordSnapReqs, coordFaultTolerance, snap_coord_failed())); + TraceEvent("SnapDataDistributor_AfterSnapCoords") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); @@ -1056,37 +1181,48 @@ ACTOR Future ddSnapCreateCore(DistributorSnapRequest snapReq, Reference ddSnapCreate(DistributorSnapRequest snapReq, - Reference const> db, - DDEnabledState* ddEnabledState) { +ACTOR Future ddSnapCreate( + DistributorSnapRequest snapReq, + Reference const> db, + DDEnabledState* ddEnabledState, + std::map* ddSnapMap /* ongoing snapshot requests */, + std::map>* + ddSnapResultMap /* finished snapshot requests, expired in SNAP_MINIMUM_TIME_GAP seconds */) { state Future dbInfoChange = db->onChange(); if (!ddEnabledState->setDDEnabled(false, snapReq.snapUID)) { // disable DD before doing snapCreate, if previous snap req has already disabled DD then this operation fails // here - TraceEvent("SnapDDSetDDEnabledFailedInMemoryCheck").log(); - snapReq.reply.sendError(operation_failed()); + TraceEvent("SnapDDSetDDEnabledFailedInMemoryCheck").detail("SnapUID", snapReq.snapUID); + ddSnapMap->at(snapReq.snapUID).reply.sendError(operation_failed()); + ddSnapMap->erase(snapReq.snapUID); + (*ddSnapResultMap)[snapReq.snapUID] = ErrorOr(operation_failed()); return Void(); } - double delayTime = g_network->isSimulated() ? 70.0 : SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT; try { choose { when(wait(dbInfoChange)) { TraceEvent("SnapDDCreateDBInfoChanged") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - snapReq.reply.sendError(snap_with_recovery_unsupported()); + ddSnapMap->at(snapReq.snapUID).reply.sendError(snap_with_recovery_unsupported()); + ddSnapMap->erase(snapReq.snapUID); + (*ddSnapResultMap)[snapReq.snapUID] = ErrorOr(snap_with_recovery_unsupported()); } when(wait(ddSnapCreateCore(snapReq, db))) { TraceEvent("SnapDDCreateSuccess") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - snapReq.reply.send(Void()); + ddSnapMap->at(snapReq.snapUID).reply.send(Void()); + ddSnapMap->erase(snapReq.snapUID); + (*ddSnapResultMap)[snapReq.snapUID] = ErrorOr(Void()); } - when(wait(delay(delayTime))) { + when(wait(delay(SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT))) { TraceEvent("SnapDDCreateTimedOut") .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); - snapReq.reply.sendError(timed_out()); + ddSnapMap->at(snapReq.snapUID).reply.sendError(timed_out()); + ddSnapMap->erase(snapReq.snapUID); + (*ddSnapResultMap)[snapReq.snapUID] = ErrorOr(timed_out()); } } } catch (Error& e) { @@ -1095,7 +1231,9 @@ ACTOR Future ddSnapCreate(DistributorSnapRequest snapReq, .detail("SnapPayload", snapReq.snapPayload) .detail("SnapUID", snapReq.snapUID); if (e.code() != error_code_operation_cancelled) { - snapReq.reply.sendError(e); + ddSnapMap->at(snapReq.snapUID).reply.sendError(e); + ddSnapMap->erase(snapReq.snapUID); + (*ddSnapResultMap)[snapReq.snapUID] = ErrorOr(e); } else { // enable DD should always succeed bool success = ddEnabledState->setDDEnabled(true, snapReq.snapUID); @@ -1246,6 +1384,8 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference ddSnapReqMap; + state std::map> ddSnapReqResultMap; self->addActor.send(actors.getResult()); self->addActor.send(traceRole(Role::DATA_DISTRIBUTOR, di.id())); @@ -1273,7 +1413,30 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Referenceerase(snapUID); + return Void(); + }, + delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP))); + } } when(DistributorExclusionSafetyCheckRequest exclCheckReq = waitNext(di.distributorExclCheckReq.getFuture())) { diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 140ededf82..55b797290d 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -43,7 +43,7 @@ BandwidthStatus getBandwidthStatus(StorageMetrics const& metrics) { } ReadBandwidthStatus getReadBandwidthStatus(StorageMetrics const& metrics) { - if (metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS || + if (metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS || metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * metrics.bytes * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS) { return ReadBandwidthStatusNormal; @@ -238,7 +238,7 @@ ACTOR Future trackShardMetrics(DataDistributionTracker::SafeAccessor self, std::max((int64_t)(SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * bytes * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS * (1.0 + SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC_JITTER)), - SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS); + SERVER_KNOBS->SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS); bounds.min.bytesReadPerKSecond = 0; bounds.permittedError.bytesReadPerKSecond = bounds.min.bytesReadPerKSecond / 4; } else if (readBandwidthStatus == ReadBandwidthStatusHigh) { @@ -291,7 +291,7 @@ ACTOR Future trackShardMetrics(DataDistributionTracker::SafeAccessor self, .detail("Keys", keys) .detail("UpdatedSize", metrics.metrics.bytes) .detail("Bandwidth", metrics.metrics.bytesPerKSecond) - .detail("BandwithStatus", getBandwidthStatus(metrics)) + .detail("BandwidthStatus", getBandwidthStatus(metrics)) .detail("BytesLower", bounds.min.bytes) .detail("BytesUpper", bounds.max.bytes) .detail("BandwidthLower", bounds.min.bytesPerKSecond) @@ -380,7 +380,7 @@ ACTOR Future>> getSplitKeys(DataDistributionTracker state Transaction tr(self->cx); try { Standalone> keys = - wait(self->cx->splitStorageMetrics(splitRange, splitMetrics, estimated)); + wait(self->cx->splitStorageMetrics(splitRange, splitMetrics, estimated, SERVER_KNOBS->MIN_SHARD_BYTES)); return keys; } catch (Error& e) { wait(tr.onError(e)); diff --git a/fdbserver/EncryptKeyProxy.actor.cpp b/fdbserver/EncryptKeyProxy.actor.cpp index f1b66e276d..554221dc19 100644 --- a/fdbserver/EncryptKeyProxy.actor.cpp +++ b/fdbserver/EncryptKeyProxy.actor.cpp @@ -42,6 +42,7 @@ #include "flow/genericactors.actor.h" #include "flow/network.h" +#include #include #include #include diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index 6d2ea7fcb2..a8110f438e 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -426,14 +426,12 @@ ACTOR Future execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri } else { // copy the files state std::string folderFrom = folder + "/."; - state std::string folderTo = folder + "-snap-" + uidStr.toString(); - double maxSimDelayTime = 10.0; - folderTo = folder + "-snap-" + uidStr.toString() + "-" + role; + state std::string folderTo = folder + "-snap-" + uidStr.toString() + "-" + role; std::vector paramList; std::string mkdirBin = "/bin/mkdir"; paramList.push_back(mkdirBin); paramList.push_back(folderTo); - cmdErr = spawnProcess(mkdirBin, paramList, maxWaitTime, false /*isSync*/, maxSimDelayTime); + cmdErr = spawnProcess(mkdirBin, paramList, maxWaitTime, false /*isSync*/, 10.0); wait(success(cmdErr)); err = cmdErr.get(); if (err == 0) { diff --git a/fdbserver/GetEncryptCipherKeys.actor.cpp b/fdbserver/GetEncryptCipherKeys.actor.cpp index 21f23753e6..328ff21587 100644 --- a/fdbserver/GetEncryptCipherKeys.actor.cpp +++ b/fdbserver/GetEncryptCipherKeys.actor.cpp @@ -1,5 +1,5 @@ /* - * GetCipherKeys.actor.cpp + * GetEncryptCipherKeys.actor.cpp * * This source file is part of the FoundationDB open source project * @@ -38,7 +38,7 @@ ACTOR Future onEncryptKeyProxyChange(Reference cons break; } } - TraceEvent("GetCipherKeys_EncryptKeyProxyChanged") + TraceEvent("GetEncryptCipherKeys_EncryptKeyProxyChanged") .detail("PreviousProxyId", previousProxyId.orDefault(UID())) .detail("CurrentProxyId", currentProxyId.orDefault(UID())); return Void(); @@ -50,19 +50,19 @@ ACTOR Future getUncachedLatestEncryptCipherKeys Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. - TraceEvent("GetLatestCipherKeys_EncryptKeyProxyNotPresent"); + TraceEvent("GetLatestEncryptCipherKeys_EncryptKeyProxyNotPresent"); return Never(); } request.reply.reset(); try { EKPGetLatestBaseCipherKeysReply reply = wait(proxy.get().getLatestBaseCipherKeys.getReply(request)); if (reply.error.present()) { - TraceEvent(SevWarn, "GetLatestCipherKeys_RequestFailed").error(reply.error.get()); + TraceEvent(SevWarn, "GetLatestEncryptCipherKeys_RequestFailed").error(reply.error.get()); throw encrypt_keys_fetch_failed(); } return reply; } catch (Error& e) { - TraceEvent("GetLatestCipherKeys_CaughtError").error(e); + TraceEvent("GetLatestEncryptCipherKeys_CaughtError").error(e); if (e.code() == error_code_broken_promise) { // Wait for onEncryptKeyProxyChange. return Never(); @@ -81,7 +81,7 @@ ACTOR Future> state EKPGetLatestBaseCipherKeysRequest request; if (!db.isValid()) { - TraceEvent(SevError, "GetLatestCipherKeys_ServerDBInfoNotAvailable"); + TraceEvent(SevError, "GetLatestEncryptCipherKeys_ServerDBInfoNotAvailable"); throw encrypt_ops_error(); } @@ -114,7 +114,7 @@ ACTOR Future> // Check for any missing cipher keys. for (auto& domain : request.encryptDomainInfos) { if (cipherKeys.count(domain.domainId) == 0) { - TraceEvent(SevWarn, "GetLatestCipherKeys_KeyMissing").detail("DomainId", domain.domainId); + TraceEvent(SevWarn, "GetLatestEncryptCipherKeys_KeyMissing").detail("DomainId", domain.domainId); throw encrypt_key_not_found(); } } @@ -133,19 +133,19 @@ ACTOR Future getUncachedEncryptCipherKeys(Refere Optional proxy = db->get().encryptKeyProxy; if (!proxy.present()) { // Wait for onEncryptKeyProxyChange. - TraceEvent("GetCipherKeys_EncryptKeyProxyNotPresent"); + TraceEvent("GetEncryptCipherKeys_EncryptKeyProxyNotPresent"); return Never(); } request.reply.reset(); try { EKPGetBaseCipherKeysByIdsReply reply = wait(proxy.get().getBaseCipherKeysByIds.getReply(request)); if (reply.error.present()) { - TraceEvent(SevWarn, "GetCipherKeys_RequestFailed").error(reply.error.get()); + TraceEvent(SevWarn, "GetEncryptCipherKeys_RequestFailed").error(reply.error.get()); throw encrypt_keys_fetch_failed(); } return reply; } catch (Error& e) { - TraceEvent("GetCipherKeys_CaughtError").error(e); + TraceEvent("GetEncryptCipherKeys_CaughtError").error(e); if (e.code() == error_code_broken_promise) { // Wait for onEncryptKeyProxyChange. return Never(); @@ -167,7 +167,7 @@ ACTOR Future>> ge state EKPGetBaseCipherKeysByIdsRequest request; if (!db.isValid()) { - TraceEvent(SevError, "GetCipherKeys_ServerDBInfoNotAvailable"); + TraceEvent(SevError, "GetEncryptCipherKeys_ServerDBInfoNotAvailable"); throw encrypt_ops_error(); } @@ -204,7 +204,7 @@ ACTOR Future>> ge BaseCipherIndex baseIdx = std::make_pair(details.encryptDomainId, details.baseCipherId); const auto& itr = baseCipherKeys.find(baseIdx); if (itr == baseCipherKeys.end()) { - TraceEvent(SevError, "GetCipherKeys_KeyMissing") + TraceEvent(SevError, "GetEncryptCipherKeys_KeyMissing") .detail("DomainId", details.encryptDomainId) .detail("BaseCipherId", details.baseCipherId); throw encrypt_key_not_found(); diff --git a/fdbserver/GlobalTagThrottler.actor.cpp b/fdbserver/GlobalTagThrottler.actor.cpp new file mode 100644 index 0000000000..5320f3671c --- /dev/null +++ b/fdbserver/GlobalTagThrottler.actor.cpp @@ -0,0 +1,533 @@ +/* + * GlobalTagThrottler.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/FDBTypes.h" +#include "fdbclient/TagThrottle.actor.h" +#include "fdbrpc/Smoother.h" +#include "fdbserver/TagThrottler.h" + +#include + +#include "flow/actorcompiler.h" // must be last include + +class GlobalTagThrottlerImpl { + class QuotaAndCounters { + Optional quota; + std::unordered_map ssToReadCostRate; + std::unordered_map ssToWriteCostRate; + Smoother totalReadCostRate; + Smoother totalWriteCostRate; + Smoother transactionCounter; + Smoother perClientRate; + + Optional getReadTPSLimit() const { + if (totalReadCostRate.smoothTotal() > 0) { + return quota.get().totalReadQuota * transactionCounter.smoothRate() / totalReadCostRate.smoothTotal(); + } else { + return {}; + } + } + + Optional getWriteTPSLimit() const { + if (totalWriteCostRate.smoothTotal() > 0) { + return quota.get().totalWriteQuota * transactionCounter.smoothRate() / totalWriteCostRate.smoothTotal(); + } else { + return {}; + } + } + + public: + QuotaAndCounters() + : totalReadCostRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME), + totalWriteCostRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME), + transactionCounter(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME), + perClientRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME) {} + + void setQuota(ThrottleApi::TagQuotaValue const& quota) { this->quota = quota; } + + void updateReadCostRate(UID ssId, double newReadCostRate) { + auto& currentReadCostRate = ssToReadCostRate[ssId]; + auto diff = newReadCostRate - currentReadCostRate; + currentReadCostRate += diff; + totalReadCostRate.addDelta(diff); + } + + void updateWriteCostRate(UID ssId, double newWriteCostRate) { + auto& currentWriteCostRate = ssToWriteCostRate[ssId]; + auto diff = newWriteCostRate - currentWriteCostRate; + currentWriteCostRate += diff; + totalWriteCostRate.addDelta(diff); + } + + void addTransactions(int count) { transactionCounter.addDelta(count); } + + Optional getTargetTotalTPSLimit() const { + if (!quota.present()) + return {}; + auto readLimit = getReadTPSLimit(); + auto writeLimit = getWriteTPSLimit(); + + // TODO: Implement expiration logic + if (!readLimit.present() && !writeLimit.present()) { + return {}; + } else { + if (!readLimit.present()) { + return writeLimit.get(); + } else if (!writeLimit.present()) { + return readLimit.get(); + } else { + return std::min(readLimit.get(), writeLimit.get()); + } + } + } + + Optional updateAndGetPerClientLimit() { + auto targetRate = getTargetTotalTPSLimit(); + if (targetRate.present() && transactionCounter.smoothRate() > 0) { + auto newPerClientRate = std::max( + SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MIN_RATE, + std::min(targetRate.get(), + (targetRate.get() / transactionCounter.smoothRate()) * perClientRate.smoothTotal())); + perClientRate.setTotal(newPerClientRate); + return ClientTagThrottleLimits(perClientRate.getTotal(), ClientTagThrottleLimits::NO_EXPIRATION); + } else { + return {}; + } + } + + void processTraceEvent(TraceEvent& te) const { + if (quota.present()) { + te.detail("ProvidedReadTPSLimit", getReadTPSLimit()) + .detail("ProvidedWriteTPSLimit", getWriteTPSLimit()) + .detail("ReadCostRate", totalReadCostRate.smoothTotal()) + .detail("WriteCostRate", totalWriteCostRate.smoothTotal()) + .detail("TotalReadQuota", quota.get().totalReadQuota) + .detail("ReservedReadQuota", quota.get().reservedReadQuota) + .detail("TotalWriteQuota", quota.get().totalWriteQuota) + .detail("ReservedWriteQuota", quota.get().reservedWriteQuota); + } + } + }; + + Database db; + UID id; + std::map trackedTags; + uint64_t throttledTagChangeId{ 0 }; + Future traceActor; + + ACTOR static Future tracer(GlobalTagThrottlerImpl const* self) { + loop { + for (const auto& [tag, quotaAndCounters] : self->trackedTags) { + TraceEvent te("GlobalTagThrottling"); + te.detail("Tag", tag); + quotaAndCounters.processTraceEvent(te); + } + wait(delay(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_TRACE_INTERVAL)); + } + } + + ACTOR static Future monitorThrottlingChanges(GlobalTagThrottlerImpl* self) { + loop { + state ReadYourWritesTransaction tr(self->db); + + loop { + // TODO: Clean up quotas that have been removed + try { + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + state RangeResult currentQuotas = wait(tr.getRange(tagQuotaKeys, CLIENT_KNOBS->TOO_MANY)); + TraceEvent("GlobalTagThrottler_ReadCurrentQuotas").detail("Size", currentQuotas.size()); + for (auto const kv : currentQuotas) { + auto const tag = kv.key.removePrefix(tagQuotaPrefix); + auto const quota = ThrottleApi::TagQuotaValue::fromValue(kv.value); + self->trackedTags[tag].setQuota(quota); + } + + ++self->throttledTagChangeId; + // FIXME: Should wait on watch instead + // wait(tr.watch(tagThrottleSignalKey)); + wait(delay(5.0)); + TraceEvent("GlobalTagThrottler_ChangeSignaled"); + TEST(true); // Global tag throttler detected quota changes + break; + } catch (Error& e) { + TraceEvent("GlobalTagThrottlerMonitoringChangesError", self->id).error(e); + wait(tr.onError(e)); + } + } + } + } + +public: + GlobalTagThrottlerImpl(Database db, UID id) : db(db), id(id) { traceActor = tracer(this); } + Future monitorThrottlingChanges() { return monitorThrottlingChanges(this); } + void addRequests(TransactionTag tag, int count) { trackedTags[tag].addTransactions(count); } + uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; } + PrioritizedTransactionTagMap getClientRates() { + // TODO: For now, only enforce total throttling rates. + // We should use reserved quotas as well. + PrioritizedTransactionTagMap result; + for (auto& [tag, quotaAndCounters] : trackedTags) { + // Currently there is no differentiation between batch priority and default priority transactions + auto const limit = quotaAndCounters.updateAndGetPerClientLimit(); + if (limit.present()) { + result[TransactionPriority::BATCH][tag] = result[TransactionPriority::DEFAULT][tag] = limit.get(); + } + } + return result; + } + int64_t autoThrottleCount() const { return trackedTags.size(); } + uint32_t busyReadTagCount() const { + // TODO: Implement + return 0; + } + uint32_t busyWriteTagCount() const { + // TODO: Implement + return 0; + } + int64_t manualThrottleCount() const { return trackedTags.size(); } + Future tryUpdateAutoThrottling(StorageQueueInfo const& ss) { + for (const auto& busyReadTag : ss.busiestReadTags) { + trackedTags[busyReadTag.tag].updateReadCostRate(ss.id, busyReadTag.rate); + } + for (const auto& busyWriteTag : ss.busiestWriteTags) { + trackedTags[busyWriteTag.tag].updateWriteCostRate(ss.id, busyWriteTag.rate); + } + // TODO: Call ThrottleApi::throttleTags + return Void(); + } + + void setQuota(TransactionTagRef tag, ThrottleApi::TagQuotaValue const& tagQuotaValue) { + trackedTags[tag].setQuota(tagQuotaValue); + } +}; + +GlobalTagThrottler::GlobalTagThrottler(Database db, UID id) : impl(PImpl::create(db, id)) {} + +GlobalTagThrottler::~GlobalTagThrottler() = default; + +Future GlobalTagThrottler::monitorThrottlingChanges() { + return impl->monitorThrottlingChanges(); +} +void GlobalTagThrottler::addRequests(TransactionTag tag, int count) { + return impl->addRequests(tag, count); +} +uint64_t GlobalTagThrottler::getThrottledTagChangeId() const { + return impl->getThrottledTagChangeId(); +} +PrioritizedTransactionTagMap GlobalTagThrottler::getClientRates() { + return impl->getClientRates(); +} +int64_t GlobalTagThrottler::autoThrottleCount() const { + return impl->autoThrottleCount(); +} +uint32_t GlobalTagThrottler::busyReadTagCount() const { + return impl->busyReadTagCount(); +} +uint32_t GlobalTagThrottler::busyWriteTagCount() const { + return impl->busyWriteTagCount(); +} +int64_t GlobalTagThrottler::manualThrottleCount() const { + return impl->manualThrottleCount(); +} +bool GlobalTagThrottler::isAutoThrottlingEnabled() const { + return true; +} +Future GlobalTagThrottler::tryUpdateAutoThrottling(StorageQueueInfo const& ss) { + return impl->tryUpdateAutoThrottling(ss); +} + +void GlobalTagThrottler::setQuota(TransactionTagRef tag, ThrottleApi::TagQuotaValue const& tagQuotaValue) { + return impl->setQuota(tag, tagQuotaValue); +} + +namespace GlobalTagThrottlerTesting { + +Optional getTPSLimit(GlobalTagThrottler& globalTagThrottler, TransactionTag tag) { + auto clientRates = globalTagThrottler.getClientRates(); + auto it1 = clientRates.find(TransactionPriority::DEFAULT); + if (it1 != clientRates.end()) { + auto it2 = it1->second.find(tag); + if (it2 != it1->second.end()) { + return it2->second.tpsRate; + } + } + return {}; +} + +class StorageServerCollection { + class Cost { + Smoother smoother; + + public: + Cost() : smoother(5.0) {} + Cost& operator+=(double delta) { + smoother.addDelta(delta); + return *this; + } + double smoothRate() const { return smoother.smoothRate(); } + }; + + std::vector> readCosts; + std::vector> writeCosts; + +public: + StorageServerCollection(size_t size) : readCosts(size), writeCosts(size) { ASSERT_GT(size, 0); } + + void addReadCost(TransactionTag tag, double cost) { + auto const costPerSS = cost / readCosts.size(); + for (auto& readCost : readCosts) { + readCost[tag] += costPerSS; + } + } + + void addWriteCost(TransactionTag tag, double cost) { + auto const costPerSS = cost / writeCosts.size(); + for (auto& writeCost : writeCosts) { + writeCost[tag] += costPerSS; + } + } + + std::vector getStorageQueueInfos() const { + std::vector result; + result.reserve(readCosts.size()); + for (int i = 0; i < readCosts.size(); ++i) { + StorageQueueInfo sqInfo(UID(i, i), LocalityData{}); + for (const auto& [tag, readCost] : readCosts[i]) { + double fractionalBusyness{ 0.0 }; // unused for global tag throttling + sqInfo.busiestReadTags.emplace_back(tag, readCost.smoothRate(), fractionalBusyness); + } + for (const auto& [tag, writeCost] : writeCosts[i]) { + double fractionalBusyness{ 0.0 }; // unused for global tag throttling + sqInfo.busiestWriteTags.emplace_back(tag, writeCost.smoothRate(), fractionalBusyness); + } + result.push_back(sqInfo); + } + return result; + } +}; + +ACTOR static Future runClient(GlobalTagThrottler* globalTagThrottler, + StorageServerCollection* storageServers, + TransactionTag tag, + double desiredTpsRate, + double costPerTransaction, + bool write) { + loop { + auto tpsLimit = getTPSLimit(*globalTagThrottler, tag); + state double tpsRate = tpsLimit.present() ? std::min(desiredTpsRate, tpsLimit.get()) : desiredTpsRate; + wait(delay(1 / tpsRate)); + if (write) { + storageServers->addWriteCost(tag, costPerTransaction); + } else { + storageServers->addReadCost(tag, costPerTransaction); + } + globalTagThrottler->addRequests(tag, 1); + } +} + +ACTOR static Future monitorClientRates(GlobalTagThrottler* globalTagThrottler, + TransactionTag tag, + double desiredTPSLimit) { + state int successes = 0; + loop { + wait(delay(1.0)); + auto currentTPSLimit = getTPSLimit(*globalTagThrottler, tag); + if (currentTPSLimit.present()) { + TraceEvent("GlobalTagThrottling_RateMonitor") + .detail("Tag", tag) + .detail("CurrentTPSRate", currentTPSLimit.get()) + .detail("DesiredTPSRate", desiredTPSLimit); + if (abs(currentTPSLimit.get() - desiredTPSLimit) < 1.0) { + if (++successes == 3) { + return Void(); + } + } else { + successes = 0; + } + } else { + successes = 0; + } + } +} + +ACTOR static Future updateGlobalTagThrottler(GlobalTagThrottler* globalTagThrottler, + StorageServerCollection const* storageServers) { + loop { + wait(delay(1.0)); + auto const storageQueueInfos = storageServers->getStorageQueueInfos(); + for (const auto& sq : storageQueueInfos) { + globalTagThrottler->tryUpdateAutoThrottling(sq); + } + } +} + +} // namespace GlobalTagThrottlerTesting + +TEST_CASE("/GlobalTagThrottler/Simple") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false); + state Future monitor = + GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} + +TEST_CASE("/GlobalTagThrottler/WriteThrottling") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalWriteQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, true); + state Future monitor = + GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} + +TEST_CASE("/GlobalTagThrottler/MultiTagThrottling") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag1 = "sampleTag1"_sr; + TransactionTag testTag2 = "sampleTag2"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag1, tagQuotaValue); + globalTagThrottler.setQuota(testTag2, tagQuotaValue); + state std::vector> futures; + state std::vector> monitorFutures; + futures.push_back( + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag1, 5.0, 6.0, false)); + futures.push_back( + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag2, 5.0, 6.0, false)); + futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers)); + monitorFutures.push_back(GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag1, 100.0 / 6.0)); + monitorFutures.push_back(GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag2, 100.0 / 6.0)); + wait(timeoutError(waitForAny(futures) || waitForAll(monitorFutures), 300.0)); + return Void(); +} + +TEST_CASE("/GlobalTagThrottler/ActiveThrottling") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, false); + state Future monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 10.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} + +TEST_CASE("/GlobalTagThrottler/MultiClientThrottling") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false); + state Future client2 = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false); + state Future monitor = + GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} + +TEST_CASE("/GlobalTagThrottler/MultiClientActiveThrottling") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, false); + state Future client2 = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, false); + state Future monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 5.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} + +// Global transaction rate should be 20.0, with a distribution of (5, 15) between the 2 clients +TEST_CASE("/GlobalTagThrottler/SkewedMultiClientActiveThrottling") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + ThrottleApi::TagQuotaValue tagQuotaValue; + TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 5.0, false); + state Future client2 = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 25.0, 5.0, false); + state Future monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 15.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} + +// Test that the tag throttler can reach equilibrium, then adjust to a new equilibrium once the quota is changed +TEST_CASE("/GlobalTagThrottler/UpdateQuota") { + state GlobalTagThrottler globalTagThrottler(Database{}, UID{}); + state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10); + state ThrottleApi::TagQuotaValue tagQuotaValue; + state TransactionTag testTag = "sampleTag1"_sr; + tagQuotaValue.totalReadQuota = 100.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + state Future client = + GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false); + state Future monitor = + GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0); + state Future updater = + GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers); + wait(timeoutError(monitor || client || updater, 300.0)); + tagQuotaValue.totalReadQuota = 50.0; + globalTagThrottler.setQuota(testTag, tagQuotaValue); + monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 50.0 / 6.0); + wait(timeoutError(monitor || client || updater, 300.0)); + return Void(); +} diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp index 611f87d11f..958f3e3bf8 100644 --- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp @@ -668,7 +668,7 @@ public: TraceEvent("RocksDB").detail("Info", "DBDestroyed"); } - rocksdb::DB* getDb() { return db; } + rocksdb::DB* getDb() const { return db; } std::unordered_map>* getAllShards() { return &physicalShards; } @@ -2092,11 +2092,13 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { } StorageBytes getStorageBytes() const override { - uint64_t total_live = 0; - int64_t total_free = 0; - int64_t total_space = 0; + uint64_t live = 0; + ASSERT(shardManager.getDb()->GetAggregatedIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live)); - return StorageBytes(total_free, total_space, total_live, total_free); + int64_t free; + int64_t total; + g_network->getDiskBytes(path, free, total); + return StorageBytes(free, total, live, free); } std::vector removeRange(KeyRangeRef range) override { return shardManager.removeRange(range); } @@ -2118,7 +2120,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { ShardManager shardManager; std::shared_ptr rocksDBMetrics; std::string path; - const std::string dataPath; UID id; Reference writeThread; Reference readThreads; diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp index fd8f55c313..b0db6b1ed5 100644 --- a/fdbserver/MutationTracking.cpp +++ b/fdbserver/MutationTracking.cpp @@ -21,6 +21,7 @@ #include #include #include "fdbclient/FDBTypes.h" +#include "fdbserver/EncryptedMutationMessage.h" #include "fdbserver/MutationTracking.h" #include "fdbserver/LogProtocolMessage.h" #include "fdbserver/SpanContextMessage.h" @@ -102,6 +103,8 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion())); OTELSpanContextMessage scm; br >> scm; + } else if (EncryptedMutationMessage::startsEncryptedMutationMessage(mutationType)) { + throw encrypt_unsupported(); } else { MutationRef m; BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion())); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index e1f1d0a163..73345b2e75 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -227,11 +227,6 @@ public: } } - ACTOR static Future monitorThrottlingChanges(Ratekeeper* self) { - wait(self->tagThrottler->monitorThrottlingChanges()); - return Void(); - } - ACTOR static Future run(RatekeeperInterface rkInterf, Reference const> dbInfo) { state Ratekeeper self(rkInterf.id(), openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)); state Future timeout = Void(); @@ -408,7 +403,7 @@ Future Ratekeeper::trackTLogQueueInfo(TLogInterface tli) { } Future Ratekeeper::monitorThrottlingChanges() { - return RatekeeperImpl::monitorThrottlingChanges(this); + return tagThrottler->monitorThrottlingChanges(); } Future Ratekeeper::run(RatekeeperInterface rkInterf, Reference const> dbInfo) { @@ -436,7 +431,11 @@ Ratekeeper::Ratekeeper(UID id, Database db) SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH, SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH, SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH) { - tagThrottler = std::make_unique(db, id); + if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING) { + tagThrottler = std::make_unique(db, id); + } else { + tagThrottler = std::make_unique(db, id); + } } void Ratekeeper::updateCommitCostEstimation( diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 1954a26b3e..736e73920f 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -24,6 +24,7 @@ #include "flow/UnitTest.h" #include "fdbclient/BackupContainer.h" #include "fdbclient/BackupAgent.actor.h" +#include "fdbserver/EncryptedMutationMessage.h" #include "fdbserver/RestoreLoader.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" #include "fdbserver/MutationTracking.h" @@ -422,6 +423,9 @@ ACTOR static Future _parsePartitionedLogFileOnLoader( ASSERT(inserted); ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(g_network->protocolVersion())); + if (EncryptedMutationMessage::isNextIn(rd)) { + throw encrypt_unsupported(); + } MutationRef mutation; rd >> mutation; diff --git a/fdbserver/RkTagThrottleCollection.cpp b/fdbserver/RkTagThrottleCollection.cpp index 11c376f57d..d0e8cb9892 100644 --- a/fdbserver/RkTagThrottleCollection.cpp +++ b/fdbserver/RkTagThrottleCollection.cpp @@ -22,7 +22,7 @@ #include "fdbserver/Knobs.h" #include "fdbserver/RkTagThrottleCollection.h" -double RkTagThrottleCollection::RkTagThrottleData::getTargetRate(Optional requestRate) { +double RkTagThrottleCollection::RkTagThrottleData::getTargetRate(Optional requestRate) const { if (limits.tpsRate == 0.0 || !requestRate.present() || requestRate.get() == 0.0 || !rateSet) { return limits.tpsRate; } else { @@ -347,10 +347,12 @@ int64_t RkTagThrottleCollection::manualThrottleCount() const { return count; } -void RkTagThrottleCollection::updateBusyTagCount(TagThrottledReason reason) { +void RkTagThrottleCollection::incrementBusyTagCount(TagThrottledReason reason) { if (reason == TagThrottledReason::BUSY_READ) { ++busyReadTagCount; } else if (reason == TagThrottledReason::BUSY_WRITE) { ++busyWriteTagCount; + } else { + ASSERT(false); } } diff --git a/fdbserver/SimKmsConnector.actor.cpp b/fdbserver/SimKmsConnector.actor.cpp index 0b6c37fb7f..91d843345a 100644 --- a/fdbserver/SimKmsConnector.actor.cpp +++ b/fdbserver/SimKmsConnector.actor.cpp @@ -139,7 +139,7 @@ ACTOR Future ekLookupByDomainIds(Reference ctx, req.debugId.present() ? TraceEvent("SimKmsGetsByDomIds", interf.id()) : Optional(); if (dbgDIdTrace.present()) { - dbgDIdTrace.get().detail("DbgId", req.debugId.get()); + dbgDIdTrace.get().setMaxEventLength(16384).detail("DbgId", req.debugId.get()); } // Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index e1461b0ed7..7c705a4055 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -2793,19 +2793,11 @@ ACTOR Future> getActivePrimaryDC(Database cx, int* fullyReplicat } } -// read storageWigglerStats through Read-only tx, then convert it to JSON field -ACTOR Future storageWigglerStatsFetcher(Optional ddWorker, - DatabaseConfiguration conf, - Database cx, - bool use_system_priority) { +ACTOR Future, Optional>> readStorageWiggleMetrics(Database cx, + bool use_system_priority) { state Reference tr(new ReadYourWritesTransaction(cx)); state Optional primaryV; state Optional remoteV; - state Future> stateFut; - if (ddWorker.present()) { - stateFut = ddWorker.get().storageWigglerState.tryGetReply(GetStorageWigglerStateRequest()); - } - loop { try { if (use_system_priority) { @@ -2813,42 +2805,59 @@ ACTOR Future storageWigglerStatsFetcher(Optionalcommit()); - break; + return std::make_pair(primaryV, remoteV); } catch (Error& e) { wait(tr->onError(e)); } } +} +// read storageWigglerStats through Read-only tx, then convert it to JSON field +ACTOR Future storageWigglerStatsFetcher(Optional ddWorker, + DatabaseConfiguration conf, + Database cx, + bool use_system_priority, + JsonBuilderArray* messages) { + + state Future stateFut; + state Future, Optional>> wiggleMetricsFut = + timeoutError(readStorageWiggleMetrics(cx, use_system_priority), 2.0); + state JsonBuilderObject res; if (ddWorker.present()) { + stateFut = timeoutError(ddWorker.get().storageWigglerState.getReply(GetStorageWigglerStateRequest()), 2.0); wait(ready(stateFut)); + } else { + return res; } - JsonBuilderObject res; - if (primaryV.present()) { - auto obj = ObjectReader::fromStringRef(primaryV.get(), IncludeVersion()).toJSON(); - if (stateFut.canGet() && stateFut.get().present()) { - auto& reply = stateFut.get().get(); + + try { + if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01)) { + throw timed_out(); + } + + wait(success(wiggleMetricsFut) && success(stateFut)); + auto [primaryV, remoteV] = wiggleMetricsFut.get(); + if (primaryV.present()) { + auto obj = ObjectReader::fromStringRef(primaryV.get(), IncludeVersion()).toJSON(); + auto& reply = stateFut.get(); obj["state"] = StorageWiggler::getWiggleStateStr(static_cast(reply.primary)); obj["last_state_change_timestamp"] = reply.lastStateChangePrimary; obj["last_state_change_datetime"] = epochsToGMTString(reply.lastStateChangePrimary); + res["primary"] = obj; } - res["primary"] = obj; - } - if (conf.regions.size() > 1 && remoteV.present()) { - auto obj = ObjectReader::fromStringRef(remoteV.get(), IncludeVersion()).toJSON(); - if (stateFut.canGet() && stateFut.get().present()) { - auto& reply = stateFut.get().get(); + if (conf.regions.size() > 1 && remoteV.present()) { + auto obj = ObjectReader::fromStringRef(remoteV.get(), IncludeVersion()).toJSON(); + auto& reply = stateFut.get(); obj["state"] = StorageWiggler::getWiggleStateStr(static_cast(reply.remote)); obj["last_state_change_timestamp"] = reply.lastStateChangeRemote; obj["last_state_change_datetime"] = epochsToGMTString(reply.lastStateChangeRemote); + res["remote"] = obj; } - res["remote"] = obj; - } - if (stateFut.canGet() && stateFut.isError()) { - res["error"] = std::string("Can't get storage wiggler state: ") + stateFut.getError().name(); - TraceEvent(SevWarn, "StorageWigglerStatsFetcher").error(stateFut.getError()); - } else if (stateFut.canGet() && stateFut.get().isError()) { - res["error"] = std::string("Can't get storage wiggler state: ") + stateFut.get().getError().name(); - TraceEvent(SevWarn, "StorageWigglerStatsFetcher").error(stateFut.get().getError()); + return res; + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + messages->push_back(JsonString::makeMessage("fetch_storage_wiggler_stats_timeout", + "Fetching storage wiggler stats timed out.")); } return res; } @@ -3096,17 +3105,29 @@ ACTOR Future clusterGetStatus( if (configuration.get().perpetualStorageWiggleSpeed > 0) { state Future>> primaryWiggleValues; state Future>> remoteWiggleValues; + double timeout = g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01) ? 0.0 : 2.0; + primaryWiggleValues = timeoutError(readStorageWiggleValues(cx, true, true), timeout); + remoteWiggleValues = timeoutError(readStorageWiggleValues(cx, false, true), timeout); + wait(store( + storageWiggler, + storageWigglerStatsFetcher(db->get().distributor, configuration.get(), cx, true, &messages)) && + ready(primaryWiggleValues) && ready(remoteWiggleValues)); - primaryWiggleValues = readStorageWiggleValues(cx, true, true); - remoteWiggleValues = readStorageWiggleValues(cx, false, true); - wait(store(storageWiggler, - storageWigglerStatsFetcher(db->get().distributor, configuration.get(), cx, true)) && - success(primaryWiggleValues) && success(remoteWiggleValues)); - - for (auto& p : primaryWiggleValues.get()) - wiggleServers.insert(p.first); - for (auto& p : remoteWiggleValues.get()) - wiggleServers.insert(p.first); + if (primaryWiggleValues.canGet()) { + for (auto& p : primaryWiggleValues.get()) + wiggleServers.insert(p.first); + } else { + messages.push_back( + JsonString::makeMessage("fetch_storage_wiggler_stats_timeout", + "Fetching wiggling servers in primary region timed out")); + } + if (remoteWiggleValues.canGet()) { + for (auto& p : remoteWiggleValues.get()) + wiggleServers.insert(p.first); + } else { + messages.push_back(JsonString::makeMessage("fetch_storage_wiggler_stats_timeout", + "Fetching wiggling servers in remote region timed out")); + } } state std::vector workerStatuses = wait(getAll(futures2)); diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 2e6cbb1850..1be4f63698 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -23,6 +23,8 @@ #include "fdbclient/FDBOptions.g.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/SystemData.h" +#include "fdbserver/EncryptedMutationMessage.h" +#include "fdbserver/GetEncryptCipherKeys.h" #include "fdbserver/Knobs.h" #include "fdbserver/ServerDBInfo.h" #include "fdbclient/StorageServerInterface.h" @@ -1874,6 +1876,9 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { state FetchInjectionInfo fii; state Reference cloneCursor2; + state Optional>> cipherKeys; + state bool collectingCipherKeys = false; + // If encrypted mutation is encountered, we collect cipher details and fetch cipher keys, then start over. loop { state uint64_t changeCounter = data->cacheRangeChangeCounter; bool epochEnd = false; @@ -1881,6 +1886,8 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { bool firstMutation = true; bool dbgLastMessageWasProtocol = false; + std::unordered_set cipherDetails; + Reference cloneCursor1 = cursor->cloneNoMore(); cloneCursor2 = cursor->cloneNoMore(); @@ -1904,36 +1911,60 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { OTELSpanContextMessage::isNextIn(cloneReader)) { OTELSpanContextMessage scm; cloneReader >> scm; + } else if (cloneReader.protocolVersion().hasEncryptionAtRest() && + EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) { + // Encrypted mutation found, but cipher keys haven't been fetch. + // Collect cipher details to fetch cipher keys in one batch. + EncryptedMutationMessage emm; + cloneReader >> emm; + cipherDetails.insert(emm.header.cipherTextDetails); + cipherDetails.insert(emm.header.cipherHeaderDetails); + collectingCipherKeys = true; } else { MutationRef msg; - cloneReader >> msg; - - if (firstMutation && msg.param1.startsWith(systemKeys.end)) - hasPrivateData = true; - firstMutation = false; - - if (msg.param1 == lastEpochEndPrivateKey) { - epochEnd = true; - // ASSERT(firstMutation); - ASSERT(dbgLastMessageWasProtocol); + if (cloneReader.protocolVersion().hasEncryptionAtRest() && + EncryptedMutationMessage::isNextIn(cloneReader)) { + assert(cipherKeys.present()); + msg = EncryptedMutationMessage::decrypt(cloneReader, cloneReader.arena(), cipherKeys.get()); + } else { + cloneReader >> msg; } - dbgLastMessageWasProtocol = false; + if (!collectingCipherKeys) { + if (firstMutation && msg.param1.startsWith(systemKeys.end)) + hasPrivateData = true; + firstMutation = false; + + if (msg.param1 == lastEpochEndPrivateKey) { + epochEnd = true; + // ASSERT(firstMutation); + ASSERT(dbgLastMessageWasProtocol); + } + + dbgLastMessageWasProtocol = false; + } } } - // Any fetchKeys which are ready to transition their cacheRanges to the adding,transferred state do so - // now. If there is an epoch end we skip this step, to increase testability and to prevent inserting a - // version in the middle of a rolled back version range. - while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) { - auto fk = data->readyFetchKeys.back(); - data->readyFetchKeys.pop_back(); - fk.send(&fii); + if (collectingCipherKeys) { + std::unordered_map> result = + wait(getEncryptCipherKeys(data->db, cipherDetails)); + cipherKeys = result; + collectingCipherKeys = false; + } else { + // Any fetchKeys which are ready to transition their cacheRanges to the adding,transferred state do + // so now. If there is an epoch end we skip this step, to increase testability and to prevent + // inserting a version in the middle of a rolled back version range. + while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) { + auto fk = data->readyFetchKeys.back(); + data->readyFetchKeys.pop_back(); + fk.send(&fii); + } + if (data->cacheRangeChangeCounter == changeCounter) + break; + // TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read + // it again. } - if (data->cacheRangeChangeCounter == changeCounter) - break; - // TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it - // again. } data->debug_inApplyUpdate = true; @@ -1988,7 +2019,11 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { reader >> oscm; } else { MutationRef msg; - reader >> msg; + if (reader.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(reader)) { + msg = EncryptedMutationMessage::decrypt(reader, reader.arena(), cipherKeys.get()); + } else { + reader >> msg; + } if (ver != invalidVersion) // This change belongs to a version < minVersion { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index b04d6247e5..f9dd88598d 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -28,7 +28,6 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/LogProtocolMessage.h" #include "fdbserver/SpanContextMessage.h" #include "fdbserver/TLogInterface.h" #include "fdbserver/Knobs.h" @@ -366,9 +365,9 @@ struct TLogData : NonCopyable { // the set and for callers that unset will // be able to match it up std::string dataFolder; // folder where data is stored - Reference> degraded; // End of fields used by snapshot based backup and restore + Reference> degraded; std::vector tempTagMessages; Reference commitLatencyDist; @@ -2569,42 +2568,6 @@ void getQueuingMetrics(TLogData* self, Reference logData, TLogQueuingMe req.reply.send(reply); } -ACTOR Future tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Reference logData) { - if (self->ignorePopUid != snapReq.snapUID.toString()) { - snapReq.reply.sendError(operation_failed()); - return Void(); - } - ExecCmdValueString snapArg(snapReq.snapPayload); - try { - int err = wait(execHelper(&snapArg, snapReq.snapUID, self->dataFolder, snapReq.role.toString())); - - std::string uidStr = snapReq.snapUID.toString(); - TraceEvent("ExecTraceTLog") - .detail("Uid", uidStr) - .detail("Status", err) - .detail("Role", snapReq.role) - .detail("Value", self->dataFolder) - .detail("ExecPayload", snapReq.snapPayload) - .detail("PersistentDataVersion", logData->persistentDataVersion) - .detail("PersistentDatadurableVersion", logData->persistentDataDurableVersion) - .detail("QueueCommittedVersion", logData->queueCommittedVersion.get()) - .detail("Version", logData->version.get()); - - if (err != 0) { - throw operation_failed(); - } - snapReq.reply.send(Void()); - } catch (Error& e) { - TraceEvent("TLogExecHelperError").errorUnsuppressed(e); - if (e.code() != error_code_operation_cancelled) { - snapReq.reply.sendError(e); - } else { - throw e; - } - } - return Void(); -} - ACTOR Future tLogEnablePopReq(TLogEnablePopRequest enablePopReq, TLogData* self, Reference logData) { if (self->ignorePopUid != enablePopReq.snapUID.toString()) { TraceEvent(SevWarn, "TLogPopDisableEnableUidMismatch") @@ -2731,9 +2694,6 @@ ACTOR Future serveTLogInterface(TLogData* self, when(TLogEnablePopRequest enablePopReq = waitNext(tli.enablePopRequest.getFuture())) { logData->addActor.send(tLogEnablePopReq(enablePopReq, self, logData)); } - when(TLogSnapRequest snapReq = waitNext(tli.snapRequest.getFuture())) { - logData->addActor.send(tLogSnapCreate(snapReq, self, logData)); - } } } diff --git a/fdbserver/TagThrottler.actor.cpp b/fdbserver/TagThrottler.actor.cpp index d85e021544..432aa459b1 100644 --- a/fdbserver/TagThrottler.actor.cpp +++ b/fdbserver/TagThrottler.actor.cpp @@ -21,6 +21,7 @@ #include "fdbserver/TagThrottler.h" #include "fdbserver/RkTagThrottleCollection.h" +#include "flow/actorcompiler.h" // must be last include class TagThrottlerImpl { Database db; @@ -106,7 +107,7 @@ class TagThrottlerImpl { if (tagKey.throttleType == TagThrottleType::AUTO) { updatedTagThrottles.autoThrottleTag( self->id, tag, 0, tagValue.tpsRate, tagValue.expirationTime); - updatedTagThrottles.updateBusyTagCount(tagValue.reason); + updatedTagThrottles.incrementBusyTagCount(tagValue.reason); } else { updatedTagThrottles.manualThrottleTag(self->id, tag, @@ -143,6 +144,7 @@ class TagThrottlerImpl { if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) { TEST(true); // Transaction tag auto-throttled Optional clientRate = throttledTags.autoThrottleTag(id, tag, busyness); + // TODO: Increment tag throttle counts here? if (clientRate.present()) { TagSet tags; tags.addTag(tag); @@ -185,23 +187,21 @@ public: // the future auto storageQueue = ss.getStorageQueueBytes(); auto storageDurabilityLag = ss.getDurabilityLag(); + std::vector> futures; if (storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES || storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS) { - // TODO: Update once size is potentially > 1 - ASSERT_WE_THINK(ss.busiestWriteTags.size() <= 1); - ASSERT_WE_THINK(ss.busiestReadTags.size() <= 1); for (const auto& busyWriteTag : ss.busiestWriteTags) { - return tryUpdateAutoThrottling(busyWriteTag.tag, - busyWriteTag.rate, - busyWriteTag.fractionalBusyness, - TagThrottledReason::BUSY_WRITE); + futures.push_back(tryUpdateAutoThrottling(busyWriteTag.tag, + busyWriteTag.rate, + busyWriteTag.fractionalBusyness, + TagThrottledReason::BUSY_WRITE)); } for (const auto& busyReadTag : ss.busiestReadTags) { - return tryUpdateAutoThrottling( - busyReadTag.tag, busyReadTag.rate, busyReadTag.fractionalBusyness, TagThrottledReason::BUSY_READ); + futures.push_back(tryUpdateAutoThrottling( + busyReadTag.tag, busyReadTag.rate, busyReadTag.fractionalBusyness, TagThrottledReason::BUSY_READ)); } } - return Void(); + return waitForAll(futures); } }; // class TagThrottlerImpl diff --git a/fdbserver/TenantCache.actor.cpp b/fdbserver/TenantCache.actor.cpp index 8bf00ec3be..7bdad31c62 100644 --- a/fdbserver/TenantCache.actor.cpp +++ b/fdbserver/TenantCache.actor.cpp @@ -48,7 +48,7 @@ public: for (int i = 0; i < tenantList.size(); i++) { TenantName tname = tenantList[i].key.removePrefix(tenantMapPrefix); - TenantMapEntry t = decodeTenantEntry(tenantList[i].value); + TenantMapEntry t = TenantMapEntry::decode(tenantList[i].value); tenantCache->insert(tname, t); @@ -86,7 +86,7 @@ public: for (int i = 0; i < tenantList.size(); i++) { TenantName tname = tenantList[i].key.removePrefix(tenantMapPrefix); - TenantMapEntry t = decodeTenantEntry(tenantList[i].value); + TenantMapEntry t = TenantMapEntry::decode(tenantList[i].value); if (tenantCache->update(tname, t)) { tenantListUpdated = true; diff --git a/fdbserver/TransactionTagCounter.cpp b/fdbserver/TransactionTagCounter.cpp index 1f0a25c2cc..7b7829f676 100644 --- a/fdbserver/TransactionTagCounter.cpp +++ b/fdbserver/TransactionTagCounter.cpp @@ -18,50 +18,193 @@ * limitations under the License. */ +#include "fdbserver/Knobs.h" #include "fdbserver/TransactionTagCounter.h" #include "flow/Trace.h" -TransactionTagCounter::TransactionTagCounter(UID thisServerID) - : thisServerID(thisServerID), - busiestReadTagEventHolder(makeReference(thisServerID.toString() + "/BusiestReadTag")) {} +namespace { -void TransactionTagCounter::addRequest(Optional const& tags, int64_t bytes) { - if (tags.present()) { - TEST(true); // Tracking transaction tag in counter - double cost = costFunction(bytes); - for (auto& tag : tags.get()) { - int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())]; - count += cost; - if (count > busiestTagCount) { - busiestTagCount = count; - busiestTag = tag; +class TopKTags { +public: + struct TagAndCount { + TransactionTag tag; + int64_t count; + bool operator<(TagAndCount const& other) const { return count < other.count; } + explicit TagAndCount(TransactionTag tag, int64_t count) : tag(tag), count(count) {} + }; + +private: + // Because the number of tracked is expected to be small, they can be tracked + // in a simple vector. If the number of tracked tags increases, a more sophisticated + // data structure will be required. + std::vector topTags; + int limit; + +public: + explicit TopKTags(int limit) : limit(limit) { + ASSERT_GT(limit, 0); + topTags.reserve(limit); + } + + void incrementCount(TransactionTag tag, int previousCount, int increase) { + auto iter = std::find_if(topTags.begin(), topTags.end(), [tag](const auto& tc) { return tc.tag == tag; }); + if (iter != topTags.end()) { + ASSERT_EQ(previousCount, iter->count); + iter->count += increase; + } else if (topTags.size() < limit) { + ASSERT_EQ(previousCount, 0); + topTags.emplace_back(tag, increase); + } else { + auto toReplace = std::min_element(topTags.begin(), topTags.end()); + ASSERT_GE(toReplace->count, previousCount); + if (toReplace->count < previousCount + increase) { + toReplace->tag = tag; + toReplace->count = previousCount + increase; } } - - intervalTotalSampledCount += cost; } + + std::vector getBusiestTags(double elapsed, double totalSampleCount) const { + std::vector result; + for (auto const& tagAndCounter : topTags) { + auto rate = (tagAndCounter.count / CLIENT_KNOBS->READ_TAG_SAMPLE_RATE) / elapsed; + if (rate > SERVER_KNOBS->MIN_TAG_READ_PAGES_RATE) { + result.emplace_back(tagAndCounter.tag, rate, tagAndCounter.count / totalSampleCount); + } + } + return result; + } + + void clear() { topTags.clear(); } +}; + +} // namespace + +class TransactionTagCounterImpl { + UID thisServerID; + TransactionTagMap intervalCounts; + int64_t intervalTotalSampledCount = 0; + TopKTags topTags; + double intervalStart = 0; + + std::vector previousBusiestTags; + Reference busiestReadTagEventHolder; + + static int64_t costFunction(int64_t bytes) { return bytes / SERVER_KNOBS->READ_COST_BYTE_FACTOR + 1; } + +public: + TransactionTagCounterImpl(UID thisServerID) + : thisServerID(thisServerID), topTags(SERVER_KNOBS->SS_THROTTLE_TAGS_TRACKED), + busiestReadTagEventHolder(makeReference(thisServerID.toString() + "/BusiestReadTag")) {} + + void addRequest(Optional const& tags, int64_t bytes) { + if (tags.present()) { + TEST(true); // Tracking transaction tag in counter + double cost = costFunction(bytes); + for (auto& tag : tags.get()) { + int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())]; + topTags.incrementCount(tag, count, cost); + count += cost; + } + + intervalTotalSampledCount += cost; + } + } + + void startNewInterval() { + double elapsed = now() - intervalStart; + previousBusiestTags.clear(); + if (intervalStart > 0 && CLIENT_KNOBS->READ_TAG_SAMPLE_RATE > 0 && elapsed > 0) { + previousBusiestTags = topTags.getBusiestTags(elapsed, intervalTotalSampledCount); + + TraceEvent("BusiestReadTag", thisServerID) + .detail("Elapsed", elapsed) + //.detail("Tag", printable(busiestTag)) + //.detail("TagCost", busiestTagCount) + .detail("TotalSampledCost", intervalTotalSampledCount) + .detail("Reported", previousBusiestTags.size()) + .trackLatest(busiestReadTagEventHolder->trackingKey); + } + + intervalCounts.clear(); + intervalTotalSampledCount = 0; + topTags.clear(); + intervalStart = now(); + } + + std::vector const& getBusiestTags() const { return previousBusiestTags; } +}; + +TransactionTagCounter::TransactionTagCounter(UID thisServerID) + : impl(PImpl::create(thisServerID)) {} + +TransactionTagCounter::~TransactionTagCounter() = default; + +void TransactionTagCounter::addRequest(Optional const& tags, int64_t bytes) { + return impl->addRequest(tags, bytes); } void TransactionTagCounter::startNewInterval() { - double elapsed = now() - intervalStart; - previousBusiestTags.clear(); - if (intervalStart > 0 && CLIENT_KNOBS->READ_TAG_SAMPLE_RATE > 0 && elapsed > 0) { - double rate = busiestTagCount / CLIENT_KNOBS->READ_TAG_SAMPLE_RATE / elapsed; - if (rate > SERVER_KNOBS->MIN_TAG_READ_PAGES_RATE) { - previousBusiestTags.emplace_back(busiestTag, rate, (double)busiestTagCount / intervalTotalSampledCount); - } - - TraceEvent("BusiestReadTag", thisServerID) - .detail("Elapsed", elapsed) - .detail("Tag", printable(busiestTag)) - .detail("TagCost", busiestTagCount) - .detail("TotalSampledCost", intervalTotalSampledCount) - .detail("Reported", !previousBusiestTags.empty()) - .trackLatest(busiestReadTagEventHolder->trackingKey); - } - - intervalCounts.clear(); - intervalTotalSampledCount = 0; - busiestTagCount = 0; - intervalStart = now(); + return impl->startNewInterval(); +} + +std::vector const& TransactionTagCounter::getBusiestTags() const { + return impl->getBusiestTags(); +} + +TEST_CASE("/TransactionTagCounter/TopKTags") { + TopKTags topTags(2); + + // Ensure that costs are larger enough to show up + auto const costMultiplier = + std::max(1.0, 2 * SERVER_KNOBS->MIN_TAG_READ_PAGES_RATE * CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); + + ASSERT_EQ(topTags.getBusiestTags(1.0, 0).size(), 0); + topTags.incrementCount("a"_sr, 0, 1 * costMultiplier); + { + auto const busiestTags = topTags.getBusiestTags(1.0, 1 * costMultiplier); + ASSERT_EQ(busiestTags.size(), 1); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "a"_sr; }), + 1); + } + topTags.incrementCount("b"_sr, 0, 2 * costMultiplier); + topTags.incrementCount("c"_sr, 0, 3 * costMultiplier); + { + auto busiestTags = topTags.getBusiestTags(1.0, 6 * costMultiplier); + ASSERT_EQ(busiestTags.size(), 2); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "a"_sr; }), + 0); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "b"_sr; }), + 1); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "c"_sr; }), + 1); + } + topTags.incrementCount("a"_sr, 1 * costMultiplier, 3 * costMultiplier); + { + auto busiestTags = topTags.getBusiestTags(1.0, 9 * costMultiplier); + ASSERT_EQ(busiestTags.size(), 2); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "a"_sr; }), + 1); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "b"_sr; }), + 0); + ASSERT_EQ(std::count_if(busiestTags.begin(), + busiestTags.end(), + [](auto const& tagInfo) { return tagInfo.tag == "c"_sr; }), + 1); + } + topTags.clear(); + ASSERT_EQ(topTags.getBusiestTags(1.0, 0).size(), 0); + return Void(); } diff --git a/fdbserver/include/fdbserver/ApplyMetadataMutation.h b/fdbserver/include/fdbserver/ApplyMetadataMutation.h index 23f9e3a2f9..015d79b6f7 100644 --- a/fdbserver/include/fdbserver/ApplyMetadataMutation.h +++ b/fdbserver/include/fdbserver/ApplyMetadataMutation.h @@ -33,6 +33,7 @@ #include "fdbserver/LogProtocolMessage.h" #include "fdbserver/LogSystem.h" #include "fdbserver/ProxyCommitData.actor.h" +#include "flow/BlobCipher.h" #include "flow/FastRef.h" // Resolver's data for applyMetadataMutations() calls. @@ -93,6 +94,7 @@ void applyMetadataMutations(SpanContext const& spanContext, Reference logSystem, const VectorRef& mutations, LogPushData* pToCommit, + const std::unordered_map>* pCipherKeys, bool& confChange, Version version, Version popVersion, diff --git a/fdbserver/include/fdbserver/EncryptedMutationMessage.h b/fdbserver/include/fdbserver/EncryptedMutationMessage.h new file mode 100644 index 0000000000..d94e7f7f1f --- /dev/null +++ b/fdbserver/include/fdbserver/EncryptedMutationMessage.h @@ -0,0 +1,117 @@ +/* + * EncryptedMutationMessage.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H +#define FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H + +#pragma once + +#include "fdbclient/CommitTransaction.h" +#include "flow/BlobCipher.h" + +struct EncryptedMutationMessage { + + BlobCipherEncryptHeader header; + StringRef encrypted; + + EncryptedMutationMessage() {} + + std::string toString() const { + return format("code: %d, encryption info: %s", + MutationRef::Reserved_For_EncryptedMutationMessage, + header.toString().c_str()); + } + + template + void serialize(Ar& ar) { + uint8_t poly = MutationRef::Reserved_For_EncryptedMutationMessage; + serializer(ar, poly, header, encrypted); + } + + static bool startsEncryptedMutationMessage(uint8_t byte) { + return byte == MutationRef::Reserved_For_EncryptedMutationMessage; + } + template + static bool isNextIn(Ar& ar) { + return startsEncryptedMutationMessage(*(const uint8_t*)ar.peekBytes(1)); + } + + // Encrypt given mutation and return an EncryptedMutationMessage. + static EncryptedMutationMessage encrypt( + Arena& arena, + const std::unordered_map>& cipherKeys, + const EncryptCipherDomainId& domainId, + const MutationRef& mutation) { + ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID); + auto textCipherItr = cipherKeys.find(domainId); + auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + uint8_t iv[AES_256_IV_LENGTH]; + generateRandomData(iv, AES_256_IV_LENGTH); + BinaryWriter bw(AssumeVersion(g_network->protocolVersion())); + bw << mutation; + EncryptedMutationMessage encrypted_mutation; + EncryptBlobCipherAes265Ctr cipher(textCipherItr->second, + headerCipherItr->second, + iv, + AES_256_IV_LENGTH, + ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + encrypted_mutation.encrypted = + cipher + .encrypt(static_cast(bw.getData()), bw.getLength(), &encrypted_mutation.header, arena) + ->toStringRef(); + return encrypted_mutation; + } + + // Encrypt system key space mutation and return an EncryptedMutationMessage. + static EncryptedMutationMessage encryptMetadata( + Arena& arena, + const std::unordered_map>& cipherKeys, + const MutationRef& mutation) { + return encrypt(arena, cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, mutation); + } + + // Read an EncryptedMutationMessage from given reader, decrypt and return the encrypted mutation. + // Also return decrypt buffer through buf, if it is specified. + template + static MutationRef decrypt(Ar& ar, + Arena& arena, + const std::unordered_map>& cipherKeys, + StringRef* buf = nullptr) { + EncryptedMutationMessage msg; + ar >> msg; + auto textCipherItr = cipherKeys.find(msg.header.cipherTextDetails); + auto headerCipherItr = cipherKeys.find(msg.header.cipherHeaderDetails); + ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); + ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, msg.header.iv); + StringRef plaintext = + cipher.decrypt(msg.encrypted.begin(), msg.encrypted.size(), msg.header, arena)->toStringRef(); + if (buf != nullptr) { + *buf = plaintext; + } + ArenaReader reader(arena, plaintext, AssumeVersion(g_network->protocolVersion())); + MutationRef mutation; + reader >> mutation; + return mutation; + } +}; +#endif \ No newline at end of file diff --git a/fdbserver/include/fdbserver/GetEncryptCipherKeys.h b/fdbserver/include/fdbserver/GetEncryptCipherKeys.h index 6af7ccc71c..9a060c295e 100644 --- a/fdbserver/include/fdbserver/GetEncryptCipherKeys.h +++ b/fdbserver/include/fdbserver/GetEncryptCipherKeys.h @@ -1,5 +1,5 @@ /* - * GetCipherKeys.h + * GetEncryptCipherKeys.h * * This source file is part of the FoundationDB open source project * diff --git a/fdbserver/include/fdbserver/Ratekeeper.h b/fdbserver/include/fdbserver/Ratekeeper.h index c0b1769c90..948cca851b 100644 --- a/fdbserver/include/fdbserver/Ratekeeper.h +++ b/fdbserver/include/fdbserver/Ratekeeper.h @@ -148,7 +148,7 @@ class Ratekeeper { double lastWarning; double lastSSListFetchedTimestamp; - std::unique_ptr tagThrottler; + std::unique_ptr tagThrottler; RatekeeperLimits normalLimits; RatekeeperLimits batchLimits; diff --git a/fdbserver/include/fdbserver/RkTagThrottleCollection.h b/fdbserver/include/fdbserver/RkTagThrottleCollection.h index 35062cdb7c..ee064685fa 100644 --- a/fdbserver/include/fdbserver/RkTagThrottleCollection.h +++ b/fdbserver/include/fdbserver/RkTagThrottleCollection.h @@ -42,7 +42,7 @@ class RkTagThrottleCollection : NonCopyable { bool rateSet = false; RkTagThrottleData() : clientRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {} - double getTargetRate(Optional requestRate); + double getTargetRate(Optional requestRate) const; Optional updateAndGetClientRate(Optional requestRate); }; @@ -83,7 +83,7 @@ public: void addRequests(TransactionTag const& tag, int requests); int64_t autoThrottleCount() const { return autoThrottledTags.size(); } int64_t manualThrottleCount() const; - void updateBusyTagCount(TagThrottledReason); + void incrementBusyTagCount(TagThrottledReason); auto getBusyReadTagCount() const { return busyReadTagCount; } auto getBusyWriteTagCount() const { return busyWriteTagCount; } }; diff --git a/fdbserver/include/fdbserver/StorageMetrics.h b/fdbserver/include/fdbserver/StorageMetrics.h index 35da5e14f5..1ecf102e3e 100644 --- a/fdbserver/include/fdbserver/StorageMetrics.h +++ b/fdbserver/include/fdbserver/StorageMetrics.h @@ -357,6 +357,7 @@ struct StorageServerMetrics { } void splitMetrics(SplitMetricsRequest req) const { + int minSplitBytes = req.minSplitBytes.present() ? req.minSplitBytes.get() : SERVER_KNOBS->MIN_SHARD_BYTES; try { SplitMetricsReply reply; KeyRef lastKey = req.keys.begin; @@ -364,10 +365,10 @@ struct StorageServerMetrics { StorageMetrics estimated = req.estimated; StorageMetrics remaining = getMetrics(req.keys) + used; - //TraceEvent("SplitMetrics").detail("Begin", req.keys.begin).detail("End", req.keys.end).detail("Remaining", remaining.bytes).detail("Used", used.bytes); + //TraceEvent("SplitMetrics").detail("Begin", req.keys.begin).detail("End", req.keys.end).detail("Remaining", remaining.bytes).detail("Used", used.bytes).detail("MinSplitBytes", minSplitBytes); while (true) { - if (remaining.bytes < 2 * SERVER_KNOBS->MIN_SHARD_BYTES) + if (remaining.bytes < 2 * minSplitBytes) break; KeyRef key = req.keys.end; bool hasUsed = used.bytes != 0 || used.bytesPerKSecond != 0 || used.iosPerKSecond != 0; @@ -382,10 +383,9 @@ struct StorageServerMetrics { lastKey, key, hasUsed); - if (used.bytes < SERVER_KNOBS->MIN_SHARD_BYTES) - key = std::max(key, - byteSample.splitEstimate(KeyRangeRef(lastKey, req.keys.end), - SERVER_KNOBS->MIN_SHARD_BYTES - used.bytes)); + if (used.bytes < minSplitBytes) + key = std::max( + key, byteSample.splitEstimate(KeyRangeRef(lastKey, req.keys.end), minSplitBytes - used.bytes)); key = getSplitKey(remaining.iosPerKSecond, estimated.iosPerKSecond, req.limits.iosPerKSecond, @@ -532,7 +532,7 @@ struct StorageServerMetrics { auto _ranges = getReadHotRanges(req.keys, SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO, SERVER_KNOBS->READ_HOT_SUB_RANGE_CHUNK_SIZE, - SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS); + SERVER_KNOBS->SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS); reply.readHotRanges = VectorRef(_ranges.data(), _ranges.size()); req.reply.send(reply); } diff --git a/fdbserver/include/fdbserver/TagPartitionedLogSystem.actor.h b/fdbserver/include/fdbserver/TagPartitionedLogSystem.actor.h index ff2c726a8e..c7d961cec7 100644 --- a/fdbserver/include/fdbserver/TagPartitionedLogSystem.actor.h +++ b/fdbserver/include/fdbserver/TagPartitionedLogSystem.actor.h @@ -32,7 +32,6 @@ #include "fdbrpc/simulator.h" #include "fdbserver/DBCoreState.h" #include "fdbserver/Knobs.h" -#include "fdbserver/LogProtocolMessage.h" #include "fdbserver/LogSystem.h" #include "fdbserver/RecoveryState.h" #include "fdbserver/ServerDBInfo.h" diff --git a/fdbserver/include/fdbserver/TagThrottler.h b/fdbserver/include/fdbserver/TagThrottler.h index 69e3909c7d..830c2fea06 100644 --- a/fdbserver/include/fdbserver/TagThrottler.h +++ b/fdbserver/include/fdbserver/TagThrottler.h @@ -23,32 +23,72 @@ #include "fdbclient/PImpl.h" #include "fdbserver/Ratekeeper.h" -class TagThrottler { +class ITagThrottler { +public: + virtual ~ITagThrottler() = default; + + // Poll the system keyspace looking for updates made through the tag throttling API + virtual Future monitorThrottlingChanges() = 0; + + // Increment the number of known requests associated with the specified tag + virtual void addRequests(TransactionTag tag, int count) = 0; + + // This throttled tag change ID is used to coordinate updates with the GRV proxies + virtual uint64_t getThrottledTagChangeId() const = 0; + + // For each tag and priority combination, return the throughput limit and expiration time + // Also, erase expired tags + virtual PrioritizedTransactionTagMap getClientRates() = 0; + + virtual int64_t autoThrottleCount() const = 0; + virtual uint32_t busyReadTagCount() const = 0; + virtual uint32_t busyWriteTagCount() const = 0; + virtual int64_t manualThrottleCount() const = 0; + virtual bool isAutoThrottlingEnabled() const = 0; + + // Based on the busiest read and write tags in the provided storage queue info, update + // tag throttling limits. + virtual Future tryUpdateAutoThrottling(StorageQueueInfo const&) = 0; +}; + +class TagThrottler : public ITagThrottler { PImpl impl; public: TagThrottler(Database db, UID id); ~TagThrottler(); - // Poll the system keyspace looking for updates made through the tag throttling API - Future monitorThrottlingChanges(); - - // Increment the number of known requests associated with the specified tag - void addRequests(TransactionTag tag, int count); - - // This throttled tag change ID is used to coordinate updates with the GRV proxies - uint64_t getThrottledTagChangeId() const; - - // For each tag and priority combination, return the throughput limit and expiration time - PrioritizedTransactionTagMap getClientRates(); - - int64_t autoThrottleCount() const; - uint32_t busyReadTagCount() const; - uint32_t busyWriteTagCount() const; - int64_t manualThrottleCount() const; - bool isAutoThrottlingEnabled() const; - - // Based on the busiest read and write tags in the provided storage queue info, update - // tag throttling limits. - Future tryUpdateAutoThrottling(StorageQueueInfo const&); + Future monitorThrottlingChanges() override; + void addRequests(TransactionTag tag, int count) override; + uint64_t getThrottledTagChangeId() const override; + PrioritizedTransactionTagMap getClientRates() override; + int64_t autoThrottleCount() const override; + uint32_t busyReadTagCount() const override; + uint32_t busyWriteTagCount() const override; + int64_t manualThrottleCount() const override; + bool isAutoThrottlingEnabled() const override; + Future tryUpdateAutoThrottling(StorageQueueInfo const&) override; +}; + +class GlobalTagThrottler : public ITagThrottler { + PImpl impl; + +public: + GlobalTagThrottler(Database db, UID id); + ~GlobalTagThrottler(); + + Future monitorThrottlingChanges() override; + void addRequests(TransactionTag tag, int count) override; + uint64_t getThrottledTagChangeId() const override; + PrioritizedTransactionTagMap getClientRates() override; + int64_t autoThrottleCount() const override; + uint32_t busyReadTagCount() const override; + uint32_t busyWriteTagCount() const override; + int64_t manualThrottleCount() const override; + bool isAutoThrottlingEnabled() const override; + Future tryUpdateAutoThrottling(StorageQueueInfo const&) override; + + // testing only +public: + void setQuota(TransactionTagRef, ThrottleApi::TagQuotaValue const&); }; diff --git a/fdbserver/include/fdbserver/TransactionTagCounter.h b/fdbserver/include/fdbserver/TransactionTagCounter.h index d520259c5c..6e2b424e6f 100644 --- a/fdbserver/include/fdbserver/TransactionTagCounter.h +++ b/fdbserver/include/fdbserver/TransactionTagCounter.h @@ -20,25 +20,23 @@ #pragma once +#include "fdbclient/PImpl.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/TagThrottle.actor.h" -#include "fdbserver/Knobs.h" class TransactionTagCounter { - TransactionTagMap intervalCounts; - int64_t intervalTotalSampledCount = 0; - TransactionTag busiestTag; - int64_t busiestTagCount = 0; - double intervalStart = 0; - - std::vector previousBusiestTags; - UID thisServerID; - Reference busiestReadTagEventHolder; + PImpl impl; public: TransactionTagCounter(UID thisServerID); - static int64_t costFunction(int64_t bytes) { return bytes / SERVER_KNOBS->READ_COST_BYTE_FACTOR + 1; } + ~TransactionTagCounter(); + + // Update counters tracking the busyness of each tag in the current interval void addRequest(Optional const& tags, int64_t bytes); + + // Save current set of busy tags and reset counters for next interval void startNewInterval(); - std::vector const& getBusiestTags() const { return previousBusiestTags; } + + // Returns the set of busiest tags as of the end of the last interval + std::vector const& getBusiestTags() const; }; diff --git a/fdbserver/include/fdbserver/workloads/TPCCWorkload.h b/fdbserver/include/fdbserver/workloads/TPCCWorkload.h deleted file mode 100644 index 466ec21666..0000000000 --- a/fdbserver/include/fdbserver/workloads/TPCCWorkload.h +++ /dev/null @@ -1,321 +0,0 @@ -/* - * TPCCWorkload.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef FDBSERVER_TPCCWORKLOAD_H -#define FDBSERVER_TPCCWORKLOAD_H -#pragma once -#include "flow/Arena.h" -#include "fdbclient/FDBTypes.h" -#include -#include - -namespace TPCCWorkload { - -// Schema -#define EXPAND(...) __VA_ARGS__ -#define EMPTY() -#define DEFER(x) x EMPTY() -// An indirection macro to avoid direct recursion -#define BOOST_PP_SEQ_FOR_EACH_ID() BOOST_PP_SEQ_FOR_EACH generators -#define ROW_CONCAT(prefix, name) prefix##name -#define ROW_TO_STRING(str) #str -#define ROW_ELEMENT_NAME(prefix, element) ROW_CONCAT(prefix, element) -#define ROW_MEMBER(r, data, elem) \ - BOOST_PP_TUPLE_ELEM(0, elem) \ - ROW_ELEMENT_NAME(data, BOOST_PP_TUPLE_ELEM(1, elem)); -#define ROW_MEMBERS_SEQ(prefix, seq) BOOST_PP_SEQ_FOR_EACH(ROW_MEMBER, prefix, seq) -#define ROW_MEMBERS(prefix, tuple) ROW_MEMBERS_SEQ(prefix, BOOST_PP_TUPLE_TO_SEQ(tuple)) - -#define ROW_SERIALIZE_ELEMENT(r, data, elem) , ROW_ELEMENT_NAME(data, BOOST_PP_TUPLE_ELEM(1, elem)) -#define ROW_SERIALIZE_ELEMENTS(prefix, seq) BOOST_PP_SEQ_FOR_EACH(ROW_SERIALIZE_ELEMENT, prefix, seq) -#define ROW_SERIALIZE(prefix, tuple) ar ROW_SERIALIZE_ELEMENTS(prefix, BOOST_PP_TUPLE_TO_SEQ(tuple)) - -#define ROW_KEY_STEP(r, data, elem) , ROW_ELEMENT_NAME(data, elem) -#define ROW_KEY_LIST_SEQ_EXP(prefix, seq) BOOST_PP_SEQ_FOR_EACH(ROW_KEY_STEP, prefix, seq) -#define ROW_KEY_LIST_SEQ(prefix, seq) ROW_KEY_LIST_SEQ_EXP(prefix, seq) -#define ROW_KEY_LIST(prefix, a) ROW_KEY_LIST_SEQ(prefix, BOOST_PP_ARRAY_TO_SEQ(a)) -#define ROW_KEY_LIST_TUPLE(prefix, tuple) ROW_KEY_LIST_SEQ(prefix, BOOST_PP_TUPLE_TO_SEQ(tuple)) - -#define ROW_KEY_HAS_KEY(Name, prefix, primary_key) \ - static constexpr bool HAS_KEY = true; \ - StringRef key() { \ - auto s = generateKey(#Name, KEY_SIZE ROW_KEY_LIST(prefix, primary_key)); \ - return StringRef(arena, s); \ - } \ - KeyRangeRef keyRange(int dontInclude) { \ - auto s = generateKey(#Name, KEY_SIZE - dontInclude ROW_KEY_LIST(prefix, primary_key)); \ - KeyRef begin = StringRef(arena, reinterpret_cast(s.c_str()), s.size() + 1); \ - KeyRef end = StringRef(arena, reinterpret_cast(s.c_str()), s.size() + 1); \ - auto sBegin = mutateString(begin); \ - sBegin[s.size()] = uint8_t('/'); \ - auto sEnd = mutateString(end); \ - sEnd[s.size()] = uint8_t('0'); \ - return KeyRangeRef(begin, end); \ - } -#define ROW_KEY_NO_KEY static constexpr bool HAS_KEY = false; -#define ROW_KEY_IMPL(Name, prefix, primary_key, sz) \ - BOOST_PP_IF(sz, ROW_KEY_HAS_KEY(Name, prefix, primary_key), ROW_KEY_NO_KEY) -#define ROW_KEY(Name, prefix, primary_key) ROW_KEY_IMPL(Name, prefix, primary_key, BOOST_PP_ARRAY_SIZE(primary_key)) - -#define ROW_INDEX_NAME_KEY(name) ROW_CONCAT(name, Key) -#define ROW_INDEX_NAME_IMPL2(name) ROW_TO_STRING(name) -#define ROW_INDEX_NAME_IMPL(indexName, name) ROW_INDEX_NAME_IMPL2(ROW_CONCAT(indexName, name)) -#define ROW_INDEX_NAME(nameTuple, index) \ - ROW_INDEX_NAME_IMPL(BOOST_PP_TUPLE_ELEM(0, index), BOOST_PP_TUPLE_ELEM(0, nameTuple)) -#define ROW_GENERATE_INDEX(r, data, index) \ - StringRef ROW_INDEX_NAME_KEY(BOOST_PP_TUPLE_ELEM(0, index))(int dontInclude = 0) { \ - auto s = generateKey(ROW_INDEX_NAME(data, index), \ - BOOST_PP_TUPLE_SIZE(index) - dontInclude - \ - 1 ROW_KEY_LIST_TUPLE(BOOST_PP_TUPLE_ELEM(1, data), BOOST_PP_TUPLE_POP_FRONT(index))); \ - return StringRef(arena, s); \ - } -#define ROW_GENERATE_INDEXES_LIST(Name, prefix, indexes) \ - BOOST_PP_LIST_FOR_EACH(ROW_GENERATE_INDEX, (Name, prefix), indexes) -#define ROW_GENERATE_INDEXES(Name, prefix, indexes) \ - ROW_GENERATE_INDEXES_LIST(Name, prefix, BOOST_PP_ARRAY_TO_LIST(indexes)) -#define ROW_INDEXES(Name, prefix, indexes) \ - BOOST_PP_IF(BOOST_PP_ARRAY_SIZE(indexes), ROW_GENERATE_INDEXES(Name, prefix, indexes), BOOST_PP_EMPTY()) - -#define ROW(Name, prefix, tuple, primary_key, indexes) \ - struct Name { \ - constexpr static FileIdentifier file_identifier = __COUNTER__; \ - Arena arena; \ - ROW_MEMBERS(prefix, tuple) \ - template \ - void serialize(Ar& ar) { \ - serializer(ROW_SERIALIZE(prefix, tuple)); \ - } \ - static constexpr int KEY_SIZE = BOOST_PP_ARRAY_SIZE(primary_key); \ - ROW_KEY(Name, prefix, primary_key) \ - ROW_INDEXES(Name, prefix, indexes) \ - } - -template -struct KeyStreamer { - void operator()(std::stringstream& ss, const Value& v) { ss << v; } -}; - -template <> -struct KeyStreamer { - void operator()(std::stringstream& ss, const StringRef& v) { ss << v.toString(); } -}; - -template <> -struct KeyStreamer { - void operator()(std::stringstream& ss, const int v) { ss << std::setfill('0') << std::setw(6) << v; } -}; - -template <> -struct KeyStreamer { - void operator()(std::stringstream& ss, const int v) { ss << std::setfill('0') << std::setw(6) << v; } -}; - -template -struct KeyGenerator; - -template -struct KeyGenerator { - static void generate(std::stringstream& ss, int max, Head h, Tail... tail) { - KeyStreamer streamer; - if (max > 0) { - ss << '/'; - streamer(ss, h); - KeyGenerator::generate(ss, max - 1, tail...); - } - } -}; - -template <> -struct KeyGenerator<> { - static void generate(std::stringstream&, int) {} -}; - -template -std::string generateKey(const std::string& table, int max, Values... values) { - std::stringstream ss; - ss << table; - if (max > 0) { - KeyGenerator::generate(ss, max, values...); - } - return ss.str(); -} - -ROW(Warehouse, - w_, - ((int, id), - (StringRef, name), - (StringRef, street_1), - (StringRef, street_2), - (StringRef, city), - (StringRef, state), - (StringRef, zip), - (double, tax), - (double, ytd)), - (1, (id)), - (0, ())); - -ROW(District, - d_, - ((int, id), - (int, w_id), - (StringRef, name), - (StringRef, street_1), - (StringRef, street_2), - (StringRef, city), - (StringRef, state), - (StringRef, zip), - (double, tax), - (double, ytd), - (int, next_o_id)), - (2, (w_id, id)), - (0, ())); - -ROW(Customer, - c_, - ((int, id), - (int, d_id), - (int, w_id), - (StringRef, first), - (StringRef, last), - (StringRef, middle), - (StringRef, street_1), - (StringRef, street_2), - (StringRef, city), - (StringRef, state), - (StringRef, zip), - (StringRef, phone), - (double, since), - (StringRef, credit), - (double, credit_lim), - (double, discount), - (double, balance), - (double, ytd_payment), - (unsigned, payment_cnt), - (unsigned, delivery_count), - (StringRef, data)), - (3, (w_id, d_id, id)), - (1, ((indexLast, w_id, d_id, last, id)))); - -ROW(History, - h_, - ((int, c_id), - (int, c_d_id), - (int, c_w_id), - (int, d_id), - (int, w_id), - (double, date), - (double, amount), - (StringRef, data)), - (0, ()), - (0, ())); - -ROW(NewOrder, no_, ((int, o_id), (int, d_id), (int, w_id)), (3, (w_id, d_id, o_id)), (0, ())); - -ROW(Order, - o_, - ((int, id), - (int, d_id), - (int, w_id), - (int, c_id), - (double, entry_d), - (Optional, carrier_id), - (short, ol_cnt), - (bool, all_local)), - (3, (w_id, d_id, id)), - (0, ())); - -ROW(OrderLine, - ol_, - ((int, o_id), - (int, d_id), - (int, w_id), - (short, number), - (int, i_id), - (int, supply_w_id), - (Optional, delivery_d), - (short, quantity), - (double, amount), - (StringRef, dist_info)), - (4, (w_id, d_id, o_id, number)), - (0, ())); - -ROW(Item, i_, ((int, id), (int, im_id), (StringRef, name), (double, price), (StringRef, data)), (1, (id)), (0, ())); - -ROW(Stock, - s_, - ((int, i_id), - (int, w_id), - (short, quantity), - (StringRef, dist_01), - (StringRef, dist_02), - (StringRef, dist_03), - (StringRef, dist_04), - (StringRef, dist_05), - (StringRef, dist_06), - (StringRef, dist_07), - (StringRef, dist_08), - (StringRef, dist_09), - (StringRef, dist_10), - (int, ytd), - (short, order_cnt), - (short, remote_cnt), - (StringRef, data)), - (2, (w_id, i_id)), - (0, ())); - -#undef FLOW_ACOMPILER_STATE -#define FLOW_ACOMPILER_STATE 1 - -struct GlobalState { - constexpr static FileIdentifier file_identifier = 1064821; - int CLoad, CRun, CDelta, CId, COlIID; - - GlobalState() { - CLoad = deterministicRandom()->randomInt(0, 256); - while (true) { - CDelta = deterministicRandom()->randomInt(65, 120); - if (!(CDelta == 96 || CDelta == 112)) { - break; - } - } - - if (CDelta > CLoad) { - CRun = CLoad + CDelta; - } else { - CRun = deterministicRandom()->coinflip() ? CLoad + CDelta : CLoad - CDelta; - } - CId = deterministicRandom()->randomInt(1, 3001); - COlIID = deterministicRandom()->randomInt(1, 100001); - } - - template - void serialize(Ar& ar) { - serializer(ar, CLoad, CRun, CDelta, CId, COlIID); - } - - StringRef key() const { return LiteralStringRef("GlobalState"); } -}; - -const std::vector syllables = { - "BAR", "UGHT", "ABLE", "RI", "PRES", "SE", "ANTI", "ALLY", "ATION", "ING", -}; - -} // namespace TPCCWorkload - -#endif diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 3e4bf9c6e2..163420a091 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -52,7 +52,9 @@ #include "fdbclient/SystemData.h" #include "fdbclient/TransactionLineage.h" #include "fdbclient/VersionedMap.h" +#include "fdbserver/EncryptedMutationMessage.h" #include "fdbserver/FDBExecHelper.actor.h" +#include "fdbserver/GetEncryptCipherKeys.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/Knobs.h" #include "fdbserver/LatencyBandConfig.h" @@ -112,7 +114,7 @@ bool canReplyWith(Error e) { return true; default: return false; - }; + } } } // namespace @@ -1673,7 +1675,7 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { } return Void(); -}; +} // Pessimistic estimate the number of overhead bytes used by each // watch. Watch key references are stored in an AsyncMap, and actors @@ -2935,7 +2937,7 @@ ACTOR Future quickGetValue(StorageServer* data, } else { throw quick_get_value_miss(); } -}; +} // If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending). // readRange has O(|result|) + O(log |data|) cost @@ -3549,7 +3551,7 @@ ACTOR Future quickGetKeyValues( } else { throw quick_get_key_values_miss(); } -}; +} void unpackKeyTuple(Tuple** referenceTuple, Optional& keyTuple, KeyValueRef* keyValue) { if (!keyTuple.present()) { @@ -3798,6 +3800,36 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") { return Void(); } +// Issues a secondary query (either range and point read) and fills results into "kvm". +ACTOR Future mapSubquery(StorageServer* data, + Version version, + GetMappedKeyValuesRequest* pOriginalReq, + Arena* pArena, + int matchIndex, + bool isRangeQuery, + bool isBoundary, + KeyValueRef* it, + MappedKeyValueRef* kvm, + Key mappedKey) { + if (isRangeQuery) { + // Use the mappedKey as the prefix of the range query. + GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, version, pArena, pOriginalReq)); + if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) || + (getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) { + kvm->key = it->key; + kvm->value = it->value; + } + + kvm->boundaryAndExist = isBoundary && !getRange.result.empty(); + kvm->reqAndResult = getRange; + } else { + GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, version, pArena, pOriginalReq)); + kvm->reqAndResult = getValue; + kvm->boundaryAndExist = isBoundary && getValue.result.present(); + } + return Void(); +} + ACTOR Future mapKeyValues(StorageServer* data, GetKeyValuesReply input, StringRef mapper, @@ -3827,43 +3859,49 @@ ACTOR Future mapKeyValues(StorageServer* data, preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery); state int sz = input.data.size(); - state int i = 0; - for (; i < sz; i++) { - state KeyValueRef* it = &input.data[i]; - state MappedKeyValueRef kvm; - state bool isBoundary = i == 0 || i == sz - 1; - // need to keep the boundary, so that caller can use it as a continuation. - if (isBoundary || matchIndex == MATCH_INDEX_ALL) { - kvm.key = it->key; - kvm.value = it->value; - } - - state Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, mappedKeyFormatTuple); - // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. - result.arena.dependsOn(mappedKey.arena()); - - // std::cout << "key:" << printable(kvm.key) << ", value:" << printable(kvm.value) - // << ", mappedKey:" << printable(mappedKey) << std::endl; - - if (isRangeQuery) { - // Use the mappedKey as the prefix of the range query. - GetRangeReqAndResultRef getRange = - wait(quickGetKeyValues(data, mappedKey, input.version, &(result.arena), pOriginalReq)); - if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) || - (getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) { - kvm.key = it->key; - kvm.value = it->value; + const int k = std::min(sz, SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE); + state std::vector kvms(k); + state std::vector> subqueries; + state int offset = 0; + for (; offset < sz; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) { + // Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries + for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) { + KeyValueRef* it = &input.data[i + offset]; + MappedKeyValueRef* kvm = &kvms[i]; + bool isBoundary = (i + offset) == 0 || (i + offset) == sz - 1; + // need to keep the boundary, so that caller can use it as a continuation. + if (isBoundary || matchIndex == MATCH_INDEX_ALL) { + kvm->key = it->key; + kvm->value = it->value; + } else { + // Clear key value to the default. + kvm->key = ""_sr; + kvm->value = ""_sr; } - kvm.boundaryAndExist = isBoundary && !getRange.result.empty(); - kvm.reqAndResult = getRange; - } else { - GetValueReqAndResultRef getValue = - wait(quickGetValue(data, mappedKey, input.version, &(result.arena), pOriginalReq)); - kvm.reqAndResult = getValue; - kvm.boundaryAndExist = isBoundary && getValue.result.present(); + Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, mappedKeyFormatTuple); + // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. + result.arena.dependsOn(mappedKey.arena()); + + // std::cout << "key:" << printable(kvm->key) << ", value:" << printable(kvm->value) + // << ", mappedKey:" << printable(mappedKey) << std::endl; + + subqueries.push_back(mapSubquery(data, + input.version, + pOriginalReq, + &result.arena, + matchIndex, + isRangeQuery, + isBoundary, + it, + kvm, + mappedKey)); + } + wait(waitForAll(subqueries)); + subqueries.clear(); + for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) { + result.data.push_back(result.arena, kvms[i]); } - result.data.push_back(result.arena, kvm); } return result; } @@ -6225,7 +6263,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { } return Void(); -}; +} AddingShard::AddingShard(StorageServer* server, KeyRangeRef const& keys) : keys(keys), server(server), transferredVersion(invalidVersion), fetchVersion(invalidVersion), phase(WaitPrevious) { @@ -6948,7 +6986,7 @@ void StorageServer::insertTenant(TenantNameRef tenantName, tenantMap.createNewVersion(version); tenantPrefixIndex.createNewVersion(version); - TenantMapEntry tenantEntry = decodeTenantEntry(value); + TenantMapEntry tenantEntry = TenantMapEntry::decode(value); tenantMap.insert(tenantName, tenantEntry); tenantPrefixIndex.insert(tenantEntry.prefix, tenantName); @@ -7094,7 +7132,11 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { state UpdateEagerReadInfo eager; state FetchInjectionInfo fii; state Reference cloneCursor2; + state Optional>> cipherKeys; + state bool collectingCipherKeys = false; + // Collect eager read keys. + // If encrypted mutation is encountered, we collect cipher details and fetch cipher keys, then start over. loop { state uint64_t changeCounter = data->shardChangeCounter; bool epochEnd = false; @@ -7102,6 +7144,8 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { bool firstMutation = true; bool dbgLastMessageWasProtocol = false; + std::unordered_set cipherDetails; + Reference cloneCursor1 = cursor->cloneNoMore(); cloneCursor2 = cursor->cloneNoMore(); @@ -7124,47 +7168,72 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { OTELSpanContextMessage::isNextIn(cloneReader)) { OTELSpanContextMessage scm; cloneReader >> scm; + } else if (cloneReader.protocolVersion().hasEncryptionAtRest() && + EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) { + // Encrypted mutation found, but cipher keys haven't been fetch. + // Collect cipher details to fetch cipher keys in one batch. + EncryptedMutationMessage emm; + cloneReader >> emm; + cipherDetails.insert(emm.header.cipherTextDetails); + cipherDetails.insert(emm.header.cipherHeaderDetails); + collectingCipherKeys = true; } else { MutationRef msg; - cloneReader >> msg; + if (cloneReader.protocolVersion().hasEncryptionAtRest() && + EncryptedMutationMessage::isNextIn(cloneReader)) { + assert(cipherKeys.present()); + msg = EncryptedMutationMessage::decrypt(cloneReader, eager.arena, cipherKeys.get()); + } else { + cloneReader >> msg; + } // TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg); - if (firstMutation && msg.param1.startsWith(systemKeys.end)) - hasPrivateData = true; - firstMutation = false; + if (!collectingCipherKeys) { + if (firstMutation && msg.param1.startsWith(systemKeys.end)) + hasPrivateData = true; + firstMutation = false; - if (msg.param1 == lastEpochEndPrivateKey) { - epochEnd = true; - ASSERT(dbgLastMessageWasProtocol); + if (msg.param1 == lastEpochEndPrivateKey) { + epochEnd = true; + ASSERT(dbgLastMessageWasProtocol); + } + + eager.addMutation(msg); + dbgLastMessageWasProtocol = false; } - - eager.addMutation(msg); - dbgLastMessageWasProtocol = false; } } - // Any fetchKeys which are ready to transition their shards to the adding,transferred state do so now. - // If there is an epoch end we skip this step, to increase testability and to prevent inserting a - // version in the middle of a rolled back version range. - while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) { - auto fk = data->readyFetchKeys.back(); - data->readyFetchKeys.pop_back(); - fk.send(&fii); - // fetchKeys() would put the data it fetched into the fii. The thread will not return back to this - // actor until it was completed. + if (collectingCipherKeys) { + std::unordered_map> getCipherKeysResult = + wait(getEncryptCipherKeys(data->db, cipherDetails)); + cipherKeys = getCipherKeysResult; + collectingCipherKeys = false; + eager = UpdateEagerReadInfo(); + } else { + // Any fetchKeys which are ready to transition their shards to the adding,transferred state do so now. + // If there is an epoch end we skip this step, to increase testability and to prevent inserting a + // version in the middle of a rolled back version range. + while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) { + auto fk = data->readyFetchKeys.back(); + data->readyFetchKeys.pop_back(); + fk.send(&fii); + // fetchKeys() would put the data it fetched into the fii. The thread will not return back to this + // actor until it was completed. + } + + for (auto& c : fii.changes) + eager.addMutations(c.mutations); + + wait(doEagerReads(data, &eager)); + if (data->shardChangeCounter == changeCounter) + break; + TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it + // again. + // SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads + // only selectively + eager = UpdateEagerReadInfo(); } - - for (auto& c : fii.changes) - eager.addMutations(c.mutations); - - wait(doEagerReads(data, &eager)); - if (data->shardChangeCounter == changeCounter) - break; - TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it - // again. - // SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads - // only selectively - eager = UpdateEagerReadInfo(); } data->eagerReadsLatencyHistogram->sampleSeconds(now() - start); @@ -7257,7 +7326,12 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { spanContext = scm.spanContext; } else { MutationRef msg; - rd >> msg; + if (rd.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(rd)) { + ASSERT(cipherKeys.present()); + msg = EncryptedMutationMessage::decrypt(rd, rd.arena(), cipherKeys.get()); + } else { + rd >> msg; + } Span span("SS:update"_loc, spanContext); span.addAttribute("key"_sr, msg.param1); @@ -7437,7 +7511,9 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { return Void(); // update will get called again ASAP } catch (Error& err) { state Error e = err; - if (e.code() != error_code_worker_removed && e.code() != error_code_please_reboot) { + if (e.code() == error_code_encrypt_keys_fetch_failed) { + TraceEvent(SevWarn, "SSUpdateError", data->thisServerID).error(e).backtrace(); + } else if (e.code() != error_code_worker_removed && e.code() != error_code_please_reboot) { TraceEvent(SevError, "SSUpdateError", data->thisServerID).error(e).backtrace(); } else if (e.code() == error_code_please_reboot) { wait(data->durableInProgress); @@ -7791,7 +7867,7 @@ void StorageServerDisk::makeNewStorageServerDurable() { auto view = data->tenantMap.atLatest(); for (auto itr = view.begin(); itr != view.end(); ++itr) { - storage->set(KeyValueRef(itr.key().withPrefix(persistTenantMapKeys.begin), encodeTenantEntry(*itr))); + storage->set(KeyValueRef(itr.key().withPrefix(persistTenantMapKeys.begin), itr->encode())); } } @@ -8272,7 +8348,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor for (tenantMapLoc = 0; tenantMapLoc < tenantMap.size(); tenantMapLoc++) { auto const& result = tenantMap[tenantMapLoc]; TenantName tenantName = result.key.substr(persistTenantMapKeys.begin.size()); - TenantMapEntry tenantEntry = decodeTenantEntry(result.value); + TenantMapEntry tenantEntry = TenantMapEntry::decode(result.value); data->tenantMap.insert(tenantName, tenantEntry); data->tenantPrefixIndex.insert(tenantEntry.prefix, tenantName); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 314e9e9053..c6fe9a594f 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1415,10 +1415,16 @@ ACTOR Future traceRole(Role role, UID roleId) { } } -ACTOR Future workerSnapCreate(WorkerSnapRequest snapReq, Standalone snapFolder) { +ACTOR Future workerSnapCreate( + WorkerSnapRequest snapReq, + std::string snapFolder, + std::map* snapReqMap /* ongoing snapshot requests */, + std::map>* + snapReqResultMap /* finished snapshot requests, expired in SNAP_MINIMUM_TIME_GAP seconds */) { state ExecCmdValueString snapArg(snapReq.snapPayload); + state std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString(); try { - int err = wait(execHelper(&snapArg, snapReq.snapUID, snapFolder.toString(), snapReq.role.toString())); + int err = wait(execHelper(&snapArg, snapReq.snapUID, snapFolder, snapReq.role.toString())); std::string uidStr = snapReq.snapUID.toString(); TraceEvent("ExecTraceWorker") .detail("Uid", uidStr) @@ -1432,11 +1438,15 @@ ACTOR Future workerSnapCreate(WorkerSnapRequest snapReq, Standaloneat(snapReqKey).reply.send(Void()); + snapReqMap->erase(snapReqKey); + (*snapReqResultMap)[snapReqKey] = ErrorOr(Void()); } catch (Error& e) { TraceEvent("ExecHelperError").errorUnsuppressed(e); if (e.code() != error_code_operation_cancelled) { - snapReq.reply.sendError(e); + snapReqMap->at(snapReqKey).reply.sendError(e); + snapReqMap->erase(snapReqKey); + (*snapReqResultMap)[snapReqKey] = ErrorOr(e); } else { throw e; } @@ -1584,6 +1594,11 @@ ACTOR Future workerServer(Reference connRecord, state WorkerCache backupWorkerCache; state WorkerCache blobWorkerCache; + state WorkerSnapRequest lastSnapReq; + // Here the key is UID+role, as we still send duplicate requests to a process which is both storage and tlog + state std::map snapReqMap; + state std::map> snapReqResultMap; + state double lastSnapTime = -SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP; // always successful for the first Snap Request state std::string coordFolder = abspath(_coordFolder); state WorkerInterface interf(locality); @@ -2497,11 +2512,49 @@ ACTOR Future workerServer(Reference connRecord, loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace); } when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) { - Standalone snapFolder = StringRef(folder); - if (snapReq.role.toString() == "coord") { - snapFolder = coordFolder; + std::string snapUID = snapReq.snapUID.toString() + snapReq.role.toString(); + if (snapReqResultMap.count(snapUID)) { + TEST(true); // Worker received a duplicate finished snap request + auto result = snapReqResultMap[snapUID]; + result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get()); + TraceEvent("RetryFinishedWorkerSnapRequest") + .detail("SnapUID", snapUID) + .detail("Role", snapReq.role) + .detail("Result", result.isError() ? result.getError().code() : 0); + } else if (snapReqMap.count(snapUID)) { + TEST(true); // Worker received a duplicate ongoing snap request + TraceEvent("RetryOngoingWorkerSnapRequest").detail("SnapUID", snapUID).detail("Role", snapReq.role); + ASSERT(snapReq.role == snapReqMap[snapUID].role); + ASSERT(snapReq.snapPayload == snapReqMap[snapUID].snapPayload); + snapReqMap[snapUID] = snapReq; + } else { + snapReqMap[snapUID] = snapReq; // set map point to the request + if (g_network->isSimulated() && (now() - lastSnapTime) < SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP) { + // only allow duplicate snapshots on same process in a short time for different roles + auto okay = (lastSnapReq.snapUID == snapReq.snapUID) && lastSnapReq.role != snapReq.role; + TraceEvent(okay ? SevInfo : SevError, "RapidSnapRequestsOnSameProcess") + .detail("CurrSnapUID", snapUID) + .detail("PrevSnapUID", lastSnapReq.snapUID) + .detail("CurrRole", snapReq.role) + .detail("PrevRole", lastSnapReq.role) + .detail("GapTime", now() - lastSnapTime); + } + errorForwarders.add(workerSnapCreate(snapReq, + snapReq.role.toString() == "coord" ? coordFolder : folder, + &snapReqMap, + &snapReqResultMap)); + auto* snapReqResultMapPtr = &snapReqResultMap; + errorForwarders.add(fmap( + [snapReqResultMapPtr, snapUID](Void _) { + snapReqResultMapPtr->erase(snapUID); + return Void(); + }, + delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP))); + if (g_network->isSimulated()) { + lastSnapReq = snapReq; + lastSnapTime = now(); + } } - errorForwarders.add(workerSnapCreate(snapReq, snapFolder)); } when(wait(errorForwarders.getResult())) {} when(wait(handleErrors)) {} diff --git a/fdbserver/workloads/GlobalTagThrottling.actor.cpp b/fdbserver/workloads/GlobalTagThrottling.actor.cpp new file mode 100644 index 0000000000..41ea8a630e --- /dev/null +++ b/fdbserver/workloads/GlobalTagThrottling.actor.cpp @@ -0,0 +1,74 @@ +/* + * GlobalTagThrottling.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/TagThrottle.actor.h" +#include "fdbserver/workloads/workloads.actor.h" + +#include "flow/actorcompiler.h" // This must be the last #include. + +class GlobalTagThrottlingWorkload : public TestWorkload { + TransactionTag transactionTag; + double reservedReadQuota{ 0.0 }; + double totalReadQuota{ 0.0 }; + double reservedWriteQuota{ 0.0 }; + double totalWriteQuota{ 0.0 }; + + ACTOR static Future setup(GlobalTagThrottlingWorkload* self, Database cx) { + state Reference tr = makeReference(cx); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + TraceEvent("GlobalTagThrottlingWorkload_SettingTagQuota") + .detail("Tag", self->transactionTag) + .detail("ReservedReadQuota", self->reservedReadQuota) + .detail("TotalReadQuota", self->totalReadQuota) + .detail("ReservedWriteQuota", self->reservedWriteQuota) + .detail("TotalWriteQuota", self->totalWriteQuota); + ThrottleApi::setTagQuota(tr, + self->transactionTag, + self->reservedReadQuota, + self->totalReadQuota, + self->reservedWriteQuota, + self->totalWriteQuota); + wait(tr->commit()); + return Void(); + } catch (Error& e) { + wait(tr->onError(e)); + } + }; + } + +public: + explicit GlobalTagThrottlingWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + transactionTag = getOption(options, "transactionTag"_sr, "sampleTag"_sr); + reservedReadQuota = getOption(options, "reservedReadQuota"_sr, 0.0); + totalReadQuota = getOption(options, "totalReadQuota"_sr, 0.0); + reservedWriteQuota = getOption(options, "reservedWriteQuota"_sr, 0.0); + totalWriteQuota = getOption(options, "totalWriteQuota"_sr, 0.0); + } + + std::string description() const override { return "GlobalTagThrottling"; } + Future setup(Database const& cx) override { return clientId ? Void() : setup(this, cx); } + Future start(Database const& cx) override { return Void(); } + Future check(Database const& cx) override { return true; } + void getMetrics(std::vector& m) override {} +}; + +WorkloadFactory GlobalTagThrottlingWorkloadFactory("GlobalTagThrottling"); diff --git a/fdbserver/workloads/PopulateTPCC.actor.cpp b/fdbserver/workloads/PopulateTPCC.actor.cpp deleted file mode 100644 index eb0b7c4ea4..0000000000 --- a/fdbserver/workloads/PopulateTPCC.actor.cpp +++ /dev/null @@ -1,521 +0,0 @@ -/* - * TPCC.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "flow/Arena.h" -#include "fdbserver/workloads/workloads.actor.h" -#include "fdbserver/QuietDatabase.h" -#include "fdbserver/workloads/TPCCWorkload.h" -#include "fdbserver/ServerDBInfo.h" - -#include "fdbclient/ReadYourWrites.h" -#include "flow/actorcompiler.h" // needs to be last include - -#undef FLOW_ACOMPILER_STATE -#define FLOW_ACOMPILER_STATE 1 - -using namespace TPCCWorkload; - -namespace { - -constexpr char alphaNumerics[] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', - 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', - 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', - 'W', 'X', 'Y', 'Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' }; -constexpr char numerics[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }; - -constexpr const char* originalString = "ORIGINAL"; - -struct PopulateTPCC : TestWorkload { - static constexpr const char* DESCRIPTION = "PopulateTPCC"; - - int actorsPerClient; - int warehousesPerActor; - int clientsUsed; - - GlobalState gState; - - PopulateTPCC(WorkloadContext const& ctx) : TestWorkload(ctx) { - std::string workloadName = DESCRIPTION; - actorsPerClient = getOption(options, LiteralStringRef("actorsPerClient"), 10); - warehousesPerActor = getOption(options, LiteralStringRef("warehousesPerActor"), 30); - clientsUsed = getOption(options, LiteralStringRef("clientsUsed"), 2); - } - - int NURand(int C, int A, int x, int y) { - return (((deterministicRandom()->randomInt(0, A + 1) | deterministicRandom()->randomInt(x, y + 1)) + C) % - (y - x + 1)) + - x; - } - - StringRef aString(Arena& arena, int x, int y) { - int length = deterministicRandom()->randomInt(x, y + 1); - char* res = new (arena) char[length]; - for (int i = 0; i < length; ++i) { - res[i] = alphaNumerics[deterministicRandom()->randomInt(0, sizeof(alphaNumerics))]; - } - return StringRef(reinterpret_cast(res), length); - } - - StringRef nString(Arena& arena, int x, int y) { - int length = deterministicRandom()->randomInt(x, y + 1); - char* res = new (arena) char[length]; - for (int i = 0; i < length; ++i) { - res[i] = numerics[deterministicRandom()->randomInt(0, sizeof(numerics))]; - } - return StringRef(reinterpret_cast(res), length); - } - - StringRef genCLast(Arena& arena, int x) { - int l = x % 10; - x /= 10; - int m = x % 10; - x /= 10; - int f = x % 10; - std::stringstream ss; - ss << syllables[f] << syllables[m] << syllables[l]; - return StringRef(arena, ss.str()); - } - - StringRef rndZip(Arena& arena) { - char* result = new (arena) char[9]; - for (int i = 0; i < 4; ++i) { - result[i] = numerics[deterministicRandom()->randomInt(0, sizeof(numerics))]; - } - for (int i = 4; i < 9; ++i) { - result[i] = '1'; - } - return StringRef(reinterpret_cast(result), 9); - } - - StringRef dataString(Arena& arena) { - if (deterministicRandom()->random01() < 0.1) { - auto str = aString(arena, 26, 51 - strlen(originalString)); - char* r = new (arena) char[str.size() + strlen(originalString)]; - int pos = deterministicRandom()->randomInt(0, str.size()); - std::copy(originalString, originalString + strlen(originalString), r + pos); - auto res = reinterpret_cast(r); - std::copy(str.begin(), str.begin() + pos, res); - std::copy(str.begin() + pos, str.end(), res + pos + strlen(originalString)); - return StringRef(res, str.size() + strlen(originalString)); - } else { - return aString(arena, 26, 51); - } - } - - ACTOR static Future writeGlobalState(PopulateTPCC* self, Database cx) { - state ReadYourWritesTransaction tr(cx); - loop { - tr.reset(); - try { - BinaryWriter writer(IncludeVersion()); - serializer(writer, self->gState); - tr.set(self->gState.key(), writer.toValue()); - wait(tr.commit()); - return Void(); - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - - ACTOR static Future readGlobalState(PopulateTPCC* self, Database cx) { - state ReadYourWritesTransaction tr(cx); - loop { - tr.reset(); - try { - Optional val = wait(tr.get(self->gState.key())); - if (val.present()) { - BinaryReader reader(val.get(), IncludeVersion()); - serializer(reader, self->gState); - } else { - wait(delay(1.0)); - } - return Void(); - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - - std::string description() const override { return DESCRIPTION; } - - ACTOR static Future populateItems(PopulateTPCC* self, Database cx) { - state Transaction tr(cx); - state int itemStart = 0; - state int i_id; - for (; itemStart < 100000; itemStart += 100) { - TraceEvent("PopulateItems").detail("Status", itemStart); - loop { - try { - tr.reset(); - for (i_id = itemStart; i_id < itemStart + 100; ++i_id) { - Item item; - item.i_id = i_id; - item.i_im_id = deterministicRandom()->randomInt(1, 10001); - item.i_name = self->aString(item.arena, 14, 25); - item.i_price = deterministicRandom()->randomInt64(1.0, 100.0); - item.i_data = self->dataString(item.arena); - BinaryWriter w(IncludeVersion()); - serializer(w, item); - tr.set(item.key(), w.toValue(), AddConflictRange::False); - } - wait(tr.commit()); - break; - } catch (Error& e) { - TraceEvent("PopulateItemsHandleError").error(e); - wait(tr.onError(e)); - } - } - } - TraceEvent("PopulateItemsDone").log(); - return Void(); - } - - ACTOR static Future populateCustomers(PopulateTPCC* self, Database cx, int w_id, int d_id) { - state Transaction tr(cx); - state int cStart; - state int c_id; - for (cStart = 0; cStart < 3000; cStart += 100) { - TraceEvent("PopulateCustomers") - .detail("Warehouse", w_id) - .detail("District", d_id) - .detail("Customer", cStart); - loop { - for (c_id = cStart; c_id < cStart + 100; ++c_id) { - Customer c; - History h; - c.c_id = c_id; - c.c_d_id = d_id; - c.c_w_id = w_id; - if (c_id < 1000) { - c.c_last = self->genCLast(c.arena, c_id); - } else { - c.c_last = self->genCLast(c.arena, self->NURand(self->gState.CLoad, 255, 0, 999)); - } - c.c_middle = LiteralStringRef("OE"); - c.c_first = self->aString(c.arena, 8, 16); - c.c_street_1 = self->aString(c.arena, 10, 20); - c.c_street_2 = self->aString(c.arena, 10, 20); - c.c_city = self->aString(c.arena, 10, 20); - c.c_state = self->aString(c.arena, 2, 2); - c.c_zip = self->rndZip(c.arena); - c.c_phone = self->nString(c.arena, 16, 16); - c.c_since = g_network->now(); - if (deterministicRandom()->random01() < 0.1) { - c.c_credit = LiteralStringRef("BC"); - } else { - c.c_credit = LiteralStringRef("GC"); - } - c.c_credit_lim = 50000; - c.c_discount = deterministicRandom()->random01() / 2.0; - c.c_balance = -10.0; - c.c_ytd_payment = 10.0; - c.c_payment_cnt = 1; - c.c_delivery_count = 0; - c.c_data = self->aString(c.arena, 300, 500); - - h.h_c_id = c_id; - h.h_c_d_id = d_id; - h.h_d_id = d_id; - h.h_w_id = w_id; - h.h_c_w_id = w_id; - h.h_date = g_network->now(); - h.h_amount = 10.0; - h.h_data = self->aString(c.arena, 12, 24); - { - BinaryWriter w(IncludeVersion()); - serializer(w, c); - tr.set(c.key(), w.toValue(), AddConflictRange::False); - } - { - // Write index - tr.set(c.indexLastKey(), c.key(), AddConflictRange::False); - } - { - BinaryWriter w(IncludeVersion()); - serializer(w, h); - UID k = deterministicRandom()->randomUniqueID(); - BinaryWriter kW(Unversioned()); - serializer(kW, k); - auto key = kW.toValue().withPrefix(LiteralStringRef("History/")); - tr.set(key, w.toValue(), AddConflictRange::False); - } - } - try { - wait(tr.commit()); - break; - } catch (Error& e) { - TraceEvent("PopulateCustomerHandleError").error(e); - wait(tr.onError(e)); - } - } - } - TraceEvent("PopulateCustomersDone").detail("Warehouse", w_id).detail("District", d_id); - return Void(); - } - - ACTOR static Future populateOrders(PopulateTPCC* self, Database cx, int w_id, int d_id) { - state Transaction tr(cx); - state std::vector customerIds; - state int idStart; - state int o_id; - customerIds.reserve(3000); - for (int i = 0; i < 3000; ++i) { - customerIds.push_back(i); - } - deterministicRandom()->randomShuffle(customerIds); - for (idStart = 0; idStart < 3000; idStart += 100) { - TraceEvent("PopulateOrders").detail("Warehouse", w_id).detail("District", d_id).detail("Order", idStart); - loop { - tr.reset(); - for (o_id = idStart; o_id < idStart + 100; ++o_id) { - Order o; - o.o_id = o_id; - o.o_c_id = customerIds[o_id]; - o.o_d_id = d_id; - o.o_w_id = w_id; - o.o_entry_d = g_network->now(); - if (o_id < 2100) { - o.o_carrier_id = deterministicRandom()->randomInt(1, 11); - } - o.o_ol_cnt = deterministicRandom()->randomInt(5, 16); - o.o_all_local = true; - for (int ol_number = 0; ol_number < o.o_ol_cnt; ++ol_number) { - OrderLine ol; - ol.ol_o_id = o_id; - ol.ol_d_id = d_id; - ol.ol_w_id = w_id; - ol.ol_number = ol_number; - ol.ol_i_id = deterministicRandom()->randomInt(0, 100000); - ol.ol_supply_w_id = w_id; - if (o_id < 2100) { - ol.ol_delivery_d = g_network->now(); - ol.ol_amount = 0.0; - } else { - ol.ol_amount = deterministicRandom()->random01() * 10000.0; - } - ol.ol_quantity = 5; - ol.ol_dist_info = self->aString(ol.arena, 24, 24); - BinaryWriter w(IncludeVersion()); - serializer(w, ol); - tr.set(ol.key(), w.toValue(), AddConflictRange::False); - } - BinaryWriter w(IncludeVersion()); - serializer(w, o); - tr.set(o.key(), w.toValue(), AddConflictRange::False); - } - try { - wait(tr.commit()); - break; - } catch (Error& e) { - TraceEvent("PopulateOrderHandleError").error(e); - wait(tr.onError(e)); - } - } - } - TraceEvent("PopulateOrdersDone").detail("Warehouse", w_id).detail("District", d_id); - return Void(); - } - - ACTOR static Future populateNewOrders(PopulateTPCC* self, Database cx, int w_id, int d_id) { - state Transaction tr(cx); - TraceEvent("PopulateNewOrders").detail("Warehouse", w_id).detail("District", d_id); - loop { - tr.reset(); - for (int i = 2100; i < 3000; ++i) { - NewOrder no; - no.no_o_id = i; - no.no_d_id = d_id; - no.no_w_id = w_id; - BinaryWriter w(IncludeVersion()); - serializer(w, no); - tr.set(no.key(), w.toValue(), AddConflictRange::False); - } - try { - wait(tr.commit()); - break; - } catch (Error& e) { - TraceEvent("PopulateNewOrderHandleError").error(e); - wait(tr.onError(e)); - } - } - TraceEvent("PopulateNewOrdersDone").detail("Warehouse", w_id).detail("District", d_id); - return Void(); - } - - ACTOR static Future populateDistricts(PopulateTPCC* self, Database cx, int w_id) { - state Transaction tr(cx); - state int d_id; - for (d_id = 0; d_id < 10; ++d_id) { - TraceEvent("PopulateDistricts").detail("Warehouse", w_id).detail("District", d_id); - loop { - tr.reset(); - District d; - d.d_id = d_id; - d.d_w_id = w_id; - d.d_name = self->aString(d.arena, 6, 10); - d.d_street_1 = self->aString(d.arena, 10, 20); - d.d_street_2 = self->aString(d.arena, 10, 20); - d.d_city = self->aString(d.arena, 10, 20); - d.d_state = self->aString(d.arena, 2, 2); - d.d_zip = self->rndZip(d.arena); - d.d_tax = deterministicRandom()->random01() * 0.2; - d.d_ytd = 30000; - d.d_next_o_id = 3000; - BinaryWriter w(IncludeVersion()); - serializer(w, d); - tr.set(d.key(), w.toValue(), AddConflictRange::False); - try { - wait(tr.commit()); - wait(populateCustomers(self, cx, w_id, d_id)); - wait(populateOrders(self, cx, w_id, d_id)); - wait(populateNewOrders(self, cx, w_id, d_id)); - break; - } catch (Error& e) { - TraceEvent("PopulateDistrictHandleError").error(e); - wait(tr.onError(e)); - } - } - } - TraceEvent("PopulateDistrictsDone").detail("Warehouse", w_id); - return Void(); - } - - ACTOR static Future populateStock(PopulateTPCC* self, Database cx, int w_id) { - state Transaction tr(cx); - state int idStart; - for (idStart = 0; idStart < 100000; idStart += 100) { - TraceEvent("PopulateStock").detail("Warehouse", w_id).detail("i_id", idStart); - loop { - tr.reset(); - for (int i = idStart; i < idStart + 100; ++i) { - Stock s; - s.s_i_id = i; - s.s_w_id = w_id; - s.s_quantity = deterministicRandom()->randomInt(1, 101); - s.s_dist_01 = self->aString(s.arena, 24, 25); - s.s_dist_02 = self->aString(s.arena, 24, 25); - s.s_dist_03 = self->aString(s.arena, 24, 25); - s.s_dist_04 = self->aString(s.arena, 24, 25); - s.s_dist_05 = self->aString(s.arena, 24, 25); - s.s_dist_06 = self->aString(s.arena, 24, 25); - s.s_dist_07 = self->aString(s.arena, 24, 25); - s.s_dist_08 = self->aString(s.arena, 24, 25); - s.s_dist_09 = self->aString(s.arena, 24, 25); - s.s_dist_10 = self->aString(s.arena, 24, 25); - s.s_ytd = 0; - s.s_order_cnt = 0; - s.s_remote_cnt = 0; - s.s_data = self->dataString(s.arena); - BinaryWriter w(IncludeVersion()); - serializer(w, s); - tr.set(s.key(), w.toValue(), AddConflictRange::False); - } - try { - wait(tr.commit()); - break; - } catch (Error& e) { - TraceEvent("PopulateStockHandleError").error(e).detail("Warehouse", w_id); - wait(tr.onError(e)); - } - } - } - TraceEvent("PopulateStockDone").detail("Warehouse", w_id); - return Void(); - } - - ACTOR static Future populateWarehouse(PopulateTPCC* self, Database cx, int w_id) { - state Transaction tr(cx); - TraceEvent("PopulateWarehouse").detail("W_ID", w_id); - loop { - tr.reset(); - try { - Warehouse w; - w.w_id = w_id; - w.w_name = self->aString(w.arena, 6, 11); - w.w_street_1 = self->aString(w.arena, 10, 21); - w.w_street_2 = self->aString(w.arena, 10, 21); - w.w_city = self->aString(w.arena, 10, 21); - w.w_state = self->aString(w.arena, 2, 3); - w.w_tax = deterministicRandom()->random01() * 0.2; - w.w_ytd = 300000; - BinaryWriter writer(IncludeVersion()); - serializer(writer, w); - tr.set(w.key(), writer.toValue(), AddConflictRange::False); - wait(tr.commit()); - break; - } catch (Error& e) { - TraceEvent("PopulateWarehouseHandleError").error(e).detail("Warehouse", w_id); - wait(tr.onError(e)); - } - } - wait(populateStock(self, cx, w_id)); - wait(populateDistricts(self, cx, w_id)); - TraceEvent("PopulateWarehouseDone").detail("W_ID", w_id); - return Void(); - } - - ACTOR static Future populateActor(PopulateTPCC* self, Database cx, int actorId) { - state int startWID = - self->clientId * self->actorsPerClient * self->warehousesPerActor + actorId * self->warehousesPerActor; - state int endWID = startWID + self->warehousesPerActor; - state int wid; - for (wid = startWID; wid < endWID; ++wid) { - wait(populateWarehouse(self, cx, wid)); - } - return Void(); - } - - ACTOR static Future populate(PopulateTPCC* self, Database cx) { - if (self->clientId == 0) { - wait(writeGlobalState(self, cx)); - } else { - wait(readGlobalState(self, cx)); - } - if (self->clientId == 0) { - wait(populateItems(self, cx)); - } - - state std::vector> populateActors; - state int actorId; - for (actorId = 0; actorId < self->actorsPerClient; ++actorId) { - populateActors.push_back(populateActor(self, cx, actorId)); - } - wait(waitForAll(populateActors)); - wait(quietDatabase(cx, self->dbInfo, "PopulateTPCC")); - return Void(); - } - - Future setup(Database const& cx) override { - if (clientId >= clientsUsed) - return Void(); - return populate(this, cx); - } - - Future start(Database const& cx) override { return Void(); } - - Future check(Database const& cx) override { return true; } - - void getMetrics(std::vector& m) override {} -}; - -} // namespace - -WorkloadFactory PopulateTPCCWorkloadFactory(PopulateTPCC::DESCRIPTION); diff --git a/fdbserver/workloads/ReadHotDetection.actor.cpp b/fdbserver/workloads/ReadHotDetection.actor.cpp index e5b15ef04d..251c1f90f4 100644 --- a/fdbserver/workloads/ReadHotDetection.actor.cpp +++ b/fdbserver/workloads/ReadHotDetection.actor.cpp @@ -101,7 +101,7 @@ struct ReadHotDetectionWorkload : TestWorkload { StorageMetrics sm = wait(cx->getStorageMetrics(self->wholeRange, 100)); // TraceEvent("RHDCheckPhaseLog") // .detail("KeyRangeSize", sm.bytes) - // .detail("KeyRangeReadBandwith", sm.bytesReadPerKSecond); + // .detail("KeyRangeReadBandwidth", sm.bytesReadPerKSecond); Standalone> keyRanges = wait(cx->getReadHotRanges(self->wholeRange)); // TraceEvent("RHDCheckPhaseLog") // .detail("KeyRangesSize", keyRanges.size()) diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index 475c3a023c..ef77eaae76 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -61,6 +61,7 @@ struct ReadWriteCommonImpl { throw; } } + ACTOR static Future tracePeriodically(ReadWriteCommon* self) { state double start = now(); state double elapsed = 0.0; @@ -376,6 +377,9 @@ struct ReadWriteWorkload : ReadWriteCommon { bool adjacentReads; // keys are adjacent within a transaction bool adjacentWrites; int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; + Optional transactionTag; + + int transactionsTagThrottled{ 0 }; // hot traffic pattern double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting @@ -397,6 +401,9 @@ struct ReadWriteWorkload : ReadWriteCommon { rampUpConcurrency = getOption(options, LiteralStringRef("rampUpConcurrency"), false); batchPriority = getOption(options, LiteralStringRef("batchPriority"), false); descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("ReadWrite")); + if (hasOption(options, LiteralStringRef("transactionTag"))) { + transactionTag = getOption(options, LiteralStringRef("transactionTag"), ""_sr); + } if (rampUpConcurrency) ASSERT(rampSweepCount == 2); // Implementation is hard coded to ramp up and down @@ -415,15 +422,18 @@ struct ReadWriteWorkload : ReadWriteCommon { } } - std::string description() const override { return descriptionString.toString(); } - template - void setupTransaction(Trans* tr) { + void setupTransaction(Trans& tr) { if (batchPriority) { - tr->setOption(FDBTransactionOptions::PRIORITY_BATCH); + tr.setOption(FDBTransactionOptions::PRIORITY_BATCH); + } + if (transactionTag.present() && tr.getTags().size() == 0) { + tr.setOption(FDBTransactionOptions::AUTO_THROTTLE_TAG, transactionTag.get()); } } + std::string description() const override { return descriptionString.toString(); } + void getMetrics(std::vector& m) override { ReadWriteCommon::getMetrics(m); if (!rampUpLoad) { @@ -449,6 +459,9 @@ struct ReadWriteWorkload : ReadWriteCommon { m.emplace_back("Mean Commit Latency (ms)", 1000 * commitLatencies.mean(), Averaged::True); m.emplace_back("Median Commit Latency (ms, averaged)", 1000 * commitLatencies.median(), Averaged::True); m.emplace_back("Max Commit Latency (ms, averaged)", 1000 * commitLatencies.max(), Averaged::True); + if (transactionTag.present()) { + m.emplace_back("Transaction Tag Throttled", transactionsTagThrottled, Averaged::False); + } } } @@ -494,11 +507,14 @@ struct ReadWriteWorkload : ReadWriteCommon { state Transaction tr(cx); try { - self->setupTransaction(&tr); + self->setupTransaction(tr); wait(self->readOp(&tr, keys, self, false)); wait(tr.warmRange(allKeys)); break; } catch (Error& e) { + if (e.code() == error_code_tag_throttled) { + ++self->transactionsTagThrottled; + } wait(tr.onError(e)); } } @@ -625,7 +641,7 @@ struct ReadWriteWorkload : ReadWriteCommon { loop { try { - self->setupTransaction(&tr); + self->setupTransaction(tr); GRVStartTime = now(); self->transactionFailureMetric->startLatency = -1; diff --git a/fdbserver/workloads/SaveAndKill.actor.cpp b/fdbserver/workloads/SaveAndKill.actor.cpp index c165146175..9ef8252db7 100644 --- a/fdbserver/workloads/SaveAndKill.actor.cpp +++ b/fdbserver/workloads/SaveAndKill.actor.cpp @@ -71,14 +71,12 @@ struct SaveAndKillWorkload : TestWorkload { std::map rebootingProcesses = g_simulator.currentlyRebootingProcesses; std::map allProcessesMap; for (const auto& [_, process] : rebootingProcesses) { - if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() && - process->name != "remote flow process") { + if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() && !process->isSpawnedKVProcess()) { allProcessesMap[process->dataFolder] = process; } } for (const auto& process : processes) { - if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() && - process->name != "remote flow process") { + if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() && !process->isSpawnedKVProcess()) { allProcessesMap[process->dataFolder] = process; } } diff --git a/fdbserver/workloads/TPCC.actor.cpp b/fdbserver/workloads/TPCC.actor.cpp deleted file mode 100644 index bff641fdba..0000000000 --- a/fdbserver/workloads/TPCC.actor.cpp +++ /dev/null @@ -1,825 +0,0 @@ -/* - * TPCC.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "fdbserver/workloads/workloads.actor.h" -#include "fdbserver/workloads/TPCCWorkload.h" - -#include -#include "flow/actorcompiler.h" // has to be last include - -using namespace TPCCWorkload; - -namespace { - -struct TPCCMetrics { - static constexpr int latenciesStored = 1000; - - uint64_t successfulStockLevelTransactions{ 0 }; - uint64_t failedStockLevelTransactions{ 0 }; - uint64_t successfulDeliveryTransactions{ 0 }; - uint64_t failedDeliveryTransactions{ 0 }; - uint64_t successfulOrderStatusTransactions{ 0 }; - uint64_t failedOrderStatusTransactions{ 0 }; - uint64_t successfulPaymentTransactions{ 0 }; - uint64_t failedPaymentTransactions{ 0 }; - uint64_t successfulNewOrderTransactions{ 0 }; - uint64_t failedNewOrderTransactions{ 0 }; - double stockLevelResponseTime{ 0.0 }; - double deliveryResponseTime{ 0.0 }; - double orderStatusResponseTime{ 0.0 }; - double paymentResponseTime{ 0.0 }; - double newOrderResponseTime{ 0.0 }; - std::vector stockLevelLatencies, deliveryLatencies, orderStatusLatencies, paymentLatencies, - newOrderLatencies; - - void sort() { - std::sort(stockLevelLatencies.begin(), stockLevelLatencies.end()); - std::sort(deliveryLatencies.begin(), deliveryLatencies.end()); - std::sort(orderStatusLatencies.begin(), orderStatusLatencies.end()); - std::sort(paymentLatencies.begin(), paymentLatencies.end()); - std::sort(newOrderLatencies.begin(), newOrderLatencies.end()); - } - - static double median(const std::vector& latencies) { - // assumes latencies is sorted - return latencies[latencies.size() / 2]; - } - - static double percentile_90(const std::vector& latencies) { - // assumes latencies is sorted - return latencies[(9 * latencies.size()) / 10]; - } - - static double percentile_99(const std::vector& latencies) { - // assumes latencies is sorted - return latencies[(99 * latencies.size()) / 100]; - } - - static void updateMetrics(bool committed, - uint64_t& successCounter, - uint64_t& failedCounter, - double txnStartTime, - std::vector& latencies, - double& totalLatency, - std::string txnType) { - auto responseTime = g_network->now() - txnStartTime; - if (committed) { - totalLatency += responseTime; - ++successCounter; - if (successCounter <= latenciesStored) - latencies[successCounter - 1] = responseTime; - else { - auto index = deterministicRandom()->randomInt(0, successCounter); - if (index < latenciesStored) { - latencies[index] = responseTime; - } - } - } else { - ++failedCounter; - } - TraceEvent("TransactionComplete") - .detail("TransactionType", txnType) - .detail("Latency", responseTime) - .detail("Begin", txnStartTime) - .detail("End", txnStartTime + responseTime) - .detail("Success", committed); - } -}; - -struct TPCC : TestWorkload { - static constexpr const char* DESCRIPTION = "TPCC"; - - int warehousesPerClient; - int expectedTransactionsPerMinute; - int testDuration; - int warmupTime; - int clientsUsed; - double startTime; - - GlobalState gState; - TPCCMetrics metrics; - - TPCC(WorkloadContext const& ctx) : TestWorkload(ctx) { - std::string workloadName = DESCRIPTION; - warehousesPerClient = getOption(options, LiteralStringRef("warehousesPerClient"), 100); - expectedTransactionsPerMinute = getOption(options, LiteralStringRef("expectedTransactionsPerMinute"), 1000); - testDuration = getOption(options, LiteralStringRef("testDuration"), 600); - warmupTime = getOption(options, LiteralStringRef("warmupTime"), 30); - clientsUsed = getOption(options, LiteralStringRef("clientsUsed"), 40); - } - - int NURand(int C, int A, int x, int y) { - return (((deterministicRandom()->randomInt(0, A + 1) | deterministicRandom()->randomInt(x, y + 1)) + C) % - (y - x + 1)) + - x; - } - - StringRef genCLast(Arena& arena, int x) { - int l = x % 10; - x /= 10; - int m = x % 10; - x /= 10; - int f = x % 10; - std::stringstream ss; - ss << syllables[f] << syllables[m] << syllables[l]; - return StringRef(arena, ss.str()); - } - - // Should call in setup - ACTOR static Future readGlobalState(TPCC* self, Database cx) { - state ReadYourWritesTransaction tr(cx); - loop { - tr.reset(); - try { - Optional val = wait(tr.get(self->gState.key())); - if (val.present()) { - BinaryReader reader(val.get(), IncludeVersion()); - serializer(reader, self->gState); - } else { - wait(delay(1.0)); - } - return Void(); - } catch (Error& e) { - wait(tr.onError(e)); - } - } - } - - std::string description() const override { return DESCRIPTION; } - - // Transactions - - ACTOR static Future newOrder(TPCC* self, Database cx, int w_id) { - state int d_id = deterministicRandom()->randomInt(0, 10); - state int c_id = self->NURand(self->gState.CRun, 1023, 1, 3000) - 1; - state int ol_cnt = deterministicRandom()->randomInt(5, 16); - state bool willRollback = deterministicRandom()->randomInt(1, 100) == 1; - state ReadYourWritesTransaction tr(cx); - try { - state Warehouse warehouse; - warehouse.w_id = w_id; - Optional wValue = wait(tr.get(warehouse.key())); - ASSERT(wValue.present()); - { - BinaryReader r(wValue.get(), IncludeVersion()); - serializer(r, warehouse); - } - state District district; - district.d_w_id = w_id; - district.d_id = d_id; - Optional dValue = wait(tr.get(district.key())); - ASSERT(dValue.present()); - { - BinaryReader r(dValue.get(), IncludeVersion()); - serializer(r, district); - } - state Customer customer; - customer.c_id = c_id; - customer.c_w_id = w_id; - customer.c_d_id = d_id; - Optional cValue = wait(tr.get(customer.key())); - ASSERT(cValue.present()); - { - BinaryReader r(cValue.get(), IncludeVersion()); - serializer(r, customer); - } - state Order order; - order.o_entry_d = g_network->now(); - order.o_c_id = c_id; - order.o_d_id = d_id; - order.o_w_id = w_id; - order.o_ol_cnt = ol_cnt; - order.o_id = district.d_next_o_id; - - ++district.d_next_o_id; - { - BinaryWriter w(IncludeVersion()); - serializer(w, district); - tr.set(district.key(), w.toValue()); - } - - state NewOrder newOrder; - newOrder.no_w_id = w_id; - newOrder.no_d_id = d_id; - newOrder.no_o_id = order.o_id; - state int ol_id = 0; - state bool allLocal = true; - for (; ol_id < order.o_ol_cnt; ++ol_id) { - if (ol_id + 1 == order.o_ol_cnt && willRollback) { - // Simulated abort - order item not found - return false; - } - state OrderLine orderLine; - orderLine.ol_number = ol_id; - orderLine.ol_w_id = w_id; - orderLine.ol_d_id = d_id; - orderLine.ol_supply_w_id = w_id; - orderLine.ol_o_id = order.o_id; - orderLine.ol_i_id = self->NURand(self->gState.CRun, 8191, 1, 100000) - 1; - orderLine.ol_quantity = deterministicRandom()->randomInt(1, 11); - if (deterministicRandom()->randomInt(0, 100) == 0) { - orderLine.ol_supply_w_id = - deterministicRandom()->randomInt(0, self->clientsUsed * self->warehousesPerClient); - } - state Item item; - item.i_id = orderLine.ol_i_id; - orderLine.ol_i_id = item.i_id; - Optional iValue = wait(tr.get(item.key())); - ASSERT(iValue.present()); - { - BinaryReader r(iValue.get(), IncludeVersion()); - serializer(r, item); - } - state Stock stock; - stock.s_i_id = item.i_id; - stock.s_w_id = orderLine.ol_supply_w_id; - Optional sValue = wait(tr.get(stock.key())); - ASSERT(sValue.present()); - { - BinaryReader r(sValue.get(), IncludeVersion()); - serializer(r, stock); - } - if (stock.s_quantity - orderLine.ol_quantity >= 10) { - stock.s_quantity -= orderLine.ol_quantity; - } else { - stock.s_quantity = (stock.s_quantity - orderLine.ol_quantity) + 91; - } - stock.s_ytd += orderLine.ol_quantity; - stock.s_order_cnt += 1; - if (orderLine.ol_supply_w_id != w_id) { - stock.s_remote_cnt += 1; - allLocal = false; - } - { - BinaryWriter w(IncludeVersion()); - serializer(w, stock); - tr.set(stock.key(), w.toValue()); - } - orderLine.ol_amount = orderLine.ol_quantity * item.i_price; - switch (orderLine.ol_d_id) { - case 0: - orderLine.ol_dist_info = stock.s_dist_01; - break; - case 1: - orderLine.ol_dist_info = stock.s_dist_02; - break; - case 2: - orderLine.ol_dist_info = stock.s_dist_03; - break; - case 3: - orderLine.ol_dist_info = stock.s_dist_04; - break; - case 4: - orderLine.ol_dist_info = stock.s_dist_05; - break; - case 5: - orderLine.ol_dist_info = stock.s_dist_06; - break; - case 6: - orderLine.ol_dist_info = stock.s_dist_07; - break; - case 7: - orderLine.ol_dist_info = stock.s_dist_08; - break; - case 8: - orderLine.ol_dist_info = stock.s_dist_09; - break; - case 9: - orderLine.ol_dist_info = stock.s_dist_10; - break; - } - { - BinaryWriter w(IncludeVersion()); - serializer(w, orderLine); - tr.set(orderLine.key(), w.toValue()); - } - } - order.o_all_local = allLocal; - { - BinaryWriter w(IncludeVersion()); - serializer(w, order); - tr.set(order.key(), w.toValue()); - } - { - BinaryWriter w(IncludeVersion()); - serializer(w, newOrder); - tr.set(newOrder.key(), w.toValue()); - } - wait(tr.commit()); - } catch (Error& e) { - return false; - } - return true; - } - - ACTOR static Future getRandomCustomer(TPCC* self, ReadYourWritesTransaction* tr, int w_id, int d_id) { - state Customer result; - result.c_w_id = w_id; - result.c_d_id = d_id; - if (deterministicRandom()->randomInt(0, 100) >= 85) { - result.c_d_id = deterministicRandom()->randomInt(0, 10); - result.c_w_id = deterministicRandom()->randomInt(0, self->clientsUsed * self->warehousesPerClient); - } - if (deterministicRandom()->randomInt(0, 100) < 60) { - // select through last name - result.c_last = self->genCLast(result.arena, self->NURand(self->gState.CRun, 1023, 1, 3000) - 1); - auto s = result.indexLastKey(1); - auto begin = new (result.arena) uint8_t[s.size() + 1]; - auto end = new (result.arena) uint8_t[s.size() + 1]; - memcpy(begin, s.begin(), s.size()); - memcpy(end, s.begin(), s.size()); - begin[s.size()] = '/'; - end[s.size()] = '0'; - state RangeResult range = - wait(tr->getRange(KeyRangeRef(StringRef(begin, s.size() + 1), StringRef(end, s.size() + 1)), 1000)); - ASSERT(range.size() > 0); - - state std::vector customers; - state int i = 0; - for (; i < range.size(); ++i) { - Optional cValue = wait(tr->get(range[i].value)); - ASSERT(cValue.present()); - BinaryReader r(cValue.get(), IncludeVersion()); - state Customer customer; - serializer(r, customer); - customers.push_back(customer); - } - - // Sort customers by first name and choose median - std::sort(customers.begin(), customers.end(), [](const Customer& cus1, const Customer& cus2) { - const std::string cus1Name = cus1.c_first.toString(); - const std::string cus2Name = cus2.c_first.toString(); - return (cus1Name.compare(cus2Name) < 0); - }); - result = customers[customers.size() / 2]; - } else { - // select through random id - result.c_id = self->NURand(self->gState.CRun, 1023, 1, 3000) - 1; - Optional val = wait(tr->get(result.key())); - ASSERT(val.present()); - BinaryReader r(val.get(), IncludeVersion()); - serializer(r, result); - } - return result; - } - - ACTOR static Future payment(TPCC* self, Database cx, int w_id) { - state ReadYourWritesTransaction tr(cx); - state int d_id = deterministicRandom()->randomInt(0, 10); - state History history; - state Warehouse warehouse; - state District district; - history.h_amount = deterministicRandom()->random01() * 4999.0 + 1.0; - history.h_date = g_network->now(); - try { - // get the customer - state Customer customer = wait(getRandomCustomer(self, &tr, w_id, d_id)); - warehouse.w_id = w_id; - Optional wValue = wait(tr.get(warehouse.key())); - ASSERT(wValue.present()); - { - BinaryReader r(wValue.get(), IncludeVersion()); - serializer(r, warehouse); - } - warehouse.w_ytd += history.h_amount; - { - BinaryWriter w(IncludeVersion()); - serializer(w, warehouse); - tr.set(warehouse.key(), w.toValue()); - } - district.d_w_id = w_id; - district.d_id = d_id; - Optional dValue = wait(tr.get(district.key())); - ASSERT(dValue.present()); - { - BinaryReader r(dValue.get(), IncludeVersion()); - serializer(r, district); - } - district.d_ytd += history.h_amount; - customer.c_balance -= history.h_amount; - customer.c_ytd_payment += history.h_amount; - customer.c_payment_cnt += 1; - if (customer.c_credit == LiteralStringRef("BC")) { - // we must update c_data - std::stringstream ss; - ss << customer.c_id << "," << customer.c_d_id << "," << customer.c_w_id << "," << district.d_id << "," - << w_id << history.h_amount << ";"; - auto s = ss.str(); - auto len = std::min(int(s.size()) + customer.c_data.size(), 500); - auto data = new (customer.arena) uint8_t[len]; - std::copy(s.begin(), s.end(), reinterpret_cast(data)); - std::copy(customer.c_data.begin(), customer.c_data.begin() + len - s.size(), data); - customer.c_data = StringRef(data, len); - } - { - BinaryWriter w(IncludeVersion()); - serializer(w, customer); - tr.set(customer.key(), w.toValue()); - } - std::stringstream ss; - ss << warehouse.w_name.toString() << " " << district.d_name.toString(); - history.h_data = StringRef(history.arena, ss.str()); - history.h_c_id = customer.c_id; - history.h_c_d_id = customer.c_d_id; - history.h_c_w_id = customer.c_w_id; - history.h_d_id = d_id; - history.h_w_id = w_id; - { - BinaryWriter w(IncludeVersion()); - serializer(w, history); - UID k = deterministicRandom()->randomUniqueID(); - BinaryWriter kW(Unversioned()); - serializer(kW, k); - auto key = kW.toValue().withPrefix(LiteralStringRef("History/")); - tr.set(key, w.toValue()); - } - wait(tr.commit()); - } catch (Error& e) { - return false; - } - return true; - } - - ACTOR static Future orderStatus(TPCC* self, Database cx, int w_id) { - state ReadYourWritesTransaction tr(cx); - state int d_id = deterministicRandom()->randomInt(0, 10); - state int i; - state Order order; - state std::vector orderLines; - try { - state Customer customer = wait(getRandomCustomer(self, &tr, w_id, d_id)); - order.o_w_id = customer.c_w_id; - order.o_d_id = customer.c_d_id; - order.o_c_id = customer.c_id; - RangeResult range = wait(tr.getRange(order.keyRange(1), 1, Snapshot::False, Reverse::True)); - ASSERT(range.size() > 0); - { - BinaryReader r(range[0].value, IncludeVersion()); - serializer(r, order); - } - for (i = 0; i < order.o_ol_cnt; ++i) { - OrderLine orderLine; - orderLine.ol_w_id = order.o_w_id; - orderLine.ol_d_id = order.o_d_id; - orderLine.ol_o_id = order.o_id; - orderLine.ol_number = i; - Optional olValue = wait(tr.get(orderLine.key())); - ASSERT(olValue.present()); - BinaryReader r(olValue.get(), IncludeVersion()); - OrderLine ol; - serializer(r, ol); - orderLines.push_back(ol); - } - } catch (Error& e) { - return false; - } - return true; - } - - ACTOR static Future delivery(TPCC* self, Database cx, int w_id) { - state ReadYourWritesTransaction tr(cx); - state int carrier_id = deterministicRandom()->randomInt(0, 10); - state int d_id; - state NewOrder newOrder; - state Order order; - state double sumAmount = 0.0; - state Customer customer; - state int i; - try { - for (d_id = 0; d_id < 10; ++d_id) { - newOrder.no_w_id = w_id; - newOrder.no_d_id = d_id; - RangeResult range = wait(tr.getRange(newOrder.keyRange(1), 1)); - if (range.size() > 0) { - { - BinaryReader r(range[0].value, IncludeVersion()); - serializer(r, newOrder); - } - tr.clear(newOrder.key()); - order.o_w_id = w_id; - order.o_d_id = d_id; - order.o_id = newOrder.no_o_id; - Optional oValue = wait(tr.get(order.key())); - ASSERT(oValue.present()); - { - BinaryReader r(oValue.get(), IncludeVersion()); - serializer(r, order); - } - order.o_carrier_id = carrier_id; - { - BinaryWriter w(IncludeVersion()); - serializer(w, order); - tr.set(order.key(), w.toValue()); - } - for (i = 0; i < order.o_ol_cnt; ++i) { - state OrderLine orderLine; - orderLine.ol_w_id = order.o_w_id; - orderLine.ol_d_id = order.o_d_id; - orderLine.ol_o_id = order.o_id; - orderLine.ol_number = i; - Optional olV = wait(tr.get(orderLine.key())); - ASSERT(olV.present()); - BinaryReader r(olV.get(), IncludeVersion()); - serializer(r, orderLine); - orderLine.ol_delivery_d = g_network->now(); - sumAmount += orderLine.ol_amount; - } - customer.c_w_id = w_id; - customer.c_d_id = d_id; - customer.c_id = order.o_c_id; - Optional cV = wait(tr.get(customer.key())); - ASSERT(cV.present()); - { - BinaryReader r(cV.get(), IncludeVersion()); - serializer(r, customer); - } - customer.c_balance += sumAmount; - customer.c_delivery_count += 1; - { - BinaryWriter w(IncludeVersion()); - serializer(w, customer); - tr.set(customer.key(), w.toValue()); - } - wait(tr.commit()); - } - } - } catch (Error& e) { - return false; - } - return true; - } - - ACTOR static Future stockLevel(TPCC* self, Database cx, int w_id, int d_id) { - state int threshold = deterministicRandom()->randomInt(10, 21); - state Transaction tr(cx); - state District district; - state OrderLine orderLine; - state Stock stock; - state int ol_o_id; - state int low_stock = 0; - state int i; - try { - district.d_w_id = w_id; - district.d_id = d_id; - Optional dV = wait(tr.get(district.key())); - ASSERT(dV.present()); - { - BinaryReader r(dV.get(), IncludeVersion()); - serializer(r, district); - } - for (ol_o_id = district.d_next_o_id - 20; ol_o_id < district.d_next_o_id; ++ol_o_id) { - orderLine.ol_w_id = w_id; - orderLine.ol_d_id = d_id; - orderLine.ol_o_id = ol_o_id; - state RangeResult range = wait(tr.getRange(orderLine.keyRange(1), CLIENT_KNOBS->TOO_MANY)); - ASSERT(!range.more); - ASSERT(range.size() > 0); - for (i = 0; i < range.size(); ++i) { - { - BinaryReader r(range[i].value, IncludeVersion()); - serializer(r, orderLine); - } - stock.s_i_id = orderLine.ol_i_id; - stock.s_w_id = orderLine.ol_w_id; - Optional sV = wait(tr.get(stock.key())); - ASSERT(sV.present()); - { - BinaryReader r(sV.get(), IncludeVersion()); - serializer(r, stock); - } - if (stock.s_quantity < threshold) { - ++low_stock; - } - } - } - } catch (Error& e) { - return false; - } - return true; - } - - ACTOR static Future emulatedUser(TPCC* self, Database cx, int w_id, int d_id) { - // stagger users - wait(delay(20.0 * deterministicRandom()->random01())); - TraceEvent("StartingEmulatedUser").detail("Warehouse", w_id).detail("District", d_id); - loop { - auto type = deterministicRandom()->randomInt(0, 100); - Future tx; - state double txnStartTime = g_network->now(); - - if (type < 4) { - tx = stockLevel(self, cx, w_id, d_id); - bool committed = wait(tx); - if (self->recordMetrics()) { - TPCCMetrics::updateMetrics(committed, - self->metrics.successfulStockLevelTransactions, - self->metrics.failedStockLevelTransactions, - txnStartTime, - self->metrics.stockLevelLatencies, - self->metrics.stockLevelResponseTime, - "StockLevel"); - } - wait(delay(2 + deterministicRandom()->random01() * 10)); - } else if (type < 8) { - tx = delivery(self, cx, w_id); - bool committed = wait(tx); - if (self->recordMetrics()) { - TPCCMetrics::updateMetrics(committed, - self->metrics.successfulDeliveryTransactions, - self->metrics.failedDeliveryTransactions, - txnStartTime, - self->metrics.deliveryLatencies, - self->metrics.deliveryResponseTime, - "Delivery"); - } - wait(delay(2 + deterministicRandom()->random01() * 10)); - } else if (type < 12) { - tx = orderStatus(self, cx, w_id); - bool committed = wait(tx); - if (self->recordMetrics()) { - TPCCMetrics::updateMetrics(committed, - self->metrics.successfulOrderStatusTransactions, - self->metrics.failedOrderStatusTransactions, - txnStartTime, - self->metrics.orderStatusLatencies, - self->metrics.orderStatusResponseTime, - "OrderStatus"); - } - wait(delay(2 + deterministicRandom()->random01() * 20)); - } else if (type < 55) { - tx = payment(self, cx, w_id); - bool committed = wait(tx); - if (self->recordMetrics()) { - TPCCMetrics::updateMetrics(committed, - self->metrics.successfulPaymentTransactions, - self->metrics.failedPaymentTransactions, - txnStartTime, - self->metrics.paymentLatencies, - self->metrics.paymentResponseTime, - "Payment"); - } - wait(delay(3 + deterministicRandom()->random01() * 24)); - } else { - tx = newOrder(self, cx, w_id); - bool committed = wait(tx); - if (self->recordMetrics()) { - TPCCMetrics::updateMetrics(committed, - self->metrics.successfulNewOrderTransactions, - self->metrics.failedNewOrderTransactions, - txnStartTime, - self->metrics.newOrderLatencies, - self->metrics.newOrderResponseTime, - "NewOrder"); - } - wait(delay(18 + deterministicRandom()->random01() * 24)); - } - } - } - - double transactionsPerMinute() const { - return metrics.successfulNewOrderTransactions * 60.0 / (testDuration - 2 * warmupTime); - } - - bool recordMetrics() const { - auto now = g_network->now(); - return (now > startTime + warmupTime && now < startTime + testDuration - warmupTime); - } - - Future start(Database const& cx) override { - if (clientId >= clientsUsed) - return Void(); - return _start(cx, this); - } - - ACTOR Future _start(Database cx, TPCC* self) { - wait(readGlobalState(self, cx)); - self->startTime = g_network->now(); - int startWID = self->clientId * self->warehousesPerClient; - int endWID = startWID + self->warehousesPerClient; - state int w_id; - state int d_id; - state std::vector> emulatedUsers; - for (w_id = startWID; w_id < endWID; ++w_id) { - for (d_id = 0; d_id < 10; ++d_id) { - emulatedUsers.push_back(timeout(emulatedUser(self, cx, w_id, d_id), self->testDuration, Void())); - } - } - wait(waitForAll(emulatedUsers)); - return Void(); - } - - Future check(Database const& cx) override { - return (transactionsPerMinute() > expectedTransactionsPerMinute); - } - void getMetrics(std::vector& m) override { - double multiplier = static_cast(clientCount) / static_cast(clientsUsed); - - m.emplace_back("Transactions Per Minute", transactionsPerMinute(), Averaged::False); - - m.emplace_back("Successful StockLevel Transactions", metrics.successfulStockLevelTransactions, Averaged::False); - m.emplace_back("Successful Delivery Transactions", metrics.successfulDeliveryTransactions, Averaged::False); - m.emplace_back( - "Successful OrderStatus Transactions", metrics.successfulOrderStatusTransactions, Averaged::False); - m.emplace_back("Successful Payment Transactions", metrics.successfulPaymentTransactions, Averaged::False); - m.emplace_back("Successful NewOrder Transactions", metrics.successfulNewOrderTransactions, Averaged::False); - - m.emplace_back("Failed StockLevel Transactions", metrics.failedStockLevelTransactions, Averaged::False); - m.emplace_back("Failed Delivery Transactions", metrics.failedDeliveryTransactions, Averaged::False); - m.emplace_back("Failed OrderStatus Transactions", metrics.failedOrderStatusTransactions, Averaged::False); - m.emplace_back("Failed Payment Transactions", metrics.failedPaymentTransactions, Averaged::False); - m.emplace_back("Failed NewOrder Transactions", metrics.failedNewOrderTransactions, Averaged::False); - - m.emplace_back("Mean StockLevel Latency", - (clientId < clientsUsed) - ? (multiplier * metrics.stockLevelResponseTime / metrics.successfulStockLevelTransactions) - : 0.0, - Averaged::True); - m.emplace_back("Mean Delivery Latency", - (clientId < clientsUsed) - ? (multiplier * metrics.deliveryResponseTime / metrics.successfulDeliveryTransactions) - : 0.0, - Averaged::True); - m.emplace_back("Mean OrderStatus Repsonse Time", - (clientId < clientsUsed) - ? (multiplier * metrics.orderStatusResponseTime / metrics.successfulOrderStatusTransactions) - : 0.0, - Averaged::True); - m.emplace_back("Mean Payment Latency", - (clientId < clientsUsed) - ? (multiplier * metrics.paymentResponseTime / metrics.successfulPaymentTransactions) - : 0.0, - Averaged::True); - m.emplace_back("Mean NewOrder Latency", - (clientId < clientsUsed) - ? (multiplier * metrics.newOrderResponseTime / metrics.successfulNewOrderTransactions) - : 0.0, - Averaged::True); - - metrics.sort(); - - m.emplace_back( - "Median StockLevel Latency", multiplier * TPCCMetrics::median(metrics.stockLevelLatencies), Averaged::True); - m.emplace_back( - "Median Delivery Latency", multiplier * TPCCMetrics::median(metrics.deliveryLatencies), Averaged::True); - m.emplace_back("Median OrderStatus Latency", - multiplier * TPCCMetrics::median(metrics.orderStatusLatencies), - Averaged::True); - m.emplace_back( - "Median Payment Latency", multiplier * TPCCMetrics::median(metrics.paymentLatencies), Averaged::True); - m.emplace_back( - "Median NewOrder Latency", multiplier * TPCCMetrics::median(metrics.newOrderLatencies), Averaged::True); - - m.emplace_back("90th Percentile StockLevel Latency", - multiplier * TPCCMetrics::percentile_90(metrics.stockLevelLatencies), - Averaged::True); - m.emplace_back("90th Percentile Delivery Latency", - multiplier * TPCCMetrics::percentile_90(metrics.deliveryLatencies), - Averaged::True); - m.emplace_back("90th Percentile OrderStatus Latency", - multiplier * TPCCMetrics::percentile_90(metrics.orderStatusLatencies), - Averaged::True); - m.emplace_back("90th Percentile Payment Latency", - multiplier * TPCCMetrics::percentile_90(metrics.paymentLatencies), - Averaged::True); - m.emplace_back("90th Percentile NewOrder Latency", - multiplier * TPCCMetrics::percentile_90(metrics.newOrderLatencies), - Averaged::True); - - m.emplace_back("99th Percentile StockLevel Latency", - multiplier * TPCCMetrics::percentile_99(metrics.stockLevelLatencies), - Averaged::True); - m.emplace_back("99th Percentile Delivery Latency", - multiplier * TPCCMetrics::percentile_99(metrics.deliveryLatencies), - Averaged::True); - m.emplace_back("99th Percentile OrderStatus Latency", - multiplier * TPCCMetrics::percentile_99(metrics.orderStatusLatencies), - Averaged::True); - m.emplace_back("99th Percentile Payment Latency", - multiplier * TPCCMetrics::percentile_99(metrics.paymentLatencies), - Averaged::True); - m.emplace_back("99th Percentile NewOrder Latency", - multiplier * TPCCMetrics::percentile_99(metrics.newOrderLatencies), - Averaged::True); - } -}; - -} // namespace - -WorkloadFactory TPCCWorkloadFactory(TPCC::DESCRIPTION); diff --git a/fdbservice/CMakeLists.txt b/fdbservice/CMakeLists.txt index cffcfc83bc..865f5155d6 100644 --- a/fdbservice/CMakeLists.txt +++ b/fdbservice/CMakeLists.txt @@ -1,6 +1,6 @@ set(FDBSERVICE_SRCS FDBService.cpp ServiceBase.cpp) - add_executable(fdbmonitor ${FDBSERVICE_SRCS}) -target_include_directories(fdbmonitor PRIVATE ${CMAKE_BINARY_DIR}/flow/include ${CMAKE_BINARY_DIR}/fdbclient/include) -add_dependencies(fdbmonitor fdbclient) \ No newline at end of file +get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES) +target_link_libraries(fdbmonitor PUBLIC SimpleOpt) +target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}") \ No newline at end of file diff --git a/flow/EncryptUtils.cpp b/flow/EncryptUtils.cpp index 2d2a8a77e9..45bae6d52b 100644 --- a/flow/EncryptUtils.cpp +++ b/flow/EncryptUtils.cpp @@ -21,6 +21,7 @@ #include "flow/EncryptUtils.h" #include "flow/Trace.h" +#include #include std::string getEncryptDbgTraceKey(std::string_view prefix, @@ -29,12 +30,15 @@ std::string getEncryptDbgTraceKey(std::string_view prefix, Optional baseCipherId) { // Construct the TraceEvent field key ensuring its uniqueness and compliance to TraceEvent field validator and log // parsing tools + std::string dName = domainName.toString(); + // Underscores are invalid in trace event detail name. + boost::replace_all(dName, "_", "-"); if (baseCipherId.present()) { boost::format fmter("%s.%lld.%s.%llu"); - return boost::str(boost::format(fmter % prefix % domainId % domainName.toString() % baseCipherId.get())); + return boost::str(boost::format(fmter % prefix % domainId % dName % baseCipherId.get())); } else { boost::format fmter("%s.%lld.%s"); - return boost::str(boost::format(fmter % prefix % domainId % domainName.toString())); + return boost::str(boost::format(fmter % prefix % domainId % dName)); } } diff --git a/flow/include/flow/EncryptUtils.h b/flow/include/flow/EncryptUtils.h index fd1861befb..9cdbe117b4 100644 --- a/flow/include/flow/EncryptUtils.h +++ b/flow/include/flow/EncryptUtils.h @@ -29,14 +29,14 @@ #include #include -#define ENCRYPT_INVALID_DOMAIN_ID 0 +#define ENCRYPT_INVALID_DOMAIN_ID -1 #define ENCRYPT_INVALID_CIPHER_KEY_ID 0 #define ENCRYPT_INVALID_RANDOM_SALT 0 #define AUTH_TOKEN_SIZE 16 -#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -1 -#define ENCRYPT_HEADER_DOMAIN_ID -2 +#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -2 +#define ENCRYPT_HEADER_DOMAIN_ID -3 const std::string FDB_DEFAULT_ENCRYPT_DOMAIN_NAME = "FdbDefaultEncryptDomain"; diff --git a/flow/include/flow/ProtocolVersion.h b/flow/include/flow/ProtocolVersion.h index adaa067264..b7cd3e845a 100644 --- a/flow/include/flow/ProtocolVersion.h +++ b/flow/include/flow/ProtocolVersion.h @@ -174,6 +174,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, SWVersionTracking); PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Metacluster); PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, TenantGroups); + PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, EncryptionAtRest); }; template <> diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index afdea5ef2a..84e8420806 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -196,6 +196,7 @@ ERROR( key_not_tuple, 2041, "The key cannot be parsed as a tuple" ); ERROR( value_not_tuple, 2042, "The value cannot be parsed as a tuple" ); ERROR( mapper_not_tuple, 2043, "The mapper cannot be parsed as a tuple" ); ERROR( invalid_checkpoint_format, 2044, "Invalid checkpoint format" ) +ERROR( invalid_throttle_quota_value, 2045, "Failed to deserialize or initialize throttle quota value" ) ERROR( incompatible_protocol_version, 2100, "Incompatible protocol version" ) ERROR( transaction_too_large, 2101, "Transaction exceeds byte limit" ) @@ -326,6 +327,7 @@ ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key") ERROR( encrypt_invalid_id, 2706, "Invalid encryption cipher details") ERROR( encrypt_keys_fetch_failed, 2707, "Encryption keys fetch from external KMS failed") ERROR( encrypt_invalid_kms_config, 2708, "Invalid encryption/kms configuration: discovery-url, validation-token, endpoint etc.") +ERROR( encrypt_unsupported, 2709, "Encryption not supported") // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d8570cca3c..42ace99f10 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -104,8 +104,6 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES SystemData.txt) add_fdb_test(TEST_FILES ThreadSafety.txt IGNORE) add_fdb_test(TEST_FILES TraceEventMetrics.txt IGNORE) - add_fdb_test(TEST_FILES PopulateTPCC.txt IGNORE) - add_fdb_test(TEST_FILES TPCC.txt IGNORE) add_fdb_test(TEST_FILES default.txt IGNORE) add_fdb_test(TEST_FILES errors.txt IGNORE) add_fdb_test(TEST_FILES fail.txt IGNORE) @@ -208,6 +206,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES rare/CycleWithKills.toml) add_fdb_test(TEST_FILES rare/CycleWithDeadHall.toml) add_fdb_test(TEST_FILES rare/FuzzTest.toml) + add_fdb_test(TEST_FILES rare/GlobalTagThrottling.toml IGNORE) add_fdb_test(TEST_FILES rare/HighContentionPrefixAllocator.toml) add_fdb_test(TEST_FILES rare/InventoryTestHeavyWrites.toml) add_fdb_test(TEST_FILES rare/LargeApiCorrectness.toml) diff --git a/tests/PopulateTPCC.txt b/tests/PopulateTPCC.txt deleted file mode 100644 index 92b9eb62a9..0000000000 --- a/tests/PopulateTPCC.txt +++ /dev/null @@ -1,9 +0,0 @@ -testTitle=PopulateTPCCTest -timeout=3600000 -clearAfterTest=false -runConsistencyCheck=false - - testName=PopulateTPCC - clientsUsed=2 - actors=1 - warehousesPerActor=200 diff --git a/tests/TPCC.txt b/tests/TPCC.txt deleted file mode 100644 index 416693b5bd..0000000000 --- a/tests/TPCC.txt +++ /dev/null @@ -1,19 +0,0 @@ -testTitle=PopulateTPCCTest -clearAfterTest=false -runConsistencyCheck=false -timeout=3600000 - - testName=PopulateTPCC - clientsUsed=2 - actorsPerClient=1 - warehousesPerActor=80 - -testTitle=TPCCTest -timeout=14400 - - testName=TPCC - warehousesPerClient=4 - testDuration=3600 - warmupTime=300 - clientsUsed=40 - expectedTransactionsPerMinute=1000 diff --git a/tests/rare/GlobalTagThrottling.toml b/tests/rare/GlobalTagThrottling.toml new file mode 100644 index 0000000000..58cda2312e --- /dev/null +++ b/tests/rare/GlobalTagThrottling.toml @@ -0,0 +1,41 @@ +[[test]] +testTitle='GlobalTagThrottling' + + [[test.knobs]] + min_tag_read_pages_rate=1.0 + global_tag_throttling=true + + [[test.workload]] + testName='GlobalTagThrottling' + transactionTag='sampleTag1' + totalReadQuota=1.0 + + [[test.workload]] + testName='ReadWrite' + testDuration=600.0 + transactionsPerSecond=100 + writesPerTransactionA=0 + readsPerTransactionA=10 + writesPerTransactionB=0 + readsPerTransactionB=0 + alpha=0.0 + nodeCount=10000 + valueBytes=1000 + minValueBytes=1000 + warmingDelay=60.0 + transactionTag='sampleTag1' + + [[test.workload]] + testName='ReadWrite' + testDuration=600.0 + transactionsPerSecond=100 + writesPerTransactionA=0 + readsPerTransactionA=10 + writesPerTransactionB=0 + readsPerTransactionB=0 + alpha=0.0 + nodeCount=10000 + valueBytes=1000 + minValueBytes=1000 + warmingDelay=60.0 + transactionTag='sampleTag2'