Merge branch 'main' into feature-metacluster

This commit is contained in:
A.J. Beamon 2022-06-30 15:08:09 -07:00
commit aea4d802c6
96 changed files with 2748 additions and 2258 deletions

View File

@ -119,6 +119,7 @@ if(NOT WIN32)
set(API_TESTER_SRCS
test/apitester/fdb_c_api_tester.cpp
test/apitester/TesterAtomicOpsCorrectnessWorkload.cpp
test/apitester/TesterApiWorkload.cpp
test/apitester/TesterApiWorkload.h
test/apitester/TesterTestSpec.cpp

View File

@ -0,0 +1,330 @@
/*
* TesterAtomicOpsCorrectnessWorkload.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterApiWorkload.h"
#include "TesterUtil.h"
#include "fdb_c_options.g.h"
#include "fmt/core.h"
#include "test/fdb_api.hpp"
#include <cctype>
#include <memory>
#include <fmt/format.h>
namespace FdbApiTester {
using fdb::Key;
using fdb::Value;
using fdb::ValueRef;
class AtomicOpsCorrectnessWorkload : public ApiWorkload {
public:
AtomicOpsCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {}
private:
typedef std::function<uint64_t(uint64_t, uint64_t)> IntAtomicOpFunction;
typedef std::function<Value(ValueRef, ValueRef)> AtomicOpFunction;
enum OpType {
OP_ATOMIC_ADD,
OP_ATOMIC_BIT_AND,
OP_ATOMIC_BIT_OR,
OP_ATOMIC_BIT_XOR,
OP_ATOMIC_APPEND_IF_FITS,
OP_ATOMIC_MAX,
OP_ATOMIC_MIN,
OP_ATOMIC_VERSIONSTAMPED_KEY,
OP_ATOMIC_VERSIONSTAMPED_VALUE,
OP_ATOMIC_BYTE_MIN,
OP_ATOMIC_BYTE_MAX,
OP_ATOMIC_COMPARE_AND_CLEAR,
OP_LAST = OP_ATOMIC_COMPARE_AND_CLEAR
};
void randomOperation(TTaskFct cont) override {
OpType txType = (OpType)Random::get().randomInt(0, OP_LAST);
switch (txType) {
case OP_ATOMIC_ADD:
testIntAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_ADD, [](uint64_t val1, uint64_t val2) { return val1 + val2; }, cont);
break;
case OP_ATOMIC_BIT_AND:
testIntAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_BIT_AND,
[](uint64_t val1, uint64_t val2) { return val1 & val2; },
cont);
break;
case OP_ATOMIC_BIT_OR:
testIntAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_BIT_OR,
[](uint64_t val1, uint64_t val2) { return val1 | val2; },
cont);
break;
case OP_ATOMIC_BIT_XOR:
testIntAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_BIT_XOR,
[](uint64_t val1, uint64_t val2) { return val1 ^ val2; },
cont);
break;
case OP_ATOMIC_APPEND_IF_FITS: {
Value val1 = randomValue();
Value val2 = randomValue();
testAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_APPEND_IF_FITS,
val1,
val2,
[](ValueRef val1, ValueRef val2) { return Value(val1) + Value(val2); },
cont);
break;
}
case OP_ATOMIC_MAX:
testIntAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_MAX,
[](uint64_t val1, uint64_t val2) { return std::max(val1, val2); },
cont);
break;
case OP_ATOMIC_MIN:
testIntAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_MIN,
[](uint64_t val1, uint64_t val2) { return std::min(val1, val2); },
cont);
break;
case OP_ATOMIC_VERSIONSTAMPED_KEY:
testAtomicVersionstampedKeyOp(cont);
break;
case OP_ATOMIC_VERSIONSTAMPED_VALUE:
testAtomicVersionstampedValueOp(cont);
break;
case OP_ATOMIC_BYTE_MIN: {
Value val1 = randomValue();
Value val2 = randomValue();
testAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_BYTE_MIN,
val1,
val2,
[](ValueRef val1, ValueRef val2) { return Value(std::min(val1, val2)); },
cont);
break;
}
case OP_ATOMIC_BYTE_MAX: {
Value val1 = randomValue();
Value val2 = randomValue();
testAtomicOp(
FDBMutationType::FDB_MUTATION_TYPE_BYTE_MAX,
val1,
val2,
[](ValueRef val1, ValueRef val2) { return Value(std::max(val1, val2)); },
cont);
break;
}
case OP_ATOMIC_COMPARE_AND_CLEAR:
testAtomicCompareAndClearOp(cont);
break;
}
}
void testIntAtomicOp(FDBMutationType opType, IntAtomicOpFunction opFunc, TTaskFct cont) {
uint64_t intValue1 = Random::get().randomInt(0, 10000000);
uint64_t intValue2 = Random::get().randomInt(0, 10000000);
Value val1 = toByteString(intValue1);
Value val2 = toByteString(intValue2);
testAtomicOp(
opType,
val1,
val2,
[opFunc](ValueRef val1, ValueRef val2) {
return toByteString(opFunc(toInteger<uint64_t>(val1), toInteger<uint64_t>(val2)));
},
cont);
}
void testAtomicOp(FDBMutationType opType, Value val1, Value val2, AtomicOpFunction opFunc, TTaskFct cont) {
Key key(randomKeyName());
execTransaction(
// 1. Set the key to val1
[key, val1](auto ctx) {
ctx->tx().set(key, val1);
ctx->commit();
},
[this, opType, opFunc, key, val1, val2, cont]() {
execTransaction(
// 2. Perform the given atomic operation to val2, but only if it hasn't been applied yet, otherwise
// retries of commit_unknown_result would cause the operation to be applied multiple times, see
// https://github.com/apple/foundationdb/issues/1321.
[key, opType, val1, val2](auto ctx) {
auto f = ctx->tx().get(key, false);
ctx->continueAfter(f, [ctx, f, opType, key, val1, val2]() {
auto outputVal = f.get();
ASSERT(outputVal.has_value());
if (outputVal.value() == val1) {
ctx->tx().atomicOp(key, val2, opType);
ctx->commit();
} else {
ctx->done();
}
});
},
[this, opFunc, key, val1, val2, cont]() {
auto result = std::make_shared<Value>();
execTransaction(
// 3. Fetch the final value.
[key, result](auto ctx) {
auto f = ctx->tx().get(key, false);
ctx->continueAfter(
f,
[ctx, f, result]() {
auto outputVal = f.get();
ASSERT(outputVal.has_value());
*result = outputVal.value();
ctx->done();
},
true);
},
[this, opFunc, key, val1, val2, result, cont]() {
// 4. Assert expectation.
auto expected = opFunc(val1, val2);
if (*result != expected) {
error(fmt::format("testAtomicOp expected: {} actual: {}",
fdb::toCharsRef(expected),
fdb::toCharsRef(*result)));
ASSERT(false);
}
schedule(cont);
});
});
});
}
void testAtomicVersionstampedKeyOp(TTaskFct cont) {
Key keyPrefix(randomKeyName());
Key key = keyPrefix + fdb::ByteString(10, '\0') + toByteString((uint32_t)keyPrefix.size());
Value val = randomValue();
auto versionstamp_f = std::make_shared<fdb::TypedFuture<fdb::future_var::KeyRef>>();
execTransaction(
// 1. Perform SetVersionstampedKey operation.
[key, val, versionstamp_f](auto ctx) {
ctx->tx().atomicOp(key, val, FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_KEY);
*versionstamp_f = ctx->tx().getVersionstamp();
ctx->commit();
},
[this, keyPrefix, val, versionstamp_f, cont]() {
ASSERT(versionstamp_f->ready());
auto resultKey = keyPrefix + Key(versionstamp_f->get());
auto resultVal = std::make_shared<Value>();
execTransaction(
// 2. Fetch the resulting versionstamped key and value.
[keyPrefix, resultKey, resultVal](auto ctx) {
auto fv = ctx->tx().get(resultKey, false);
ctx->continueAfter(fv, [ctx, fv, resultVal]() {
auto outputVal = fv.get();
ASSERT(outputVal.has_value());
*resultVal = outputVal.value();
ctx->done();
});
},
[this, keyPrefix, val, resultVal, cont]() {
// 3. Assert expectation.
ASSERT(*resultVal == val);
schedule(cont);
});
});
}
void testAtomicVersionstampedValueOp(TTaskFct cont) {
Key key(randomKeyName());
Value valPrefix = randomValue();
Value val = valPrefix + fdb::ByteString(10, '\0') + toByteString((uint32_t)valPrefix.size());
auto versionstamp_f = std::make_shared<fdb::TypedFuture<fdb::future_var::KeyRef>>();
execTransaction(
// 1. Perform SetVersionstampedValue operation.
[key, val, versionstamp_f](auto ctx) {
ctx->tx().atomicOp(key, val, FDBMutationType::FDB_MUTATION_TYPE_SET_VERSIONSTAMPED_VALUE);
*versionstamp_f = ctx->tx().getVersionstamp();
ctx->commit();
},
[this, key, valPrefix, versionstamp_f, cont]() {
versionstamp_f->blockUntilReady();
auto versionstamp = Key(versionstamp_f->get());
auto result = std::make_shared<Value>();
execTransaction(
// 2. Fetch the resulting versionstamped value.
[key, result](auto ctx) {
auto f = ctx->tx().get(key, false);
ctx->continueAfter(
f,
[ctx, f, result]() {
auto outputVal = f.get();
ASSERT(outputVal.has_value());
*result = outputVal.value();
ctx->done();
},
true);
},
[this, key, valPrefix, result, versionstamp, cont]() {
// 3. Assert expectation.
ASSERT(*result == valPrefix + versionstamp);
schedule(cont);
});
});
}
void testAtomicCompareAndClearOp(TTaskFct cont) {
Key key(randomKeyName());
Value val = randomValue();
execTransaction(
// 1. Set the key to initial value
[key, val](auto ctx) {
ctx->tx().set(key, val);
ctx->commit();
},
[this, key, val, cont]() {
execTransaction(
// 2. Perform CompareAndClear operation.
[key, val](auto ctx) {
ctx->tx().atomicOp(key, val, FDBMutationType::FDB_MUTATION_TYPE_COMPARE_AND_CLEAR);
ctx->commit();
},
[this, key, cont]() {
execTransaction(
// 3. Verify that the key was cleared.
[key](auto ctx) {
auto f = ctx->tx().get(key, false);
ctx->continueAfter(
f,
[ctx, f]() {
auto outputVal = f.get();
ASSERT(!outputVal.has_value());
ctx->done();
},
true);
},
[this, cont]() { schedule(cont); });
});
});
}
};
WorkloadFactory<AtomicOpsCorrectnessWorkload> AtomicOpsCorrectnessWorkloadFactory("AtomicOpsCorrectness");
} // namespace FdbApiTester

View File

@ -30,7 +30,7 @@ public:
ApiCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {}
private:
enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ };
enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_GET_RANGE, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ };
void randomCommitReadOp(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
@ -125,6 +125,71 @@ private:
});
}
void getRangeLoop(std::shared_ptr<ITransactionContext> ctx,
fdb::KeySelector begin,
fdb::KeySelector end,
std::shared_ptr<std::vector<fdb::KeyValue>> results) {
auto f = ctx->tx().getRange(begin,
end,
0 /*limit*/,
0 /*target_bytes*/,
FDB_STREAMING_MODE_WANT_ALL,
0 /*iteration*/,
false /*snapshot*/,
false /*reverse*/);
ctx->continueAfter(f, [this, ctx, f, end, results]() {
auto out = copyKeyValueArray(f.get());
results->insert(results->end(), out.first.begin(), out.first.end());
const bool more = out.second;
if (more) {
// Fetch the remaining results.
getRangeLoop(ctx, fdb::key_select::firstGreaterThan(results->back().key), end, results);
} else {
ctx->done();
}
});
}
void randomGetRangeOp(TTaskFct cont) {
auto begin = randomKey(readExistingKeysRatio);
auto end = randomKey(readExistingKeysRatio);
auto results = std::make_shared<std::vector<fdb::KeyValue>>();
execTransaction(
[this, begin, end, results](auto ctx) {
// Clear the results vector, in case the transaction is retried.
results->clear();
getRangeLoop(ctx,
fdb::key_select::firstGreaterOrEqual(begin),
fdb::key_select::firstGreaterOrEqual(end),
results);
},
[this, begin, end, results, cont]() {
auto expected = store.getRange(begin, end, results->size() + 10, false);
if (results->size() != expected.size()) {
error(fmt::format("randomGetRangeOp mismatch. expected {} keys, actual {} keys",
expected.size(),
results->size()));
} else {
auto expected_kv = expected.begin();
for (auto actual_kv : *results) {
if (actual_kv.key != expected_kv->key || actual_kv.value != expected_kv->value) {
error(fmt::format(
"randomGetRangeOp mismatch. expected key: {} actual key: {} expected value: "
"{:.80} actual value: {:.80}",
fdb::toCharsRef(expected_kv->key),
fdb::toCharsRef(actual_kv.key),
fdb::toCharsRef(expected_kv->value),
fdb::toCharsRef(actual_kv.value)));
}
expected_kv++;
}
}
schedule(cont);
});
}
void randomOperation(TTaskFct cont) {
OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
switch (txType) {
@ -137,6 +202,9 @@ private:
case OP_CLEAR:
randomClearOp(cont);
break;
case OP_GET_RANGE:
randomGetRangeOp(cont);
break;
case OP_CLEAR_RANGE:
randomClearRangeOp(cont);
break;

View File

@ -120,6 +120,25 @@ KeyValueArray copyKeyValueArray(fdb::future_var::KeyValueRefArray::Type array);
using KeyRangeArray = std::vector<fdb::KeyRange>;
KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array);
static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
// Converts a little-endian encoded number into an integral type.
template <class T, typename = std::enable_if_t<std::is_integral<T>::value>>
static T toInteger(fdb::BytesRef value) {
ASSERT(value.size() == sizeof(T));
T output;
memcpy(&output, value.data(), value.size());
return output;
}
// Converts an integral type to a little-endian encoded byte string.
template <class T, typename = std::enable_if_t<std::is_integral<T>::value>>
static fdb::ByteString toByteString(T value) {
fdb::ByteString output(sizeof(T), 0);
memcpy(output.data(), (const uint8_t*)&value, sizeof(value));
return output;
}
} // namespace FdbApiTester
#endif

View File

@ -12,14 +12,18 @@ maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100

View File

@ -15,10 +15,15 @@ maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100

View File

@ -15,10 +15,15 @@ maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100

View File

@ -14,10 +14,15 @@ maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100

View File

@ -7,10 +7,15 @@ multiThreaded = false
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100

View File

@ -516,6 +516,8 @@ public:
return out;
}
TypedFuture<future_var::KeyRef> getVersionstamp() { return native::fdb_transaction_get_versionstamp(tr.get()); }
TypedFuture<future_var::KeyRef> getKey(KeySelector sel, bool snapshot) {
return native::fdb_transaction_get_key(tr.get(), sel.key, sel.keyLength, sel.orEqual, sel.offset, snapshot);
}
@ -577,6 +579,11 @@ public:
native::fdb_transaction_set(tr.get(), key.data(), intSize(key), value.data(), intSize(value));
}
void atomicOp(KeyRef key, ValueRef param, FDBMutationType operationType) {
native::fdb_transaction_atomic_op(
tr.get(), key.data(), intSize(key), param.data(), intSize(param), operationType);
}
void clear(KeyRef key) { native::fdb_transaction_clear(tr.get(), key.data(), intSize(key)); }
void clearRange(KeyRef begin, KeyRef end) {

View File

@ -193,10 +193,9 @@ endif()
find_package(toml11 QUIET)
if(toml11_FOUND)
add_library(toml11_target INTERFACE)
add_dependencies(toml11_target INTERFACE toml11::toml11)
target_link_libraries(toml11_target INTERFACE toml11::toml11)
else()
include(ExternalProject)
include(ExternalProject)
ExternalProject_add(toml11Project
URL "https://github.com/ToruNiina/toml11/archive/v3.4.0.tar.gz"
URL_HASH SHA256=bc6d733efd9216af8c119d8ac64a805578c79cc82b813e4d1d880ca128bd154d

View File

@ -4,6 +4,7 @@ target_include_directories(rapidjson INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/rapid
add_subdirectory(crc32)
add_subdirectory(stacktrace)
add_subdirectory(folly_memcpy)
add_subdirectory(rapidxml)
add_subdirectory(sqlite)
add_subdirectory(SimpleOpt)
add_subdirectory(fmt-8.1.1)

View File

@ -35,6 +35,10 @@
#endif
#endif
#ifdef _WIN32
#include <intrin.h>
#endif
#include "crc32/crc32c.h"
#if !defined(__aarch64__) && !defined(__powerpc64__)

View File

@ -0,0 +1,2 @@
add_library(rapidxml INTERFACE)
target_include_directories(rapidxml INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include")

View File

@ -0,0 +1,178 @@
/*
* QuotaCommand.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbcli/fdbcli.actor.h"
#include "flow/actorcompiler.h" // This must be the last include
namespace {
enum class LimitType { RESERVED, TOTAL };
enum class OpType { READ, WRITE };
Optional<TransactionTag> parseTag(StringRef token) {
if (token.size() > CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH) {
return {};
} else {
return token;
}
}
Optional<LimitType> parseLimitType(StringRef token) {
if (token == "reserved"_sr) {
return LimitType::RESERVED;
} else if (token == "total"_sr) {
return LimitType::TOTAL;
} else {
return {};
}
}
Optional<OpType> parseOpType(StringRef token) {
if (token == "read"_sr) {
return OpType::READ;
} else if (token == "write"_sr) {
return OpType::WRITE;
} else {
return {};
}
}
Optional<double> parseLimitValue(StringRef token) {
try {
return std::stod(token.toString());
} catch (...) {
return {};
}
}
ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, OpType opType) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(tag.withPrefix(tagQuotaPrefix));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
if (!v.present()) {
fmt::print("<empty>\n");
} else {
auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (limitType == LimitType::TOTAL && opType == OpType::READ) {
fmt::print("{}\n", quota.totalReadQuota);
} else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) {
fmt::print("{}\n", quota.totalWriteQuota);
} else if (limitType == LimitType::RESERVED && opType == OpType::READ) {
fmt::print("{}\n", quota.reservedReadQuota);
} else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) {
fmt::print("{}\n", quota.reservedWriteQuota);
}
}
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<Void> setQuota(Reference<IDatabase> db,
TransactionTag tag,
LimitType limitType,
OpType opType,
double value) {
state Reference<ITransaction> tr = db->createTransaction();
state Key key = tag.withPrefix(tagQuotaPrefix);
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(key);
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
}
if (limitType == LimitType::TOTAL && opType == OpType::READ) {
quota.totalReadQuota = value;
} else if (limitType == LimitType::TOTAL && opType == OpType::WRITE) {
quota.totalWriteQuota = value;
} else if (limitType == LimitType::RESERVED && opType == OpType::READ) {
quota.reservedReadQuota = value;
} else if (limitType == LimitType::RESERVED && opType == OpType::WRITE) {
quota.reservedWriteQuota = value;
}
ThrottleApi::setTagQuota(tr,
tag,
quota.reservedReadQuota,
quota.totalReadQuota,
quota.reservedWriteQuota,
quota.totalWriteQuota);
wait(safeThreadFutureToFuture(tr->commit()));
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
constexpr auto usage =
"quota [get <tag> [reserved|total] [read|write]|set <tag> [reserved|total] [read|write] <value>]";
bool exitFailure() {
fmt::print(usage);
return false;
}
} // namespace
namespace fdb_cli {
ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
state bool result = true;
if (tokens.size() != 5 && tokens.size() != 6) {
return exitFailure();
} else {
auto tag = parseTag(tokens[2]);
auto limitType = parseLimitType(tokens[3]);
auto opType = parseOpType(tokens[4]);
if (!tag.present() || !limitType.present() || !opType.present()) {
return exitFailure();
}
if (tokens[1] == "get"_sr) {
if (tokens.size() != 5) {
return exitFailure();
}
wait(getQuota(db, tag.get(), limitType.get(), opType.get()));
return true;
} else if (tokens[1] == "set"_sr) {
if (tokens.size() != 6) {
return exitFailure();
}
auto const limitValue = parseLimitValue(tokens[5]);
if (!limitValue.present()) {
return exitFailure();
}
wait(setQuota(db, tag.get(), limitType.get(), opType.get(), limitValue.get()));
return true;
} else {
return exitFailure();
}
}
}
} // namespace fdb_cli

View File

@ -509,6 +509,10 @@ void initHelp() {
CommandHelp("getversion",
"Fetch the current read version",
"Displays the current read version of the database or currently running transaction.");
helpMap["quota"] =
CommandHelp("quota",
"quota [get <tag> [reserved|total] [read|write]|set <tag> [reserved|total] [read|write] <value>]",
"Get or modify the throughput quota for the specified tag.");
helpMap["reset"] =
CommandHelp("reset",
"reset the current transaction",
@ -1468,6 +1472,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
continue;
}
if (tokencmp(tokens[0], "quota")) {
bool _result = wait(makeInterruptable(quotaCommandActor(db, tokens)));
if (!_result) {
is_error = true;
}
continue;
}
if (tokencmp(tokens[0], "reset")) {
if (tokens.size() != 1) {
printUsage(tokens[0]);

View File

@ -222,6 +222,8 @@ ACTOR Future<bool> profileCommandActor(Database db,
Reference<ITransaction> tr,
std::vector<StringRef> tokens,
bool intrans);
// quota command
ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// setclass command
ACTOR Future<bool> setClassCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// snapshot command

View File

@ -78,7 +78,7 @@ add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS $
target_include_directories(fdbclient PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include")
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/versions.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/include/fdbclient/versions.h)
add_dependencies(fdbclient fdboptions)
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack PRIVATE rapidxml)
# Create a separate fdbclient library with sampling enabled. This lets
# fdbserver retain sampling functionality in client code while disabling
@ -86,7 +86,7 @@ target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
add_flow_target(STATIC_LIBRARY NAME fdbclient_sampling SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
target_include_directories(fdbclient_sampling PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include")
add_dependencies(fdbclient_sampling fdboptions)
target_link_libraries(fdbclient_sampling PUBLIC fdbrpc_sampling msgpack)
target_link_libraries(fdbclient_sampling PUBLIC fdbrpc_sampling msgpack PRIVATE rapidxml)
target_compile_definitions(fdbclient_sampling PRIVATE -DENABLE_SAMPLING)
if(WIN32)

View File

@ -450,16 +450,21 @@ bool isCompleteConfiguration(std::map<std::string, std::string> const& options)
options.count(p + "storage_engine") == 1;
}
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Transaction* tr) {
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
RangeResult res = wait(tr->getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY);
DatabaseConfiguration config;
config.fromKeyValues((VectorRef<KeyValueRef>)res);
return config;
}
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Database cx) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
RangeResult res = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY);
DatabaseConfiguration config;
config.fromKeyValues((VectorRef<KeyValueRef>)res);
DatabaseConfiguration config = wait(getDatabaseConfiguration(&tr));
return config;
} catch (Error& e) {
wait(tr.onError(e));

View File

@ -5158,8 +5158,9 @@ Future<Optional<Value>> Transaction::get(const Key& key, Snapshot snapshot) {
if (!ver.isReady() || metadataVersion.isSet()) {
return metadataVersion.getFuture();
} else {
if (ver.isError())
if (ver.isError()) {
return ver.getError();
}
if (ver.get() == trState->cx->metadataVersionCache[trState->cx->mvCacheInsertLocation].first) {
return trState->cx->metadataVersionCache[trState->cx->mvCacheInsertLocation].second;
}
@ -5763,6 +5764,10 @@ void Transaction::resetImpl(bool generateNewSpan) {
cancelWatches();
}
TagSet const& Transaction::getTags() const {
return trState->options.tags;
}
void Transaction::reset() {
resetImpl(false);
}
@ -7067,6 +7072,25 @@ Future<ProtocolVersion> DatabaseContext::getClusterProtocol(Optional<ProtocolVer
return getClusterProtocolImpl(coordinator, expectedVersion);
}
double ClientTagThrottleData::throttleDuration() const {
if (expiration <= now()) {
return 0.0;
}
double capacity =
(smoothRate.smoothTotal() - smoothReleased.smoothRate()) * CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW;
if (capacity >= 1) {
return 0.0;
}
if (tpsRate == 0) {
return std::max(0.0, expiration - now());
}
return std::min(expiration - now(), capacity / tpsRate);
}
uint32_t Transaction::getSize() {
auto s = tr.transaction.mutations.expectedSize() + tr.transaction.read_conflict_ranges.expectedSize() +
tr.transaction.write_conflict_ranges.expectedSize();
@ -7892,7 +7916,8 @@ ACTOR Future<Void> splitStorageMetricsStream(PromiseStream<Key> resultStream,
Database cx,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated) {
StorageMetrics estimated,
Optional<int> minSplitBytes) {
state Span span("NAPI:SplitStorageMetricsStream"_loc);
state Key beginKey = keys.begin;
state Key globalLastKey = beginKey;
@ -7923,7 +7948,8 @@ ACTOR Future<Void> splitStorageMetricsStream(PromiseStream<Key> resultStream,
limit,
localUsed,
estimated,
i == locations.size() - 1 && keys.end <= locations.back().range.end);
i == locations.size() - 1 && keys.end <= locations.back().range.end,
minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
@ -7986,15 +8012,17 @@ ACTOR Future<Void> splitStorageMetricsStream(PromiseStream<Key> resultStream,
Future<Void> DatabaseContext::splitStorageMetricsStream(const PromiseStream<Key>& resultStream,
KeyRange const& keys,
StorageMetrics const& limit,
StorageMetrics const& estimated) {
StorageMetrics const& estimated,
Optional<int> const& minSplitBytes) {
return ::splitStorageMetricsStream(
resultStream, Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated);
resultStream, Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated, minSplitBytes);
}
ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
KeyRange keys,
StorageMetrics limit,
StorageMetrics estimated) {
StorageMetrics estimated,
Optional<int> minSplitBytes) {
state Span span("NAPI:SplitStorageMetrics"_loc);
loop {
state std::vector<KeyRangeLocationInfo> locations =
@ -8023,7 +8051,8 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
state int i = 0;
for (; i < locations.size(); i++) {
SplitMetricsRequest req(locations[i].range, limit, used, estimated, i == locations.size() - 1);
SplitMetricsRequest req(
locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
@ -8067,8 +8096,10 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(Database cx,
Future<Standalone<VectorRef<KeyRef>>> DatabaseContext::splitStorageMetrics(KeyRange const& keys,
StorageMetrics const& limit,
StorageMetrics const& estimated) {
return ::splitStorageMetrics(Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated);
StorageMetrics const& estimated,
Optional<int> const& minSplitBytes) {
return ::splitStorageMetrics(
Database(Reference<DatabaseContext>::addRef(this)), keys, limit, estimated, minSplitBytes);
}
void Transaction::checkDeferredError() const {

View File

@ -40,7 +40,7 @@
#include "flow/IAsyncFile.h"
#include "flow/Hostname.h"
#include "flow/UnitTest.h"
#include "fdbclient/rapidxml/rapidxml.hpp"
#include "rapidxml/rapidxml.hpp"
#ifdef BUILD_AWS_BACKUP
#include "fdbclient/FDBAWSCredentialsProvider.h"
#endif

View File

@ -575,7 +575,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"duplicate_mutation_streams",
"duplicate_mutation_fetch_timeout",
"primary_dc_missing",
"fetch_primary_dc_timeout"
"fetch_primary_dc_timeout",
"fetch_storage_wiggler_stats_timeout"
]
},
"issues":[

View File

@ -20,6 +20,7 @@
#include "fdbclient/ServerKnobs.h"
#include "flow/IRandom.h"
#include "flow/flow.h"
#define init(...) KNOB_FN(__VA_ARGS__, INIT_ATOMIC_KNOB, INIT_KNOB)(__VA_ARGS__)
@ -35,12 +36,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( MAX_VERSIONS_IN_FLIGHT, 100 * VERSIONS_PER_SECOND );
init( MAX_VERSIONS_IN_FLIGHT_FORCED, 6e5 * VERSIONS_PER_SECOND ); //one week of versions
init( ENABLE_VERSION_VECTOR, false );
init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false );
bool buggifyShortReadWindow = randomize && BUGGIFY && !ENABLE_VERSION_VECTOR;
init( MAX_READ_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_READ_TRANSACTION_LIFE_VERSIONS = VERSIONS_PER_SECOND; else if (buggifyShortReadWindow) MAX_READ_TRANSACTION_LIFE_VERSIONS = std::max<int>(1, 0.1 * VERSIONS_PER_SECOND); else if( randomize && BUGGIFY ) MAX_READ_TRANSACTION_LIFE_VERSIONS = 10 * VERSIONS_PER_SECOND;
init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
init( MAX_COMMIT_BATCH_INTERVAL, 2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
init( ENABLE_VERSION_VECTOR_TLOG_UNICAST, false );
init( MAX_VERSION_RATE_MODIFIER, 0.1 );
init( MAX_VERSION_RATE_OFFSET, VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.
init( ENABLE_VERSION_VECTOR_HA_OPTIMIZATION, false );
@ -117,7 +119,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
init( MAX_FORKED_PROCESS_OUTPUT, 1024 );
init( SNAP_CREATE_MAX_TIMEOUT, 300.0 );
init( SNAP_CREATE_MAX_TIMEOUT, isSimulated ? 70.0 : 300.0 );
init( SNAP_MINIMUM_TIME_GAP, 5.0 );
init( SNAP_NETWORK_FAILURE_RETRY_LIMIT, 10 );
init( MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE, 1 );
init( MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE, 1 );
@ -181,7 +185,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
/*
The bytesRead/byteSize radio. Will be declared as read hot when larger than this. 8.0 was chosen to avoid reporting table scan as read hot.
*/
init ( SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS, 1666667 * 1000);
init ( SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS, 1666667 * 1000);
/*
The read bandwidth of a given shard needs to be larger than this value in order to be evaluated if it's read hot. The roughly 1.67MB per second is calculated as following:
- Heuristic data suggests that each storage process can do max 500K read operations per second
@ -662,6 +666,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( AUTO_TAG_THROTTLE_UPDATE_FREQUENCY, 10.0 ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLE_UPDATE_FREQUENCY = 0.5;
init( TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL, 30.0 ); if(randomize && BUGGIFY) TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL = 1.0;
init( AUTO_TAG_THROTTLING_ENABLED, true ); if(randomize && BUGGIFY) AUTO_TAG_THROTTLING_ENABLED = false;
init( SS_THROTTLE_TAGS_TRACKED, 1 ); if(randomize && BUGGIFY) SS_THROTTLE_TAGS_TRACKED = deterministicRandom()->randomInt(1, 10);
init( GLOBAL_TAG_THROTTLING, false );
init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 );
init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 );
init( GLOBAL_TAG_THROTTLING_TRACE_INTERVAL, 5.0 );
//Storage Metrics
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
@ -725,6 +734,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );
init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100);
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );

View File

@ -1404,17 +1404,6 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) {
return interf;
}
Value encodeTenantEntry(TenantMapEntry const& tenantEntry) {
return ObjectWriter::toValue(tenantEntry, IncludeVersion());
}
TenantMapEntry decodeTenantEntry(ValueRef const& value) {
TenantMapEntry entry;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(entry);
return entry;
}
const KeyRangeRef tenantMapKeys("\xff/tenantMap/"_sr, "\xff/tenantMap0"_sr);
const KeyRef tenantMapPrefix = tenantMapKeys.begin;
const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr;

View File

@ -18,12 +18,16 @@
* limitations under the License.
*/
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/Tuple.h"
#include "flow/actorcompiler.h" // has to be last include
double const ClientTagThrottleLimits::NO_EXPIRATION = std::numeric_limits<double>::max();
void TagSet::addTag(TransactionTagRef tag) {
ASSERT(CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH < 256); // Tag length is encoded with a single byte
ASSERT(CLIENT_KNOBS->MAX_TAGS_PER_TRANSACTION < 256); // Number of tags is encoded with a single byte
@ -124,6 +128,53 @@ TagThrottleValue TagThrottleValue::fromValue(const ValueRef& value) {
return throttleValue;
}
KeyRangeRef const tagQuotaKeys = KeyRangeRef("\xff/tagQuota/"_sr, "\xff/tagQuota0"_sr);
KeyRef const tagQuotaPrefix = tagQuotaKeys.begin;
Key ThrottleApi::getTagQuotaKey(TransactionTagRef tag) {
return tag.withPrefix(tagQuotaPrefix);
}
bool ThrottleApi::TagQuotaValue::isValid() const {
return reservedReadQuota <= totalReadQuota && reservedWriteQuota <= totalWriteQuota && reservedReadQuota >= 0 &&
reservedWriteQuota >= 0;
}
Value ThrottleApi::TagQuotaValue::toValue() const {
Tuple tuple;
tuple.appendDouble(reservedReadQuota);
tuple.appendDouble(totalReadQuota);
tuple.appendDouble(reservedWriteQuota);
tuple.appendDouble(totalWriteQuota);
return tuple.pack();
}
ThrottleApi::TagQuotaValue ThrottleApi::TagQuotaValue::fromValue(ValueRef value) {
auto tuple = Tuple::unpack(value);
if (tuple.size() != 4) {
throw invalid_throttle_quota_value();
}
TagQuotaValue result;
try {
result.reservedReadQuota = tuple.getDouble(0);
result.totalReadQuota = tuple.getDouble(1);
result.reservedWriteQuota = tuple.getDouble(2);
result.totalWriteQuota = tuple.getDouble(3);
} catch (Error& e) {
TraceEvent(SevWarnAlways, "TagQuotaValueFailedToDeserialize").error(e);
throw invalid_throttle_quota_value();
}
if (!result.isValid()) {
TraceEvent(SevWarnAlways, "TagQuotaValueInvalidQuotas")
.detail("ReservedReadQuota", result.reservedReadQuota)
.detail("TotalReadQuota", result.totalReadQuota)
.detail("ReservedWriteQuota", result.reservedWriteQuota)
.detail("TotalWriteQuota", result.totalWriteQuota);
throw invalid_throttle_quota_value();
}
return result;
}
FDB_DEFINE_BOOLEAN_PARAM(ContainsRecommended);
FDB_DEFINE_BOOLEAN_PARAM(Capitalize);

View File

@ -96,12 +96,12 @@ bool TenantMapEntry::matchesConfiguration(TenantMapEntry const& other) const {
TEST_CASE("/fdbclient/TenantMapEntry/Serialization") {
TenantMapEntry entry1(1, ""_sr, TenantState::READY);
ASSERT(entry1.prefix == "\x00\x00\x00\x00\x00\x00\x00\x01"_sr);
TenantMapEntry entry2 = decodeTenantEntry(encodeTenantEntry(entry1));
TenantMapEntry entry2 = TenantMapEntry::decode(entry1.encode());
ASSERT(entry1.id == entry2.id && entry1.prefix == entry2.prefix);
TenantMapEntry entry3(std::numeric_limits<int64_t>::max(), "foo"_sr, TenantState::READY);
ASSERT(entry3.prefix == "foo\x7f\xff\xff\xff\xff\xff\xff\xff"_sr);
TenantMapEntry entry4 = decodeTenantEntry(encodeTenantEntry(entry3));
TenantMapEntry entry4 = TenantMapEntry::decode(entry3.encode());
ASSERT(entry3.id == entry4.id && entry3.prefix == entry4.prefix);
for (int i = 0; i < 100; ++i) {
@ -120,7 +120,7 @@ TEST_CASE("/fdbclient/TenantMapEntry/Serialization") {
entry.prefix.endsWith(StringRef(reinterpret_cast<uint8_t*>(&bigEndianId), 8)) &&
entry.prefix.size() == subspaceLength + 8);
TenantMapEntry decodedEntry = decodeTenantEntry(encodeTenantEntry(entry));
TenantMapEntry decodedEntry = TenantMapEntry::decode(entry.encode());
ASSERT(decodedEntry.id == entry.id && decodedEntry.prefix == entry.prefix);
}

View File

@ -79,6 +79,7 @@ struct MutationRef {
CompareAndClear,
Reserved_For_SpanContextMessage /* See fdbserver/SpanContextMessage.h */,
Reserved_For_OTELSpanContextMessage,
Reserved_For_EncryptedMutationMessage /* See fdbserver/EncryptedMutationMessage.actor.h */,
MAX_ATOMIC_OP
};
// This is stored this way for serialization purposes.

View File

@ -116,23 +116,7 @@ public:
bool canRecheck() const { return lastCheck < now() - CLIENT_KNOBS->TAG_THROTTLE_RECHECK_INTERVAL; }
double throttleDuration() const {
if (expiration <= now()) {
return 0.0;
}
double capacity =
(smoothRate.smoothTotal() - smoothReleased.smoothRate()) * CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW;
if (capacity >= 1) {
return 0.0;
}
if (tpsRate == 0) {
return std::max(0.0, expiration - now());
}
return std::min(expiration - now(), capacity / tpsRate);
}
double throttleDuration() const;
};
struct WatchParameters : public ReferenceCounted<WatchParameters> {
@ -307,10 +291,12 @@ public:
Future<Void> splitStorageMetricsStream(PromiseStream<Key> const& resultsStream,
KeyRange const& keys,
StorageMetrics const& limit,
StorageMetrics const& estimated);
StorageMetrics const& estimated,
Optional<int> const& minSplitBytes = {});
Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(KeyRange const& keys,
StorageMetrics const& limit,
StorageMetrics const& estimated);
StorageMetrics const& estimated,
Optional<int> const& minSplitBytes = {});
Future<Standalone<VectorRef<ReadHotRangeWithMetrics>>> getReadHotRanges(KeyRange const& keys);

View File

@ -41,6 +41,7 @@ standard API and some knowledge of the contents of the system key space.
#include "fdbclient/MonitorLeader.h"
#include "flow/actorcompiler.h" // has to be last include
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Transaction* tr);
ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Database cx);
ACTOR Future<Void> waitForFullReplication(Database cx);

View File

@ -465,6 +465,7 @@ public:
Reference<TransactionState> trState;
std::vector<Reference<Watch>> watches;
TagSet const& getTags() const;
Span span;
// used in template functions as returned Future type

View File

@ -196,6 +196,7 @@ public:
Transaction& getTransaction() { return tr; }
Optional<TenantName> getTenant() { return tr.getTenant(); }
TagSet const& getTags() const { return tr.getTags(); }
// used in template functions as returned Future type
template <typename Type>

View File

@ -177,7 +177,7 @@ public:
SHARD_MIN_BYTES_PER_KSEC, // Shards with more than this bandwidth will not be merged
SHARD_SPLIT_BYTES_PER_KSEC; // When splitting a shard, it is split into pieces with less than this bandwidth
double SHARD_MAX_READ_DENSITY_RATIO;
int64_t SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS;
int64_t SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS;
double SHARD_MAX_BYTES_READ_PER_KSEC_JITTER;
double STORAGE_METRIC_TIMEOUT;
double METRIC_DELAY;
@ -564,6 +564,7 @@ public:
int64_t TLOG_RECOVER_MEMORY_LIMIT;
double TLOG_IGNORE_POP_AUTO_ENABLE_DELAY;
// Tag throttling
int64_t MAX_MANUAL_THROTTLED_TRANSACTION_TAGS;
int64_t MAX_AUTO_THROTTLED_TRANSACTION_TAGS;
double MIN_TAG_COST;
@ -576,6 +577,17 @@ public:
double AUTO_TAG_THROTTLE_UPDATE_FREQUENCY;
double TAG_THROTTLE_EXPIRED_CLEANUP_INTERVAL;
bool AUTO_TAG_THROTTLING_ENABLED;
// Limit to the number of throttling tags each storage server
// will track and send to the ratekeeper
int64_t SS_THROTTLE_TAGS_TRACKED;
// Use global tag throttling strategy. i.e. throttle based on the cluster-wide
// throughput for tags and their associated quotas.
bool GLOBAL_TAG_THROTTLING;
// Minimum number of transactions per second that the global tag throttler must allow for each tag
double GLOBAL_TAG_THROTTLING_MIN_RATE;
// Used by global tag throttling counters
double GLOBAL_TAG_THROTTLING_FOLDING_TIME;
double GLOBAL_TAG_THROTTLING_TRACE_INTERVAL;
double MAX_TRANSACTIONS_PER_BYTE;
@ -603,7 +615,12 @@ public:
// disk snapshot
int64_t MAX_FORKED_PROCESS_OUTPUT;
// retry limit after network failures
int64_t SNAP_NETWORK_FAILURE_RETRY_LIMIT;
// time limit for creating snapshot
double SNAP_CREATE_MAX_TIMEOUT;
// minimum gap time between two snapshot requests for the same process
double SNAP_MINIMUM_TIME_GAP;
// Maximum number of storage servers a snapshot can fail to
// capture while still succeeding
int64_t MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE;
@ -672,6 +689,7 @@ public:
bool ENABLE_CLEAR_RANGE_EAGER_READS;
bool QUICK_GET_VALUE_FALLBACK;
bool QUICK_GET_KEY_VALUES_FALLBACK;
int MAX_PARALLEL_QUICK_GET_VALUE;
int CHECKPOINT_TRANSFER_BLOCK_BYTES;
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;

View File

@ -716,18 +716,21 @@ struct SplitMetricsRequest {
StorageMetrics estimated;
bool isLastShard;
ReplyPromise<SplitMetricsReply> reply;
Optional<int> minSplitBytes;
SplitMetricsRequest() {}
SplitMetricsRequest(KeyRangeRef const& keys,
StorageMetrics const& limits,
StorageMetrics const& used,
StorageMetrics const& estimated,
bool isLastShard)
: keys(arena, keys), limits(limits), used(used), estimated(estimated), isLastShard(isLastShard) {}
bool isLastShard,
Optional<int> minSplitBytes)
: keys(arena, keys), limits(limits), used(used), estimated(estimated), isLastShard(isLastShard),
minSplitBytes(minSplitBytes) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena);
serializer(ar, keys, limits, used, estimated, isLastShard, reply, arena, minSplitBytes);
}
};

View File

@ -390,6 +390,8 @@ extern const KeyRef tagThrottleSignalKey;
extern const KeyRef tagThrottleAutoEnabledKey;
extern const KeyRef tagThrottleLimitKey;
extern const KeyRef tagThrottleCountKey;
extern const KeyRangeRef tagQuotaKeys;
extern const KeyRef tagQuotaPrefix;
// Log Range constant variables
// Used in the backup pipeline to track mutations
@ -632,9 +634,6 @@ extern const KeyRef tenantDataPrefixKey;
extern const KeyRangeRef tenantGroupTenantIndexKeys;
extern const KeyRangeRef tenantTombstoneKeys;
Value encodeTenantEntry(TenantMapEntry const& tenantEntry);
TenantMapEntry decodeTenantEntry(ValueRef const& value);
// Metacluster keys
extern const KeyRangeRef dataClusterMetadataKeys;
extern const KeyRef dataClusterMetadataPrefix;

View File

@ -207,6 +207,8 @@ struct ClientTagThrottleLimits {
double tpsRate;
double expiration;
static double const NO_EXPIRATION;
ClientTagThrottleLimits() : tpsRate(0), expiration(0) {}
ClientTagThrottleLimits(double tpsRate, double expiration) : tpsRate(tpsRate), expiration(expiration) {}
@ -595,6 +597,38 @@ Future<Void> enableAuto(Reference<DB> db, bool enabled) {
}
}
class TagQuotaValue {
public:
double reservedReadQuota{ 0.0 };
double totalReadQuota{ 0.0 };
double reservedWriteQuota{ 0.0 };
double totalWriteQuota{ 0.0 };
bool isValid() const;
Value toValue() const;
static TagQuotaValue fromValue(ValueRef);
};
Key getTagQuotaKey(TransactionTagRef);
template <class Tr>
void setTagQuota(Reference<Tr> tr,
TransactionTagRef tag,
double reservedReadQuota,
double totalReadQuota,
double reservedWriteQuota,
double totalWriteQuota) {
TagQuotaValue tagQuotaValue;
tagQuotaValue.reservedReadQuota = reservedReadQuota;
tagQuotaValue.totalReadQuota = totalReadQuota;
tagQuotaValue.reservedWriteQuota = reservedWriteQuota;
tagQuotaValue.totalWriteQuota = totalWriteQuota;
if (!tagQuotaValue.isValid()) {
throw invalid_throttle_quota_value();
}
tr->set(getTagQuotaKey(tag), tagQuotaValue.toValue());
signalThrottleChange(tr);
}
}; // namespace ThrottleApi
template <class Value>

View File

@ -59,6 +59,15 @@ struct TenantMapEntry {
void setSubspace(KeyRef subspace);
bool matchesConfiguration(TenantMapEntry const& other) const;
Value encode() const { return ObjectWriter::toValue(*this, IncludeVersion(ProtocolVersion::withTenants())); }
static TenantMapEntry decode(ValueRef const& value) {
TenantMapEntry entry;
ObjectReader reader(value.begin(), IncludeVersion(ProtocolVersion::withTenants()));
reader.deserialize(entry);
return entry;
}
template <class Ar>
void serialize(Ar& ar) {
KeyRef subspace;

View File

@ -41,7 +41,7 @@ Future<Optional<TenantMapEntry>> tryGetTenantTransaction(Transaction tr, TenantN
state typename transaction_future_type<Transaction, Optional<Value>>::type tenantFuture = tr->get(tenantMapKey);
Optional<Value> val = wait(safeThreadFutureToFuture(tenantFuture));
return val.map<TenantMapEntry>([](Optional<Value> v) { return decodeTenantEntry(v.get()); });
return val.map<TenantMapEntry>([](Optional<Value> v) { return TenantMapEntry::decode(v.get()); });
}
ACTOR template <class DB>
@ -181,7 +181,8 @@ Future<std::pair<Optional<TenantMapEntry>, bool>> createTenantTransaction(
tenantEntry.assignedCluster = Optional<ClusterName>();
}
tr->set(tenantMapKey, encodeTenantEntry(tenantEntry));
tr->set(tenantMapKey, newTenant.encode());
if (tenantEntry.tenantGroup.present()) {
tr->set(getTenantGroupIndexKey(tenantEntry.tenantGroup.get(), name), ""_sr);
}
@ -355,7 +356,7 @@ Future<std::map<TenantName, TenantMapEntry>> listTenantsTransaction(Transaction
std::map<TenantName, TenantMapEntry> tenants;
for (auto kv : results) {
tenants[kv.key.removePrefix(tenantMapPrefix)] = decodeTenantEntry(kv.value);
tenants[kv.key.removePrefix(tenantMapPrefix)] = TenantMapEntry::decode(kv.value);
}
return tenants;

View File

@ -126,6 +126,10 @@ public:
Future<KillType> onShutdown() { return shutdownSignal.getFuture(); }
bool isSpawnedKVProcess() const {
// SOMEDAY: use a separate bool may be better?
return name == "remote flow process";
}
bool isReliable() const {
return !failed && fault_injection_p1 == 0 && fault_injection_p2 == 0 && !failedDisk &&
(!machine || (machine->machineProcess->fault_injection_p1 == 0 &&

View File

@ -1328,7 +1328,8 @@ public:
std::vector<LocalityData> primaryLocalitiesDead, primaryLocalitiesLeft;
for (auto processInfo : getAllProcesses()) {
if (processInfo->isAvailableClass() && processInfo->locality.dcId() == dcId) {
if (!processInfo->isSpawnedKVProcess() && processInfo->isAvailableClass() &&
processInfo->locality.dcId() == dcId) {
if (processInfo->isExcluded() || processInfo->isCleared() || !processInfo->isAvailable()) {
primaryProcessesDead.add(processInfo->locality);
primaryLocalitiesDead.push_back(processInfo->locality);
@ -1348,7 +1349,6 @@ public:
if (usableRegions > 1 && remoteTLogPolicy && !primaryTLogsDead) {
primaryTLogsDead = primaryProcessesDead.validate(remoteTLogPolicy);
}
return primaryTLogsDead || primaryProcessesDead.validate(storagePolicy);
}
@ -1602,7 +1602,7 @@ public:
.detail("Protected", protectedAddresses.count(machine->address))
.backtrace();
// This will remove all the "tracked" messages that came from the machine being killed
if (machine->name != "remote flow process")
if (!machine->isSpawnedKVProcess())
latestEventCache.clear();
machine->failed = true;
} else if (kt == InjectFaults) {
@ -1631,8 +1631,7 @@ public:
} else {
ASSERT(false);
}
ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting ||
machine->name == "remote flow process");
ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting || machine->isSpawnedKVProcess());
}
void rebootProcess(ProcessInfo* process, KillType kt) override {
if (kt == RebootProcessAndDelete && protectedAddresses.count(process->address)) {
@ -2498,7 +2497,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
.detail("Rebooting", p->rebooting)
.detail("Reliable", p->isReliable());
return;
} else if (p->name == "remote flow process") {
} else if (p->isSpawnedKVProcess()) {
TraceEvent(SevDebug, "DoRebootFailed").detail("Name", p->name).detail("Address", p->address);
return;
} else if (p->getChilds().size()) {

View File

@ -24,6 +24,7 @@
#include "fdbclient/Notified.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
@ -67,13 +68,14 @@ public:
ProxyCommitData& proxyCommitData_,
Reference<ILogSystem> logSystem_,
LogPushData* toCommit_,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* cipherKeys_,
bool& confChange_,
Version version,
Version popVersion_,
bool initialCommit_)
: spanContext(spanContext_), dbgid(proxyCommitData_.dbgid), arena(arena_), mutations(mutations_),
txnStateStore(proxyCommitData_.txnStateStore), toCommit(toCommit_), confChange(confChange_),
logSystem(logSystem_), version(version), popVersion(popVersion_),
txnStateStore(proxyCommitData_.txnStateStore), toCommit(toCommit_), cipherKeys(cipherKeys_),
confChange(confChange_), logSystem(logSystem_), version(version), popVersion(popVersion_),
vecBackupKeys(&proxyCommitData_.vecBackupKeys), keyInfo(&proxyCommitData_.keyInfo),
cacheInfo(&proxyCommitData_.cacheInfo),
uid_applyMutationsData(proxyCommitData_.firstProxy ? &proxyCommitData_.uid_applyMutationsData : nullptr),
@ -108,6 +110,9 @@ private:
// non-null if these mutations were part of a new commit handled by this commit proxy
LogPushData* toCommit = nullptr;
// Cipher keys used to encrypt to be committed mutations
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* cipherKeys = nullptr;
// Flag indicates if the configure is changed
bool& confChange;
@ -152,6 +157,16 @@ private:
bool dummyConfChange = false;
private:
void writeMutation(const MutationRef& m) {
if (forResolver || !SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
toCommit->writeTypedMessage(m);
} else {
ASSERT(cipherKeys != nullptr);
Arena arena;
toCommit->writeTypedMessage(EncryptedMutationMessage::encryptMetadata(arena, *cipherKeys, m));
}
}
void checkSetKeyServersPrefix(MutationRef m) {
if (!m.param1.startsWith(keyServersPrefix)) {
return;
@ -221,7 +236,7 @@ private:
.detail("Tag", tag.toString());
toCommit->addTag(tag);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
@ -243,7 +258,7 @@ private:
toCommit->writeTypedMessage(LogProtocolMessage(), true);
TraceEvent(SevDebug, "SendingPrivatized_ServerTag", dbgid).detail("M", privatized);
toCommit->addTag(tag);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
if (!initialCommit) {
txnStateStore->set(KeyValueRef(m.param1, m.param2));
@ -303,7 +318,7 @@ private:
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivatized_CacheTag", dbgid).detail("M", privatized);
toCommit->addTag(cacheTag);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
void checkSetConfigKeys(MutationRef m) {
@ -354,7 +369,7 @@ private:
toCommit->addTags(allSources);
}
TraceEvent(SevDebug, "SendingPrivatized_ChangeFeed", dbgid).detail("M", privatized);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
@ -408,7 +423,7 @@ private:
if (tagV.present()) {
TraceEvent(SevDebug, "SendingPrivatized_TSSID", dbgid).detail("M", privatized);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
}
@ -437,7 +452,7 @@ private:
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivatized_TSSQuarantine", dbgid).detail("M", privatized);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
@ -560,7 +575,7 @@ private:
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivatized_GlobalKeys", dbgid).detail("M", privatized);
toCommit->addTags(allTags);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
// Generates private mutations for the target storage server, instructing it to create a checkpoint.
@ -582,7 +597,7 @@ private:
.detail("Checkpoint", checkpoint.toString());
toCommit->addTag(tag);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
@ -639,7 +654,7 @@ private:
if (tenantMap) {
ASSERT(version != invalidVersion);
TenantName tenantName = m.param1.removePrefix(tenantMapPrefix);
TenantMapEntry tenantEntry = decodeTenantEntry(m.param2);
TenantMapEntry tenantEntry = TenantMapEntry::decode(m.param2);
TraceEvent("CommitProxyInsertTenant", dbgid).detail("Tenant", tenantName).detail("Version", version);
(*tenantMap)[tenantName] = tenantEntry;
@ -662,7 +677,7 @@ private:
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
TEST(true); // Tenant added to map
@ -780,7 +795,7 @@ private:
TraceEvent(SevDebug, "SendingPrivatized_ClearServerTag", dbgid).detail("M", privatized);
toCommit->addTag(decodeServerTagValue(kv.value));
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
// Might be a tss removal, which doesn't store a tag there.
@ -804,7 +819,7 @@ private:
TraceEvent(SevDebug, "SendingPrivatized_TSSClearServerTag", dbgid)
.detail("M", privatized);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
}
@ -989,7 +1004,7 @@ private:
privatized.param2 = m.param2.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivatized_ClearTSSMapping", dbgid).detail("M", privatized);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
@ -1016,7 +1031,7 @@ private:
privatized.param2 = m.param2.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivatized_ClearTSSQuarantine", dbgid).detail("M", privatized);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
}
}
@ -1070,7 +1085,7 @@ private:
privatized.type = MutationRef::ClearRange;
privatized.param1 = range.begin.withPrefix(systemKeys.begin, arena);
privatized.param2 = range.end.withPrefix(systemKeys.begin, arena);
toCommit->writeTypedMessage(privatized);
writeMutation(privatized);
}
TEST(true); // Tenant cleared from map
@ -1179,9 +1194,9 @@ private:
.detail("MBegin", mutationBegin)
.detail("MEnd", mutationEnd);
toCommit->addTags(allTags);
toCommit->writeTypedMessage(mutationBegin);
writeMutation(mutationBegin);
toCommit->addTags(allTags);
toCommit->writeTypedMessage(mutationEnd);
writeMutation(mutationEnd);
}
}
@ -1258,6 +1273,7 @@ void applyMetadataMutations(SpanContext const& spanContext,
Reference<ILogSystem> logSystem,
const VectorRef<MutationRef>& mutations,
LogPushData* toCommit,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* pCipherKeys,
bool& confChange,
Version version,
Version popVersion,
@ -1269,6 +1285,7 @@ void applyMetadataMutations(SpanContext const& spanContext,
proxyCommitData,
logSystem,
toCommit,
pCipherKeys,
confChange,
version,
popVersion,

View File

@ -25,6 +25,8 @@
#include "fdbclient/SystemData.h"
#include "fdbserver/BackupInterface.h"
#include "fdbserver/BackupProgress.actor.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/GetEncryptCipherKeys.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
@ -44,6 +46,7 @@ struct VersionedMessage {
StringRef message;
VectorRef<Tag> tags;
Arena arena; // Keep a reference to the memory containing the message
Arena decryptArena; // Arena used for decrypt buffer.
size_t bytes; // arena's size when inserted, which can grow afterwards
VersionedMessage(LogMessageVersion v, StringRef m, const VectorRef<Tag>& t, const Arena& a)
@ -53,7 +56,8 @@ struct VersionedMessage {
// Returns true if the message is a mutation that should be backuped, i.e.,
// either key is not in system key space or is not a metadataVersionKey.
bool isBackupMessage(MutationRef* m) const {
bool isBackupMessage(MutationRef* m,
const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys) {
for (Tag tag : tags) {
if (tag.locality == tagLocalitySpecial || tag.locality == tagLocalityTxs) {
return false; // skip Txs mutations
@ -71,10 +75,26 @@ struct VersionedMessage {
TEST(true); // Returning false for OTELSpanContextMessage
return false;
}
reader >> *m;
if (EncryptedMutationMessage::isNextIn(reader)) {
// In case the mutation is encrypted, get the decrypted mutation and also update message to point to
// the decrypted mutation.
// We use dedicated arena for decrypt buffer, as the other arena is used to count towards backup lock bytes.
*m = EncryptedMutationMessage::decrypt(reader, decryptArena, cipherKeys, &message);
} else {
reader >> *m;
}
return normalKeys.contains(m->param1) || m->param1 == metadataVersionKey;
}
void collectCipherDetailIfEncrypted(std::unordered_set<BlobCipherDetails>& cipherDetails) {
ArenaReader reader(arena, message, AssumeVersion(g_network->protocolVersion()));
if (EncryptedMutationMessage::isNextIn(reader)) {
EncryptedMutationMessage emm;
reader >> emm;
cipherDetails.insert(emm.header.cipherTextDetails);
cipherDetails.insert(emm.header.cipherHeaderDetails);
}
}
};
struct BackupData {
@ -89,6 +109,7 @@ struct BackupData {
Version minKnownCommittedVersion;
Version savedVersion; // Largest version saved to blob storage
Version popVersion; // Largest version popped in NOOP mode, can be larger than savedVersion.
Reference<AsyncVar<ServerDBInfo> const> db;
AsyncVar<Reference<ILogSystem>> logSystem;
Database cx;
std::vector<VersionedMessage> messages;
@ -245,7 +266,7 @@ struct BackupData {
: myId(id), tag(req.routerTag), totalTags(req.totalTags), startVersion(req.startVersion),
endVersion(req.endVersion), recruitedEpoch(req.recruitedEpoch), backupEpoch(req.backupEpoch),
minKnownCommittedVersion(invalidVersion), savedVersion(req.startVersion - 1), popVersion(req.startVersion - 1),
pulledVersion(0), paused(false), lock(new FlowLock(SERVER_KNOBS->BACKUP_LOCK_BYTES)),
db(db), pulledVersion(0), paused(false), lock(new FlowLock(SERVER_KNOBS->BACKUP_LOCK_BYTES)),
cc("BackupWorker", myId.toString()) {
cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True);
@ -682,7 +703,10 @@ ACTOR static Future<Void> updateLogBytesWritten(BackupData* self,
// Saves messages in the range of [0, numMsg) to a file and then remove these
// messages. The file content format is a sequence of (Version, sub#, msgSize, message).
// Note only ready backups are saved.
ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int numMsg) {
ACTOR Future<Void> saveMutationsToFile(BackupData* self,
Version popVersion,
int numMsg,
std::unordered_set<BlobCipherDetails> cipherDetails) {
state int blockSize = SERVER_KNOBS->BACKUP_FILE_BLOCK_BYTES;
state std::vector<Future<Reference<IBackupFile>>> logFileFutures;
state std::vector<Reference<IBackupFile>> logFiles;
@ -691,6 +715,7 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
state std::vector<Version> beginVersions; // logFiles' begin versions
state KeyRangeMap<std::set<int>> keyRangeMap; // range to index in logFileFutures, logFiles, & blockEnds
state std::vector<Standalone<StringRef>> mutations;
state std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> cipherKeys;
state int idx;
// Make sure all backups are ready, otherwise mutations will be lost.
@ -742,11 +767,18 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
.detail("File", logFiles[i]->getFileName());
}
// Fetch cipher keys if any of the messages are encrypted.
if (!cipherDetails.empty()) {
std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> getCipherKeysResult =
wait(getEncryptCipherKeys(self->db, cipherDetails));
cipherKeys = getCipherKeysResult;
}
blockEnds = std::vector<int64_t>(logFiles.size(), 0);
for (idx = 0; idx < numMsg; idx++) {
const auto& message = self->messages[idx];
auto& message = self->messages[idx];
MutationRef m;
if (!message.isBackupMessage(&m))
if (!message.isBackupMessage(&m, cipherKeys))
continue;
DEBUG_MUTATION("addMutation", message.version.version, m)
@ -815,6 +847,7 @@ ACTOR Future<Void> uploadData(BackupData* self) {
state Future<Void> uploadDelay = delay(SERVER_KNOBS->BACKUP_UPLOAD_DELAY);
state int numMsg = 0;
state std::unordered_set<BlobCipherDetails> cipherDetails;
Version lastPopVersion = popVersion;
// index of last version's end position in self->messages
int lastVersionIndex = 0;
@ -826,7 +859,7 @@ ACTOR Future<Void> uploadData(BackupData* self) {
popVersion = std::max(popVersion, self->minKnownCommittedVersion);
}
} else {
for (const auto& message : self->messages) {
for (auto& message : self->messages) {
// message may be prefetched in peek; uncommitted message should not be uploaded.
const Version version = message.getVersion();
if (version > self->maxPopVersion())
@ -836,6 +869,7 @@ ACTOR Future<Void> uploadData(BackupData* self) {
lastVersion = popVersion;
popVersion = version;
}
message.collectCipherDetailIfEncrypted(cipherDetails);
numMsg++;
}
}
@ -859,7 +893,7 @@ ACTOR Future<Void> uploadData(BackupData* self) {
.detail("NumMsg", numMsg)
.detail("MsgQ", self->messages.size());
// save an empty file for old epochs so that log file versions are continuous
wait(saveMutationsToFile(self, popVersion, numMsg));
wait(saveMutationsToFile(self, popVersion, numMsg, cipherDetails));
self->eraseMessages(numMsg);
}

View File

@ -354,8 +354,9 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> splitRange(Reference<BlobManagerData
state PromiseStream<Key> resultStream;
state Standalone<VectorRef<KeyRef>> keys;
state Future<Void> streamFuture =
bmData->db->splitStorageMetricsStream(resultStream, range, splitMetrics, estimated);
// SplitMetrics.bytes / 3 as min split size because of same splitThreshold logic above.
state Future<Void> streamFuture = bmData->db->splitStorageMetricsStream(
resultStream, range, splitMetrics, estimated, splitMetrics.bytes / 3);
loop {
try {
Key k = waitNext(resultStream.getFuture());
@ -846,7 +847,7 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
std::vector<Key> prefixes;
for (auto& it : tenantResults) {
TenantNameRef tenantName = it.key.removePrefix(tenantMapPrefix);
TenantMapEntry entry = decodeTenantEntry(it.value);
TenantMapEntry entry = TenantMapEntry::decode(it.value);
tenants.push_back(std::pair(tenantName, entry));
prefixes.push_back(entry.prefix);
}

View File

@ -3198,7 +3198,7 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
for (auto& it : tenantResults) {
// FIXME: handle removing/moving tenants!
TenantNameRef tenantName = it.key.removePrefix(tenantMapPrefix);
TenantMapEntry entry = decodeTenantEntry(it.value);
TenantMapEntry entry = TenantMapEntry::decode(it.value);
tenants.push_back(std::pair(tenantName, entry));
}
bwData->tenantData.addTenants(tenants);

View File

@ -1013,11 +1013,6 @@ ACTOR Future<Void> updateLocalityForDcId(Optional<Key> dcId,
if (ver == invalidVersion) {
ver = oldLogSystem->getKnownCommittedVersion();
}
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR_TLOG_UNICAST) {
// Do not try to split peeks between data centers in peekTxns() to recover mem kvstore.
// This recovery optimization won't work in UNICAST mode.
loc.first = -1;
}
locality->set(PeekTxsInfo(loc.first, loc.second, ver));
TraceEvent("UpdatedLocalityForDcId")

View File

@ -34,7 +34,9 @@
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/ConflictSet.h"
#include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "fdbserver/GetEncryptCipherKeys.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LogSystem.h"
@ -48,6 +50,7 @@
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
#include "flow/BlobCipher.h"
#include "flow/Error.h"
#include "flow/IRandom.h"
#include "flow/Knobs.h"
@ -641,6 +644,9 @@ struct CommitBatchContext {
std::set<Tag> writtenTags; // final set tags written to in the batch
std::set<Tag> writtenTagsPreResolution; // tags written to in the batch not including any changes from the resolver.
// Cipher keys to be used to encrypt mutations
std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys;
CommitBatchContext(ProxyCommitData*, const std::vector<CommitTransactionRequest>*, const int);
void setupTraceBatch();
@ -897,6 +903,27 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
self->transactionResolverMap.swap(requests.transactionResolverMap);
// Used to report conflicting keys
self->txReadConflictRangeIndexMap.swap(requests.txReadConflictRangeIndexMap);
// Fetch cipher keys if needed.
state Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getCipherKeys;
if (SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
static std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> defaultDomains = {
{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME },
{ ENCRYPT_HEADER_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }
};
std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> encryptDomains = defaultDomains;
for (int t = 0; t < trs.size(); t++) {
int64_t tenantId = trs[t].tenantInfo.tenantId;
Optional<TenantName> tenantName = trs[t].tenantInfo.name;
// TODO(yiwu): In raw access mode, use tenant prefix to figure out tenant id for user data
if (tenantId != TenantInfo::INVALID_TENANT) {
ASSERT(tenantName.present());
encryptDomains[tenantId] = tenantName.get();
}
}
getCipherKeys = getLatestEncryptCipherKeys(pProxyCommitData->db, encryptDomains);
}
self->releaseFuture = releaseResolvingAfter(pProxyCommitData, self->releaseDelay, self->localBatchNumber);
if (self->localBatchNumber - self->pProxyCommitData->latestLocalCommitBatchLogging.get() >
@ -922,6 +949,11 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
"CommitDebug", self->debugID.get().first(), "CommitProxyServer.commitBatch.AfterResolution");
}
if (SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys = wait(getCipherKeys);
self->cipherKeys = cipherKeys;
}
return Void();
}
@ -961,6 +993,7 @@ void applyMetadataEffect(CommitBatchContext* self) {
self->pProxyCommitData->logSystem,
self->resolution[0].stateMutations[versionIndex][transactionIndex].mutations,
/* pToCommit= */ nullptr,
/* pCipherKeys= */ nullptr,
self->forceRecovery,
/* version= */ self->commitVersion,
/* popVersion= */ 0,
@ -1060,6 +1093,7 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
pProxyCommitData->logSystem,
trs[t].transaction.mutations,
SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? nullptr : &self->toCommit,
SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION ? &self->cipherKeys : nullptr,
self->forceRecovery,
self->commitVersion,
self->commitVersion + 1,
@ -1111,6 +1145,22 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
return Void();
}
void writeMutation(CommitBatchContext* self, int64_t tenantId, const MutationRef& mutation) {
static_assert(TenantInfo::INVALID_TENANT == ENCRYPT_INVALID_DOMAIN_ID);
if (!SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION || tenantId == TenantInfo::INVALID_TENANT) {
// TODO(yiwu): In raw access mode, use tenant prefix to figure out tenant id for user data
bool isRawAccess = tenantId == TenantInfo::INVALID_TENANT && !isSystemKey(mutation.param1) &&
!(mutation.type == MutationRef::ClearRange && isSystemKey(mutation.param2)) &&
self->pProxyCommitData->db->get().client.tenantMode == TenantMode::REQUIRED;
TEST(isRawAccess); // Raw access to tenant key space
self->toCommit.writeTypedMessage(mutation);
} else {
Arena arena;
self->toCommit.writeTypedMessage(
EncryptedMutationMessage::encrypt(arena, self->cipherKeys, tenantId /*domainId*/, mutation));
}
}
/// This second pass through committed transactions assigns the actual mutations to the appropriate storage servers'
/// tags
ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
@ -1127,6 +1177,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
state Optional<ClientTrCommitCostEstimation>* trCost = &trs[self->transactionNum].commitCostEstimation;
state int mutationNum = 0;
state VectorRef<MutationRef>* pMutations = &trs[self->transactionNum].transaction.mutations;
state int64_t tenantId = trs[self->transactionNum].tenantInfo.tenantId;
self->toCommit.addTransactionInfo(trs[self->transactionNum].spanContext);
@ -1184,7 +1235,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
if (pProxyCommitData->cacheInfo[m.param1]) {
self->toCommit.addTag(cacheTag);
}
self->toCommit.writeTypedMessage(m);
writeMutation(self, tenantId, m);
} else if (m.type == MutationRef::ClearRange) {
KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2));
auto ranges = pProxyCommitData->keyInfo.intersectingRanges(clearRange);
@ -1237,7 +1288,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
if (pProxyCommitData->needsCacheTag(clearRange)) {
self->toCommit.addTag(cacheTag);
}
self->toCommit.writeTypedMessage(m);
writeMutation(self, tenantId, m);
} else {
UNREACHABLE();
}
@ -2086,21 +2137,32 @@ ACTOR Future<Void> proxySnapCreate(ProxySnapRequest snapReq, ProxyCommitData* co
throw snap_log_anti_quorum_unsupported();
}
// send a snap request to DD
if (!commitData->db->get().distributor.present()) {
TraceEvent(SevWarnAlways, "DataDistributorNotPresent").detail("Operation", "SnapRequest");
throw dd_not_found();
}
state Future<ErrorOr<Void>> ddSnapReq = commitData->db->get().distributor.get().distributorSnapReq.tryGetReply(
DistributorSnapRequest(snapReq.snapPayload, snapReq.snapUID));
try {
wait(throwErrorOr(ddSnapReq));
} catch (Error& e) {
TraceEvent("SnapCommitProxy_DDSnapResponseError")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
throw e;
state int snapReqRetry = 0;
state double snapRetryBackoff = FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
loop {
// send a snap request to DD
if (!commitData->db->get().distributor.present()) {
TraceEvent(SevWarnAlways, "DataDistributorNotPresent").detail("Operation", "SnapRequest");
throw dd_not_found();
}
try {
Future<ErrorOr<Void>> ddSnapReq =
commitData->db->get().distributor.get().distributorSnapReq.tryGetReply(
DistributorSnapRequest(snapReq.snapPayload, snapReq.snapUID));
wait(throwErrorOr(ddSnapReq));
break;
} catch (Error& e) {
TraceEvent("SnapCommitProxy_DDSnapResponseError")
.errorUnsuppressed(e)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// Retry if we have network issues
if (e.code() != error_code_request_maybe_delivered ||
++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
throw e;
wait(delay(snapRetryBackoff));
snapRetryBackoff = snapRetryBackoff * 2; // exponential backoff
}
}
snapReq.reply.send(Void());
} catch (Error& e) {
@ -2297,6 +2359,7 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
Reference<ILogSystem>(),
mutations,
/* pToCommit= */ nullptr,
/* pCipherKeys= */ nullptr,
confChanges,
/* version= */ 0,
/* popVersion= */ 0,
@ -2388,7 +2451,8 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
// Wait until we can load the "real" logsystem, since we don't support switching them currently
while (!(masterLifetime.isEqual(commitData.db->get().masterLifetime) &&
commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION)) {
commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION &&
(!SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION || commitData.db->get().encryptKeyProxy.present()))) {
//TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch);
wait(commitData.db->onChange());
}

View File

@ -876,14 +876,26 @@ Future<Void> sendSnapReq(RequestStream<Req> stream, Req req, Error e) {
return Void();
}
ACTOR template <class Req>
Future<ErrorOr<Void>> trySendSnapReq(RequestStream<Req> stream, Req req) {
ErrorOr<REPLY_TYPE(Req)> reply = wait(stream.tryGetReply(req));
if (reply.isError()) {
TraceEvent("SnapDataDistributor_ReqError")
.errorUnsuppressed(reply.getError())
.detail("Peer", stream.getEndpoint().getPrimaryAddress());
return ErrorOr<Void>(reply.getError());
ACTOR Future<ErrorOr<Void>> trySendSnapReq(RequestStream<WorkerSnapRequest> stream, WorkerSnapRequest req) {
state int snapReqRetry = 0;
state double snapRetryBackoff = FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
loop {
ErrorOr<REPLY_TYPE(WorkerSnapRequest)> reply = wait(stream.tryGetReply(req));
if (reply.isError()) {
TraceEvent("SnapDataDistributor_ReqError")
.errorUnsuppressed(reply.getError())
.detail("Peer", stream.getEndpoint().getPrimaryAddress());
if (reply.getError().code() != error_code_request_maybe_delivered ||
++snapReqRetry > SERVER_KNOBS->SNAP_NETWORK_FAILURE_RETRY_LIMIT)
return ErrorOr<Void>(reply.getError());
else {
// retry for network failures with same snap UID to avoid snapshot twice
req = WorkerSnapRequest(req.snapPayload, req.snapUID, req.role);
wait(delay(snapRetryBackoff));
snapRetryBackoff = snapRetryBackoff * 2;
}
} else
break;
}
return ErrorOr<Void>(Void());
}
@ -906,6 +918,124 @@ ACTOR static Future<Void> waitForMost(std::vector<Future<ErrorOr<Void>>> futures
return Void();
}
ACTOR Future<std::map<NetworkAddress, std::pair<WorkerInterface, std::string>>> getStatefulWorkers(
Database cx,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
std::vector<TLogInterface>* tlogs,
int* storageFaultTolerance) {
state std::map<NetworkAddress, std::pair<WorkerInterface, std::string>> result;
state std::map<NetworkAddress, WorkerInterface> workersMap;
state Transaction tr(cx);
state DatabaseConfiguration configuration;
loop {
try {
// necessary options
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
// get database configuration
DatabaseConfiguration _configuration = wait(getDatabaseConfiguration(&tr));
configuration = _configuration;
// get storages
RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
state std::vector<StorageServerInterface> storageServers;
storageServers.reserve(serverList.size());
for (int i = 0; i < serverList.size(); i++)
storageServers.push_back(decodeServerListValue(serverList[i].value));
// get workers
state std::vector<WorkerDetails> workers = wait(getWorkers(dbInfo));
for (const auto& worker : workers) {
workersMap[worker.interf.address()] = worker.interf;
}
Optional<Value> regionsValue =
wait(tr.get(LiteralStringRef("usable_regions").withPrefix(configKeysPrefix)));
int usableRegions = 1;
if (regionsValue.present()) {
usableRegions = atoi(regionsValue.get().toString().c_str());
}
auto masterDcId = dbInfo->get().master.locality.dcId();
int storageFailures = 0;
for (const auto& server : storageServers) {
TraceEvent(SevDebug, "StorageServerDcIdInfo")
.detail("Address", server.address().toString())
.detail("ServerLocalityID", server.locality.dcId())
.detail("MasterDcID", masterDcId);
if (usableRegions == 1 || server.locality.dcId() == masterDcId) {
auto itr = workersMap.find(server.address());
if (itr == workersMap.end()) {
TraceEvent(SevWarn, "GetStorageWorkers")
.detail("Reason", "Could not find worker for storage server")
.detail("SS", server.id());
++storageFailures;
} else {
if (result.count(server.address())) {
ASSERT(itr->second.id() == result[server.address()].first.id());
if (result[server.address()].second.find("storage") == std::string::npos)
result[server.address()].second.append(",storage");
} else {
result[server.address()] = std::make_pair(itr->second, "storage");
}
}
}
}
// calculate fault tolerance
*storageFaultTolerance = std::min(static_cast<int>(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE),
configuration.storageTeamSize - 1) -
storageFailures;
if (*storageFaultTolerance < 0) {
TEST(true); // Too many failed storage servers to complete snapshot
throw snap_storage_failed();
}
// tlogs
for (const auto& tlog : *tlogs) {
TraceEvent(SevDebug, "GetStatefulWorkersTlog").detail("Addr", tlog.address());
if (workersMap.find(tlog.address()) == workersMap.end()) {
TraceEvent(SevError, "MissingTlogWorkerInterface").detail("TlogAddress", tlog.address());
throw snap_tlog_failed();
}
if (result.count(tlog.address())) {
ASSERT(workersMap[tlog.address()].id() == result[tlog.address()].first.id());
result[tlog.address()].second.append(",tlog");
} else {
result[tlog.address()] = std::make_pair(workersMap[tlog.address()], "tlog");
}
}
// get coordinators
Optional<Value> coordinators = wait(tr.get(coordinatorsKey));
if (!coordinators.present()) {
throw operation_failed();
}
ClusterConnectionString ccs(coordinators.get().toString());
std::vector<NetworkAddress> coordinatorsAddr = wait(ccs.tryResolveHostnames());
std::set<NetworkAddress> coordinatorsAddrSet(coordinatorsAddr.begin(), coordinatorsAddr.end());
for (const auto& worker : workers) {
// Note : only considers second address for coordinators,
// as we use primary addresses from storage and tlog interfaces above
NetworkAddress primary = worker.interf.address();
Optional<NetworkAddress> secondary = worker.interf.tLog.getEndpoint().addresses.secondaryAddress;
if (coordinatorsAddrSet.find(primary) != coordinatorsAddrSet.end() ||
(secondary.present() && (coordinatorsAddrSet.find(secondary.get()) != coordinatorsAddrSet.end()))) {
if (result.count(primary)) {
ASSERT(workersMap[primary].id() == result[primary].first.id());
result[primary].second.append(",coord");
} else {
result[primary] = std::make_pair(workersMap[primary], "coord");
}
}
}
return result;
} catch (Error& e) {
wait(tr.onError(e));
result.clear();
}
}
}
ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<AsyncVar<ServerDBInfo> const> db) {
state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
@ -942,47 +1072,44 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
TraceEvent("SnapDataDistributor_AfterDisableTLogPop")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// snap local storage nodes
// TODO: Atomically read configuration and storage worker list in a single transaction
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
std::pair<std::vector<WorkerInterface>, int> storageWorkersAndFailures =
wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
const auto& [storageWorkers, storageFailures] = storageWorkersAndFailures;
auto const storageFaultTolerance =
std::min(static_cast<int>(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE),
configuration.storageTeamSize - 1) -
storageFailures;
if (storageFaultTolerance < 0) {
TEST(true); // Too many failed storage servers to complete snapshot
throw snap_storage_failed();
}
TraceEvent("SnapDataDistributor_GotStorageWorkers")
state int storageFaultTolerance;
// snap stateful nodes
state std::map<NetworkAddress, std::pair<WorkerInterface, std::string>> statefulWorkers =
wait(transformErrors(getStatefulWorkers(cx, db, &tlogs, &storageFaultTolerance), snap_storage_failed()));
TraceEvent("SnapDataDistributor_GotStatefulWorkers")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// we need to snapshot storage nodes before snapshot any tlogs
std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
storageSnapReqs.reserve(storageWorkers.size());
for (const auto& worker : storageWorkers) {
storageSnapReqs.push_back(trySendSnapReq(
worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr)));
for (const auto& [addr, entry] : statefulWorkers) {
auto& [interf, role] = entry;
if (role.find("storage") != std::string::npos)
storageSnapReqs.push_back(trySendSnapReq(
interf.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr)));
}
wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));
TraceEvent("SnapDataDistributor_AfterSnapStorage")
.detail("FaultTolerance", storageFaultTolerance)
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// snap local tlog nodes
std::vector<Future<Void>> tLogSnapReqs;
std::vector<Future<ErrorOr<Void>>> tLogSnapReqs;
tLogSnapReqs.reserve(tlogs.size());
for (const auto& tlog : tlogs) {
tLogSnapReqs.push_back(sendSnapReq(tlog.snapRequest,
TLogSnapRequest{ snapReq.snapPayload, snapReq.snapUID, "tlog"_sr },
snap_tlog_failed()));
for (const auto& [addr, entry] : statefulWorkers) {
auto& [interf, role] = entry;
if (role.find("tlog") != std::string::npos)
tLogSnapReqs.push_back(trySendSnapReq(
interf.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "tlog"_sr)));
}
wait(waitForAll(tLogSnapReqs));
wait(waitForMost(tLogSnapReqs, 0, snap_tlog_failed()));
TraceEvent("SnapDataDistributor_AfterTLogStorage")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// enable tlog pop on local tlog nodes
std::vector<Future<Void>> enablePops;
enablePops.reserve(tlogs.size());
@ -995,20 +1122,18 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
TraceEvent("SnapDataDistributor_AfterEnableTLogPops")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// snap the coordinators
std::vector<WorkerInterface> coordWorkers = wait(getCoordWorkers(cx, db));
TraceEvent("SnapDataDistributor_GotCoordWorkers")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
std::vector<Future<ErrorOr<Void>>> coordSnapReqs;
coordSnapReqs.reserve(coordWorkers.size());
for (const auto& worker : coordWorkers) {
coordSnapReqs.push_back(trySendSnapReq(
worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr)));
for (const auto& [addr, entry] : statefulWorkers) {
auto& [interf, role] = entry;
if (role.find("coord") != std::string::npos)
coordSnapReqs.push_back(trySendSnapReq(
interf.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr)));
}
auto const coordFaultTolerance = std::min<int>(std::max<int>(0, coordSnapReqs.size() / 2 - 1),
SERVER_KNOBS->MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE);
wait(waitForMost(coordSnapReqs, coordFaultTolerance, snap_coord_failed()));
TraceEvent("SnapDataDistributor_AfterSnapCoords")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
@ -1056,37 +1181,48 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
return Void();
}
ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq,
Reference<AsyncVar<ServerDBInfo> const> db,
DDEnabledState* ddEnabledState) {
ACTOR Future<Void> ddSnapCreate(
DistributorSnapRequest snapReq,
Reference<AsyncVar<ServerDBInfo> const> db,
DDEnabledState* ddEnabledState,
std::map<UID, DistributorSnapRequest>* ddSnapMap /* ongoing snapshot requests */,
std::map<UID, ErrorOr<Void>>*
ddSnapResultMap /* finished snapshot requests, expired in SNAP_MINIMUM_TIME_GAP seconds */) {
state Future<Void> dbInfoChange = db->onChange();
if (!ddEnabledState->setDDEnabled(false, snapReq.snapUID)) {
// disable DD before doing snapCreate, if previous snap req has already disabled DD then this operation fails
// here
TraceEvent("SnapDDSetDDEnabledFailedInMemoryCheck").log();
snapReq.reply.sendError(operation_failed());
TraceEvent("SnapDDSetDDEnabledFailedInMemoryCheck").detail("SnapUID", snapReq.snapUID);
ddSnapMap->at(snapReq.snapUID).reply.sendError(operation_failed());
ddSnapMap->erase(snapReq.snapUID);
(*ddSnapResultMap)[snapReq.snapUID] = ErrorOr<Void>(operation_failed());
return Void();
}
double delayTime = g_network->isSimulated() ? 70.0 : SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT;
try {
choose {
when(wait(dbInfoChange)) {
TraceEvent("SnapDDCreateDBInfoChanged")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
snapReq.reply.sendError(snap_with_recovery_unsupported());
ddSnapMap->at(snapReq.snapUID).reply.sendError(snap_with_recovery_unsupported());
ddSnapMap->erase(snapReq.snapUID);
(*ddSnapResultMap)[snapReq.snapUID] = ErrorOr<Void>(snap_with_recovery_unsupported());
}
when(wait(ddSnapCreateCore(snapReq, db))) {
TraceEvent("SnapDDCreateSuccess")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
snapReq.reply.send(Void());
ddSnapMap->at(snapReq.snapUID).reply.send(Void());
ddSnapMap->erase(snapReq.snapUID);
(*ddSnapResultMap)[snapReq.snapUID] = ErrorOr<Void>(Void());
}
when(wait(delay(delayTime))) {
when(wait(delay(SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT))) {
TraceEvent("SnapDDCreateTimedOut")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
snapReq.reply.sendError(timed_out());
ddSnapMap->at(snapReq.snapUID).reply.sendError(timed_out());
ddSnapMap->erase(snapReq.snapUID);
(*ddSnapResultMap)[snapReq.snapUID] = ErrorOr<Void>(timed_out());
}
}
} catch (Error& e) {
@ -1095,7 +1231,9 @@ ACTOR Future<Void> ddSnapCreate(DistributorSnapRequest snapReq,
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
ddSnapMap->at(snapReq.snapUID).reply.sendError(e);
ddSnapMap->erase(snapReq.snapUID);
(*ddSnapResultMap)[snapReq.snapUID] = ErrorOr<Void>(e);
} else {
// enable DD should always succeed
bool success = ddEnabledState->setDDEnabled(true, snapReq.snapUID);
@ -1246,6 +1384,8 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
state ActorCollection actors(false);
state DDEnabledState ddEnabledState;
state std::map<UID, DistributorSnapRequest> ddSnapReqMap;
state std::map<UID, ErrorOr<Void>> ddSnapReqResultMap;
self->addActor.send(actors.getResult());
self->addActor.send(traceRole(Role::DATA_DISTRIBUTOR, di.id()));
@ -1273,7 +1413,30 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
actors.add(ddGetMetrics(req, getShardMetricsList));
}
when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
actors.add(ddSnapCreate(snapReq, db, &ddEnabledState));
auto& snapUID = snapReq.snapUID;
if (ddSnapReqResultMap.count(snapUID)) {
TEST(true); // Data distributor received a duplicate finished snap request
auto result = ddSnapReqResultMap[snapUID];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedDistributorSnapRequest")
.detail("SnapUID", snapUID)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (ddSnapReqMap.count(snapReq.snapUID)) {
TEST(true); // Data distributor received a duplicate ongoing snap request
TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
ddSnapReqMap[snapUID] = snapReq;
} else {
ddSnapReqMap[snapUID] = snapReq;
actors.add(ddSnapCreate(snapReq, db, &ddEnabledState, &ddSnapReqMap, &ddSnapReqResultMap));
auto* ddSnapReqResultMapPtr = &ddSnapReqResultMap;
actors.add(fmap(
[ddSnapReqResultMapPtr, snapUID](Void _) {
ddSnapReqResultMapPtr->erase(snapUID);
return Void();
},
delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP)));
}
}
when(DistributorExclusionSafetyCheckRequest exclCheckReq =
waitNext(di.distributorExclCheckReq.getFuture())) {

View File

@ -43,7 +43,7 @@ BandwidthStatus getBandwidthStatus(StorageMetrics const& metrics) {
}
ReadBandwidthStatus getReadBandwidthStatus(StorageMetrics const& metrics) {
if (metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS ||
if (metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS ||
metrics.bytesReadPerKSecond <= SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * metrics.bytes *
SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS) {
return ReadBandwidthStatusNormal;
@ -238,7 +238,7 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
std::max((int64_t)(SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO * bytes *
SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS *
(1.0 + SERVER_KNOBS->SHARD_MAX_BYTES_READ_PER_KSEC_JITTER)),
SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS);
SERVER_KNOBS->SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS);
bounds.min.bytesReadPerKSecond = 0;
bounds.permittedError.bytesReadPerKSecond = bounds.min.bytesReadPerKSecond / 4;
} else if (readBandwidthStatus == ReadBandwidthStatusHigh) {
@ -291,7 +291,7 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
.detail("Keys", keys)
.detail("UpdatedSize", metrics.metrics.bytes)
.detail("Bandwidth", metrics.metrics.bytesPerKSecond)
.detail("BandwithStatus", getBandwidthStatus(metrics))
.detail("BandwidthStatus", getBandwidthStatus(metrics))
.detail("BytesLower", bounds.min.bytes)
.detail("BytesUpper", bounds.max.bytes)
.detail("BandwidthLower", bounds.min.bytesPerKSecond)
@ -380,7 +380,7 @@ ACTOR Future<Standalone<VectorRef<KeyRef>>> getSplitKeys(DataDistributionTracker
state Transaction tr(self->cx);
try {
Standalone<VectorRef<KeyRef>> keys =
wait(self->cx->splitStorageMetrics(splitRange, splitMetrics, estimated));
wait(self->cx->splitStorageMetrics(splitRange, splitMetrics, estimated, SERVER_KNOBS->MIN_SHARD_BYTES));
return keys;
} catch (Error& e) {
wait(tr.onError(e));

View File

@ -42,6 +42,7 @@
#include "flow/genericactors.actor.h"
#include "flow/network.h"
#include <boost/functional/hash.hpp>
#include <boost/mpl/not.hpp>
#include <limits>
#include <string>

View File

@ -426,14 +426,12 @@ ACTOR Future<int> execHelper(ExecCmdValueString* execArg, UID snapUID, std::stri
} else {
// copy the files
state std::string folderFrom = folder + "/.";
state std::string folderTo = folder + "-snap-" + uidStr.toString();
double maxSimDelayTime = 10.0;
folderTo = folder + "-snap-" + uidStr.toString() + "-" + role;
state std::string folderTo = folder + "-snap-" + uidStr.toString() + "-" + role;
std::vector<std::string> paramList;
std::string mkdirBin = "/bin/mkdir";
paramList.push_back(mkdirBin);
paramList.push_back(folderTo);
cmdErr = spawnProcess(mkdirBin, paramList, maxWaitTime, false /*isSync*/, maxSimDelayTime);
cmdErr = spawnProcess(mkdirBin, paramList, maxWaitTime, false /*isSync*/, 10.0);
wait(success(cmdErr));
err = cmdErr.get();
if (err == 0) {

View File

@ -1,5 +1,5 @@
/*
* GetCipherKeys.actor.cpp
* GetEncryptCipherKeys.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
@ -38,7 +38,7 @@ ACTOR Future<Void> onEncryptKeyProxyChange(Reference<AsyncVar<ServerDBInfo> cons
break;
}
}
TraceEvent("GetCipherKeys_EncryptKeyProxyChanged")
TraceEvent("GetEncryptCipherKeys_EncryptKeyProxyChanged")
.detail("PreviousProxyId", previousProxyId.orDefault(UID()))
.detail("CurrentProxyId", currentProxyId.orDefault(UID()));
return Void();
@ -50,19 +50,19 @@ ACTOR Future<EKPGetLatestBaseCipherKeysReply> getUncachedLatestEncryptCipherKeys
Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy;
if (!proxy.present()) {
// Wait for onEncryptKeyProxyChange.
TraceEvent("GetLatestCipherKeys_EncryptKeyProxyNotPresent");
TraceEvent("GetLatestEncryptCipherKeys_EncryptKeyProxyNotPresent");
return Never();
}
request.reply.reset();
try {
EKPGetLatestBaseCipherKeysReply reply = wait(proxy.get().getLatestBaseCipherKeys.getReply(request));
if (reply.error.present()) {
TraceEvent(SevWarn, "GetLatestCipherKeys_RequestFailed").error(reply.error.get());
TraceEvent(SevWarn, "GetLatestEncryptCipherKeys_RequestFailed").error(reply.error.get());
throw encrypt_keys_fetch_failed();
}
return reply;
} catch (Error& e) {
TraceEvent("GetLatestCipherKeys_CaughtError").error(e);
TraceEvent("GetLatestEncryptCipherKeys_CaughtError").error(e);
if (e.code() == error_code_broken_promise) {
// Wait for onEncryptKeyProxyChange.
return Never();
@ -81,7 +81,7 @@ ACTOR Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>
state EKPGetLatestBaseCipherKeysRequest request;
if (!db.isValid()) {
TraceEvent(SevError, "GetLatestCipherKeys_ServerDBInfoNotAvailable");
TraceEvent(SevError, "GetLatestEncryptCipherKeys_ServerDBInfoNotAvailable");
throw encrypt_ops_error();
}
@ -114,7 +114,7 @@ ACTOR Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>
// Check for any missing cipher keys.
for (auto& domain : request.encryptDomainInfos) {
if (cipherKeys.count(domain.domainId) == 0) {
TraceEvent(SevWarn, "GetLatestCipherKeys_KeyMissing").detail("DomainId", domain.domainId);
TraceEvent(SevWarn, "GetLatestEncryptCipherKeys_KeyMissing").detail("DomainId", domain.domainId);
throw encrypt_key_not_found();
}
}
@ -133,19 +133,19 @@ ACTOR Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Refere
Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy;
if (!proxy.present()) {
// Wait for onEncryptKeyProxyChange.
TraceEvent("GetCipherKeys_EncryptKeyProxyNotPresent");
TraceEvent("GetEncryptCipherKeys_EncryptKeyProxyNotPresent");
return Never();
}
request.reply.reset();
try {
EKPGetBaseCipherKeysByIdsReply reply = wait(proxy.get().getBaseCipherKeysByIds.getReply(request));
if (reply.error.present()) {
TraceEvent(SevWarn, "GetCipherKeys_RequestFailed").error(reply.error.get());
TraceEvent(SevWarn, "GetEncryptCipherKeys_RequestFailed").error(reply.error.get());
throw encrypt_keys_fetch_failed();
}
return reply;
} catch (Error& e) {
TraceEvent("GetCipherKeys_CaughtError").error(e);
TraceEvent("GetEncryptCipherKeys_CaughtError").error(e);
if (e.code() == error_code_broken_promise) {
// Wait for onEncryptKeyProxyChange.
return Never();
@ -167,7 +167,7 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
state EKPGetBaseCipherKeysByIdsRequest request;
if (!db.isValid()) {
TraceEvent(SevError, "GetCipherKeys_ServerDBInfoNotAvailable");
TraceEvent(SevError, "GetEncryptCipherKeys_ServerDBInfoNotAvailable");
throw encrypt_ops_error();
}
@ -204,7 +204,7 @@ ACTOR Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> ge
BaseCipherIndex baseIdx = std::make_pair(details.encryptDomainId, details.baseCipherId);
const auto& itr = baseCipherKeys.find(baseIdx);
if (itr == baseCipherKeys.end()) {
TraceEvent(SevError, "GetCipherKeys_KeyMissing")
TraceEvent(SevError, "GetEncryptCipherKeys_KeyMissing")
.detail("DomainId", details.encryptDomainId)
.detail("BaseCipherId", details.baseCipherId);
throw encrypt_key_not_found();

View File

@ -0,0 +1,533 @@
/*
* GlobalTagThrottler.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/FDBTypes.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbrpc/Smoother.h"
#include "fdbserver/TagThrottler.h"
#include <limits>
#include "flow/actorcompiler.h" // must be last include
class GlobalTagThrottlerImpl {
class QuotaAndCounters {
Optional<ThrottleApi::TagQuotaValue> quota;
std::unordered_map<UID, double> ssToReadCostRate;
std::unordered_map<UID, double> ssToWriteCostRate;
Smoother totalReadCostRate;
Smoother totalWriteCostRate;
Smoother transactionCounter;
Smoother perClientRate;
Optional<double> getReadTPSLimit() const {
if (totalReadCostRate.smoothTotal() > 0) {
return quota.get().totalReadQuota * transactionCounter.smoothRate() / totalReadCostRate.smoothTotal();
} else {
return {};
}
}
Optional<double> getWriteTPSLimit() const {
if (totalWriteCostRate.smoothTotal() > 0) {
return quota.get().totalWriteQuota * transactionCounter.smoothRate() / totalWriteCostRate.smoothTotal();
} else {
return {};
}
}
public:
QuotaAndCounters()
: totalReadCostRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
totalWriteCostRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
transactionCounter(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME),
perClientRate(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_FOLDING_TIME) {}
void setQuota(ThrottleApi::TagQuotaValue const& quota) { this->quota = quota; }
void updateReadCostRate(UID ssId, double newReadCostRate) {
auto& currentReadCostRate = ssToReadCostRate[ssId];
auto diff = newReadCostRate - currentReadCostRate;
currentReadCostRate += diff;
totalReadCostRate.addDelta(diff);
}
void updateWriteCostRate(UID ssId, double newWriteCostRate) {
auto& currentWriteCostRate = ssToWriteCostRate[ssId];
auto diff = newWriteCostRate - currentWriteCostRate;
currentWriteCostRate += diff;
totalWriteCostRate.addDelta(diff);
}
void addTransactions(int count) { transactionCounter.addDelta(count); }
Optional<double> getTargetTotalTPSLimit() const {
if (!quota.present())
return {};
auto readLimit = getReadTPSLimit();
auto writeLimit = getWriteTPSLimit();
// TODO: Implement expiration logic
if (!readLimit.present() && !writeLimit.present()) {
return {};
} else {
if (!readLimit.present()) {
return writeLimit.get();
} else if (!writeLimit.present()) {
return readLimit.get();
} else {
return std::min(readLimit.get(), writeLimit.get());
}
}
}
Optional<ClientTagThrottleLimits> updateAndGetPerClientLimit() {
auto targetRate = getTargetTotalTPSLimit();
if (targetRate.present() && transactionCounter.smoothRate() > 0) {
auto newPerClientRate = std::max(
SERVER_KNOBS->GLOBAL_TAG_THROTTLING_MIN_RATE,
std::min(targetRate.get(),
(targetRate.get() / transactionCounter.smoothRate()) * perClientRate.smoothTotal()));
perClientRate.setTotal(newPerClientRate);
return ClientTagThrottleLimits(perClientRate.getTotal(), ClientTagThrottleLimits::NO_EXPIRATION);
} else {
return {};
}
}
void processTraceEvent(TraceEvent& te) const {
if (quota.present()) {
te.detail("ProvidedReadTPSLimit", getReadTPSLimit())
.detail("ProvidedWriteTPSLimit", getWriteTPSLimit())
.detail("ReadCostRate", totalReadCostRate.smoothTotal())
.detail("WriteCostRate", totalWriteCostRate.smoothTotal())
.detail("TotalReadQuota", quota.get().totalReadQuota)
.detail("ReservedReadQuota", quota.get().reservedReadQuota)
.detail("TotalWriteQuota", quota.get().totalWriteQuota)
.detail("ReservedWriteQuota", quota.get().reservedWriteQuota);
}
}
};
Database db;
UID id;
std::map<TransactionTag, QuotaAndCounters> trackedTags;
uint64_t throttledTagChangeId{ 0 };
Future<Void> traceActor;
ACTOR static Future<Void> tracer(GlobalTagThrottlerImpl const* self) {
loop {
for (const auto& [tag, quotaAndCounters] : self->trackedTags) {
TraceEvent te("GlobalTagThrottling");
te.detail("Tag", tag);
quotaAndCounters.processTraceEvent(te);
}
wait(delay(SERVER_KNOBS->GLOBAL_TAG_THROTTLING_TRACE_INTERVAL));
}
}
ACTOR static Future<Void> monitorThrottlingChanges(GlobalTagThrottlerImpl* self) {
loop {
state ReadYourWritesTransaction tr(self->db);
loop {
// TODO: Clean up quotas that have been removed
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state RangeResult currentQuotas = wait(tr.getRange(tagQuotaKeys, CLIENT_KNOBS->TOO_MANY));
TraceEvent("GlobalTagThrottler_ReadCurrentQuotas").detail("Size", currentQuotas.size());
for (auto const kv : currentQuotas) {
auto const tag = kv.key.removePrefix(tagQuotaPrefix);
auto const quota = ThrottleApi::TagQuotaValue::fromValue(kv.value);
self->trackedTags[tag].setQuota(quota);
}
++self->throttledTagChangeId;
// FIXME: Should wait on watch instead
// wait(tr.watch(tagThrottleSignalKey));
wait(delay(5.0));
TraceEvent("GlobalTagThrottler_ChangeSignaled");
TEST(true); // Global tag throttler detected quota changes
break;
} catch (Error& e) {
TraceEvent("GlobalTagThrottlerMonitoringChangesError", self->id).error(e);
wait(tr.onError(e));
}
}
}
}
public:
GlobalTagThrottlerImpl(Database db, UID id) : db(db), id(id) { traceActor = tracer(this); }
Future<Void> monitorThrottlingChanges() { return monitorThrottlingChanges(this); }
void addRequests(TransactionTag tag, int count) { trackedTags[tag].addTransactions(count); }
uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; }
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() {
// TODO: For now, only enforce total throttling rates.
// We should use reserved quotas as well.
PrioritizedTransactionTagMap<ClientTagThrottleLimits> result;
for (auto& [tag, quotaAndCounters] : trackedTags) {
// Currently there is no differentiation between batch priority and default priority transactions
auto const limit = quotaAndCounters.updateAndGetPerClientLimit();
if (limit.present()) {
result[TransactionPriority::BATCH][tag] = result[TransactionPriority::DEFAULT][tag] = limit.get();
}
}
return result;
}
int64_t autoThrottleCount() const { return trackedTags.size(); }
uint32_t busyReadTagCount() const {
// TODO: Implement
return 0;
}
uint32_t busyWriteTagCount() const {
// TODO: Implement
return 0;
}
int64_t manualThrottleCount() const { return trackedTags.size(); }
Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const& ss) {
for (const auto& busyReadTag : ss.busiestReadTags) {
trackedTags[busyReadTag.tag].updateReadCostRate(ss.id, busyReadTag.rate);
}
for (const auto& busyWriteTag : ss.busiestWriteTags) {
trackedTags[busyWriteTag.tag].updateWriteCostRate(ss.id, busyWriteTag.rate);
}
// TODO: Call ThrottleApi::throttleTags
return Void();
}
void setQuota(TransactionTagRef tag, ThrottleApi::TagQuotaValue const& tagQuotaValue) {
trackedTags[tag].setQuota(tagQuotaValue);
}
};
GlobalTagThrottler::GlobalTagThrottler(Database db, UID id) : impl(PImpl<GlobalTagThrottlerImpl>::create(db, id)) {}
GlobalTagThrottler::~GlobalTagThrottler() = default;
Future<Void> GlobalTagThrottler::monitorThrottlingChanges() {
return impl->monitorThrottlingChanges();
}
void GlobalTagThrottler::addRequests(TransactionTag tag, int count) {
return impl->addRequests(tag, count);
}
uint64_t GlobalTagThrottler::getThrottledTagChangeId() const {
return impl->getThrottledTagChangeId();
}
PrioritizedTransactionTagMap<ClientTagThrottleLimits> GlobalTagThrottler::getClientRates() {
return impl->getClientRates();
}
int64_t GlobalTagThrottler::autoThrottleCount() const {
return impl->autoThrottleCount();
}
uint32_t GlobalTagThrottler::busyReadTagCount() const {
return impl->busyReadTagCount();
}
uint32_t GlobalTagThrottler::busyWriteTagCount() const {
return impl->busyWriteTagCount();
}
int64_t GlobalTagThrottler::manualThrottleCount() const {
return impl->manualThrottleCount();
}
bool GlobalTagThrottler::isAutoThrottlingEnabled() const {
return true;
}
Future<Void> GlobalTagThrottler::tryUpdateAutoThrottling(StorageQueueInfo const& ss) {
return impl->tryUpdateAutoThrottling(ss);
}
void GlobalTagThrottler::setQuota(TransactionTagRef tag, ThrottleApi::TagQuotaValue const& tagQuotaValue) {
return impl->setQuota(tag, tagQuotaValue);
}
namespace GlobalTagThrottlerTesting {
Optional<double> getTPSLimit(GlobalTagThrottler& globalTagThrottler, TransactionTag tag) {
auto clientRates = globalTagThrottler.getClientRates();
auto it1 = clientRates.find(TransactionPriority::DEFAULT);
if (it1 != clientRates.end()) {
auto it2 = it1->second.find(tag);
if (it2 != it1->second.end()) {
return it2->second.tpsRate;
}
}
return {};
}
class StorageServerCollection {
class Cost {
Smoother smoother;
public:
Cost() : smoother(5.0) {}
Cost& operator+=(double delta) {
smoother.addDelta(delta);
return *this;
}
double smoothRate() const { return smoother.smoothRate(); }
};
std::vector<std::map<TransactionTag, Cost>> readCosts;
std::vector<std::map<TransactionTag, Cost>> writeCosts;
public:
StorageServerCollection(size_t size) : readCosts(size), writeCosts(size) { ASSERT_GT(size, 0); }
void addReadCost(TransactionTag tag, double cost) {
auto const costPerSS = cost / readCosts.size();
for (auto& readCost : readCosts) {
readCost[tag] += costPerSS;
}
}
void addWriteCost(TransactionTag tag, double cost) {
auto const costPerSS = cost / writeCosts.size();
for (auto& writeCost : writeCosts) {
writeCost[tag] += costPerSS;
}
}
std::vector<StorageQueueInfo> getStorageQueueInfos() const {
std::vector<StorageQueueInfo> result;
result.reserve(readCosts.size());
for (int i = 0; i < readCosts.size(); ++i) {
StorageQueueInfo sqInfo(UID(i, i), LocalityData{});
for (const auto& [tag, readCost] : readCosts[i]) {
double fractionalBusyness{ 0.0 }; // unused for global tag throttling
sqInfo.busiestReadTags.emplace_back(tag, readCost.smoothRate(), fractionalBusyness);
}
for (const auto& [tag, writeCost] : writeCosts[i]) {
double fractionalBusyness{ 0.0 }; // unused for global tag throttling
sqInfo.busiestWriteTags.emplace_back(tag, writeCost.smoothRate(), fractionalBusyness);
}
result.push_back(sqInfo);
}
return result;
}
};
ACTOR static Future<Void> runClient(GlobalTagThrottler* globalTagThrottler,
StorageServerCollection* storageServers,
TransactionTag tag,
double desiredTpsRate,
double costPerTransaction,
bool write) {
loop {
auto tpsLimit = getTPSLimit(*globalTagThrottler, tag);
state double tpsRate = tpsLimit.present() ? std::min<double>(desiredTpsRate, tpsLimit.get()) : desiredTpsRate;
wait(delay(1 / tpsRate));
if (write) {
storageServers->addWriteCost(tag, costPerTransaction);
} else {
storageServers->addReadCost(tag, costPerTransaction);
}
globalTagThrottler->addRequests(tag, 1);
}
}
ACTOR static Future<Void> monitorClientRates(GlobalTagThrottler* globalTagThrottler,
TransactionTag tag,
double desiredTPSLimit) {
state int successes = 0;
loop {
wait(delay(1.0));
auto currentTPSLimit = getTPSLimit(*globalTagThrottler, tag);
if (currentTPSLimit.present()) {
TraceEvent("GlobalTagThrottling_RateMonitor")
.detail("Tag", tag)
.detail("CurrentTPSRate", currentTPSLimit.get())
.detail("DesiredTPSRate", desiredTPSLimit);
if (abs(currentTPSLimit.get() - desiredTPSLimit) < 1.0) {
if (++successes == 3) {
return Void();
}
} else {
successes = 0;
}
} else {
successes = 0;
}
}
}
ACTOR static Future<Void> updateGlobalTagThrottler(GlobalTagThrottler* globalTagThrottler,
StorageServerCollection const* storageServers) {
loop {
wait(delay(1.0));
auto const storageQueueInfos = storageServers->getStorageQueueInfos();
for (const auto& sq : storageQueueInfos) {
globalTagThrottler->tryUpdateAutoThrottling(sq);
}
}
}
} // namespace GlobalTagThrottlerTesting
TEST_CASE("/GlobalTagThrottler/Simple") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false);
state Future<Void> monitor =
GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}
TEST_CASE("/GlobalTagThrottler/WriteThrottling") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalWriteQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, true);
state Future<Void> monitor =
GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}
TEST_CASE("/GlobalTagThrottler/MultiTagThrottling") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag1 = "sampleTag1"_sr;
TransactionTag testTag2 = "sampleTag2"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag1, tagQuotaValue);
globalTagThrottler.setQuota(testTag2, tagQuotaValue);
state std::vector<Future<Void>> futures;
state std::vector<Future<Void>> monitorFutures;
futures.push_back(
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag1, 5.0, 6.0, false));
futures.push_back(
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag2, 5.0, 6.0, false));
futures.push_back(GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers));
monitorFutures.push_back(GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag1, 100.0 / 6.0));
monitorFutures.push_back(GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag2, 100.0 / 6.0));
wait(timeoutError(waitForAny(futures) || waitForAll(monitorFutures), 300.0));
return Void();
}
TEST_CASE("/GlobalTagThrottler/ActiveThrottling") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, false);
state Future<Void> monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 10.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}
TEST_CASE("/GlobalTagThrottler/MultiClientThrottling") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false);
state Future<Void> client2 =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false);
state Future<Void> monitor =
GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}
TEST_CASE("/GlobalTagThrottler/MultiClientActiveThrottling") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, false);
state Future<Void> client2 =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 20.0, 10.0, false);
state Future<Void> monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 5.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}
// Global transaction rate should be 20.0, with a distribution of (5, 15) between the 2 clients
TEST_CASE("/GlobalTagThrottler/SkewedMultiClientActiveThrottling") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
ThrottleApi::TagQuotaValue tagQuotaValue;
TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 5.0, false);
state Future<Void> client2 =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 25.0, 5.0, false);
state Future<Void> monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 15.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}
// Test that the tag throttler can reach equilibrium, then adjust to a new equilibrium once the quota is changed
TEST_CASE("/GlobalTagThrottler/UpdateQuota") {
state GlobalTagThrottler globalTagThrottler(Database{}, UID{});
state GlobalTagThrottlerTesting::StorageServerCollection storageServers(10);
state ThrottleApi::TagQuotaValue tagQuotaValue;
state TransactionTag testTag = "sampleTag1"_sr;
tagQuotaValue.totalReadQuota = 100.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
state Future<Void> client =
GlobalTagThrottlerTesting::runClient(&globalTagThrottler, &storageServers, testTag, 5.0, 6.0, false);
state Future<Void> monitor =
GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 100.0 / 6.0);
state Future<Void> updater =
GlobalTagThrottlerTesting::updateGlobalTagThrottler(&globalTagThrottler, &storageServers);
wait(timeoutError(monitor || client || updater, 300.0));
tagQuotaValue.totalReadQuota = 50.0;
globalTagThrottler.setQuota(testTag, tagQuotaValue);
monitor = GlobalTagThrottlerTesting::monitorClientRates(&globalTagThrottler, testTag, 50.0 / 6.0);
wait(timeoutError(monitor || client || updater, 300.0));
return Void();
}

View File

@ -668,7 +668,7 @@ public:
TraceEvent("RocksDB").detail("Info", "DBDestroyed");
}
rocksdb::DB* getDb() { return db; }
rocksdb::DB* getDb() const { return db; }
std::unordered_map<std::string, std::shared_ptr<PhysicalShard>>* getAllShards() { return &physicalShards; }
@ -2092,11 +2092,13 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
}
StorageBytes getStorageBytes() const override {
uint64_t total_live = 0;
int64_t total_free = 0;
int64_t total_space = 0;
uint64_t live = 0;
ASSERT(shardManager.getDb()->GetAggregatedIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));
return StorageBytes(total_free, total_space, total_live, total_free);
int64_t free;
int64_t total;
g_network->getDiskBytes(path, free, total);
return StorageBytes(free, total, live, free);
}
std::vector<std::string> removeRange(KeyRangeRef range) override { return shardManager.removeRange(range); }
@ -2118,7 +2120,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
ShardManager shardManager;
std::shared_ptr<RocksDBMetrics> rocksDBMetrics;
std::string path;
const std::string dataPath;
UID id;
Reference<IThreadPool> writeThread;
Reference<IThreadPool> readThreads;

View File

@ -21,6 +21,7 @@
#include <algorithm>
#include <vector>
#include "fdbclient/FDBTypes.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/SpanContextMessage.h"
@ -102,6 +103,8 @@ TraceEvent debugTagsAndMessageEnabled(const char* context, Version version, Stri
BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));
OTELSpanContextMessage scm;
br >> scm;
} else if (EncryptedMutationMessage::startsEncryptedMutationMessage(mutationType)) {
throw encrypt_unsupported();
} else {
MutationRef m;
BinaryReader br(mutationData, AssumeVersion(rdr.protocolVersion()));

View File

@ -227,11 +227,6 @@ public:
}
}
ACTOR static Future<Void> monitorThrottlingChanges(Ratekeeper* self) {
wait(self->tagThrottler->monitorThrottlingChanges());
return Void();
}
ACTOR static Future<Void> run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
state Ratekeeper self(rkInterf.id(), openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True));
state Future<Void> timeout = Void();
@ -408,7 +403,7 @@ Future<Void> Ratekeeper::trackTLogQueueInfo(TLogInterface tli) {
}
Future<Void> Ratekeeper::monitorThrottlingChanges() {
return RatekeeperImpl::monitorThrottlingChanges(this);
return tagThrottler->monitorThrottlingChanges();
}
Future<Void> Ratekeeper::run(RatekeeperInterface rkInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
@ -436,7 +431,11 @@ Ratekeeper::Ratekeeper(UID id, Database db)
SERVER_KNOBS->SPRING_BYTES_TLOG_BATCH,
SERVER_KNOBS->MAX_TL_SS_VERSION_DIFFERENCE_BATCH,
SERVER_KNOBS->TARGET_DURABILITY_LAG_VERSIONS_BATCH) {
tagThrottler = std::make_unique<TagThrottler>(db, id);
if (SERVER_KNOBS->GLOBAL_TAG_THROTTLING) {
tagThrottler = std::make_unique<GlobalTagThrottler>(db, id);
} else {
tagThrottler = std::make_unique<TagThrottler>(db, id);
}
}
void Ratekeeper::updateCommitCostEstimation(

View File

@ -24,6 +24,7 @@
#include "flow/UnitTest.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/RestoreLoader.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/MutationTracking.h"
@ -422,6 +423,9 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
ASSERT(inserted);
ArenaReader rd(buf.arena(), StringRef(message, msgSize), AssumeVersion(g_network->protocolVersion()));
if (EncryptedMutationMessage::isNextIn(rd)) {
throw encrypt_unsupported();
}
MutationRef mutation;
rd >> mutation;

View File

@ -22,7 +22,7 @@
#include "fdbserver/Knobs.h"
#include "fdbserver/RkTagThrottleCollection.h"
double RkTagThrottleCollection::RkTagThrottleData::getTargetRate(Optional<double> requestRate) {
double RkTagThrottleCollection::RkTagThrottleData::getTargetRate(Optional<double> requestRate) const {
if (limits.tpsRate == 0.0 || !requestRate.present() || requestRate.get() == 0.0 || !rateSet) {
return limits.tpsRate;
} else {
@ -347,10 +347,12 @@ int64_t RkTagThrottleCollection::manualThrottleCount() const {
return count;
}
void RkTagThrottleCollection::updateBusyTagCount(TagThrottledReason reason) {
void RkTagThrottleCollection::incrementBusyTagCount(TagThrottledReason reason) {
if (reason == TagThrottledReason::BUSY_READ) {
++busyReadTagCount;
} else if (reason == TagThrottledReason::BUSY_WRITE) {
++busyWriteTagCount;
} else {
ASSERT(false);
}
}

View File

@ -139,7 +139,7 @@ ACTOR Future<Void> ekLookupByDomainIds(Reference<SimKmsConnectorContext> ctx,
req.debugId.present() ? TraceEvent("SimKmsGetsByDomIds", interf.id()) : Optional<TraceEvent>();
if (dbgDIdTrace.present()) {
dbgDIdTrace.get().detail("DbgId", req.debugId.get());
dbgDIdTrace.get().setMaxEventLength(16384).detail("DbgId", req.debugId.get());
}
// Map encryptionDomainId to corresponding EncryptKeyCtx element using a modulo operation. This

View File

@ -2793,19 +2793,11 @@ ACTOR Future<Optional<Value>> getActivePrimaryDC(Database cx, int* fullyReplicat
}
}
// read storageWigglerStats through Read-only tx, then convert it to JSON field
ACTOR Future<JsonBuilderObject> storageWigglerStatsFetcher(Optional<DataDistributorInterface> ddWorker,
DatabaseConfiguration conf,
Database cx,
bool use_system_priority) {
ACTOR Future<std::pair<Optional<Value>, Optional<Value>>> readStorageWiggleMetrics(Database cx,
bool use_system_priority) {
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
state Optional<Value> primaryV;
state Optional<Value> remoteV;
state Future<ErrorOr<GetStorageWigglerStateReply>> stateFut;
if (ddWorker.present()) {
stateFut = ddWorker.get().storageWigglerState.tryGetReply(GetStorageWigglerStateRequest());
}
loop {
try {
if (use_system_priority) {
@ -2813,42 +2805,59 @@ ACTOR Future<JsonBuilderObject> storageWigglerStatsFetcher(Optional<DataDistribu
}
wait(store(primaryV, StorageWiggleMetrics::runGetTransaction(tr, true)) &&
store(remoteV, StorageWiggleMetrics::runGetTransaction(tr, false)));
wait(tr->commit());
break;
return std::make_pair(primaryV, remoteV);
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
// read storageWigglerStats through Read-only tx, then convert it to JSON field
ACTOR Future<JsonBuilderObject> storageWigglerStatsFetcher(Optional<DataDistributorInterface> ddWorker,
DatabaseConfiguration conf,
Database cx,
bool use_system_priority,
JsonBuilderArray* messages) {
state Future<GetStorageWigglerStateReply> stateFut;
state Future<std::pair<Optional<Value>, Optional<Value>>> wiggleMetricsFut =
timeoutError(readStorageWiggleMetrics(cx, use_system_priority), 2.0);
state JsonBuilderObject res;
if (ddWorker.present()) {
stateFut = timeoutError(ddWorker.get().storageWigglerState.getReply(GetStorageWigglerStateRequest()), 2.0);
wait(ready(stateFut));
} else {
return res;
}
JsonBuilderObject res;
if (primaryV.present()) {
auto obj = ObjectReader::fromStringRef<StorageWiggleMetrics>(primaryV.get(), IncludeVersion()).toJSON();
if (stateFut.canGet() && stateFut.get().present()) {
auto& reply = stateFut.get().get();
try {
if (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01)) {
throw timed_out();
}
wait(success(wiggleMetricsFut) && success(stateFut));
auto [primaryV, remoteV] = wiggleMetricsFut.get();
if (primaryV.present()) {
auto obj = ObjectReader::fromStringRef<StorageWiggleMetrics>(primaryV.get(), IncludeVersion()).toJSON();
auto& reply = stateFut.get();
obj["state"] = StorageWiggler::getWiggleStateStr(static_cast<StorageWiggler::State>(reply.primary));
obj["last_state_change_timestamp"] = reply.lastStateChangePrimary;
obj["last_state_change_datetime"] = epochsToGMTString(reply.lastStateChangePrimary);
res["primary"] = obj;
}
res["primary"] = obj;
}
if (conf.regions.size() > 1 && remoteV.present()) {
auto obj = ObjectReader::fromStringRef<StorageWiggleMetrics>(remoteV.get(), IncludeVersion()).toJSON();
if (stateFut.canGet() && stateFut.get().present()) {
auto& reply = stateFut.get().get();
if (conf.regions.size() > 1 && remoteV.present()) {
auto obj = ObjectReader::fromStringRef<StorageWiggleMetrics>(remoteV.get(), IncludeVersion()).toJSON();
auto& reply = stateFut.get();
obj["state"] = StorageWiggler::getWiggleStateStr(static_cast<StorageWiggler::State>(reply.remote));
obj["last_state_change_timestamp"] = reply.lastStateChangeRemote;
obj["last_state_change_datetime"] = epochsToGMTString(reply.lastStateChangeRemote);
res["remote"] = obj;
}
res["remote"] = obj;
}
if (stateFut.canGet() && stateFut.isError()) {
res["error"] = std::string("Can't get storage wiggler state: ") + stateFut.getError().name();
TraceEvent(SevWarn, "StorageWigglerStatsFetcher").error(stateFut.getError());
} else if (stateFut.canGet() && stateFut.get().isError()) {
res["error"] = std::string("Can't get storage wiggler state: ") + stateFut.get().getError().name();
TraceEvent(SevWarn, "StorageWigglerStatsFetcher").error(stateFut.get().getError());
return res;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
messages->push_back(JsonString::makeMessage("fetch_storage_wiggler_stats_timeout",
"Fetching storage wiggler stats timed out."));
}
return res;
}
@ -3096,17 +3105,29 @@ ACTOR Future<StatusReply> clusterGetStatus(
if (configuration.get().perpetualStorageWiggleSpeed > 0) {
state Future<std::vector<std::pair<UID, StorageWiggleValue>>> primaryWiggleValues;
state Future<std::vector<std::pair<UID, StorageWiggleValue>>> remoteWiggleValues;
double timeout = g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01) ? 0.0 : 2.0;
primaryWiggleValues = timeoutError(readStorageWiggleValues(cx, true, true), timeout);
remoteWiggleValues = timeoutError(readStorageWiggleValues(cx, false, true), timeout);
wait(store(
storageWiggler,
storageWigglerStatsFetcher(db->get().distributor, configuration.get(), cx, true, &messages)) &&
ready(primaryWiggleValues) && ready(remoteWiggleValues));
primaryWiggleValues = readStorageWiggleValues(cx, true, true);
remoteWiggleValues = readStorageWiggleValues(cx, false, true);
wait(store(storageWiggler,
storageWigglerStatsFetcher(db->get().distributor, configuration.get(), cx, true)) &&
success(primaryWiggleValues) && success(remoteWiggleValues));
for (auto& p : primaryWiggleValues.get())
wiggleServers.insert(p.first);
for (auto& p : remoteWiggleValues.get())
wiggleServers.insert(p.first);
if (primaryWiggleValues.canGet()) {
for (auto& p : primaryWiggleValues.get())
wiggleServers.insert(p.first);
} else {
messages.push_back(
JsonString::makeMessage("fetch_storage_wiggler_stats_timeout",
"Fetching wiggling servers in primary region timed out"));
}
if (remoteWiggleValues.canGet()) {
for (auto& p : remoteWiggleValues.get())
wiggleServers.insert(p.first);
} else {
messages.push_back(JsonString::makeMessage("fetch_storage_wiggler_stats_timeout",
"Fetching wiggling servers in remote region timed out"));
}
}
state std::vector<JsonBuilderObject> workerStatuses = wait(getAll(futures2));

View File

@ -23,6 +23,8 @@
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/GetEncryptCipherKeys.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbclient/StorageServerInterface.h"
@ -1874,6 +1876,9 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
state FetchInjectionInfo fii;
state Reference<ILogSystem::IPeekCursor> cloneCursor2;
state Optional<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> cipherKeys;
state bool collectingCipherKeys = false;
// If encrypted mutation is encountered, we collect cipher details and fetch cipher keys, then start over.
loop {
state uint64_t changeCounter = data->cacheRangeChangeCounter;
bool epochEnd = false;
@ -1881,6 +1886,8 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
bool firstMutation = true;
bool dbgLastMessageWasProtocol = false;
std::unordered_set<BlobCipherDetails> cipherDetails;
Reference<ILogSystem::IPeekCursor> cloneCursor1 = cursor->cloneNoMore();
cloneCursor2 = cursor->cloneNoMore();
@ -1904,36 +1911,60 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
OTELSpanContextMessage::isNextIn(cloneReader)) {
OTELSpanContextMessage scm;
cloneReader >> scm;
} else if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) {
// Encrypted mutation found, but cipher keys haven't been fetch.
// Collect cipher details to fetch cipher keys in one batch.
EncryptedMutationMessage emm;
cloneReader >> emm;
cipherDetails.insert(emm.header.cipherTextDetails);
cipherDetails.insert(emm.header.cipherHeaderDetails);
collectingCipherKeys = true;
} else {
MutationRef msg;
cloneReader >> msg;
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
firstMutation = false;
if (msg.param1 == lastEpochEndPrivateKey) {
epochEnd = true;
// ASSERT(firstMutation);
ASSERT(dbgLastMessageWasProtocol);
if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
EncryptedMutationMessage::isNextIn(cloneReader)) {
assert(cipherKeys.present());
msg = EncryptedMutationMessage::decrypt(cloneReader, cloneReader.arena(), cipherKeys.get());
} else {
cloneReader >> msg;
}
dbgLastMessageWasProtocol = false;
if (!collectingCipherKeys) {
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
firstMutation = false;
if (msg.param1 == lastEpochEndPrivateKey) {
epochEnd = true;
// ASSERT(firstMutation);
ASSERT(dbgLastMessageWasProtocol);
}
dbgLastMessageWasProtocol = false;
}
}
}
// Any fetchKeys which are ready to transition their cacheRanges to the adding,transferred state do so
// now. If there is an epoch end we skip this step, to increase testability and to prevent inserting a
// version in the middle of a rolled back version range.
while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) {
auto fk = data->readyFetchKeys.back();
data->readyFetchKeys.pop_back();
fk.send(&fii);
if (collectingCipherKeys) {
std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> result =
wait(getEncryptCipherKeys(data->db, cipherDetails));
cipherKeys = result;
collectingCipherKeys = false;
} else {
// Any fetchKeys which are ready to transition their cacheRanges to the adding,transferred state do
// so now. If there is an epoch end we skip this step, to increase testability and to prevent
// inserting a version in the middle of a rolled back version range.
while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) {
auto fk = data->readyFetchKeys.back();
data->readyFetchKeys.pop_back();
fk.send(&fii);
}
if (data->cacheRangeChangeCounter == changeCounter)
break;
// TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read
// it again.
}
if (data->cacheRangeChangeCounter == changeCounter)
break;
// TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it
// again.
}
data->debug_inApplyUpdate = true;
@ -1988,7 +2019,11 @@ ACTOR Future<Void> pullAsyncData(StorageCacheData* data) {
reader >> oscm;
} else {
MutationRef msg;
reader >> msg;
if (reader.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(reader)) {
msg = EncryptedMutationMessage::decrypt(reader, reader.arena(), cipherKeys.get());
} else {
reader >> msg;
}
if (ver != invalidVersion) // This change belongs to a version < minVersion
{

View File

@ -28,7 +28,6 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/SpanContextMessage.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/Knobs.h"
@ -366,9 +365,9 @@ struct TLogData : NonCopyable {
// the set and for callers that unset will
// be able to match it up
std::string dataFolder; // folder where data is stored
Reference<AsyncVar<bool>> degraded;
// End of fields used by snapshot based backup and restore
Reference<AsyncVar<bool>> degraded;
std::vector<TagsAndMessage> tempTagMessages;
Reference<Histogram> commitLatencyDist;
@ -2569,42 +2568,6 @@ void getQueuingMetrics(TLogData* self, Reference<LogData> logData, TLogQueuingMe
req.reply.send(reply);
}
ACTOR Future<Void> tLogSnapCreate(TLogSnapRequest snapReq, TLogData* self, Reference<LogData> logData) {
if (self->ignorePopUid != snapReq.snapUID.toString()) {
snapReq.reply.sendError(operation_failed());
return Void();
}
ExecCmdValueString snapArg(snapReq.snapPayload);
try {
int err = wait(execHelper(&snapArg, snapReq.snapUID, self->dataFolder, snapReq.role.toString()));
std::string uidStr = snapReq.snapUID.toString();
TraceEvent("ExecTraceTLog")
.detail("Uid", uidStr)
.detail("Status", err)
.detail("Role", snapReq.role)
.detail("Value", self->dataFolder)
.detail("ExecPayload", snapReq.snapPayload)
.detail("PersistentDataVersion", logData->persistentDataVersion)
.detail("PersistentDatadurableVersion", logData->persistentDataDurableVersion)
.detail("QueueCommittedVersion", logData->queueCommittedVersion.get())
.detail("Version", logData->version.get());
if (err != 0) {
throw operation_failed();
}
snapReq.reply.send(Void());
} catch (Error& e) {
TraceEvent("TLogExecHelperError").errorUnsuppressed(e);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
} else {
throw e;
}
}
return Void();
}
ACTOR Future<Void> tLogEnablePopReq(TLogEnablePopRequest enablePopReq, TLogData* self, Reference<LogData> logData) {
if (self->ignorePopUid != enablePopReq.snapUID.toString()) {
TraceEvent(SevWarn, "TLogPopDisableEnableUidMismatch")
@ -2731,9 +2694,6 @@ ACTOR Future<Void> serveTLogInterface(TLogData* self,
when(TLogEnablePopRequest enablePopReq = waitNext(tli.enablePopRequest.getFuture())) {
logData->addActor.send(tLogEnablePopReq(enablePopReq, self, logData));
}
when(TLogSnapRequest snapReq = waitNext(tli.snapRequest.getFuture())) {
logData->addActor.send(tLogSnapCreate(snapReq, self, logData));
}
}
}

View File

@ -21,6 +21,7 @@
#include "fdbserver/TagThrottler.h"
#include "fdbserver/RkTagThrottleCollection.h"
#include "flow/actorcompiler.h" // must be last include
class TagThrottlerImpl {
Database db;
@ -106,7 +107,7 @@ class TagThrottlerImpl {
if (tagKey.throttleType == TagThrottleType::AUTO) {
updatedTagThrottles.autoThrottleTag(
self->id, tag, 0, tagValue.tpsRate, tagValue.expirationTime);
updatedTagThrottles.updateBusyTagCount(tagValue.reason);
updatedTagThrottles.incrementBusyTagCount(tagValue.reason);
} else {
updatedTagThrottles.manualThrottleTag(self->id,
tag,
@ -143,6 +144,7 @@ class TagThrottlerImpl {
if (busyness > SERVER_KNOBS->AUTO_THROTTLE_TARGET_TAG_BUSYNESS && rate > SERVER_KNOBS->MIN_TAG_COST) {
TEST(true); // Transaction tag auto-throttled
Optional<double> clientRate = throttledTags.autoThrottleTag(id, tag, busyness);
// TODO: Increment tag throttle counts here?
if (clientRate.present()) {
TagSet tags;
tags.addTag(tag);
@ -185,23 +187,21 @@ public:
// the future
auto storageQueue = ss.getStorageQueueBytes();
auto storageDurabilityLag = ss.getDurabilityLag();
std::vector<Future<Void>> futures;
if (storageQueue > SERVER_KNOBS->AUTO_TAG_THROTTLE_STORAGE_QUEUE_BYTES ||
storageDurabilityLag > SERVER_KNOBS->AUTO_TAG_THROTTLE_DURABILITY_LAG_VERSIONS) {
// TODO: Update once size is potentially > 1
ASSERT_WE_THINK(ss.busiestWriteTags.size() <= 1);
ASSERT_WE_THINK(ss.busiestReadTags.size() <= 1);
for (const auto& busyWriteTag : ss.busiestWriteTags) {
return tryUpdateAutoThrottling(busyWriteTag.tag,
busyWriteTag.rate,
busyWriteTag.fractionalBusyness,
TagThrottledReason::BUSY_WRITE);
futures.push_back(tryUpdateAutoThrottling(busyWriteTag.tag,
busyWriteTag.rate,
busyWriteTag.fractionalBusyness,
TagThrottledReason::BUSY_WRITE));
}
for (const auto& busyReadTag : ss.busiestReadTags) {
return tryUpdateAutoThrottling(
busyReadTag.tag, busyReadTag.rate, busyReadTag.fractionalBusyness, TagThrottledReason::BUSY_READ);
futures.push_back(tryUpdateAutoThrottling(
busyReadTag.tag, busyReadTag.rate, busyReadTag.fractionalBusyness, TagThrottledReason::BUSY_READ));
}
}
return Void();
return waitForAll(futures);
}
}; // class TagThrottlerImpl

View File

@ -48,7 +48,7 @@ public:
for (int i = 0; i < tenantList.size(); i++) {
TenantName tname = tenantList[i].key.removePrefix(tenantMapPrefix);
TenantMapEntry t = decodeTenantEntry(tenantList[i].value);
TenantMapEntry t = TenantMapEntry::decode(tenantList[i].value);
tenantCache->insert(tname, t);
@ -86,7 +86,7 @@ public:
for (int i = 0; i < tenantList.size(); i++) {
TenantName tname = tenantList[i].key.removePrefix(tenantMapPrefix);
TenantMapEntry t = decodeTenantEntry(tenantList[i].value);
TenantMapEntry t = TenantMapEntry::decode(tenantList[i].value);
if (tenantCache->update(tname, t)) {
tenantListUpdated = true;

View File

@ -18,50 +18,193 @@
* limitations under the License.
*/
#include "fdbserver/Knobs.h"
#include "fdbserver/TransactionTagCounter.h"
#include "flow/Trace.h"
TransactionTagCounter::TransactionTagCounter(UID thisServerID)
: thisServerID(thisServerID),
busiestReadTagEventHolder(makeReference<EventCacheHolder>(thisServerID.toString() + "/BusiestReadTag")) {}
namespace {
void TransactionTagCounter::addRequest(Optional<TagSet> const& tags, int64_t bytes) {
if (tags.present()) {
TEST(true); // Tracking transaction tag in counter
double cost = costFunction(bytes);
for (auto& tag : tags.get()) {
int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())];
count += cost;
if (count > busiestTagCount) {
busiestTagCount = count;
busiestTag = tag;
class TopKTags {
public:
struct TagAndCount {
TransactionTag tag;
int64_t count;
bool operator<(TagAndCount const& other) const { return count < other.count; }
explicit TagAndCount(TransactionTag tag, int64_t count) : tag(tag), count(count) {}
};
private:
// Because the number of tracked is expected to be small, they can be tracked
// in a simple vector. If the number of tracked tags increases, a more sophisticated
// data structure will be required.
std::vector<TagAndCount> topTags;
int limit;
public:
explicit TopKTags(int limit) : limit(limit) {
ASSERT_GT(limit, 0);
topTags.reserve(limit);
}
void incrementCount(TransactionTag tag, int previousCount, int increase) {
auto iter = std::find_if(topTags.begin(), topTags.end(), [tag](const auto& tc) { return tc.tag == tag; });
if (iter != topTags.end()) {
ASSERT_EQ(previousCount, iter->count);
iter->count += increase;
} else if (topTags.size() < limit) {
ASSERT_EQ(previousCount, 0);
topTags.emplace_back(tag, increase);
} else {
auto toReplace = std::min_element(topTags.begin(), topTags.end());
ASSERT_GE(toReplace->count, previousCount);
if (toReplace->count < previousCount + increase) {
toReplace->tag = tag;
toReplace->count = previousCount + increase;
}
}
intervalTotalSampledCount += cost;
}
std::vector<StorageQueuingMetricsReply::TagInfo> getBusiestTags(double elapsed, double totalSampleCount) const {
std::vector<StorageQueuingMetricsReply::TagInfo> result;
for (auto const& tagAndCounter : topTags) {
auto rate = (tagAndCounter.count / CLIENT_KNOBS->READ_TAG_SAMPLE_RATE) / elapsed;
if (rate > SERVER_KNOBS->MIN_TAG_READ_PAGES_RATE) {
result.emplace_back(tagAndCounter.tag, rate, tagAndCounter.count / totalSampleCount);
}
}
return result;
}
void clear() { topTags.clear(); }
};
} // namespace
class TransactionTagCounterImpl {
UID thisServerID;
TransactionTagMap<int64_t> intervalCounts;
int64_t intervalTotalSampledCount = 0;
TopKTags topTags;
double intervalStart = 0;
std::vector<StorageQueuingMetricsReply::TagInfo> previousBusiestTags;
Reference<EventCacheHolder> busiestReadTagEventHolder;
static int64_t costFunction(int64_t bytes) { return bytes / SERVER_KNOBS->READ_COST_BYTE_FACTOR + 1; }
public:
TransactionTagCounterImpl(UID thisServerID)
: thisServerID(thisServerID), topTags(SERVER_KNOBS->SS_THROTTLE_TAGS_TRACKED),
busiestReadTagEventHolder(makeReference<EventCacheHolder>(thisServerID.toString() + "/BusiestReadTag")) {}
void addRequest(Optional<TagSet> const& tags, int64_t bytes) {
if (tags.present()) {
TEST(true); // Tracking transaction tag in counter
double cost = costFunction(bytes);
for (auto& tag : tags.get()) {
int64_t& count = intervalCounts[TransactionTag(tag, tags.get().getArena())];
topTags.incrementCount(tag, count, cost);
count += cost;
}
intervalTotalSampledCount += cost;
}
}
void startNewInterval() {
double elapsed = now() - intervalStart;
previousBusiestTags.clear();
if (intervalStart > 0 && CLIENT_KNOBS->READ_TAG_SAMPLE_RATE > 0 && elapsed > 0) {
previousBusiestTags = topTags.getBusiestTags(elapsed, intervalTotalSampledCount);
TraceEvent("BusiestReadTag", thisServerID)
.detail("Elapsed", elapsed)
//.detail("Tag", printable(busiestTag))
//.detail("TagCost", busiestTagCount)
.detail("TotalSampledCost", intervalTotalSampledCount)
.detail("Reported", previousBusiestTags.size())
.trackLatest(busiestReadTagEventHolder->trackingKey);
}
intervalCounts.clear();
intervalTotalSampledCount = 0;
topTags.clear();
intervalStart = now();
}
std::vector<StorageQueuingMetricsReply::TagInfo> const& getBusiestTags() const { return previousBusiestTags; }
};
TransactionTagCounter::TransactionTagCounter(UID thisServerID)
: impl(PImpl<TransactionTagCounterImpl>::create(thisServerID)) {}
TransactionTagCounter::~TransactionTagCounter() = default;
void TransactionTagCounter::addRequest(Optional<TagSet> const& tags, int64_t bytes) {
return impl->addRequest(tags, bytes);
}
void TransactionTagCounter::startNewInterval() {
double elapsed = now() - intervalStart;
previousBusiestTags.clear();
if (intervalStart > 0 && CLIENT_KNOBS->READ_TAG_SAMPLE_RATE > 0 && elapsed > 0) {
double rate = busiestTagCount / CLIENT_KNOBS->READ_TAG_SAMPLE_RATE / elapsed;
if (rate > SERVER_KNOBS->MIN_TAG_READ_PAGES_RATE) {
previousBusiestTags.emplace_back(busiestTag, rate, (double)busiestTagCount / intervalTotalSampledCount);
}
TraceEvent("BusiestReadTag", thisServerID)
.detail("Elapsed", elapsed)
.detail("Tag", printable(busiestTag))
.detail("TagCost", busiestTagCount)
.detail("TotalSampledCost", intervalTotalSampledCount)
.detail("Reported", !previousBusiestTags.empty())
.trackLatest(busiestReadTagEventHolder->trackingKey);
}
intervalCounts.clear();
intervalTotalSampledCount = 0;
busiestTagCount = 0;
intervalStart = now();
return impl->startNewInterval();
}
std::vector<StorageQueuingMetricsReply::TagInfo> const& TransactionTagCounter::getBusiestTags() const {
return impl->getBusiestTags();
}
TEST_CASE("/TransactionTagCounter/TopKTags") {
TopKTags topTags(2);
// Ensure that costs are larger enough to show up
auto const costMultiplier =
std::max<double>(1.0, 2 * SERVER_KNOBS->MIN_TAG_READ_PAGES_RATE * CLIENT_KNOBS->READ_TAG_SAMPLE_RATE);
ASSERT_EQ(topTags.getBusiestTags(1.0, 0).size(), 0);
topTags.incrementCount("a"_sr, 0, 1 * costMultiplier);
{
auto const busiestTags = topTags.getBusiestTags(1.0, 1 * costMultiplier);
ASSERT_EQ(busiestTags.size(), 1);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "a"_sr; }),
1);
}
topTags.incrementCount("b"_sr, 0, 2 * costMultiplier);
topTags.incrementCount("c"_sr, 0, 3 * costMultiplier);
{
auto busiestTags = topTags.getBusiestTags(1.0, 6 * costMultiplier);
ASSERT_EQ(busiestTags.size(), 2);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "a"_sr; }),
0);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "b"_sr; }),
1);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "c"_sr; }),
1);
}
topTags.incrementCount("a"_sr, 1 * costMultiplier, 3 * costMultiplier);
{
auto busiestTags = topTags.getBusiestTags(1.0, 9 * costMultiplier);
ASSERT_EQ(busiestTags.size(), 2);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "a"_sr; }),
1);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "b"_sr; }),
0);
ASSERT_EQ(std::count_if(busiestTags.begin(),
busiestTags.end(),
[](auto const& tagInfo) { return tagInfo.tag == "c"_sr; }),
1);
}
topTags.clear();
ASSERT_EQ(topTags.getBusiestTags(1.0, 0).size(), 0);
return Void();
}

View File

@ -33,6 +33,7 @@
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/ProxyCommitData.actor.h"
#include "flow/BlobCipher.h"
#include "flow/FastRef.h"
// Resolver's data for applyMetadataMutations() calls.
@ -93,6 +94,7 @@ void applyMetadataMutations(SpanContext const& spanContext,
Reference<ILogSystem> logSystem,
const VectorRef<MutationRef>& mutations,
LogPushData* pToCommit,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* pCipherKeys,
bool& confChange,
Version version,
Version popVersion,

View File

@ -0,0 +1,117 @@
/*
* EncryptedMutationMessage.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H
#define FDBSERVER_ENCRYPTEDMUTATIONMESSAGE_H
#pragma once
#include "fdbclient/CommitTransaction.h"
#include "flow/BlobCipher.h"
struct EncryptedMutationMessage {
BlobCipherEncryptHeader header;
StringRef encrypted;
EncryptedMutationMessage() {}
std::string toString() const {
return format("code: %d, encryption info: %s",
MutationRef::Reserved_For_EncryptedMutationMessage,
header.toString().c_str());
}
template <class Ar>
void serialize(Ar& ar) {
uint8_t poly = MutationRef::Reserved_For_EncryptedMutationMessage;
serializer(ar, poly, header, encrypted);
}
static bool startsEncryptedMutationMessage(uint8_t byte) {
return byte == MutationRef::Reserved_For_EncryptedMutationMessage;
}
template <class Ar>
static bool isNextIn(Ar& ar) {
return startsEncryptedMutationMessage(*(const uint8_t*)ar.peekBytes(1));
}
// Encrypt given mutation and return an EncryptedMutationMessage.
static EncryptedMutationMessage encrypt(
Arena& arena,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
const EncryptCipherDomainId& domainId,
const MutationRef& mutation) {
ASSERT_NE(domainId, ENCRYPT_INVALID_DOMAIN_ID);
auto textCipherItr = cipherKeys.find(domainId);
auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID);
ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
uint8_t iv[AES_256_IV_LENGTH];
generateRandomData(iv, AES_256_IV_LENGTH);
BinaryWriter bw(AssumeVersion(g_network->protocolVersion()));
bw << mutation;
EncryptedMutationMessage encrypted_mutation;
EncryptBlobCipherAes265Ctr cipher(textCipherItr->second,
headerCipherItr->second,
iv,
AES_256_IV_LENGTH,
ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
encrypted_mutation.encrypted =
cipher
.encrypt(static_cast<const uint8_t*>(bw.getData()), bw.getLength(), &encrypted_mutation.header, arena)
->toStringRef();
return encrypted_mutation;
}
// Encrypt system key space mutation and return an EncryptedMutationMessage.
static EncryptedMutationMessage encryptMetadata(
Arena& arena,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>& cipherKeys,
const MutationRef& mutation) {
return encrypt(arena, cipherKeys, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, mutation);
}
// Read an EncryptedMutationMessage from given reader, decrypt and return the encrypted mutation.
// Also return decrypt buffer through buf, if it is specified.
template <class Ar>
static MutationRef decrypt(Ar& ar,
Arena& arena,
const std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>& cipherKeys,
StringRef* buf = nullptr) {
EncryptedMutationMessage msg;
ar >> msg;
auto textCipherItr = cipherKeys.find(msg.header.cipherTextDetails);
auto headerCipherItr = cipherKeys.find(msg.header.cipherHeaderDetails);
ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid());
ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid());
DecryptBlobCipherAes256Ctr cipher(textCipherItr->second, headerCipherItr->second, msg.header.iv);
StringRef plaintext =
cipher.decrypt(msg.encrypted.begin(), msg.encrypted.size(), msg.header, arena)->toStringRef();
if (buf != nullptr) {
*buf = plaintext;
}
ArenaReader reader(arena, plaintext, AssumeVersion(g_network->protocolVersion()));
MutationRef mutation;
reader >> mutation;
return mutation;
}
};
#endif

View File

@ -1,5 +1,5 @@
/*
* GetCipherKeys.h
* GetEncryptCipherKeys.h
*
* This source file is part of the FoundationDB open source project
*

View File

@ -148,7 +148,7 @@ class Ratekeeper {
double lastWarning;
double lastSSListFetchedTimestamp;
std::unique_ptr<class TagThrottler> tagThrottler;
std::unique_ptr<class ITagThrottler> tagThrottler;
RatekeeperLimits normalLimits;
RatekeeperLimits batchLimits;

View File

@ -42,7 +42,7 @@ class RkTagThrottleCollection : NonCopyable {
bool rateSet = false;
RkTagThrottleData() : clientRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {}
double getTargetRate(Optional<double> requestRate);
double getTargetRate(Optional<double> requestRate) const;
Optional<double> updateAndGetClientRate(Optional<double> requestRate);
};
@ -83,7 +83,7 @@ public:
void addRequests(TransactionTag const& tag, int requests);
int64_t autoThrottleCount() const { return autoThrottledTags.size(); }
int64_t manualThrottleCount() const;
void updateBusyTagCount(TagThrottledReason);
void incrementBusyTagCount(TagThrottledReason);
auto getBusyReadTagCount() const { return busyReadTagCount; }
auto getBusyWriteTagCount() const { return busyWriteTagCount; }
};

View File

@ -357,6 +357,7 @@ struct StorageServerMetrics {
}
void splitMetrics(SplitMetricsRequest req) const {
int minSplitBytes = req.minSplitBytes.present() ? req.minSplitBytes.get() : SERVER_KNOBS->MIN_SHARD_BYTES;
try {
SplitMetricsReply reply;
KeyRef lastKey = req.keys.begin;
@ -364,10 +365,10 @@ struct StorageServerMetrics {
StorageMetrics estimated = req.estimated;
StorageMetrics remaining = getMetrics(req.keys) + used;
//TraceEvent("SplitMetrics").detail("Begin", req.keys.begin).detail("End", req.keys.end).detail("Remaining", remaining.bytes).detail("Used", used.bytes);
//TraceEvent("SplitMetrics").detail("Begin", req.keys.begin).detail("End", req.keys.end).detail("Remaining", remaining.bytes).detail("Used", used.bytes).detail("MinSplitBytes", minSplitBytes);
while (true) {
if (remaining.bytes < 2 * SERVER_KNOBS->MIN_SHARD_BYTES)
if (remaining.bytes < 2 * minSplitBytes)
break;
KeyRef key = req.keys.end;
bool hasUsed = used.bytes != 0 || used.bytesPerKSecond != 0 || used.iosPerKSecond != 0;
@ -382,10 +383,9 @@ struct StorageServerMetrics {
lastKey,
key,
hasUsed);
if (used.bytes < SERVER_KNOBS->MIN_SHARD_BYTES)
key = std::max(key,
byteSample.splitEstimate(KeyRangeRef(lastKey, req.keys.end),
SERVER_KNOBS->MIN_SHARD_BYTES - used.bytes));
if (used.bytes < minSplitBytes)
key = std::max(
key, byteSample.splitEstimate(KeyRangeRef(lastKey, req.keys.end), minSplitBytes - used.bytes));
key = getSplitKey(remaining.iosPerKSecond,
estimated.iosPerKSecond,
req.limits.iosPerKSecond,
@ -532,7 +532,7 @@ struct StorageServerMetrics {
auto _ranges = getReadHotRanges(req.keys,
SERVER_KNOBS->SHARD_MAX_READ_DENSITY_RATIO,
SERVER_KNOBS->READ_HOT_SUB_RANGE_CHUNK_SIZE,
SERVER_KNOBS->SHARD_READ_HOT_BANDWITH_MIN_PER_KSECONDS);
SERVER_KNOBS->SHARD_READ_HOT_BANDWIDTH_MIN_PER_KSECONDS);
reply.readHotRanges = VectorRef(_ranges.data(), _ranges.size());
req.reply.send(reply);
}

View File

@ -32,7 +32,6 @@
#include "fdbrpc/simulator.h"
#include "fdbserver/DBCoreState.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/ServerDBInfo.h"

View File

@ -23,32 +23,72 @@
#include "fdbclient/PImpl.h"
#include "fdbserver/Ratekeeper.h"
class TagThrottler {
class ITagThrottler {
public:
virtual ~ITagThrottler() = default;
// Poll the system keyspace looking for updates made through the tag throttling API
virtual Future<Void> monitorThrottlingChanges() = 0;
// Increment the number of known requests associated with the specified tag
virtual void addRequests(TransactionTag tag, int count) = 0;
// This throttled tag change ID is used to coordinate updates with the GRV proxies
virtual uint64_t getThrottledTagChangeId() const = 0;
// For each tag and priority combination, return the throughput limit and expiration time
// Also, erase expired tags
virtual PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() = 0;
virtual int64_t autoThrottleCount() const = 0;
virtual uint32_t busyReadTagCount() const = 0;
virtual uint32_t busyWriteTagCount() const = 0;
virtual int64_t manualThrottleCount() const = 0;
virtual bool isAutoThrottlingEnabled() const = 0;
// Based on the busiest read and write tags in the provided storage queue info, update
// tag throttling limits.
virtual Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&) = 0;
};
class TagThrottler : public ITagThrottler {
PImpl<class TagThrottlerImpl> impl;
public:
TagThrottler(Database db, UID id);
~TagThrottler();
// Poll the system keyspace looking for updates made through the tag throttling API
Future<Void> monitorThrottlingChanges();
// Increment the number of known requests associated with the specified tag
void addRequests(TransactionTag tag, int count);
// This throttled tag change ID is used to coordinate updates with the GRV proxies
uint64_t getThrottledTagChangeId() const;
// For each tag and priority combination, return the throughput limit and expiration time
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates();
int64_t autoThrottleCount() const;
uint32_t busyReadTagCount() const;
uint32_t busyWriteTagCount() const;
int64_t manualThrottleCount() const;
bool isAutoThrottlingEnabled() const;
// Based on the busiest read and write tags in the provided storage queue info, update
// tag throttling limits.
Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&);
Future<Void> monitorThrottlingChanges() override;
void addRequests(TransactionTag tag, int count) override;
uint64_t getThrottledTagChangeId() const override;
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override;
int64_t autoThrottleCount() const override;
uint32_t busyReadTagCount() const override;
uint32_t busyWriteTagCount() const override;
int64_t manualThrottleCount() const override;
bool isAutoThrottlingEnabled() const override;
Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&) override;
};
class GlobalTagThrottler : public ITagThrottler {
PImpl<class GlobalTagThrottlerImpl> impl;
public:
GlobalTagThrottler(Database db, UID id);
~GlobalTagThrottler();
Future<Void> monitorThrottlingChanges() override;
void addRequests(TransactionTag tag, int count) override;
uint64_t getThrottledTagChangeId() const override;
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override;
int64_t autoThrottleCount() const override;
uint32_t busyReadTagCount() const override;
uint32_t busyWriteTagCount() const override;
int64_t manualThrottleCount() const override;
bool isAutoThrottlingEnabled() const override;
Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&) override;
// testing only
public:
void setQuota(TransactionTagRef, ThrottleApi::TagQuotaValue const&);
};

View File

@ -20,25 +20,23 @@
#pragma once
#include "fdbclient/PImpl.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbserver/Knobs.h"
class TransactionTagCounter {
TransactionTagMap<int64_t> intervalCounts;
int64_t intervalTotalSampledCount = 0;
TransactionTag busiestTag;
int64_t busiestTagCount = 0;
double intervalStart = 0;
std::vector<StorageQueuingMetricsReply::TagInfo> previousBusiestTags;
UID thisServerID;
Reference<EventCacheHolder> busiestReadTagEventHolder;
PImpl<class TransactionTagCounterImpl> impl;
public:
TransactionTagCounter(UID thisServerID);
static int64_t costFunction(int64_t bytes) { return bytes / SERVER_KNOBS->READ_COST_BYTE_FACTOR + 1; }
~TransactionTagCounter();
// Update counters tracking the busyness of each tag in the current interval
void addRequest(Optional<TagSet> const& tags, int64_t bytes);
// Save current set of busy tags and reset counters for next interval
void startNewInterval();
std::vector<StorageQueuingMetricsReply::TagInfo> const& getBusiestTags() const { return previousBusiestTags; }
// Returns the set of busiest tags as of the end of the last interval
std::vector<StorageQueuingMetricsReply::TagInfo> const& getBusiestTags() const;
};

View File

@ -1,321 +0,0 @@
/*
* TPCCWorkload.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBSERVER_TPCCWORKLOAD_H
#define FDBSERVER_TPCCWORKLOAD_H
#pragma once
#include "flow/Arena.h"
#include "fdbclient/FDBTypes.h"
#include <boost/preprocessor.hpp>
#include <iomanip>
namespace TPCCWorkload {
// Schema
#define EXPAND(...) __VA_ARGS__
#define EMPTY()
#define DEFER(x) x EMPTY()
// An indirection macro to avoid direct recursion
#define BOOST_PP_SEQ_FOR_EACH_ID() BOOST_PP_SEQ_FOR_EACH generators
#define ROW_CONCAT(prefix, name) prefix##name
#define ROW_TO_STRING(str) #str
#define ROW_ELEMENT_NAME(prefix, element) ROW_CONCAT(prefix, element)
#define ROW_MEMBER(r, data, elem) \
BOOST_PP_TUPLE_ELEM(0, elem) \
ROW_ELEMENT_NAME(data, BOOST_PP_TUPLE_ELEM(1, elem));
#define ROW_MEMBERS_SEQ(prefix, seq) BOOST_PP_SEQ_FOR_EACH(ROW_MEMBER, prefix, seq)
#define ROW_MEMBERS(prefix, tuple) ROW_MEMBERS_SEQ(prefix, BOOST_PP_TUPLE_TO_SEQ(tuple))
#define ROW_SERIALIZE_ELEMENT(r, data, elem) , ROW_ELEMENT_NAME(data, BOOST_PP_TUPLE_ELEM(1, elem))
#define ROW_SERIALIZE_ELEMENTS(prefix, seq) BOOST_PP_SEQ_FOR_EACH(ROW_SERIALIZE_ELEMENT, prefix, seq)
#define ROW_SERIALIZE(prefix, tuple) ar ROW_SERIALIZE_ELEMENTS(prefix, BOOST_PP_TUPLE_TO_SEQ(tuple))
#define ROW_KEY_STEP(r, data, elem) , ROW_ELEMENT_NAME(data, elem)
#define ROW_KEY_LIST_SEQ_EXP(prefix, seq) BOOST_PP_SEQ_FOR_EACH(ROW_KEY_STEP, prefix, seq)
#define ROW_KEY_LIST_SEQ(prefix, seq) ROW_KEY_LIST_SEQ_EXP(prefix, seq)
#define ROW_KEY_LIST(prefix, a) ROW_KEY_LIST_SEQ(prefix, BOOST_PP_ARRAY_TO_SEQ(a))
#define ROW_KEY_LIST_TUPLE(prefix, tuple) ROW_KEY_LIST_SEQ(prefix, BOOST_PP_TUPLE_TO_SEQ(tuple))
#define ROW_KEY_HAS_KEY(Name, prefix, primary_key) \
static constexpr bool HAS_KEY = true; \
StringRef key() { \
auto s = generateKey(#Name, KEY_SIZE ROW_KEY_LIST(prefix, primary_key)); \
return StringRef(arena, s); \
} \
KeyRangeRef keyRange(int dontInclude) { \
auto s = generateKey(#Name, KEY_SIZE - dontInclude ROW_KEY_LIST(prefix, primary_key)); \
KeyRef begin = StringRef(arena, reinterpret_cast<const uint8_t*>(s.c_str()), s.size() + 1); \
KeyRef end = StringRef(arena, reinterpret_cast<const uint8_t*>(s.c_str()), s.size() + 1); \
auto sBegin = mutateString(begin); \
sBegin[s.size()] = uint8_t('/'); \
auto sEnd = mutateString(end); \
sEnd[s.size()] = uint8_t('0'); \
return KeyRangeRef(begin, end); \
}
#define ROW_KEY_NO_KEY static constexpr bool HAS_KEY = false;
#define ROW_KEY_IMPL(Name, prefix, primary_key, sz) \
BOOST_PP_IF(sz, ROW_KEY_HAS_KEY(Name, prefix, primary_key), ROW_KEY_NO_KEY)
#define ROW_KEY(Name, prefix, primary_key) ROW_KEY_IMPL(Name, prefix, primary_key, BOOST_PP_ARRAY_SIZE(primary_key))
#define ROW_INDEX_NAME_KEY(name) ROW_CONCAT(name, Key)
#define ROW_INDEX_NAME_IMPL2(name) ROW_TO_STRING(name)
#define ROW_INDEX_NAME_IMPL(indexName, name) ROW_INDEX_NAME_IMPL2(ROW_CONCAT(indexName, name))
#define ROW_INDEX_NAME(nameTuple, index) \
ROW_INDEX_NAME_IMPL(BOOST_PP_TUPLE_ELEM(0, index), BOOST_PP_TUPLE_ELEM(0, nameTuple))
#define ROW_GENERATE_INDEX(r, data, index) \
StringRef ROW_INDEX_NAME_KEY(BOOST_PP_TUPLE_ELEM(0, index))(int dontInclude = 0) { \
auto s = generateKey(ROW_INDEX_NAME(data, index), \
BOOST_PP_TUPLE_SIZE(index) - dontInclude - \
1 ROW_KEY_LIST_TUPLE(BOOST_PP_TUPLE_ELEM(1, data), BOOST_PP_TUPLE_POP_FRONT(index))); \
return StringRef(arena, s); \
}
#define ROW_GENERATE_INDEXES_LIST(Name, prefix, indexes) \
BOOST_PP_LIST_FOR_EACH(ROW_GENERATE_INDEX, (Name, prefix), indexes)
#define ROW_GENERATE_INDEXES(Name, prefix, indexes) \
ROW_GENERATE_INDEXES_LIST(Name, prefix, BOOST_PP_ARRAY_TO_LIST(indexes))
#define ROW_INDEXES(Name, prefix, indexes) \
BOOST_PP_IF(BOOST_PP_ARRAY_SIZE(indexes), ROW_GENERATE_INDEXES(Name, prefix, indexes), BOOST_PP_EMPTY())
#define ROW(Name, prefix, tuple, primary_key, indexes) \
struct Name { \
constexpr static FileIdentifier file_identifier = __COUNTER__; \
Arena arena; \
ROW_MEMBERS(prefix, tuple) \
template <class Ar> \
void serialize(Ar& ar) { \
serializer(ROW_SERIALIZE(prefix, tuple)); \
} \
static constexpr int KEY_SIZE = BOOST_PP_ARRAY_SIZE(primary_key); \
ROW_KEY(Name, prefix, primary_key) \
ROW_INDEXES(Name, prefix, indexes) \
}
template <class Value>
struct KeyStreamer {
void operator()(std::stringstream& ss, const Value& v) { ss << v; }
};
template <>
struct KeyStreamer<StringRef> {
void operator()(std::stringstream& ss, const StringRef& v) { ss << v.toString(); }
};
template <>
struct KeyStreamer<int> {
void operator()(std::stringstream& ss, const int v) { ss << std::setfill('0') << std::setw(6) << v; }
};
template <>
struct KeyStreamer<short> {
void operator()(std::stringstream& ss, const int v) { ss << std::setfill('0') << std::setw(6) << v; }
};
template <class... Values>
struct KeyGenerator;
template <class Head, class... Tail>
struct KeyGenerator<Head, Tail...> {
static void generate(std::stringstream& ss, int max, Head h, Tail... tail) {
KeyStreamer<Head> streamer;
if (max > 0) {
ss << '/';
streamer(ss, h);
KeyGenerator<Tail...>::generate(ss, max - 1, tail...);
}
}
};
template <>
struct KeyGenerator<> {
static void generate(std::stringstream&, int) {}
};
template <class... Values>
std::string generateKey(const std::string& table, int max, Values... values) {
std::stringstream ss;
ss << table;
if (max > 0) {
KeyGenerator<Values...>::generate(ss, max, values...);
}
return ss.str();
}
ROW(Warehouse,
w_,
((int, id),
(StringRef, name),
(StringRef, street_1),
(StringRef, street_2),
(StringRef, city),
(StringRef, state),
(StringRef, zip),
(double, tax),
(double, ytd)),
(1, (id)),
(0, ()));
ROW(District,
d_,
((int, id),
(int, w_id),
(StringRef, name),
(StringRef, street_1),
(StringRef, street_2),
(StringRef, city),
(StringRef, state),
(StringRef, zip),
(double, tax),
(double, ytd),
(int, next_o_id)),
(2, (w_id, id)),
(0, ()));
ROW(Customer,
c_,
((int, id),
(int, d_id),
(int, w_id),
(StringRef, first),
(StringRef, last),
(StringRef, middle),
(StringRef, street_1),
(StringRef, street_2),
(StringRef, city),
(StringRef, state),
(StringRef, zip),
(StringRef, phone),
(double, since),
(StringRef, credit),
(double, credit_lim),
(double, discount),
(double, balance),
(double, ytd_payment),
(unsigned, payment_cnt),
(unsigned, delivery_count),
(StringRef, data)),
(3, (w_id, d_id, id)),
(1, ((indexLast, w_id, d_id, last, id))));
ROW(History,
h_,
((int, c_id),
(int, c_d_id),
(int, c_w_id),
(int, d_id),
(int, w_id),
(double, date),
(double, amount),
(StringRef, data)),
(0, ()),
(0, ()));
ROW(NewOrder, no_, ((int, o_id), (int, d_id), (int, w_id)), (3, (w_id, d_id, o_id)), (0, ()));
ROW(Order,
o_,
((int, id),
(int, d_id),
(int, w_id),
(int, c_id),
(double, entry_d),
(Optional<short>, carrier_id),
(short, ol_cnt),
(bool, all_local)),
(3, (w_id, d_id, id)),
(0, ()));
ROW(OrderLine,
ol_,
((int, o_id),
(int, d_id),
(int, w_id),
(short, number),
(int, i_id),
(int, supply_w_id),
(Optional<double>, delivery_d),
(short, quantity),
(double, amount),
(StringRef, dist_info)),
(4, (w_id, d_id, o_id, number)),
(0, ()));
ROW(Item, i_, ((int, id), (int, im_id), (StringRef, name), (double, price), (StringRef, data)), (1, (id)), (0, ()));
ROW(Stock,
s_,
((int, i_id),
(int, w_id),
(short, quantity),
(StringRef, dist_01),
(StringRef, dist_02),
(StringRef, dist_03),
(StringRef, dist_04),
(StringRef, dist_05),
(StringRef, dist_06),
(StringRef, dist_07),
(StringRef, dist_08),
(StringRef, dist_09),
(StringRef, dist_10),
(int, ytd),
(short, order_cnt),
(short, remote_cnt),
(StringRef, data)),
(2, (w_id, i_id)),
(0, ()));
#undef FLOW_ACOMPILER_STATE
#define FLOW_ACOMPILER_STATE 1
struct GlobalState {
constexpr static FileIdentifier file_identifier = 1064821;
int CLoad, CRun, CDelta, CId, COlIID;
GlobalState() {
CLoad = deterministicRandom()->randomInt(0, 256);
while (true) {
CDelta = deterministicRandom()->randomInt(65, 120);
if (!(CDelta == 96 || CDelta == 112)) {
break;
}
}
if (CDelta > CLoad) {
CRun = CLoad + CDelta;
} else {
CRun = deterministicRandom()->coinflip() ? CLoad + CDelta : CLoad - CDelta;
}
CId = deterministicRandom()->randomInt(1, 3001);
COlIID = deterministicRandom()->randomInt(1, 100001);
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, CLoad, CRun, CDelta, CId, COlIID);
}
StringRef key() const { return LiteralStringRef("GlobalState"); }
};
const std::vector<std::string> syllables = {
"BAR", "UGHT", "ABLE", "RI", "PRES", "SE", "ANTI", "ALLY", "ATION", "ING",
};
} // namespace TPCCWorkload
#endif

View File

@ -52,7 +52,9 @@
#include "fdbclient/SystemData.h"
#include "fdbclient/TransactionLineage.h"
#include "fdbclient/VersionedMap.h"
#include "fdbserver/EncryptedMutationMessage.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "fdbserver/GetEncryptCipherKeys.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LatencyBandConfig.h"
@ -112,7 +114,7 @@ bool canReplyWith(Error e) {
return true;
default:
return false;
};
}
}
} // namespace
@ -1673,7 +1675,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
}
return Void();
};
}
// Pessimistic estimate the number of overhead bytes used by each
// watch. Watch key references are stored in an AsyncMap<Key,bool>, and actors
@ -2935,7 +2937,7 @@ ACTOR Future<GetValueReqAndResultRef> quickGetValue(StorageServer* data,
} else {
throw quick_get_value_miss();
}
};
}
// If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending).
// readRange has O(|result|) + O(log |data|) cost
@ -3549,7 +3551,7 @@ ACTOR Future<GetRangeReqAndResultRef> quickGetKeyValues(
} else {
throw quick_get_key_values_miss();
}
};
}
void unpackKeyTuple(Tuple** referenceTuple, Optional<Tuple>& keyTuple, KeyValueRef* keyValue) {
if (!keyTuple.present()) {
@ -3798,6 +3800,36 @@ TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
return Void();
}
// Issues a secondary query (either range and point read) and fills results into "kvm".
ACTOR Future<Void> mapSubquery(StorageServer* data,
Version version,
GetMappedKeyValuesRequest* pOriginalReq,
Arena* pArena,
int matchIndex,
bool isRangeQuery,
bool isBoundary,
KeyValueRef* it,
MappedKeyValueRef* kvm,
Key mappedKey) {
if (isRangeQuery) {
// Use the mappedKey as the prefix of the range query.
GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, version, pArena, pOriginalReq));
if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) ||
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) {
kvm->key = it->key;
kvm->value = it->value;
}
kvm->boundaryAndExist = isBoundary && !getRange.result.empty();
kvm->reqAndResult = getRange;
} else {
GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, version, pArena, pOriginalReq));
kvm->reqAndResult = getValue;
kvm->boundaryAndExist = isBoundary && getValue.result.present();
}
return Void();
}
ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
GetKeyValuesReply input,
StringRef mapper,
@ -3827,43 +3859,49 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
preprocessMappedKey(mappedKeyFormatTuple, vt, isRangeQuery);
state int sz = input.data.size();
state int i = 0;
for (; i < sz; i++) {
state KeyValueRef* it = &input.data[i];
state MappedKeyValueRef kvm;
state bool isBoundary = i == 0 || i == sz - 1;
// need to keep the boundary, so that caller can use it as a continuation.
if (isBoundary || matchIndex == MATCH_INDEX_ALL) {
kvm.key = it->key;
kvm.value = it->value;
}
state Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, mappedKeyFormatTuple);
// Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously.
result.arena.dependsOn(mappedKey.arena());
// std::cout << "key:" << printable(kvm.key) << ", value:" << printable(kvm.value)
// << ", mappedKey:" << printable(mappedKey) << std::endl;
if (isRangeQuery) {
// Use the mappedKey as the prefix of the range query.
GetRangeReqAndResultRef getRange =
wait(quickGetKeyValues(data, mappedKey, input.version, &(result.arena), pOriginalReq));
if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) ||
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) {
kvm.key = it->key;
kvm.value = it->value;
const int k = std::min(sz, SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE);
state std::vector<MappedKeyValueRef> kvms(k);
state std::vector<Future<Void>> subqueries;
state int offset = 0;
for (; offset < sz; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
// Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
KeyValueRef* it = &input.data[i + offset];
MappedKeyValueRef* kvm = &kvms[i];
bool isBoundary = (i + offset) == 0 || (i + offset) == sz - 1;
// need to keep the boundary, so that caller can use it as a continuation.
if (isBoundary || matchIndex == MATCH_INDEX_ALL) {
kvm->key = it->key;
kvm->value = it->value;
} else {
// Clear key value to the default.
kvm->key = ""_sr;
kvm->value = ""_sr;
}
kvm.boundaryAndExist = isBoundary && !getRange.result.empty();
kvm.reqAndResult = getRange;
} else {
GetValueReqAndResultRef getValue =
wait(quickGetValue(data, mappedKey, input.version, &(result.arena), pOriginalReq));
kvm.reqAndResult = getValue;
kvm.boundaryAndExist = isBoundary && getValue.result.present();
Key mappedKey = constructMappedKey(it, vt, mappedKeyTuple, mappedKeyFormatTuple);
// Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously.
result.arena.dependsOn(mappedKey.arena());
// std::cout << "key:" << printable(kvm->key) << ", value:" << printable(kvm->value)
// << ", mappedKey:" << printable(mappedKey) << std::endl;
subqueries.push_back(mapSubquery(data,
input.version,
pOriginalReq,
&result.arena,
matchIndex,
isRangeQuery,
isBoundary,
it,
kvm,
mappedKey));
}
wait(waitForAll(subqueries));
subqueries.clear();
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
result.data.push_back(result.arena, kvms[i]);
}
result.data.push_back(result.arena, kvm);
}
return result;
}
@ -6225,7 +6263,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
}
return Void();
};
}
AddingShard::AddingShard(StorageServer* server, KeyRangeRef const& keys)
: keys(keys), server(server), transferredVersion(invalidVersion), fetchVersion(invalidVersion), phase(WaitPrevious) {
@ -6948,7 +6986,7 @@ void StorageServer::insertTenant(TenantNameRef tenantName,
tenantMap.createNewVersion(version);
tenantPrefixIndex.createNewVersion(version);
TenantMapEntry tenantEntry = decodeTenantEntry(value);
TenantMapEntry tenantEntry = TenantMapEntry::decode(value);
tenantMap.insert(tenantName, tenantEntry);
tenantPrefixIndex.insert(tenantEntry.prefix, tenantName);
@ -7094,7 +7132,11 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
state UpdateEagerReadInfo eager;
state FetchInjectionInfo fii;
state Reference<ILogSystem::IPeekCursor> cloneCursor2;
state Optional<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> cipherKeys;
state bool collectingCipherKeys = false;
// Collect eager read keys.
// If encrypted mutation is encountered, we collect cipher details and fetch cipher keys, then start over.
loop {
state uint64_t changeCounter = data->shardChangeCounter;
bool epochEnd = false;
@ -7102,6 +7144,8 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
bool firstMutation = true;
bool dbgLastMessageWasProtocol = false;
std::unordered_set<BlobCipherDetails> cipherDetails;
Reference<ILogSystem::IPeekCursor> cloneCursor1 = cursor->cloneNoMore();
cloneCursor2 = cursor->cloneNoMore();
@ -7124,47 +7168,72 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
OTELSpanContextMessage::isNextIn(cloneReader)) {
OTELSpanContextMessage scm;
cloneReader >> scm;
} else if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
EncryptedMutationMessage::isNextIn(cloneReader) && !cipherKeys.present()) {
// Encrypted mutation found, but cipher keys haven't been fetch.
// Collect cipher details to fetch cipher keys in one batch.
EncryptedMutationMessage emm;
cloneReader >> emm;
cipherDetails.insert(emm.header.cipherTextDetails);
cipherDetails.insert(emm.header.cipherHeaderDetails);
collectingCipherKeys = true;
} else {
MutationRef msg;
cloneReader >> msg;
if (cloneReader.protocolVersion().hasEncryptionAtRest() &&
EncryptedMutationMessage::isNextIn(cloneReader)) {
assert(cipherKeys.present());
msg = EncryptedMutationMessage::decrypt(cloneReader, eager.arena, cipherKeys.get());
} else {
cloneReader >> msg;
}
// TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg);
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
firstMutation = false;
if (!collectingCipherKeys) {
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
firstMutation = false;
if (msg.param1 == lastEpochEndPrivateKey) {
epochEnd = true;
ASSERT(dbgLastMessageWasProtocol);
if (msg.param1 == lastEpochEndPrivateKey) {
epochEnd = true;
ASSERT(dbgLastMessageWasProtocol);
}
eager.addMutation(msg);
dbgLastMessageWasProtocol = false;
}
eager.addMutation(msg);
dbgLastMessageWasProtocol = false;
}
}
// Any fetchKeys which are ready to transition their shards to the adding,transferred state do so now.
// If there is an epoch end we skip this step, to increase testability and to prevent inserting a
// version in the middle of a rolled back version range.
while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) {
auto fk = data->readyFetchKeys.back();
data->readyFetchKeys.pop_back();
fk.send(&fii);
// fetchKeys() would put the data it fetched into the fii. The thread will not return back to this
// actor until it was completed.
if (collectingCipherKeys) {
std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>> getCipherKeysResult =
wait(getEncryptCipherKeys(data->db, cipherDetails));
cipherKeys = getCipherKeysResult;
collectingCipherKeys = false;
eager = UpdateEagerReadInfo();
} else {
// Any fetchKeys which are ready to transition their shards to the adding,transferred state do so now.
// If there is an epoch end we skip this step, to increase testability and to prevent inserting a
// version in the middle of a rolled back version range.
while (!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) {
auto fk = data->readyFetchKeys.back();
data->readyFetchKeys.pop_back();
fk.send(&fii);
// fetchKeys() would put the data it fetched into the fii. The thread will not return back to this
// actor until it was completed.
}
for (auto& c : fii.changes)
eager.addMutations(c.mutations);
wait(doEagerReads(data, &eager));
if (data->shardChangeCounter == changeCounter)
break;
TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it
// again.
// SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads
// only selectively
eager = UpdateEagerReadInfo();
}
for (auto& c : fii.changes)
eager.addMutations(c.mutations);
wait(doEagerReads(data, &eager));
if (data->shardChangeCounter == changeCounter)
break;
TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it
// again.
// SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads
// only selectively
eager = UpdateEagerReadInfo();
}
data->eagerReadsLatencyHistogram->sampleSeconds(now() - start);
@ -7257,7 +7326,12 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
spanContext = scm.spanContext;
} else {
MutationRef msg;
rd >> msg;
if (rd.protocolVersion().hasEncryptionAtRest() && EncryptedMutationMessage::isNextIn(rd)) {
ASSERT(cipherKeys.present());
msg = EncryptedMutationMessage::decrypt(rd, rd.arena(), cipherKeys.get());
} else {
rd >> msg;
}
Span span("SS:update"_loc, spanContext);
span.addAttribute("key"_sr, msg.param1);
@ -7437,7 +7511,9 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
return Void(); // update will get called again ASAP
} catch (Error& err) {
state Error e = err;
if (e.code() != error_code_worker_removed && e.code() != error_code_please_reboot) {
if (e.code() == error_code_encrypt_keys_fetch_failed) {
TraceEvent(SevWarn, "SSUpdateError", data->thisServerID).error(e).backtrace();
} else if (e.code() != error_code_worker_removed && e.code() != error_code_please_reboot) {
TraceEvent(SevError, "SSUpdateError", data->thisServerID).error(e).backtrace();
} else if (e.code() == error_code_please_reboot) {
wait(data->durableInProgress);
@ -7791,7 +7867,7 @@ void StorageServerDisk::makeNewStorageServerDurable() {
auto view = data->tenantMap.atLatest();
for (auto itr = view.begin(); itr != view.end(); ++itr) {
storage->set(KeyValueRef(itr.key().withPrefix(persistTenantMapKeys.begin), encodeTenantEntry(*itr)));
storage->set(KeyValueRef(itr.key().withPrefix(persistTenantMapKeys.begin), itr->encode()));
}
}
@ -8272,7 +8348,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
for (tenantMapLoc = 0; tenantMapLoc < tenantMap.size(); tenantMapLoc++) {
auto const& result = tenantMap[tenantMapLoc];
TenantName tenantName = result.key.substr(persistTenantMapKeys.begin.size());
TenantMapEntry tenantEntry = decodeTenantEntry(result.value);
TenantMapEntry tenantEntry = TenantMapEntry::decode(result.value);
data->tenantMap.insert(tenantName, tenantEntry);
data->tenantPrefixIndex.insert(tenantEntry.prefix, tenantName);

View File

@ -1415,10 +1415,16 @@ ACTOR Future<Void> traceRole(Role role, UID roleId) {
}
}
ACTOR Future<Void> workerSnapCreate(WorkerSnapRequest snapReq, Standalone<StringRef> snapFolder) {
ACTOR Future<Void> workerSnapCreate(
WorkerSnapRequest snapReq,
std::string snapFolder,
std::map<std::string, WorkerSnapRequest>* snapReqMap /* ongoing snapshot requests */,
std::map<std::string, ErrorOr<Void>>*
snapReqResultMap /* finished snapshot requests, expired in SNAP_MINIMUM_TIME_GAP seconds */) {
state ExecCmdValueString snapArg(snapReq.snapPayload);
state std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString();
try {
int err = wait(execHelper(&snapArg, snapReq.snapUID, snapFolder.toString(), snapReq.role.toString()));
int err = wait(execHelper(&snapArg, snapReq.snapUID, snapFolder, snapReq.role.toString()));
std::string uidStr = snapReq.snapUID.toString();
TraceEvent("ExecTraceWorker")
.detail("Uid", uidStr)
@ -1432,11 +1438,15 @@ ACTOR Future<Void> workerSnapCreate(WorkerSnapRequest snapReq, Standalone<String
if (snapReq.role.toString() == "storage") {
printStorageVersionInfo();
}
snapReq.reply.send(Void());
snapReqMap->at(snapReqKey).reply.send(Void());
snapReqMap->erase(snapReqKey);
(*snapReqResultMap)[snapReqKey] = ErrorOr<Void>(Void());
} catch (Error& e) {
TraceEvent("ExecHelperError").errorUnsuppressed(e);
if (e.code() != error_code_operation_cancelled) {
snapReq.reply.sendError(e);
snapReqMap->at(snapReqKey).reply.sendError(e);
snapReqMap->erase(snapReqKey);
(*snapReqResultMap)[snapReqKey] = ErrorOr<Void>(e);
} else {
throw e;
}
@ -1584,6 +1594,11 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
state WorkerCache<InitializeBackupReply> backupWorkerCache;
state WorkerCache<InitializeBlobWorkerReply> blobWorkerCache;
state WorkerSnapRequest lastSnapReq;
// Here the key is UID+role, as we still send duplicate requests to a process which is both storage and tlog
state std::map<std::string, WorkerSnapRequest> snapReqMap;
state std::map<std::string, ErrorOr<Void>> snapReqResultMap;
state double lastSnapTime = -SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP; // always successful for the first Snap Request
state std::string coordFolder = abspath(_coordFolder);
state WorkerInterface interf(locality);
@ -2497,11 +2512,49 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
loggingTrigger = delay(loggingDelay, TaskPriority::FlushTrace);
}
when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
Standalone<StringRef> snapFolder = StringRef(folder);
if (snapReq.role.toString() == "coord") {
snapFolder = coordFolder;
std::string snapUID = snapReq.snapUID.toString() + snapReq.role.toString();
if (snapReqResultMap.count(snapUID)) {
TEST(true); // Worker received a duplicate finished snap request
auto result = snapReqResultMap[snapUID];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedWorkerSnapRequest")
.detail("SnapUID", snapUID)
.detail("Role", snapReq.role)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (snapReqMap.count(snapUID)) {
TEST(true); // Worker received a duplicate ongoing snap request
TraceEvent("RetryOngoingWorkerSnapRequest").detail("SnapUID", snapUID).detail("Role", snapReq.role);
ASSERT(snapReq.role == snapReqMap[snapUID].role);
ASSERT(snapReq.snapPayload == snapReqMap[snapUID].snapPayload);
snapReqMap[snapUID] = snapReq;
} else {
snapReqMap[snapUID] = snapReq; // set map point to the request
if (g_network->isSimulated() && (now() - lastSnapTime) < SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP) {
// only allow duplicate snapshots on same process in a short time for different roles
auto okay = (lastSnapReq.snapUID == snapReq.snapUID) && lastSnapReq.role != snapReq.role;
TraceEvent(okay ? SevInfo : SevError, "RapidSnapRequestsOnSameProcess")
.detail("CurrSnapUID", snapUID)
.detail("PrevSnapUID", lastSnapReq.snapUID)
.detail("CurrRole", snapReq.role)
.detail("PrevRole", lastSnapReq.role)
.detail("GapTime", now() - lastSnapTime);
}
errorForwarders.add(workerSnapCreate(snapReq,
snapReq.role.toString() == "coord" ? coordFolder : folder,
&snapReqMap,
&snapReqResultMap));
auto* snapReqResultMapPtr = &snapReqResultMap;
errorForwarders.add(fmap(
[snapReqResultMapPtr, snapUID](Void _) {
snapReqResultMapPtr->erase(snapUID);
return Void();
},
delay(SERVER_KNOBS->SNAP_MINIMUM_TIME_GAP)));
if (g_network->isSimulated()) {
lastSnapReq = snapReq;
lastSnapTime = now();
}
}
errorForwarders.add(workerSnapCreate(snapReq, snapFolder));
}
when(wait(errorForwarders.getResult())) {}
when(wait(handleErrors)) {}

View File

@ -0,0 +1,74 @@
/*
* GlobalTagThrottling.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/TagThrottle.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
class GlobalTagThrottlingWorkload : public TestWorkload {
TransactionTag transactionTag;
double reservedReadQuota{ 0.0 };
double totalReadQuota{ 0.0 };
double reservedWriteQuota{ 0.0 };
double totalWriteQuota{ 0.0 };
ACTOR static Future<Void> setup(GlobalTagThrottlingWorkload* self, Database cx) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
TraceEvent("GlobalTagThrottlingWorkload_SettingTagQuota")
.detail("Tag", self->transactionTag)
.detail("ReservedReadQuota", self->reservedReadQuota)
.detail("TotalReadQuota", self->totalReadQuota)
.detail("ReservedWriteQuota", self->reservedWriteQuota)
.detail("TotalWriteQuota", self->totalWriteQuota);
ThrottleApi::setTagQuota(tr,
self->transactionTag,
self->reservedReadQuota,
self->totalReadQuota,
self->reservedWriteQuota,
self->totalWriteQuota);
wait(tr->commit());
return Void();
} catch (Error& e) {
wait(tr->onError(e));
}
};
}
public:
explicit GlobalTagThrottlingWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
transactionTag = getOption(options, "transactionTag"_sr, "sampleTag"_sr);
reservedReadQuota = getOption(options, "reservedReadQuota"_sr, 0.0);
totalReadQuota = getOption(options, "totalReadQuota"_sr, 0.0);
reservedWriteQuota = getOption(options, "reservedWriteQuota"_sr, 0.0);
totalWriteQuota = getOption(options, "totalWriteQuota"_sr, 0.0);
}
std::string description() const override { return "GlobalTagThrottling"; }
Future<Void> setup(Database const& cx) override { return clientId ? Void() : setup(this, cx); }
Future<Void> start(Database const& cx) override { return Void(); }
Future<bool> check(Database const& cx) override { return true; }
void getMetrics(std::vector<PerfMetric>& m) override {}
};
WorkloadFactory<GlobalTagThrottlingWorkload> GlobalTagThrottlingWorkloadFactory("GlobalTagThrottling");

View File

@ -1,521 +0,0 @@
/*
* TPCC.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "flow/Arena.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbserver/workloads/TPCCWorkload.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbclient/ReadYourWrites.h"
#include "flow/actorcompiler.h" // needs to be last include
#undef FLOW_ACOMPILER_STATE
#define FLOW_ACOMPILER_STATE 1
using namespace TPCCWorkload;
namespace {
constexpr char alphaNumerics[] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' };
constexpr char numerics[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
constexpr const char* originalString = "ORIGINAL";
struct PopulateTPCC : TestWorkload {
static constexpr const char* DESCRIPTION = "PopulateTPCC";
int actorsPerClient;
int warehousesPerActor;
int clientsUsed;
GlobalState gState;
PopulateTPCC(WorkloadContext const& ctx) : TestWorkload(ctx) {
std::string workloadName = DESCRIPTION;
actorsPerClient = getOption(options, LiteralStringRef("actorsPerClient"), 10);
warehousesPerActor = getOption(options, LiteralStringRef("warehousesPerActor"), 30);
clientsUsed = getOption(options, LiteralStringRef("clientsUsed"), 2);
}
int NURand(int C, int A, int x, int y) {
return (((deterministicRandom()->randomInt(0, A + 1) | deterministicRandom()->randomInt(x, y + 1)) + C) %
(y - x + 1)) +
x;
}
StringRef aString(Arena& arena, int x, int y) {
int length = deterministicRandom()->randomInt(x, y + 1);
char* res = new (arena) char[length];
for (int i = 0; i < length; ++i) {
res[i] = alphaNumerics[deterministicRandom()->randomInt(0, sizeof(alphaNumerics))];
}
return StringRef(reinterpret_cast<uint8_t*>(res), length);
}
StringRef nString(Arena& arena, int x, int y) {
int length = deterministicRandom()->randomInt(x, y + 1);
char* res = new (arena) char[length];
for (int i = 0; i < length; ++i) {
res[i] = numerics[deterministicRandom()->randomInt(0, sizeof(numerics))];
}
return StringRef(reinterpret_cast<uint8_t*>(res), length);
}
StringRef genCLast(Arena& arena, int x) {
int l = x % 10;
x /= 10;
int m = x % 10;
x /= 10;
int f = x % 10;
std::stringstream ss;
ss << syllables[f] << syllables[m] << syllables[l];
return StringRef(arena, ss.str());
}
StringRef rndZip(Arena& arena) {
char* result = new (arena) char[9];
for (int i = 0; i < 4; ++i) {
result[i] = numerics[deterministicRandom()->randomInt(0, sizeof(numerics))];
}
for (int i = 4; i < 9; ++i) {
result[i] = '1';
}
return StringRef(reinterpret_cast<uint8_t*>(result), 9);
}
StringRef dataString(Arena& arena) {
if (deterministicRandom()->random01() < 0.1) {
auto str = aString(arena, 26, 51 - strlen(originalString));
char* r = new (arena) char[str.size() + strlen(originalString)];
int pos = deterministicRandom()->randomInt(0, str.size());
std::copy(originalString, originalString + strlen(originalString), r + pos);
auto res = reinterpret_cast<uint8_t*>(r);
std::copy(str.begin(), str.begin() + pos, res);
std::copy(str.begin() + pos, str.end(), res + pos + strlen(originalString));
return StringRef(res, str.size() + strlen(originalString));
} else {
return aString(arena, 26, 51);
}
}
ACTOR static Future<Void> writeGlobalState(PopulateTPCC* self, Database cx) {
state ReadYourWritesTransaction tr(cx);
loop {
tr.reset();
try {
BinaryWriter writer(IncludeVersion());
serializer(writer, self->gState);
tr.set(self->gState.key(), writer.toValue());
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR static Future<Void> readGlobalState(PopulateTPCC* self, Database cx) {
state ReadYourWritesTransaction tr(cx);
loop {
tr.reset();
try {
Optional<Value> val = wait(tr.get(self->gState.key()));
if (val.present()) {
BinaryReader reader(val.get(), IncludeVersion());
serializer(reader, self->gState);
} else {
wait(delay(1.0));
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
std::string description() const override { return DESCRIPTION; }
ACTOR static Future<Void> populateItems(PopulateTPCC* self, Database cx) {
state Transaction tr(cx);
state int itemStart = 0;
state int i_id;
for (; itemStart < 100000; itemStart += 100) {
TraceEvent("PopulateItems").detail("Status", itemStart);
loop {
try {
tr.reset();
for (i_id = itemStart; i_id < itemStart + 100; ++i_id) {
Item item;
item.i_id = i_id;
item.i_im_id = deterministicRandom()->randomInt(1, 10001);
item.i_name = self->aString(item.arena, 14, 25);
item.i_price = deterministicRandom()->randomInt64(1.0, 100.0);
item.i_data = self->dataString(item.arena);
BinaryWriter w(IncludeVersion());
serializer(w, item);
tr.set(item.key(), w.toValue(), AddConflictRange::False);
}
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("PopulateItemsHandleError").error(e);
wait(tr.onError(e));
}
}
}
TraceEvent("PopulateItemsDone").log();
return Void();
}
ACTOR static Future<Void> populateCustomers(PopulateTPCC* self, Database cx, int w_id, int d_id) {
state Transaction tr(cx);
state int cStart;
state int c_id;
for (cStart = 0; cStart < 3000; cStart += 100) {
TraceEvent("PopulateCustomers")
.detail("Warehouse", w_id)
.detail("District", d_id)
.detail("Customer", cStart);
loop {
for (c_id = cStart; c_id < cStart + 100; ++c_id) {
Customer c;
History h;
c.c_id = c_id;
c.c_d_id = d_id;
c.c_w_id = w_id;
if (c_id < 1000) {
c.c_last = self->genCLast(c.arena, c_id);
} else {
c.c_last = self->genCLast(c.arena, self->NURand(self->gState.CLoad, 255, 0, 999));
}
c.c_middle = LiteralStringRef("OE");
c.c_first = self->aString(c.arena, 8, 16);
c.c_street_1 = self->aString(c.arena, 10, 20);
c.c_street_2 = self->aString(c.arena, 10, 20);
c.c_city = self->aString(c.arena, 10, 20);
c.c_state = self->aString(c.arena, 2, 2);
c.c_zip = self->rndZip(c.arena);
c.c_phone = self->nString(c.arena, 16, 16);
c.c_since = g_network->now();
if (deterministicRandom()->random01() < 0.1) {
c.c_credit = LiteralStringRef("BC");
} else {
c.c_credit = LiteralStringRef("GC");
}
c.c_credit_lim = 50000;
c.c_discount = deterministicRandom()->random01() / 2.0;
c.c_balance = -10.0;
c.c_ytd_payment = 10.0;
c.c_payment_cnt = 1;
c.c_delivery_count = 0;
c.c_data = self->aString(c.arena, 300, 500);
h.h_c_id = c_id;
h.h_c_d_id = d_id;
h.h_d_id = d_id;
h.h_w_id = w_id;
h.h_c_w_id = w_id;
h.h_date = g_network->now();
h.h_amount = 10.0;
h.h_data = self->aString(c.arena, 12, 24);
{
BinaryWriter w(IncludeVersion());
serializer(w, c);
tr.set(c.key(), w.toValue(), AddConflictRange::False);
}
{
// Write index
tr.set(c.indexLastKey(), c.key(), AddConflictRange::False);
}
{
BinaryWriter w(IncludeVersion());
serializer(w, h);
UID k = deterministicRandom()->randomUniqueID();
BinaryWriter kW(Unversioned());
serializer(kW, k);
auto key = kW.toValue().withPrefix(LiteralStringRef("History/"));
tr.set(key, w.toValue(), AddConflictRange::False);
}
}
try {
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("PopulateCustomerHandleError").error(e);
wait(tr.onError(e));
}
}
}
TraceEvent("PopulateCustomersDone").detail("Warehouse", w_id).detail("District", d_id);
return Void();
}
ACTOR static Future<Void> populateOrders(PopulateTPCC* self, Database cx, int w_id, int d_id) {
state Transaction tr(cx);
state std::vector<int> customerIds;
state int idStart;
state int o_id;
customerIds.reserve(3000);
for (int i = 0; i < 3000; ++i) {
customerIds.push_back(i);
}
deterministicRandom()->randomShuffle(customerIds);
for (idStart = 0; idStart < 3000; idStart += 100) {
TraceEvent("PopulateOrders").detail("Warehouse", w_id).detail("District", d_id).detail("Order", idStart);
loop {
tr.reset();
for (o_id = idStart; o_id < idStart + 100; ++o_id) {
Order o;
o.o_id = o_id;
o.o_c_id = customerIds[o_id];
o.o_d_id = d_id;
o.o_w_id = w_id;
o.o_entry_d = g_network->now();
if (o_id < 2100) {
o.o_carrier_id = deterministicRandom()->randomInt(1, 11);
}
o.o_ol_cnt = deterministicRandom()->randomInt(5, 16);
o.o_all_local = true;
for (int ol_number = 0; ol_number < o.o_ol_cnt; ++ol_number) {
OrderLine ol;
ol.ol_o_id = o_id;
ol.ol_d_id = d_id;
ol.ol_w_id = w_id;
ol.ol_number = ol_number;
ol.ol_i_id = deterministicRandom()->randomInt(0, 100000);
ol.ol_supply_w_id = w_id;
if (o_id < 2100) {
ol.ol_delivery_d = g_network->now();
ol.ol_amount = 0.0;
} else {
ol.ol_amount = deterministicRandom()->random01() * 10000.0;
}
ol.ol_quantity = 5;
ol.ol_dist_info = self->aString(ol.arena, 24, 24);
BinaryWriter w(IncludeVersion());
serializer(w, ol);
tr.set(ol.key(), w.toValue(), AddConflictRange::False);
}
BinaryWriter w(IncludeVersion());
serializer(w, o);
tr.set(o.key(), w.toValue(), AddConflictRange::False);
}
try {
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("PopulateOrderHandleError").error(e);
wait(tr.onError(e));
}
}
}
TraceEvent("PopulateOrdersDone").detail("Warehouse", w_id).detail("District", d_id);
return Void();
}
ACTOR static Future<Void> populateNewOrders(PopulateTPCC* self, Database cx, int w_id, int d_id) {
state Transaction tr(cx);
TraceEvent("PopulateNewOrders").detail("Warehouse", w_id).detail("District", d_id);
loop {
tr.reset();
for (int i = 2100; i < 3000; ++i) {
NewOrder no;
no.no_o_id = i;
no.no_d_id = d_id;
no.no_w_id = w_id;
BinaryWriter w(IncludeVersion());
serializer(w, no);
tr.set(no.key(), w.toValue(), AddConflictRange::False);
}
try {
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("PopulateNewOrderHandleError").error(e);
wait(tr.onError(e));
}
}
TraceEvent("PopulateNewOrdersDone").detail("Warehouse", w_id).detail("District", d_id);
return Void();
}
ACTOR static Future<Void> populateDistricts(PopulateTPCC* self, Database cx, int w_id) {
state Transaction tr(cx);
state int d_id;
for (d_id = 0; d_id < 10; ++d_id) {
TraceEvent("PopulateDistricts").detail("Warehouse", w_id).detail("District", d_id);
loop {
tr.reset();
District d;
d.d_id = d_id;
d.d_w_id = w_id;
d.d_name = self->aString(d.arena, 6, 10);
d.d_street_1 = self->aString(d.arena, 10, 20);
d.d_street_2 = self->aString(d.arena, 10, 20);
d.d_city = self->aString(d.arena, 10, 20);
d.d_state = self->aString(d.arena, 2, 2);
d.d_zip = self->rndZip(d.arena);
d.d_tax = deterministicRandom()->random01() * 0.2;
d.d_ytd = 30000;
d.d_next_o_id = 3000;
BinaryWriter w(IncludeVersion());
serializer(w, d);
tr.set(d.key(), w.toValue(), AddConflictRange::False);
try {
wait(tr.commit());
wait(populateCustomers(self, cx, w_id, d_id));
wait(populateOrders(self, cx, w_id, d_id));
wait(populateNewOrders(self, cx, w_id, d_id));
break;
} catch (Error& e) {
TraceEvent("PopulateDistrictHandleError").error(e);
wait(tr.onError(e));
}
}
}
TraceEvent("PopulateDistrictsDone").detail("Warehouse", w_id);
return Void();
}
ACTOR static Future<Void> populateStock(PopulateTPCC* self, Database cx, int w_id) {
state Transaction tr(cx);
state int idStart;
for (idStart = 0; idStart < 100000; idStart += 100) {
TraceEvent("PopulateStock").detail("Warehouse", w_id).detail("i_id", idStart);
loop {
tr.reset();
for (int i = idStart; i < idStart + 100; ++i) {
Stock s;
s.s_i_id = i;
s.s_w_id = w_id;
s.s_quantity = deterministicRandom()->randomInt(1, 101);
s.s_dist_01 = self->aString(s.arena, 24, 25);
s.s_dist_02 = self->aString(s.arena, 24, 25);
s.s_dist_03 = self->aString(s.arena, 24, 25);
s.s_dist_04 = self->aString(s.arena, 24, 25);
s.s_dist_05 = self->aString(s.arena, 24, 25);
s.s_dist_06 = self->aString(s.arena, 24, 25);
s.s_dist_07 = self->aString(s.arena, 24, 25);
s.s_dist_08 = self->aString(s.arena, 24, 25);
s.s_dist_09 = self->aString(s.arena, 24, 25);
s.s_dist_10 = self->aString(s.arena, 24, 25);
s.s_ytd = 0;
s.s_order_cnt = 0;
s.s_remote_cnt = 0;
s.s_data = self->dataString(s.arena);
BinaryWriter w(IncludeVersion());
serializer(w, s);
tr.set(s.key(), w.toValue(), AddConflictRange::False);
}
try {
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("PopulateStockHandleError").error(e).detail("Warehouse", w_id);
wait(tr.onError(e));
}
}
}
TraceEvent("PopulateStockDone").detail("Warehouse", w_id);
return Void();
}
ACTOR static Future<Void> populateWarehouse(PopulateTPCC* self, Database cx, int w_id) {
state Transaction tr(cx);
TraceEvent("PopulateWarehouse").detail("W_ID", w_id);
loop {
tr.reset();
try {
Warehouse w;
w.w_id = w_id;
w.w_name = self->aString(w.arena, 6, 11);
w.w_street_1 = self->aString(w.arena, 10, 21);
w.w_street_2 = self->aString(w.arena, 10, 21);
w.w_city = self->aString(w.arena, 10, 21);
w.w_state = self->aString(w.arena, 2, 3);
w.w_tax = deterministicRandom()->random01() * 0.2;
w.w_ytd = 300000;
BinaryWriter writer(IncludeVersion());
serializer(writer, w);
tr.set(w.key(), writer.toValue(), AddConflictRange::False);
wait(tr.commit());
break;
} catch (Error& e) {
TraceEvent("PopulateWarehouseHandleError").error(e).detail("Warehouse", w_id);
wait(tr.onError(e));
}
}
wait(populateStock(self, cx, w_id));
wait(populateDistricts(self, cx, w_id));
TraceEvent("PopulateWarehouseDone").detail("W_ID", w_id);
return Void();
}
ACTOR static Future<Void> populateActor(PopulateTPCC* self, Database cx, int actorId) {
state int startWID =
self->clientId * self->actorsPerClient * self->warehousesPerActor + actorId * self->warehousesPerActor;
state int endWID = startWID + self->warehousesPerActor;
state int wid;
for (wid = startWID; wid < endWID; ++wid) {
wait(populateWarehouse(self, cx, wid));
}
return Void();
}
ACTOR static Future<Void> populate(PopulateTPCC* self, Database cx) {
if (self->clientId == 0) {
wait(writeGlobalState(self, cx));
} else {
wait(readGlobalState(self, cx));
}
if (self->clientId == 0) {
wait(populateItems(self, cx));
}
state std::vector<Future<Void>> populateActors;
state int actorId;
for (actorId = 0; actorId < self->actorsPerClient; ++actorId) {
populateActors.push_back(populateActor(self, cx, actorId));
}
wait(waitForAll(populateActors));
wait(quietDatabase(cx, self->dbInfo, "PopulateTPCC"));
return Void();
}
Future<Void> setup(Database const& cx) override {
if (clientId >= clientsUsed)
return Void();
return populate(this, cx);
}
Future<Void> start(Database const& cx) override { return Void(); }
Future<bool> check(Database const& cx) override { return true; }
void getMetrics(std::vector<PerfMetric>& m) override {}
};
} // namespace
WorkloadFactory<PopulateTPCC> PopulateTPCCWorkloadFactory(PopulateTPCC::DESCRIPTION);

View File

@ -101,7 +101,7 @@ struct ReadHotDetectionWorkload : TestWorkload {
StorageMetrics sm = wait(cx->getStorageMetrics(self->wholeRange, 100));
// TraceEvent("RHDCheckPhaseLog")
// .detail("KeyRangeSize", sm.bytes)
// .detail("KeyRangeReadBandwith", sm.bytesReadPerKSecond);
// .detail("KeyRangeReadBandwidth", sm.bytesReadPerKSecond);
Standalone<VectorRef<ReadHotRangeWithMetrics>> keyRanges = wait(cx->getReadHotRanges(self->wholeRange));
// TraceEvent("RHDCheckPhaseLog")
// .detail("KeyRangesSize", keyRanges.size())

View File

@ -61,6 +61,7 @@ struct ReadWriteCommonImpl {
throw;
}
}
ACTOR static Future<Void> tracePeriodically(ReadWriteCommon* self) {
state double start = now();
state double elapsed = 0.0;
@ -376,6 +377,9 @@ struct ReadWriteWorkload : ReadWriteCommon {
bool adjacentReads; // keys are adjacent within a transaction
bool adjacentWrites;
int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction;
Optional<Key> transactionTag;
int transactionsTagThrottled{ 0 };
// hot traffic pattern
double hotKeyFraction, forceHotProbability = 0; // key based hot traffic setting
@ -397,6 +401,9 @@ struct ReadWriteWorkload : ReadWriteCommon {
rampUpConcurrency = getOption(options, LiteralStringRef("rampUpConcurrency"), false);
batchPriority = getOption(options, LiteralStringRef("batchPriority"), false);
descriptionString = getOption(options, LiteralStringRef("description"), LiteralStringRef("ReadWrite"));
if (hasOption(options, LiteralStringRef("transactionTag"))) {
transactionTag = getOption(options, LiteralStringRef("transactionTag"), ""_sr);
}
if (rampUpConcurrency)
ASSERT(rampSweepCount == 2); // Implementation is hard coded to ramp up and down
@ -415,15 +422,18 @@ struct ReadWriteWorkload : ReadWriteCommon {
}
}
std::string description() const override { return descriptionString.toString(); }
template <class Trans>
void setupTransaction(Trans* tr) {
void setupTransaction(Trans& tr) {
if (batchPriority) {
tr->setOption(FDBTransactionOptions::PRIORITY_BATCH);
tr.setOption(FDBTransactionOptions::PRIORITY_BATCH);
}
if (transactionTag.present() && tr.getTags().size() == 0) {
tr.setOption(FDBTransactionOptions::AUTO_THROTTLE_TAG, transactionTag.get());
}
}
std::string description() const override { return descriptionString.toString(); }
void getMetrics(std::vector<PerfMetric>& m) override {
ReadWriteCommon::getMetrics(m);
if (!rampUpLoad) {
@ -449,6 +459,9 @@ struct ReadWriteWorkload : ReadWriteCommon {
m.emplace_back("Mean Commit Latency (ms)", 1000 * commitLatencies.mean(), Averaged::True);
m.emplace_back("Median Commit Latency (ms, averaged)", 1000 * commitLatencies.median(), Averaged::True);
m.emplace_back("Max Commit Latency (ms, averaged)", 1000 * commitLatencies.max(), Averaged::True);
if (transactionTag.present()) {
m.emplace_back("Transaction Tag Throttled", transactionsTagThrottled, Averaged::False);
}
}
}
@ -494,11 +507,14 @@ struct ReadWriteWorkload : ReadWriteCommon {
state Transaction tr(cx);
try {
self->setupTransaction(&tr);
self->setupTransaction(tr);
wait(self->readOp(&tr, keys, self, false));
wait(tr.warmRange(allKeys));
break;
} catch (Error& e) {
if (e.code() == error_code_tag_throttled) {
++self->transactionsTagThrottled;
}
wait(tr.onError(e));
}
}
@ -625,7 +641,7 @@ struct ReadWriteWorkload : ReadWriteCommon {
loop {
try {
self->setupTransaction(&tr);
self->setupTransaction(tr);
GRVStartTime = now();
self->transactionFailureMetric->startLatency = -1;

View File

@ -71,14 +71,12 @@ struct SaveAndKillWorkload : TestWorkload {
std::map<NetworkAddress, ISimulator::ProcessInfo*> rebootingProcesses = g_simulator.currentlyRebootingProcesses;
std::map<std::string, ISimulator::ProcessInfo*> allProcessesMap;
for (const auto& [_, process] : rebootingProcesses) {
if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() &&
process->name != "remote flow process") {
if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() && !process->isSpawnedKVProcess()) {
allProcessesMap[process->dataFolder] = process;
}
}
for (const auto& process : processes) {
if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() &&
process->name != "remote flow process") {
if (allProcessesMap.find(process->dataFolder) == allProcessesMap.end() && !process->isSpawnedKVProcess()) {
allProcessesMap[process->dataFolder] = process;
}
}

View File

@ -1,825 +0,0 @@
/*
* TPCC.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/TPCCWorkload.h"
#include <fdbclient/ReadYourWrites.h>
#include "flow/actorcompiler.h" // has to be last include
using namespace TPCCWorkload;
namespace {
struct TPCCMetrics {
static constexpr int latenciesStored = 1000;
uint64_t successfulStockLevelTransactions{ 0 };
uint64_t failedStockLevelTransactions{ 0 };
uint64_t successfulDeliveryTransactions{ 0 };
uint64_t failedDeliveryTransactions{ 0 };
uint64_t successfulOrderStatusTransactions{ 0 };
uint64_t failedOrderStatusTransactions{ 0 };
uint64_t successfulPaymentTransactions{ 0 };
uint64_t failedPaymentTransactions{ 0 };
uint64_t successfulNewOrderTransactions{ 0 };
uint64_t failedNewOrderTransactions{ 0 };
double stockLevelResponseTime{ 0.0 };
double deliveryResponseTime{ 0.0 };
double orderStatusResponseTime{ 0.0 };
double paymentResponseTime{ 0.0 };
double newOrderResponseTime{ 0.0 };
std::vector<double> stockLevelLatencies, deliveryLatencies, orderStatusLatencies, paymentLatencies,
newOrderLatencies;
void sort() {
std::sort(stockLevelLatencies.begin(), stockLevelLatencies.end());
std::sort(deliveryLatencies.begin(), deliveryLatencies.end());
std::sort(orderStatusLatencies.begin(), orderStatusLatencies.end());
std::sort(paymentLatencies.begin(), paymentLatencies.end());
std::sort(newOrderLatencies.begin(), newOrderLatencies.end());
}
static double median(const std::vector<double>& latencies) {
// assumes latencies is sorted
return latencies[latencies.size() / 2];
}
static double percentile_90(const std::vector<double>& latencies) {
// assumes latencies is sorted
return latencies[(9 * latencies.size()) / 10];
}
static double percentile_99(const std::vector<double>& latencies) {
// assumes latencies is sorted
return latencies[(99 * latencies.size()) / 100];
}
static void updateMetrics(bool committed,
uint64_t& successCounter,
uint64_t& failedCounter,
double txnStartTime,
std::vector<double>& latencies,
double& totalLatency,
std::string txnType) {
auto responseTime = g_network->now() - txnStartTime;
if (committed) {
totalLatency += responseTime;
++successCounter;
if (successCounter <= latenciesStored)
latencies[successCounter - 1] = responseTime;
else {
auto index = deterministicRandom()->randomInt(0, successCounter);
if (index < latenciesStored) {
latencies[index] = responseTime;
}
}
} else {
++failedCounter;
}
TraceEvent("TransactionComplete")
.detail("TransactionType", txnType)
.detail("Latency", responseTime)
.detail("Begin", txnStartTime)
.detail("End", txnStartTime + responseTime)
.detail("Success", committed);
}
};
struct TPCC : TestWorkload {
static constexpr const char* DESCRIPTION = "TPCC";
int warehousesPerClient;
int expectedTransactionsPerMinute;
int testDuration;
int warmupTime;
int clientsUsed;
double startTime;
GlobalState gState;
TPCCMetrics metrics;
TPCC(WorkloadContext const& ctx) : TestWorkload(ctx) {
std::string workloadName = DESCRIPTION;
warehousesPerClient = getOption(options, LiteralStringRef("warehousesPerClient"), 100);
expectedTransactionsPerMinute = getOption(options, LiteralStringRef("expectedTransactionsPerMinute"), 1000);
testDuration = getOption(options, LiteralStringRef("testDuration"), 600);
warmupTime = getOption(options, LiteralStringRef("warmupTime"), 30);
clientsUsed = getOption(options, LiteralStringRef("clientsUsed"), 40);
}
int NURand(int C, int A, int x, int y) {
return (((deterministicRandom()->randomInt(0, A + 1) | deterministicRandom()->randomInt(x, y + 1)) + C) %
(y - x + 1)) +
x;
}
StringRef genCLast(Arena& arena, int x) {
int l = x % 10;
x /= 10;
int m = x % 10;
x /= 10;
int f = x % 10;
std::stringstream ss;
ss << syllables[f] << syllables[m] << syllables[l];
return StringRef(arena, ss.str());
}
// Should call in setup
ACTOR static Future<Void> readGlobalState(TPCC* self, Database cx) {
state ReadYourWritesTransaction tr(cx);
loop {
tr.reset();
try {
Optional<Value> val = wait(tr.get(self->gState.key()));
if (val.present()) {
BinaryReader reader(val.get(), IncludeVersion());
serializer(reader, self->gState);
} else {
wait(delay(1.0));
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
std::string description() const override { return DESCRIPTION; }
// Transactions
ACTOR static Future<bool> newOrder(TPCC* self, Database cx, int w_id) {
state int d_id = deterministicRandom()->randomInt(0, 10);
state int c_id = self->NURand(self->gState.CRun, 1023, 1, 3000) - 1;
state int ol_cnt = deterministicRandom()->randomInt(5, 16);
state bool willRollback = deterministicRandom()->randomInt(1, 100) == 1;
state ReadYourWritesTransaction tr(cx);
try {
state Warehouse warehouse;
warehouse.w_id = w_id;
Optional<Value> wValue = wait(tr.get(warehouse.key()));
ASSERT(wValue.present());
{
BinaryReader r(wValue.get(), IncludeVersion());
serializer(r, warehouse);
}
state District district;
district.d_w_id = w_id;
district.d_id = d_id;
Optional<Value> dValue = wait(tr.get(district.key()));
ASSERT(dValue.present());
{
BinaryReader r(dValue.get(), IncludeVersion());
serializer(r, district);
}
state Customer customer;
customer.c_id = c_id;
customer.c_w_id = w_id;
customer.c_d_id = d_id;
Optional<Value> cValue = wait(tr.get(customer.key()));
ASSERT(cValue.present());
{
BinaryReader r(cValue.get(), IncludeVersion());
serializer(r, customer);
}
state Order order;
order.o_entry_d = g_network->now();
order.o_c_id = c_id;
order.o_d_id = d_id;
order.o_w_id = w_id;
order.o_ol_cnt = ol_cnt;
order.o_id = district.d_next_o_id;
++district.d_next_o_id;
{
BinaryWriter w(IncludeVersion());
serializer(w, district);
tr.set(district.key(), w.toValue());
}
state NewOrder newOrder;
newOrder.no_w_id = w_id;
newOrder.no_d_id = d_id;
newOrder.no_o_id = order.o_id;
state int ol_id = 0;
state bool allLocal = true;
for (; ol_id < order.o_ol_cnt; ++ol_id) {
if (ol_id + 1 == order.o_ol_cnt && willRollback) {
// Simulated abort - order item not found
return false;
}
state OrderLine orderLine;
orderLine.ol_number = ol_id;
orderLine.ol_w_id = w_id;
orderLine.ol_d_id = d_id;
orderLine.ol_supply_w_id = w_id;
orderLine.ol_o_id = order.o_id;
orderLine.ol_i_id = self->NURand(self->gState.CRun, 8191, 1, 100000) - 1;
orderLine.ol_quantity = deterministicRandom()->randomInt(1, 11);
if (deterministicRandom()->randomInt(0, 100) == 0) {
orderLine.ol_supply_w_id =
deterministicRandom()->randomInt(0, self->clientsUsed * self->warehousesPerClient);
}
state Item item;
item.i_id = orderLine.ol_i_id;
orderLine.ol_i_id = item.i_id;
Optional<Value> iValue = wait(tr.get(item.key()));
ASSERT(iValue.present());
{
BinaryReader r(iValue.get(), IncludeVersion());
serializer(r, item);
}
state Stock stock;
stock.s_i_id = item.i_id;
stock.s_w_id = orderLine.ol_supply_w_id;
Optional<Value> sValue = wait(tr.get(stock.key()));
ASSERT(sValue.present());
{
BinaryReader r(sValue.get(), IncludeVersion());
serializer(r, stock);
}
if (stock.s_quantity - orderLine.ol_quantity >= 10) {
stock.s_quantity -= orderLine.ol_quantity;
} else {
stock.s_quantity = (stock.s_quantity - orderLine.ol_quantity) + 91;
}
stock.s_ytd += orderLine.ol_quantity;
stock.s_order_cnt += 1;
if (orderLine.ol_supply_w_id != w_id) {
stock.s_remote_cnt += 1;
allLocal = false;
}
{
BinaryWriter w(IncludeVersion());
serializer(w, stock);
tr.set(stock.key(), w.toValue());
}
orderLine.ol_amount = orderLine.ol_quantity * item.i_price;
switch (orderLine.ol_d_id) {
case 0:
orderLine.ol_dist_info = stock.s_dist_01;
break;
case 1:
orderLine.ol_dist_info = stock.s_dist_02;
break;
case 2:
orderLine.ol_dist_info = stock.s_dist_03;
break;
case 3:
orderLine.ol_dist_info = stock.s_dist_04;
break;
case 4:
orderLine.ol_dist_info = stock.s_dist_05;
break;
case 5:
orderLine.ol_dist_info = stock.s_dist_06;
break;
case 6:
orderLine.ol_dist_info = stock.s_dist_07;
break;
case 7:
orderLine.ol_dist_info = stock.s_dist_08;
break;
case 8:
orderLine.ol_dist_info = stock.s_dist_09;
break;
case 9:
orderLine.ol_dist_info = stock.s_dist_10;
break;
}
{
BinaryWriter w(IncludeVersion());
serializer(w, orderLine);
tr.set(orderLine.key(), w.toValue());
}
}
order.o_all_local = allLocal;
{
BinaryWriter w(IncludeVersion());
serializer(w, order);
tr.set(order.key(), w.toValue());
}
{
BinaryWriter w(IncludeVersion());
serializer(w, newOrder);
tr.set(newOrder.key(), w.toValue());
}
wait(tr.commit());
} catch (Error& e) {
return false;
}
return true;
}
ACTOR static Future<Customer> getRandomCustomer(TPCC* self, ReadYourWritesTransaction* tr, int w_id, int d_id) {
state Customer result;
result.c_w_id = w_id;
result.c_d_id = d_id;
if (deterministicRandom()->randomInt(0, 100) >= 85) {
result.c_d_id = deterministicRandom()->randomInt(0, 10);
result.c_w_id = deterministicRandom()->randomInt(0, self->clientsUsed * self->warehousesPerClient);
}
if (deterministicRandom()->randomInt(0, 100) < 60) {
// select through last name
result.c_last = self->genCLast(result.arena, self->NURand(self->gState.CRun, 1023, 1, 3000) - 1);
auto s = result.indexLastKey(1);
auto begin = new (result.arena) uint8_t[s.size() + 1];
auto end = new (result.arena) uint8_t[s.size() + 1];
memcpy(begin, s.begin(), s.size());
memcpy(end, s.begin(), s.size());
begin[s.size()] = '/';
end[s.size()] = '0';
state RangeResult range =
wait(tr->getRange(KeyRangeRef(StringRef(begin, s.size() + 1), StringRef(end, s.size() + 1)), 1000));
ASSERT(range.size() > 0);
state std::vector<Customer> customers;
state int i = 0;
for (; i < range.size(); ++i) {
Optional<Value> cValue = wait(tr->get(range[i].value));
ASSERT(cValue.present());
BinaryReader r(cValue.get(), IncludeVersion());
state Customer customer;
serializer(r, customer);
customers.push_back(customer);
}
// Sort customers by first name and choose median
std::sort(customers.begin(), customers.end(), [](const Customer& cus1, const Customer& cus2) {
const std::string cus1Name = cus1.c_first.toString();
const std::string cus2Name = cus2.c_first.toString();
return (cus1Name.compare(cus2Name) < 0);
});
result = customers[customers.size() / 2];
} else {
// select through random id
result.c_id = self->NURand(self->gState.CRun, 1023, 1, 3000) - 1;
Optional<Value> val = wait(tr->get(result.key()));
ASSERT(val.present());
BinaryReader r(val.get(), IncludeVersion());
serializer(r, result);
}
return result;
}
ACTOR static Future<bool> payment(TPCC* self, Database cx, int w_id) {
state ReadYourWritesTransaction tr(cx);
state int d_id = deterministicRandom()->randomInt(0, 10);
state History history;
state Warehouse warehouse;
state District district;
history.h_amount = deterministicRandom()->random01() * 4999.0 + 1.0;
history.h_date = g_network->now();
try {
// get the customer
state Customer customer = wait(getRandomCustomer(self, &tr, w_id, d_id));
warehouse.w_id = w_id;
Optional<Value> wValue = wait(tr.get(warehouse.key()));
ASSERT(wValue.present());
{
BinaryReader r(wValue.get(), IncludeVersion());
serializer(r, warehouse);
}
warehouse.w_ytd += history.h_amount;
{
BinaryWriter w(IncludeVersion());
serializer(w, warehouse);
tr.set(warehouse.key(), w.toValue());
}
district.d_w_id = w_id;
district.d_id = d_id;
Optional<Value> dValue = wait(tr.get(district.key()));
ASSERT(dValue.present());
{
BinaryReader r(dValue.get(), IncludeVersion());
serializer(r, district);
}
district.d_ytd += history.h_amount;
customer.c_balance -= history.h_amount;
customer.c_ytd_payment += history.h_amount;
customer.c_payment_cnt += 1;
if (customer.c_credit == LiteralStringRef("BC")) {
// we must update c_data
std::stringstream ss;
ss << customer.c_id << "," << customer.c_d_id << "," << customer.c_w_id << "," << district.d_id << ","
<< w_id << history.h_amount << ";";
auto s = ss.str();
auto len = std::min(int(s.size()) + customer.c_data.size(), 500);
auto data = new (customer.arena) uint8_t[len];
std::copy(s.begin(), s.end(), reinterpret_cast<char*>(data));
std::copy(customer.c_data.begin(), customer.c_data.begin() + len - s.size(), data);
customer.c_data = StringRef(data, len);
}
{
BinaryWriter w(IncludeVersion());
serializer(w, customer);
tr.set(customer.key(), w.toValue());
}
std::stringstream ss;
ss << warehouse.w_name.toString() << " " << district.d_name.toString();
history.h_data = StringRef(history.arena, ss.str());
history.h_c_id = customer.c_id;
history.h_c_d_id = customer.c_d_id;
history.h_c_w_id = customer.c_w_id;
history.h_d_id = d_id;
history.h_w_id = w_id;
{
BinaryWriter w(IncludeVersion());
serializer(w, history);
UID k = deterministicRandom()->randomUniqueID();
BinaryWriter kW(Unversioned());
serializer(kW, k);
auto key = kW.toValue().withPrefix(LiteralStringRef("History/"));
tr.set(key, w.toValue());
}
wait(tr.commit());
} catch (Error& e) {
return false;
}
return true;
}
ACTOR static Future<bool> orderStatus(TPCC* self, Database cx, int w_id) {
state ReadYourWritesTransaction tr(cx);
state int d_id = deterministicRandom()->randomInt(0, 10);
state int i;
state Order order;
state std::vector<OrderLine> orderLines;
try {
state Customer customer = wait(getRandomCustomer(self, &tr, w_id, d_id));
order.o_w_id = customer.c_w_id;
order.o_d_id = customer.c_d_id;
order.o_c_id = customer.c_id;
RangeResult range = wait(tr.getRange(order.keyRange(1), 1, Snapshot::False, Reverse::True));
ASSERT(range.size() > 0);
{
BinaryReader r(range[0].value, IncludeVersion());
serializer(r, order);
}
for (i = 0; i < order.o_ol_cnt; ++i) {
OrderLine orderLine;
orderLine.ol_w_id = order.o_w_id;
orderLine.ol_d_id = order.o_d_id;
orderLine.ol_o_id = order.o_id;
orderLine.ol_number = i;
Optional<Value> olValue = wait(tr.get(orderLine.key()));
ASSERT(olValue.present());
BinaryReader r(olValue.get(), IncludeVersion());
OrderLine ol;
serializer(r, ol);
orderLines.push_back(ol);
}
} catch (Error& e) {
return false;
}
return true;
}
ACTOR static Future<bool> delivery(TPCC* self, Database cx, int w_id) {
state ReadYourWritesTransaction tr(cx);
state int carrier_id = deterministicRandom()->randomInt(0, 10);
state int d_id;
state NewOrder newOrder;
state Order order;
state double sumAmount = 0.0;
state Customer customer;
state int i;
try {
for (d_id = 0; d_id < 10; ++d_id) {
newOrder.no_w_id = w_id;
newOrder.no_d_id = d_id;
RangeResult range = wait(tr.getRange(newOrder.keyRange(1), 1));
if (range.size() > 0) {
{
BinaryReader r(range[0].value, IncludeVersion());
serializer(r, newOrder);
}
tr.clear(newOrder.key());
order.o_w_id = w_id;
order.o_d_id = d_id;
order.o_id = newOrder.no_o_id;
Optional<Value> oValue = wait(tr.get(order.key()));
ASSERT(oValue.present());
{
BinaryReader r(oValue.get(), IncludeVersion());
serializer(r, order);
}
order.o_carrier_id = carrier_id;
{
BinaryWriter w(IncludeVersion());
serializer(w, order);
tr.set(order.key(), w.toValue());
}
for (i = 0; i < order.o_ol_cnt; ++i) {
state OrderLine orderLine;
orderLine.ol_w_id = order.o_w_id;
orderLine.ol_d_id = order.o_d_id;
orderLine.ol_o_id = order.o_id;
orderLine.ol_number = i;
Optional<Value> olV = wait(tr.get(orderLine.key()));
ASSERT(olV.present());
BinaryReader r(olV.get(), IncludeVersion());
serializer(r, orderLine);
orderLine.ol_delivery_d = g_network->now();
sumAmount += orderLine.ol_amount;
}
customer.c_w_id = w_id;
customer.c_d_id = d_id;
customer.c_id = order.o_c_id;
Optional<Value> cV = wait(tr.get(customer.key()));
ASSERT(cV.present());
{
BinaryReader r(cV.get(), IncludeVersion());
serializer(r, customer);
}
customer.c_balance += sumAmount;
customer.c_delivery_count += 1;
{
BinaryWriter w(IncludeVersion());
serializer(w, customer);
tr.set(customer.key(), w.toValue());
}
wait(tr.commit());
}
}
} catch (Error& e) {
return false;
}
return true;
}
ACTOR static Future<bool> stockLevel(TPCC* self, Database cx, int w_id, int d_id) {
state int threshold = deterministicRandom()->randomInt(10, 21);
state Transaction tr(cx);
state District district;
state OrderLine orderLine;
state Stock stock;
state int ol_o_id;
state int low_stock = 0;
state int i;
try {
district.d_w_id = w_id;
district.d_id = d_id;
Optional<Value> dV = wait(tr.get(district.key()));
ASSERT(dV.present());
{
BinaryReader r(dV.get(), IncludeVersion());
serializer(r, district);
}
for (ol_o_id = district.d_next_o_id - 20; ol_o_id < district.d_next_o_id; ++ol_o_id) {
orderLine.ol_w_id = w_id;
orderLine.ol_d_id = d_id;
orderLine.ol_o_id = ol_o_id;
state RangeResult range = wait(tr.getRange(orderLine.keyRange(1), CLIENT_KNOBS->TOO_MANY));
ASSERT(!range.more);
ASSERT(range.size() > 0);
for (i = 0; i < range.size(); ++i) {
{
BinaryReader r(range[i].value, IncludeVersion());
serializer(r, orderLine);
}
stock.s_i_id = orderLine.ol_i_id;
stock.s_w_id = orderLine.ol_w_id;
Optional<Value> sV = wait(tr.get(stock.key()));
ASSERT(sV.present());
{
BinaryReader r(sV.get(), IncludeVersion());
serializer(r, stock);
}
if (stock.s_quantity < threshold) {
++low_stock;
}
}
}
} catch (Error& e) {
return false;
}
return true;
}
ACTOR static Future<Void> emulatedUser(TPCC* self, Database cx, int w_id, int d_id) {
// stagger users
wait(delay(20.0 * deterministicRandom()->random01()));
TraceEvent("StartingEmulatedUser").detail("Warehouse", w_id).detail("District", d_id);
loop {
auto type = deterministicRandom()->randomInt(0, 100);
Future<bool> tx;
state double txnStartTime = g_network->now();
if (type < 4) {
tx = stockLevel(self, cx, w_id, d_id);
bool committed = wait(tx);
if (self->recordMetrics()) {
TPCCMetrics::updateMetrics(committed,
self->metrics.successfulStockLevelTransactions,
self->metrics.failedStockLevelTransactions,
txnStartTime,
self->metrics.stockLevelLatencies,
self->metrics.stockLevelResponseTime,
"StockLevel");
}
wait(delay(2 + deterministicRandom()->random01() * 10));
} else if (type < 8) {
tx = delivery(self, cx, w_id);
bool committed = wait(tx);
if (self->recordMetrics()) {
TPCCMetrics::updateMetrics(committed,
self->metrics.successfulDeliveryTransactions,
self->metrics.failedDeliveryTransactions,
txnStartTime,
self->metrics.deliveryLatencies,
self->metrics.deliveryResponseTime,
"Delivery");
}
wait(delay(2 + deterministicRandom()->random01() * 10));
} else if (type < 12) {
tx = orderStatus(self, cx, w_id);
bool committed = wait(tx);
if (self->recordMetrics()) {
TPCCMetrics::updateMetrics(committed,
self->metrics.successfulOrderStatusTransactions,
self->metrics.failedOrderStatusTransactions,
txnStartTime,
self->metrics.orderStatusLatencies,
self->metrics.orderStatusResponseTime,
"OrderStatus");
}
wait(delay(2 + deterministicRandom()->random01() * 20));
} else if (type < 55) {
tx = payment(self, cx, w_id);
bool committed = wait(tx);
if (self->recordMetrics()) {
TPCCMetrics::updateMetrics(committed,
self->metrics.successfulPaymentTransactions,
self->metrics.failedPaymentTransactions,
txnStartTime,
self->metrics.paymentLatencies,
self->metrics.paymentResponseTime,
"Payment");
}
wait(delay(3 + deterministicRandom()->random01() * 24));
} else {
tx = newOrder(self, cx, w_id);
bool committed = wait(tx);
if (self->recordMetrics()) {
TPCCMetrics::updateMetrics(committed,
self->metrics.successfulNewOrderTransactions,
self->metrics.failedNewOrderTransactions,
txnStartTime,
self->metrics.newOrderLatencies,
self->metrics.newOrderResponseTime,
"NewOrder");
}
wait(delay(18 + deterministicRandom()->random01() * 24));
}
}
}
double transactionsPerMinute() const {
return metrics.successfulNewOrderTransactions * 60.0 / (testDuration - 2 * warmupTime);
}
bool recordMetrics() const {
auto now = g_network->now();
return (now > startTime + warmupTime && now < startTime + testDuration - warmupTime);
}
Future<Void> start(Database const& cx) override {
if (clientId >= clientsUsed)
return Void();
return _start(cx, this);
}
ACTOR Future<Void> _start(Database cx, TPCC* self) {
wait(readGlobalState(self, cx));
self->startTime = g_network->now();
int startWID = self->clientId * self->warehousesPerClient;
int endWID = startWID + self->warehousesPerClient;
state int w_id;
state int d_id;
state std::vector<Future<Void>> emulatedUsers;
for (w_id = startWID; w_id < endWID; ++w_id) {
for (d_id = 0; d_id < 10; ++d_id) {
emulatedUsers.push_back(timeout(emulatedUser(self, cx, w_id, d_id), self->testDuration, Void()));
}
}
wait(waitForAll(emulatedUsers));
return Void();
}
Future<bool> check(Database const& cx) override {
return (transactionsPerMinute() > expectedTransactionsPerMinute);
}
void getMetrics(std::vector<PerfMetric>& m) override {
double multiplier = static_cast<double>(clientCount) / static_cast<double>(clientsUsed);
m.emplace_back("Transactions Per Minute", transactionsPerMinute(), Averaged::False);
m.emplace_back("Successful StockLevel Transactions", metrics.successfulStockLevelTransactions, Averaged::False);
m.emplace_back("Successful Delivery Transactions", metrics.successfulDeliveryTransactions, Averaged::False);
m.emplace_back(
"Successful OrderStatus Transactions", metrics.successfulOrderStatusTransactions, Averaged::False);
m.emplace_back("Successful Payment Transactions", metrics.successfulPaymentTransactions, Averaged::False);
m.emplace_back("Successful NewOrder Transactions", metrics.successfulNewOrderTransactions, Averaged::False);
m.emplace_back("Failed StockLevel Transactions", metrics.failedStockLevelTransactions, Averaged::False);
m.emplace_back("Failed Delivery Transactions", metrics.failedDeliveryTransactions, Averaged::False);
m.emplace_back("Failed OrderStatus Transactions", metrics.failedOrderStatusTransactions, Averaged::False);
m.emplace_back("Failed Payment Transactions", metrics.failedPaymentTransactions, Averaged::False);
m.emplace_back("Failed NewOrder Transactions", metrics.failedNewOrderTransactions, Averaged::False);
m.emplace_back("Mean StockLevel Latency",
(clientId < clientsUsed)
? (multiplier * metrics.stockLevelResponseTime / metrics.successfulStockLevelTransactions)
: 0.0,
Averaged::True);
m.emplace_back("Mean Delivery Latency",
(clientId < clientsUsed)
? (multiplier * metrics.deliveryResponseTime / metrics.successfulDeliveryTransactions)
: 0.0,
Averaged::True);
m.emplace_back("Mean OrderStatus Repsonse Time",
(clientId < clientsUsed)
? (multiplier * metrics.orderStatusResponseTime / metrics.successfulOrderStatusTransactions)
: 0.0,
Averaged::True);
m.emplace_back("Mean Payment Latency",
(clientId < clientsUsed)
? (multiplier * metrics.paymentResponseTime / metrics.successfulPaymentTransactions)
: 0.0,
Averaged::True);
m.emplace_back("Mean NewOrder Latency",
(clientId < clientsUsed)
? (multiplier * metrics.newOrderResponseTime / metrics.successfulNewOrderTransactions)
: 0.0,
Averaged::True);
metrics.sort();
m.emplace_back(
"Median StockLevel Latency", multiplier * TPCCMetrics::median(metrics.stockLevelLatencies), Averaged::True);
m.emplace_back(
"Median Delivery Latency", multiplier * TPCCMetrics::median(metrics.deliveryLatencies), Averaged::True);
m.emplace_back("Median OrderStatus Latency",
multiplier * TPCCMetrics::median(metrics.orderStatusLatencies),
Averaged::True);
m.emplace_back(
"Median Payment Latency", multiplier * TPCCMetrics::median(metrics.paymentLatencies), Averaged::True);
m.emplace_back(
"Median NewOrder Latency", multiplier * TPCCMetrics::median(metrics.newOrderLatencies), Averaged::True);
m.emplace_back("90th Percentile StockLevel Latency",
multiplier * TPCCMetrics::percentile_90(metrics.stockLevelLatencies),
Averaged::True);
m.emplace_back("90th Percentile Delivery Latency",
multiplier * TPCCMetrics::percentile_90(metrics.deliveryLatencies),
Averaged::True);
m.emplace_back("90th Percentile OrderStatus Latency",
multiplier * TPCCMetrics::percentile_90(metrics.orderStatusLatencies),
Averaged::True);
m.emplace_back("90th Percentile Payment Latency",
multiplier * TPCCMetrics::percentile_90(metrics.paymentLatencies),
Averaged::True);
m.emplace_back("90th Percentile NewOrder Latency",
multiplier * TPCCMetrics::percentile_90(metrics.newOrderLatencies),
Averaged::True);
m.emplace_back("99th Percentile StockLevel Latency",
multiplier * TPCCMetrics::percentile_99(metrics.stockLevelLatencies),
Averaged::True);
m.emplace_back("99th Percentile Delivery Latency",
multiplier * TPCCMetrics::percentile_99(metrics.deliveryLatencies),
Averaged::True);
m.emplace_back("99th Percentile OrderStatus Latency",
multiplier * TPCCMetrics::percentile_99(metrics.orderStatusLatencies),
Averaged::True);
m.emplace_back("99th Percentile Payment Latency",
multiplier * TPCCMetrics::percentile_99(metrics.paymentLatencies),
Averaged::True);
m.emplace_back("99th Percentile NewOrder Latency",
multiplier * TPCCMetrics::percentile_99(metrics.newOrderLatencies),
Averaged::True);
}
};
} // namespace
WorkloadFactory<TPCC> TPCCWorkloadFactory(TPCC::DESCRIPTION);

View File

@ -1,6 +1,6 @@
set(FDBSERVICE_SRCS FDBService.cpp ServiceBase.cpp)
add_executable(fdbmonitor ${FDBSERVICE_SRCS})
target_include_directories(fdbmonitor PRIVATE ${CMAKE_BINARY_DIR}/flow/include ${CMAKE_BINARY_DIR}/fdbclient/include)
add_dependencies(fdbmonitor fdbclient)
get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")

View File

@ -21,6 +21,7 @@
#include "flow/EncryptUtils.h"
#include "flow/Trace.h"
#include <boost/algorithm/string.hpp>
#include <boost/format.hpp>
std::string getEncryptDbgTraceKey(std::string_view prefix,
@ -29,12 +30,15 @@ std::string getEncryptDbgTraceKey(std::string_view prefix,
Optional<EncryptCipherBaseKeyId> baseCipherId) {
// Construct the TraceEvent field key ensuring its uniqueness and compliance to TraceEvent field validator and log
// parsing tools
std::string dName = domainName.toString();
// Underscores are invalid in trace event detail name.
boost::replace_all(dName, "_", "-");
if (baseCipherId.present()) {
boost::format fmter("%s.%lld.%s.%llu");
return boost::str(boost::format(fmter % prefix % domainId % domainName.toString() % baseCipherId.get()));
return boost::str(boost::format(fmter % prefix % domainId % dName % baseCipherId.get()));
} else {
boost::format fmter("%s.%lld.%s");
return boost::str(boost::format(fmter % prefix % domainId % domainName.toString()));
return boost::str(boost::format(fmter % prefix % domainId % dName));
}
}

View File

@ -29,14 +29,14 @@
#include <string>
#include <string_view>
#define ENCRYPT_INVALID_DOMAIN_ID 0
#define ENCRYPT_INVALID_DOMAIN_ID -1
#define ENCRYPT_INVALID_CIPHER_KEY_ID 0
#define ENCRYPT_INVALID_RANDOM_SALT 0
#define AUTH_TOKEN_SIZE 16
#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -1
#define ENCRYPT_HEADER_DOMAIN_ID -2
#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -2
#define ENCRYPT_HEADER_DOMAIN_ID -3
const std::string FDB_DEFAULT_ENCRYPT_DOMAIN_NAME = "FdbDefaultEncryptDomain";

View File

@ -174,6 +174,7 @@ public: // introduced features
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, SWVersionTracking);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Metacluster);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, TenantGroups);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, EncryptionAtRest);
};
template <>

View File

@ -196,6 +196,7 @@ ERROR( key_not_tuple, 2041, "The key cannot be parsed as a tuple" );
ERROR( value_not_tuple, 2042, "The value cannot be parsed as a tuple" );
ERROR( mapper_not_tuple, 2043, "The mapper cannot be parsed as a tuple" );
ERROR( invalid_checkpoint_format, 2044, "Invalid checkpoint format" )
ERROR( invalid_throttle_quota_value, 2045, "Failed to deserialize or initialize throttle quota value" )
ERROR( incompatible_protocol_version, 2100, "Incompatible protocol version" )
ERROR( transaction_too_large, 2101, "Transaction exceeds byte limit" )
@ -326,6 +327,7 @@ ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key")
ERROR( encrypt_invalid_id, 2706, "Invalid encryption cipher details")
ERROR( encrypt_keys_fetch_failed, 2707, "Encryption keys fetch from external KMS failed")
ERROR( encrypt_invalid_kms_config, 2708, "Invalid encryption/kms configuration: discovery-url, validation-token, endpoint etc.")
ERROR( encrypt_unsupported, 2709, "Encryption not supported")
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error

View File

@ -104,8 +104,6 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES SystemData.txt)
add_fdb_test(TEST_FILES ThreadSafety.txt IGNORE)
add_fdb_test(TEST_FILES TraceEventMetrics.txt IGNORE)
add_fdb_test(TEST_FILES PopulateTPCC.txt IGNORE)
add_fdb_test(TEST_FILES TPCC.txt IGNORE)
add_fdb_test(TEST_FILES default.txt IGNORE)
add_fdb_test(TEST_FILES errors.txt IGNORE)
add_fdb_test(TEST_FILES fail.txt IGNORE)
@ -208,6 +206,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES rare/CycleWithKills.toml)
add_fdb_test(TEST_FILES rare/CycleWithDeadHall.toml)
add_fdb_test(TEST_FILES rare/FuzzTest.toml)
add_fdb_test(TEST_FILES rare/GlobalTagThrottling.toml IGNORE)
add_fdb_test(TEST_FILES rare/HighContentionPrefixAllocator.toml)
add_fdb_test(TEST_FILES rare/InventoryTestHeavyWrites.toml)
add_fdb_test(TEST_FILES rare/LargeApiCorrectness.toml)

View File

@ -1,9 +0,0 @@
testTitle=PopulateTPCCTest
timeout=3600000
clearAfterTest=false
runConsistencyCheck=false
testName=PopulateTPCC
clientsUsed=2
actors=1
warehousesPerActor=200

View File

@ -1,19 +0,0 @@
testTitle=PopulateTPCCTest
clearAfterTest=false
runConsistencyCheck=false
timeout=3600000
testName=PopulateTPCC
clientsUsed=2
actorsPerClient=1
warehousesPerActor=80
testTitle=TPCCTest
timeout=14400
testName=TPCC
warehousesPerClient=4
testDuration=3600
warmupTime=300
clientsUsed=40
expectedTransactionsPerMinute=1000

View File

@ -0,0 +1,41 @@
[[test]]
testTitle='GlobalTagThrottling'
[[test.knobs]]
min_tag_read_pages_rate=1.0
global_tag_throttling=true
[[test.workload]]
testName='GlobalTagThrottling'
transactionTag='sampleTag1'
totalReadQuota=1.0
[[test.workload]]
testName='ReadWrite'
testDuration=600.0
transactionsPerSecond=100
writesPerTransactionA=0
readsPerTransactionA=10
writesPerTransactionB=0
readsPerTransactionB=0
alpha=0.0
nodeCount=10000
valueBytes=1000
minValueBytes=1000
warmingDelay=60.0
transactionTag='sampleTag1'
[[test.workload]]
testName='ReadWrite'
testDuration=600.0
transactionsPerSecond=100
writesPerTransactionA=0
readsPerTransactionA=10
writesPerTransactionB=0
readsPerTransactionB=0
alpha=0.0
nodeCount=10000
valueBytes=1000
minValueBytes=1000
warmingDelay=60.0
transactionTag='sampleTag2'