Merge branch 'main' of github.com:apple/foundationdb into getsizetenant

This commit is contained in:
Ankita Kejriwal 2022-10-14 16:32:38 -07:00
commit 0f9da9d1ad
255 changed files with 5554 additions and 1929 deletions

View File

@ -166,6 +166,7 @@ void ApiWorkload::populateDataTx(TTaskFct cont, std::optional<int> tenantId) {
execTransaction( execTransaction(
[kvPairs](auto ctx) { [kvPairs](auto ctx) {
for (const fdb::KeyValue& kv : *kvPairs) { for (const fdb::KeyValue& kv : *kvPairs) {
ctx->tx().addReadConflictRange(kv.key, kv.key + fdb::Key(1, '\x00'));
ctx->tx().set(kv.key, kv.value); ctx->tx().set(kv.key, kv.value);
} }
ctx->commit(); ctx->commit();
@ -257,6 +258,7 @@ void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional<int> tenantId) {
execTransaction( execTransaction(
[kvPairs](auto ctx) { [kvPairs](auto ctx) {
for (const fdb::KeyValue& kv : *kvPairs) { for (const fdb::KeyValue& kv : *kvPairs) {
ctx->tx().addReadConflictRange(kv.key, kv.key + fdb::Key(1, '\x00'));
ctx->tx().set(kv.key, kv.value); ctx->tx().set(kv.key, kv.value);
} }
ctx->commit(); ctx->commit();
@ -279,6 +281,7 @@ void ApiWorkload::randomClearOp(TTaskFct cont, std::optional<int> tenantId) {
execTransaction( execTransaction(
[keys](auto ctx) { [keys](auto ctx) {
for (const auto& key : *keys) { for (const auto& key : *keys) {
ctx->tx().addReadConflictRange(key, key + fdb::Key(1, '\x00'));
ctx->tx().clear(key); ctx->tx().clear(key);
} }
ctx->commit(); ctx->commit();
@ -300,6 +303,7 @@ void ApiWorkload::randomClearRangeOp(TTaskFct cont, std::optional<int> tenantId)
} }
execTransaction( execTransaction(
[begin, end](auto ctx) { [begin, end](auto ctx) {
ctx->tx().addReadConflictRange(begin, end);
ctx->tx().clearRange(begin, end); ctx->tx().clearRange(begin, end);
ctx->commit(); ctx->commit();
}, },

View File

@ -160,6 +160,7 @@ private:
execTransaction( execTransaction(
// 1. Set the key to val1 // 1. Set the key to val1
[key, val1](auto ctx) { [key, val1](auto ctx) {
ctx->tx().addReadConflictRange(key, key + fdb::Key(1, '\x00'));
ctx->tx().set(key, val1); ctx->tx().set(key, val1);
ctx->commit(); ctx->commit();
}, },
@ -296,6 +297,7 @@ private:
// 1. Set the key to initial value // 1. Set the key to initial value
[key, val](auto ctx) { [key, val](auto ctx) {
ctx->tx().set(key, val); ctx->tx().set(key, val);
ctx->tx().addReadConflictRange(key, key + fdb::Key(1, '\x00'));
ctx->commit(); ctx->commit();
}, },
[this, key, val, cont]() { [this, key, val, cont]() {

View File

@ -50,6 +50,7 @@ private:
execTransaction( execTransaction(
[kvPairs](auto ctx) { [kvPairs](auto ctx) {
for (const fdb::KeyValue& kv : *kvPairs) { for (const fdb::KeyValue& kv : *kvPairs) {
ctx->tx().addReadConflictRange(kv.key, kv.key + fdb::Key(1, '\x00'));
ctx->tx().set(kv.key, kv.value); ctx->tx().set(kv.key, kv.value);
} }
ctx->commit(); ctx->commit();

View File

@ -77,10 +77,11 @@ public:
int retryLimit, int retryLimit,
std::string bgBasePath, std::string bgBasePath,
std::optional<fdb::BytesRef> tenantName, std::optional<fdb::BytesRef> tenantName,
bool transactional) bool transactional,
bool restartOnTimeout)
: executor(executor), startFct(startFct), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit), : executor(executor), startFct(startFct), contAfterDone(cont), scheduler(scheduler), retryLimit(retryLimit),
txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath), tenantName(tenantName), txState(TxState::IN_PROGRESS), commitCalled(false), bgBasePath(bgBasePath), tenantName(tenantName),
transactional(transactional) { transactional(transactional), restartOnTimeout(restartOnTimeout) {
databaseCreateErrorInjected = executor->getOptions().injectDatabaseCreateErrors && databaseCreateErrorInjected = executor->getOptions().injectDatabaseCreateErrors &&
Random::get().randomBool(executor->getOptions().databaseCreateErrorRatio); Random::get().randomBool(executor->getOptions().databaseCreateErrorRatio);
if (databaseCreateErrorInjected) { if (databaseCreateErrorInjected) {
@ -177,7 +178,8 @@ public:
ASSERT(!onErrorFuture); ASSERT(!onErrorFuture);
if (databaseCreateErrorInjected && canBeInjectedDatabaseCreateError(err.code())) { if ((databaseCreateErrorInjected && canBeInjectedDatabaseCreateError(err.code())) ||
(restartOnTimeout && err.code() == error_code_transaction_timed_out)) {
// Failed to create a database because of failure injection // Failed to create a database because of failure injection
// Restart by recreating the transaction in a valid database // Restart by recreating the transaction in a valid database
recreateAndRestartTransaction(); recreateAndRestartTransaction();
@ -235,7 +237,11 @@ protected:
fdb::Error err = onErrorFuture.error(); fdb::Error err = onErrorFuture.error();
onErrorFuture = {}; onErrorFuture = {};
if (err) { if (err) {
transactionFailed(err); if (restartOnTimeout && err.code() == error_code_transaction_timed_out) {
recreateAndRestartTransaction();
} else {
transactionFailed(err);
}
} else { } else {
restartTransaction(); restartTransaction();
} }
@ -359,6 +365,9 @@ protected:
// Accessed on initialization and in ON_ERROR state only (no need for mutex) // Accessed on initialization and in ON_ERROR state only (no need for mutex)
bool databaseCreateErrorInjected; bool databaseCreateErrorInjected;
// Restart the transaction automatically on timeout errors
const bool restartOnTimeout;
// The tenant that we will run this transaction in // The tenant that we will run this transaction in
const std::optional<fdb::BytesRef> tenantName; const std::optional<fdb::BytesRef> tenantName;
@ -378,9 +387,17 @@ public:
int retryLimit, int retryLimit,
std::string bgBasePath, std::string bgBasePath,
std::optional<fdb::BytesRef> tenantName, std::optional<fdb::BytesRef> tenantName,
bool transactional) bool transactional,
: TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) { bool restartOnTimeout)
} : TransactionContextBase(executor,
startFct,
cont,
scheduler,
retryLimit,
bgBasePath,
tenantName,
transactional,
restartOnTimeout) {}
protected: protected:
void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override {
@ -456,9 +473,17 @@ public:
int retryLimit, int retryLimit,
std::string bgBasePath, std::string bgBasePath,
std::optional<fdb::BytesRef> tenantName, std::optional<fdb::BytesRef> tenantName,
bool transactional) bool transactional,
: TransactionContextBase(executor, startFct, cont, scheduler, retryLimit, bgBasePath, tenantName, transactional) { bool restartOnTimeout)
} : TransactionContextBase(executor,
startFct,
cont,
scheduler,
retryLimit,
bgBasePath,
tenantName,
transactional,
restartOnTimeout) {}
protected: protected:
void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override { void doContinueAfter(fdb::Future f, TTaskFct cont, bool retryOnError) override {
@ -470,7 +495,7 @@ protected:
lock.unlock(); lock.unlock();
try { try {
f.then([this](fdb::Future f) { futureReadyCallback(f, this); }); f.then([this](fdb::Future f) { futureReadyCallback(f, this); });
} catch (std::runtime_error& err) { } catch (std::exception& err) {
lock.lock(); lock.lock();
callbackMap.erase(f); callbackMap.erase(f);
lock.unlock(); lock.unlock();
@ -482,7 +507,7 @@ protected:
try { try {
AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param; AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param;
txCtx->onFutureReady(f); txCtx->onFutureReady(f);
} catch (std::runtime_error& err) { } catch (std::exception& err) {
fmt::print("Unexpected exception in callback {}\n", err.what()); fmt::print("Unexpected exception in callback {}\n", err.what());
abort(); abort();
} catch (...) { } catch (...) {
@ -544,7 +569,7 @@ protected:
try { try {
AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param; AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param;
txCtx->onErrorReady(f); txCtx->onErrorReady(f);
} catch (std::runtime_error& err) { } catch (std::exception& err) {
fmt::print("Unexpected exception in callback {}\n", err.what()); fmt::print("Unexpected exception in callback {}\n", err.what());
abort(); abort();
} catch (...) { } catch (...) {
@ -673,7 +698,8 @@ public:
void execute(TOpStartFct startFct, void execute(TOpStartFct startFct,
TOpContFct cont, TOpContFct cont,
std::optional<fdb::BytesRef> tenantName, std::optional<fdb::BytesRef> tenantName,
bool transactional) override { bool transactional,
bool restartOnTimeout) override {
try { try {
std::shared_ptr<ITransactionContext> ctx; std::shared_ptr<ITransactionContext> ctx;
if (options.blockOnFutures) { if (options.blockOnFutures) {
@ -684,7 +710,8 @@ public:
options.transactionRetryLimit, options.transactionRetryLimit,
bgBasePath, bgBasePath,
tenantName, tenantName,
transactional); transactional,
restartOnTimeout);
} else { } else {
ctx = std::make_shared<AsyncTransactionContext>(this, ctx = std::make_shared<AsyncTransactionContext>(this,
startFct, startFct,
@ -693,7 +720,8 @@ public:
options.transactionRetryLimit, options.transactionRetryLimit,
bgBasePath, bgBasePath,
tenantName, tenantName,
transactional); transactional,
restartOnTimeout);
} }
startFct(ctx); startFct(ctx);
} catch (...) { } catch (...) {

View File

@ -116,7 +116,8 @@ public:
virtual void execute(TOpStartFct start, virtual void execute(TOpStartFct start,
TOpContFct cont, TOpContFct cont,
std::optional<fdb::BytesRef> tenantName, std::optional<fdb::BytesRef> tenantName,
bool transactional) = 0; bool transactional,
bool restartOnTimeout) = 0;
virtual fdb::Database selectDatabase() = 0; virtual fdb::Database selectDatabase() = 0;
virtual std::string getClusterFileForErrorInjection() = 0; virtual std::string getClusterFileForErrorInjection() = 0;
virtual const TransactionExecutorOptions& getOptions() = 0; virtual const TransactionExecutorOptions& getOptions() = 0;

View File

@ -20,6 +20,7 @@
#include "TesterWorkload.h" #include "TesterWorkload.h"
#include "TesterUtil.h" #include "TesterUtil.h"
#include "fdb_c_options.g.h"
#include "fmt/core.h" #include "fmt/core.h"
#include "test/apitester/TesterScheduler.h" #include "test/apitester/TesterScheduler.h"
#include <cstdlib> #include <cstdlib>
@ -82,6 +83,8 @@ WorkloadBase::WorkloadBase(const WorkloadConfig& config)
: manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients), : manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients),
failed(false), numTxCompleted(0), numTxStarted(0), inProgress(false) { failed(false), numTxCompleted(0), numTxStarted(0), inProgress(false) {
maxErrors = config.getIntOption("maxErrors", 10); maxErrors = config.getIntOption("maxErrors", 10);
minTxTimeoutMs = config.getIntOption("minTxTimeoutMs", 0);
maxTxTimeoutMs = config.getIntOption("maxTxTimeoutMs", 0);
workloadId = fmt::format("{}{}", config.name, clientId); workloadId = fmt::format("{}{}", config.name, clientId);
} }
@ -129,9 +132,15 @@ void WorkloadBase::doExecute(TOpStartFct startFct,
} }
tasksScheduled++; tasksScheduled++;
numTxStarted++; numTxStarted++;
manager->txExecutor->execute( manager->txExecutor->execute( //
startFct, [this, transactional, cont, startFct](auto ctx) {
[this, startFct, cont, failOnError](fdb::Error err) { if (transactional && maxTxTimeoutMs > 0) {
int timeoutMs = Random::get().randomInt(minTxTimeoutMs, maxTxTimeoutMs);
ctx->tx().setOption(FDB_TR_OPTION_TIMEOUT, timeoutMs);
}
startFct(ctx);
},
[this, cont, failOnError](fdb::Error err) {
numTxCompleted++; numTxCompleted++;
if (err.code() == error_code_success) { if (err.code() == error_code_success) {
cont(); cont();
@ -148,7 +157,8 @@ void WorkloadBase::doExecute(TOpStartFct startFct,
scheduledTaskDone(); scheduledTaskDone();
}, },
tenant, tenant,
transactional); transactional,
maxTxTimeoutMs > 0);
} }
void WorkloadBase::info(const std::string& msg) { void WorkloadBase::info(const std::string& msg) {

View File

@ -166,6 +166,12 @@ protected:
// The maximum number of errors before stoppoing the workload // The maximum number of errors before stoppoing the workload
int maxErrors; int maxErrors;
// The timeout (in ms) automatically set for all transactions to a random value
// in the range [minTxTimeoutMs, maxTxTimeoutMs]
// If maxTxTimeoutMs <= 0, no timeout is set
int minTxTimeoutMs;
int maxTxTimeoutMs;
// Workload identifier, consisting of workload name and client ID // Workload identifier, consisting of workload name and client ID
std::string workloadId; std::string workloadId;

View File

@ -429,7 +429,7 @@ bool runWorkloads(TesterOptions& options) {
} }
workloadMgr.run(); workloadMgr.run();
return !workloadMgr.failed(); return !workloadMgr.failed();
} catch (const std::runtime_error& err) { } catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what()); fmt::print(stderr, "ERROR: {}\n", err.what());
return false; return false;
} }
@ -461,7 +461,7 @@ int main(int argc, char** argv) {
fdb_check(fdb::network::stop()); fdb_check(fdb::network::stop());
network_thread.join(); network_thread.join();
} catch (const std::runtime_error& err) { } catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what()); fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1; retCode = 1;
} }

View File

@ -0,0 +1,25 @@
[[test]]
title = 'Cancel Transactions with Timeouts'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 10
maxTxTimeoutMs = 10000

View File

@ -0,0 +1,33 @@
[[test]]
title = 'API Correctness with Timeouts'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000
[[test.workload]]
name = 'AtomicOpsCorrectness'
initialSize = 0
numRandomOperations = 100
minTxTimeoutMs = 100
maxTxTimeoutMs = 10000

View File

@ -1199,6 +1199,8 @@ void usage() {
printf("%-24s %s\n", " --flatbuffers", "Use flatbuffers"); printf("%-24s %s\n", " --flatbuffers", "Use flatbuffers");
printf("%-24s %s\n", " --streaming", "Streaming mode: all (default), iterator, small, medium, large, serial"); printf("%-24s %s\n", " --streaming", "Streaming mode: all (default), iterator, small, medium, large, serial");
printf("%-24s %s\n", " --disable_ryw", "Disable snapshot read-your-writes"); printf("%-24s %s\n", " --disable_ryw", "Disable snapshot read-your-writes");
printf(
"%-24s %s\n", " --disable_client_bypass", "Disable client-bypass forcing mako to use multi-version client");
printf("%-24s %s\n", " --json_report=PATH", "Output stats to the specified json file (Default: mako.json)"); printf("%-24s %s\n", " --json_report=PATH", "Output stats to the specified json file (Default: mako.json)");
printf("%-24s %s\n", printf("%-24s %s\n",
" --bg_file_path=PATH", " --bg_file_path=PATH",

View File

@ -392,6 +392,11 @@ func (o DatabaseOptions) SetTransactionIncludePortInAddress() error {
return o.setOpt(505, nil) return o.setOpt(505, nil)
} }
// Set a random idempotency id for all transactions. See the transaction option description for more information.
func (o DatabaseOptions) SetTransactionAutomaticIdempotency() error {
return o.setOpt(506, nil)
}
// Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information. // Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information.
func (o DatabaseOptions) SetTransactionBypassUnreadable() error { func (o DatabaseOptions) SetTransactionBypassUnreadable() error {
return o.setOpt(700, nil) return o.setOpt(700, nil)
@ -551,6 +556,18 @@ func (o TransactionOptions) SetSizeLimit(param int64) error {
return o.setOpt(503, int64ToBytes(param)) return o.setOpt(503, int64ToBytes(param))
} }
// Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes.
//
// Parameter: Unique ID
func (o TransactionOptions) SetIdempotencyId(param string) error {
return o.setOpt(504, []byte(param))
}
// Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future.
func (o TransactionOptions) SetAutomaticIdempotency() error {
return o.setOpt(505, nil)
}
// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior. // Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior.
func (o TransactionOptions) SetSnapshotRywEnable() error { func (o TransactionOptions) SetSnapshotRywEnable() error {
return o.setOpt(600, nil) return o.setOpt(600, nil)

View File

@ -56,7 +56,7 @@ endfunction()
# all these tests in serialized order and within the same directory. This is # all these tests in serialized order and within the same directory. This is
# useful for restart tests # useful for restart tests
function(add_fdb_test) function(add_fdb_test)
set(options UNIT IGNORE) set(options UNIT IGNORE LONG_RUNNING)
set(oneValueArgs TEST_NAME TIMEOUT) set(oneValueArgs TEST_NAME TIMEOUT)
set(multiValueArgs TEST_FILES) set(multiValueArgs TEST_FILES)
cmake_parse_arguments(ADD_FDB_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") cmake_parse_arguments(ADD_FDB_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
@ -106,6 +106,9 @@ function(add_fdb_test)
if(ADD_FDB_TEST_UNIT) if(ADD_FDB_TEST_UNIT)
message(STATUS message(STATUS
"ADDING UNIT TEST ${assigned_id} ${test_name}") "ADDING UNIT TEST ${assigned_id} ${test_name}")
elseif(ADD_FDB_TEST_LONG_RUNNING)
message(STATUS
"ADDING LONG RUNNING TEST ${assigned_id} ${test_name}")
else() else()
message(STATUS message(STATUS
"ADDING SIMULATOR TEST ${assigned_id} ${test_name}") "ADDING SIMULATOR TEST ${assigned_id} ${test_name}")
@ -150,9 +153,15 @@ function(add_fdb_test)
endif() endif()
endif() endif()
# set variables used for generating test packages # set variables used for generating test packages
set(TEST_NAMES ${TEST_NAMES} ${test_name} PARENT_SCOPE) if(ADD_FDB_TEST_LONG_RUNNING)
set(TEST_FILES_${test_name} ${ADD_FDB_TEST_TEST_FILES} PARENT_SCOPE) set(LONG_RUNNING_TEST_NAMES ${LONG_RUNNING_TEST_NAMES} ${test_name} PARENT_SCOPE)
set(TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE) set(LONG_RUNNING_TEST_FILES_${test_name} ${ADD_FDB_TEST_TEST_FILES} PARENT_SCOPE)
set(LONG_RUNNING_TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE)
else()
set(TEST_NAMES ${TEST_NAMES} ${test_name} PARENT_SCOPE)
set(TEST_FILES_${test_name} ${ADD_FDB_TEST_TEST_FILES} PARENT_SCOPE)
set(TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE)
endif()
endfunction() endfunction()
if(NOT WIN32) if(NOT WIN32)
@ -167,14 +176,21 @@ endif()
# - OUT_DIR the directory where files will be staged # - OUT_DIR the directory where files will be staged
# - CONTEXT the type of correctness package being built (e.g. 'valgrind correctness') # - CONTEXT the type of correctness package being built (e.g. 'valgrind correctness')
function(stage_correctness_package) function(stage_correctness_package)
set(options LONG_RUNNING)
set(oneValueArgs OUT_DIR CONTEXT OUT_FILES) set(oneValueArgs OUT_DIR CONTEXT OUT_FILES)
cmake_parse_arguments(STAGE "" "${oneValueArgs}" "" "${ARGN}") set(multiValueArgs TEST_LIST)
cmake_parse_arguments(STAGE "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin) file(MAKE_DIRECTORY ${STAGE_OUT_DIR}/bin)
string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) foreach(test IN LISTS STAGE_TEST_LIST)
foreach(test IN LISTS TEST_NAMES)
if((${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND if((${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND
(NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE})) (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE}))
foreach(file IN LISTS TEST_FILES_${test}) string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length)
if(STAGE_LONG_RUNNING)
set(TEST_FILES_PREFIX "LONG_RUNNING_TEST_FILES")
else()
set(TEST_FILES_PREFIX "TEST_FILES")
endif()
foreach(file IN LISTS ${TEST_FILES_PREFIX}_${test})
string(SUBSTRING ${file} ${base_length} -1 rel_out_file) string(SUBSTRING ${file} ${base_length} -1 rel_out_file)
set(out_file ${STAGE_OUT_DIR}/tests/${rel_out_file}) set(out_file ${STAGE_OUT_DIR}/tests/${rel_out_file})
list(APPEND test_files ${out_file}) list(APPEND test_files ${out_file})
@ -265,7 +281,7 @@ function(create_correctness_package)
return() return()
endif() endif()
set(out_dir "${CMAKE_BINARY_DIR}/correctness") set(out_dir "${CMAKE_BINARY_DIR}/correctness")
stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "correctness" OUT_FILES package_files) stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "correctness" OUT_FILES package_files TEST_LIST "${TEST_NAMES}")
set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness-${FDB_VERSION}.tar.gz) set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness-${FDB_VERSION}.tar.gz)
add_custom_command( add_custom_command(
OUTPUT ${tar_file} OUTPUT ${tar_file}
@ -294,13 +310,47 @@ function(create_correctness_package)
add_dependencies(package_tests_u package_tests) add_dependencies(package_tests_u package_tests)
endfunction() endfunction()
function(create_long_running_correctness_package)
if(WIN32)
return()
endif()
set(out_dir "${CMAKE_BINARY_DIR}/long_running_correctness")
stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "long running correctness" OUT_FILES package_files TEST_LIST "${LONG_RUNNING_TEST_NAMES}" LONG_RUNNING)
set(tar_file ${CMAKE_BINARY_DIR}/packages/long-running-correctness-${FDB_VERSION}.tar.gz)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${package_files}
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTest.sh
${out_dir}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/correctnessTimeout.sh
${out_dir}/joshua_timeout
COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${package_files}
${out_dir}/joshua_test
${out_dir}/joshua_timeout
WORKING_DIRECTORY ${out_dir}
COMMENT "Package long running correctness archive"
)
add_custom_target(package_long_running_tests ALL DEPENDS ${tar_file})
add_dependencies(package_long_running_tests strip_only_fdbserver TestHarness)
set(unversioned_tar_file "${CMAKE_BINARY_DIR}/packages/long_running_correctness.tar.gz")
add_custom_command(
OUTPUT "${unversioned_tar_file}"
DEPENDS "${tar_file}"
COMMAND ${CMAKE_COMMAND} -E copy "${tar_file}" "${unversioned_tar_file}"
COMMENT "Copy long running correctness package to ${unversioned_tar_file}")
add_custom_target(package_long_running_tests_u DEPENDS "${unversioned_tar_file}")
add_dependencies(package_long_running_tests_u package_long_running_tests)
endfunction()
function(create_valgrind_correctness_package) function(create_valgrind_correctness_package)
if(WIN32) if(WIN32)
return() return()
endif() endif()
if(USE_VALGRIND) if(USE_VALGRIND)
set(out_dir "${CMAKE_BINARY_DIR}/valgrind_correctness") set(out_dir "${CMAKE_BINARY_DIR}/valgrind_correctness")
stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "valgrind correctness" OUT_FILES package_files) stage_correctness_package(OUT_DIR ${out_dir} CONTEXT "valgrind correctness" OUT_FILES package_files TEST_LIST "${TEST_NAMES}")
set(tar_file ${CMAKE_BINARY_DIR}/packages/valgrind-${FDB_VERSION}.tar.gz) set(tar_file ${CMAKE_BINARY_DIR}/packages/valgrind-${FDB_VERSION}.tar.gz)
add_custom_command( add_custom_command(
OUTPUT ${tar_file} OUTPUT ${tar_file}

View File

@ -26,6 +26,7 @@ env_set(TRACE_PC_GUARD_INSTRUMENTATION_LIB "" STRING "Path to a library containi
env_set(PROFILE_INSTR_GENERATE OFF BOOL "If set, build FDB as an instrumentation build to generate profiles") env_set(PROFILE_INSTR_GENERATE OFF BOOL "If set, build FDB as an instrumentation build to generate profiles")
env_set(PROFILE_INSTR_USE "" STRING "If set, build FDB with profile") env_set(PROFILE_INSTR_USE "" STRING "If set, build FDB with profile")
env_set(FULL_DEBUG_SYMBOLS OFF BOOL "Generate full debug symbols") env_set(FULL_DEBUG_SYMBOLS OFF BOOL "Generate full debug symbols")
env_set(ENABLE_LONG_RUNNING_TESTS OFF BOOL "Add a long running tests package")
set(USE_SANITIZER OFF) set(USE_SANITIZER OFF)
if(USE_ASAN OR USE_VALGRIND OR USE_MSAN OR USE_TSAN OR USE_UBSAN) if(USE_ASAN OR USE_VALGRIND OR USE_MSAN OR USE_TSAN OR USE_UBSAN)

View File

@ -128,6 +128,35 @@ set_knob(db, 'min_trace_severity', '10', None, 'description')
set_knob(db, 'min_trace_severity', '20', 'az-1', 'description') set_knob(db, 'min_trace_severity', '20', 'az-1', 'description')
``` ```
### CLI Usage
Users may also utilize `fdbcli` to set and update knobs dynamically. Usage is as follows
```
setknob <knob_name> <knob_value> [config_class]
getknob <knob_name> [config_class]
```
Where `knob_name` is an existing knob, `knob_value` is the desired value to set the knob and `config_class` is the optional configuration class. Furthermore, `setknob` may be combined within a `begin\commit` to update multiple knobs atomically. If using this option, a description must follow `commit` otherwise a prompt will be shown asking for a description. The description must be non-empty. An example follows.
```
begin
setknob min_trace_severity 30
setknob tracing_udp_listener_addr 192.168.0.1
commit "fdbcli change"
```
Users may only combine knob configuration changes with other knob configuration changes in the same transaction. For example, the following is not permitted and will raise an error.
```
begin
set foo bar
setknob max_metric_size 1000
commit "change"
```
Specifically, `set, clear, get, getrange, clearrange` cannot be combined in any transaction with a `setknob` or `getknob`.
If using an individual `setknob` without being inside a `begin\commit` block, then `fdbcli` will prompt for a description as well.
#### Type checking
Knobs have implicit types attached to them when defined. For example, the knob `tracing_udp_listener_addr` is set to `"127.0.0.1"` as so the type is string. If a user invokes `setknob` on this knob with an incorrect value that is not a string, the transaction will fail.
### Disable the Configuration Database ### Disable the Configuration Database
The configuration database includes both client and server changes and is The configuration database includes both client and server changes and is

106
design/idempotency_ids.md Normal file
View File

@ -0,0 +1,106 @@
# Goals
The main goal is to make transactions safer and easier to reason about. New users should get a "just works" experience. One of the main selling points of FoundationDB is that it solves the hard distributed systems problems for you, so that you only need to concern yourself with your business logic. Non-idempotent transactions is probably the biggest "gotcha" that users need to be made aware of -- and they won't discover it organically. In order to achieve this "just works" experience I believe it is necessary to make automatic idempotency have low-enough overhead so that we can enable it by default.
As an intermediate goal, I plan to introduce this feature disabled by default. The long-term plan is to make it the default.
# API
Introduce a new transaction option `IDEMPOTENCY_ID`, which will be validated to be at most 255 bytes.
Add
```
FDBFuture* fdb_transaction_commit_result(FDBTransaction* tr, uint8_t const* idempotency_id, int idempotency_id_length)
```
, which can be used to determine the result of a commit that failed with `transaction_timed_out`.
Commits for transactions with idempotency ids would not fail with `commit_unknown_result`, but in (extremely) rare cases could fail with a new error that clients are expected to handle by restarting the process.
# Background
- https://forums.foundationdb.org/t/automatically-providing-transaction-idempotency/1873
- https://github.com/apple/foundationdb/issues/1321
- https://docs.google.com/document/d/19LDQuurg4Tt8eUcig3-8g2VOG9ZpQvtWrp_691RqMo8/edit#
# Data model
Commit proxies would combine idempotency IDs for transactions within a batch. The purpose of this is to try to limit the number of distinct database keys that need to be written, and to lessen the number of extra mutation bytes for idempotency IDs.
## Key format
```
\xff\x02/idmp/${commit_version_big_endian (8 bytes)}${high_order_byte_of_batch_index (1 byte)}
```
- `commit_version_big_endian` the commit version stored big-endian so that the cleaner worker can find the oldest idempotency ids easily, and also so that "unknown_committed" transactions can recover their commit version.
- `high_order_byte_of_batch_index` this limits us to 256 idempotency ids per value
## Value format
```
${protocol_version}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
```
The batch index for each idempotency id can be reconstructed from the high order byte and low order bytes stored in the key and value, respectively. This is necessary for an "unknown_committed" transaction to recover their full version stamp. Batch index is a `short int`, i.e. 2 bytes.
# Cleaning up old idempotency ids
After learning the result of an attempt to commit a transaction with an
idempotency id, the client may inform the cluster that it's no longer interested
in that id and the cluster can reclaim the space used to store the idempotency
id. The happy-path reply to a CommitTransactionRequest will say which proxy this
request should be sent to, and all idempotency ids for a database key will be
sent to the same proxy so that it can clear the key once it receives all of
them. The first proxy will also periodically clean up the oldest idempotency ids, based on a policy determined by two knobs. One knob will control the minimum lifetime of an idempotency id (i.e. don't delete anything younger than 1 day), and the other will control the target byte size of the idempotency keys (e.g. keep 100 MB of idempotency keys around).
# Commit protocol
The basic change will be that a commit future will not become ready until the client confirms whether or not the commit succeeded. (`transaction_timed_out` is an unfortunate exception here)
The idempotency id will be automatically added to both the read conflict range and the write conflict range, before makeSelfConflicting is called so that we don't duplicate that work. We can reuse the `\xff/SC/` self-conflicting key space here.
## Did I already commit?
The first version of this scans the keys in the idmp key range to check for the idempotency ids. The plan for the next version is the following:
Storage servers would have a new endpoint that clients can use to ask if the transaction for an idempotency id already committed. Clients would need to check every possible shard that their idempotency id may have ended up in.
Storage servers would maintain a map from idempotency id to versionstamp in memory, and clients would need to contact all storage servers responsible for the `[\xff\x02/idmp/, \xff\x02/idmp0)` keyspace to be sure of their commit status. Assuming an idempotency id + versionstamp is 16 + 10 bytes, and that the lifetime of most idempotency ids is less than 1 second, that corresponds to at least 260 MB of memory on the storage server at 1,000,000 transactions/s, which seems acceptable. Let's double that to account for things like hash table load factor and allocating extra memory to ensure amortized constant time insertion. Still seems acceptable. We probably want to use a hashtable with open addressing to avoid frequent heap allocations. I _think_ [swisstables](https://abseil.io/about/design/swisstables) would work here.
When a transaction learns that it did in fact commit, the commit future succeeds, and the versionstamp gets filled with the original, successful transaction's versionstamp. After the successful commit is reported, it's no longer necessary to store its idempotency ID. The client will send an RPC to the cleaner role indicating that it can remove this idempotency ID.
If a transaction learns that it did in fact _not_ commit, the commit future will fail with an error that indicates that the transaction did not commit. Perhaps `transaction_too_old`.
If a transaction learns that it has been in-flight so long that its idempotency id could have been expired, then it will fail with a new, non-retriable error. It is expected that this will be rare enough that crashing the application is acceptable.
# Considerations
- Additional storage space on the cluster. This can be controlled directly via an idempotency id target bytes knob/config.
- Potential write hot spot.
# Multi-version client
The multi-version client will generate its own idempotency id for a transaction and manage its lifecycle. It will duplicate the logic in NativeApi to achieve the same guarantees. As part of this change we will also ensure that the previous commit attempt is no longer in-flight before allowing the commit future to become ready. This will fix a potential "causal-write-risky" issue if a commit attempt fails with `cluster_version_changed`.
# Experiments
- Initial experiments show that this is about 1% overhead for the worst case workload which is transactions that only update a single key.
```
Single replication redwood cluster with dedicated ebs disks for tlog and storage. All tests saturated the tlog disk's IOPs.
volume_type: gp3
volume_size: 384
iops: 9000
throughput: 250
$ bin/mako --mode run --rows 1000000 -x u1 -p 8 -t 8 --cluster=$HOME/fdb.cluster --seconds 100 # already warm, but quiesced
Baseline:
19714.67 TPS
"user space" method of writing idempotency id -> versionstamp in every transaction:
13831.00 TPS
"combine idempotency ids in transaction batch" method:
19515.62 TPS
```

View File

@ -203,6 +203,13 @@ The ``get`` command fetches the value of a given key. Its syntax is ``get <KEY>`
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``. Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
getknob
-------
The ``getknob`` command fetches the value of a given knob that has been populated by ``setknob``. Its syntax is ``getknob <KNOBNAME> [CONFIGCLASS]``. It displays the value of ``<KNOBNAME>`` if ``<KNOBNAME>`` is present in the database and ``not found`` otherwise.
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
getrange getrange
-------- --------
@ -395,6 +402,13 @@ The ``setclass`` command can be used to change the :ref:`process class <guidelin
The available process classes are ``unset``, ``storage``, ``transaction``, ``resolution``, ``grv_proxy``, ``commit_proxy``, ``master``, ``test``, ``unset``, ``stateless``, ``log``, ``router``, ``cluster_controller``, ``fast_restore``, ``data_distributor``, ``coordinator``, ``ratekeeper``, ``storage_cache``, ``backup``, and ``default``. The available process classes are ``unset``, ``storage``, ``transaction``, ``resolution``, ``grv_proxy``, ``commit_proxy``, ``master``, ``test``, ``unset``, ``stateless``, ``log``, ``router``, ``cluster_controller``, ``fast_restore``, ``data_distributor``, ``coordinator``, ``ratekeeper``, ``storage_cache``, ``backup``, and ``default``.
setknob
-------
The ``setknob`` command can be used to set knobs dynamically. Its syntax is ``setknob <KNOBNAME> <KNOBVALUE> [CONFIGCLASS]``. If not present in a ``begin\commit`` block, the CLI will prompt for a description of the change.
Note that :ref:`characters can be escaped <cli-escaping>` when specifying keys (or values) in ``fdbcli``.
sleep sleep
----- -----

View File

@ -499,11 +499,14 @@ void initHelp() {
"transaction, and are automatically committed for you. By explicitly beginning a transaction, " "transaction, and are automatically committed for you. By explicitly beginning a transaction, "
"successive operations are all performed as part of a single transaction.\n\nTo commit the " "successive operations are all performed as part of a single transaction.\n\nTo commit the "
"transaction, use the commit command. To discard the transaction, use the reset command."); "transaction, use the commit command. To discard the transaction, use the reset command.");
helpMap["commit"] = CommandHelp("commit", helpMap["commit"] = CommandHelp("commit [description]",
"commit the current transaction", "commit the current transaction",
"Any sets or clears executed after the start of the current transaction will be " "Any sets or clears executed after the start of the current transaction will be "
"committed to the database. On success, the committed version number is displayed. " "committed to the database. On success, the committed version number is displayed. "
"If commit fails, the error is displayed and the transaction must be retried."); "If commit fails, the error is displayed and the transaction must be retried. The "
"command optionally allows for a description in case the transaction targets the "
"configuration database. If no description is provided in the command, a prompt "
"will be shown asking for a relevant description of the configuration change");
helpMap["clear"] = CommandHelp( helpMap["clear"] = CommandHelp(
"clear <KEY>", "clear <KEY>",
"clear a key from the database", "clear a key from the database",
@ -552,6 +555,14 @@ void initHelp() {
helpMap["set"] = CommandHelp("set <KEY> <VALUE>", helpMap["set"] = CommandHelp("set <KEY> <VALUE>",
"set a value for a given key", "set a value for a given key",
"If KEY is not already present in the database, it will be created." ESCAPINGKV); "If KEY is not already present in the database, it will be created." ESCAPINGKV);
helpMap["setknob"] = CommandHelp("setknob <KEY> <VALUE> [CONFIG_CLASS]",
"updates a knob to specified value",
"setknob will prompt for a descrption of the changes" ESCAPINGKV);
helpMap["getknob"] = CommandHelp(
"getknob <KEY> [CONFIG_CLASS]", "gets the value of the specified knob", "CONFIG_CLASS is optional." ESCAPINGK);
helpMap["option"] = CommandHelp( helpMap["option"] = CommandHelp(
"option <STATE> <OPTION> <ARG>", "option <STATE> <OPTION> <ARG>",
"enables or disables an option", "enables or disables an option",
@ -1050,12 +1061,17 @@ Future<T> stopNetworkAfter(Future<T> what) {
} }
} }
enum TransType { Db = 0, Config, None };
ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterConnectionFile> ccf) { ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterConnectionFile> ccf) {
state LineNoise& linenoise = *plinenoise; state LineNoise& linenoise = *plinenoise;
state bool intrans = false; state bool intrans = false;
state TransType transtype = TransType::None;
state bool isCommitDesc = false;
state Database localDb; state Database localDb;
state Reference<IDatabase> db; state Reference<IDatabase> db;
state Reference<IDatabase> configDb;
state Reference<ITenant> tenant; state Reference<ITenant> tenant;
state Optional<TenantName> tenantName; state Optional<TenantName> tenantName;
state Optional<TenantMapEntry> tenantEntry; state Optional<TenantMapEntry> tenantEntry;
@ -1064,6 +1080,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
state const Reference<ITenant> managementTenant; state const Reference<ITenant> managementTenant;
state Reference<ITransaction> tr; state Reference<ITransaction> tr;
state Reference<ITransaction> config_tr;
state Transaction trx; state Transaction trx;
state bool writeMode = false; state bool writeMode = false;
@ -1085,6 +1102,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
printf("Using cluster file `%s'.\n", ccf->getLocation().c_str()); printf("Using cluster file `%s'.\n", ccf->getLocation().c_str());
} }
db = API->createDatabase(opt.clusterFile.c_str()); db = API->createDatabase(opt.clusterFile.c_str());
configDb = API->createDatabase(opt.clusterFile.c_str());
configDb->setOption(FDBDatabaseOptions::USE_CONFIG_DATABASE);
} catch (Error& e) { } catch (Error& e) {
fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code()); fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code());
printf("Unable to connect to cluster from `%s'\n", ccf->getLocation().c_str()); printf("Unable to connect to cluster from `%s'\n", ccf->getLocation().c_str());
@ -1442,23 +1461,46 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
} else { } else {
activeOptions = FdbOptions(globalOptions); activeOptions = FdbOptions(globalOptions);
options = &activeOptions; options = &activeOptions;
getTransaction(db, tenant, tr, options, false);
intrans = true; intrans = true;
transtype = TransType::None;
getTransaction(db, tenant, tr, options, false);
printf("Transaction started\n"); printf("Transaction started\n");
} }
continue; continue;
} }
if (tokencmp(tokens[0], "commit")) { if (tokencmp(tokens[0], "commit")) {
if (tokens.size() != 1) { if (tokens.size() > 2) {
printUsage(tokens[0]); printUsage(tokens[0]);
is_error = true; is_error = true;
} else if (!intrans) { } else if (!intrans) {
fprintf(stderr, "ERROR: No active transaction\n"); fprintf(stderr, "ERROR: No active transaction\n");
is_error = true; is_error = true;
} else { } else {
wait(commitTransaction(tr)); if (isCommitDesc && tokens.size() == 1) {
// prompt for description and add to txn
state Optional<std::string> raw;
while (!raw.present() || raw.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty.\n");
state Optional<std::string> rawline =
wait(makeInterruptable(linenoise.read("description: ")));
raw = rawline;
}
std::string line = raw.get();
config_tr->set("\xff\xff/description"_sr, line);
}
if (transtype == TransType::Db) {
wait(commitTransaction(tr));
} else {
if (tokens.size() > 1) {
config_tr->set("\xff\xff/description"_sr, tokens[1]);
}
wait(commitTransaction(config_tr));
}
isCommitDesc = false;
intrans = false; intrans = false;
transtype = TransType::None;
options = &globalOptions; options = &globalOptions;
} }
@ -1481,10 +1523,16 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
fprintf(stderr, "ERROR: No active transaction\n"); fprintf(stderr, "ERROR: No active transaction\n");
is_error = true; is_error = true;
} else { } else {
tr->reset(); if (transtype == TransType::Config) {
activeOptions = FdbOptions(globalOptions); config_tr->reset();
options = &activeOptions; } else {
options->apply(tr); tr->reset();
activeOptions = FdbOptions(globalOptions);
options = &activeOptions;
options->apply(tr);
}
isCommitDesc = false;
transtype = TransType::None;
printf("Transaction reset\n"); printf("Transaction reset\n");
} }
continue; continue;
@ -1510,6 +1558,15 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
printUsage(tokens[0]); printUsage(tokens[0]);
is_error = true; is_error = true;
} else { } else {
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Db;
} else if (transtype == TransType::Config) {
fprintf(stderr, "ERROR: Cannot perform get in configuration transaction\n");
is_error = true;
continue;
}
}
state ThreadFuture<Optional<Value>> valueF = state ThreadFuture<Optional<Value>> valueF =
getTransaction(db, tenant, tr, options, intrans)->get(tokens[1]); getTransaction(db, tenant, tr, options, intrans)->get(tokens[1]);
Optional<Standalone<StringRef>> v = wait(makeInterruptable(safeThreadFutureToFuture(valueF))); Optional<Standalone<StringRef>> v = wait(makeInterruptable(safeThreadFutureToFuture(valueF)));
@ -1618,7 +1675,17 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
} else { } else {
state int limit; state int limit;
bool valid = true; bool valid = true;
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Db;
} else if (transtype == TransType::Config) {
fprintf(
stderr,
"ERROR: Cannot perform getrange or getrangekeys in configuration transaction\n");
is_error = true;
continue;
}
}
if (tokens.size() == 4) { if (tokens.size() == 4) {
// INT_MAX is 10 digits; rather than // INT_MAX is 10 digits; rather than
// worrying about overflow we'll just cap // worrying about overflow we'll just cap
@ -1707,6 +1774,15 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
printUsage(tokens[0]); printUsage(tokens[0]);
is_error = true; is_error = true;
} else { } else {
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Db;
} else if (transtype == TransType::Config) {
fprintf(stderr, "ERROR: Cannot perform set in configuration transaction\n");
is_error = true;
continue;
}
}
getTransaction(db, tenant, tr, options, intrans); getTransaction(db, tenant, tr, options, intrans);
tr->set(tokens[1], tokens[2]); tr->set(tokens[1], tokens[2]);
@ -1717,6 +1793,91 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
continue; continue;
} }
if (tokencmp(tokens[0], "setknob")) {
if (tokens.size() > 4 || tokens.size() < 3) {
printUsage(tokens[0]);
is_error = true;
} else {
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Config;
} else if (transtype == TransType::Db) {
fprintf(stderr, "ERROR: Cannot perform setknob in database transaction\n");
is_error = true;
isCommitDesc = false;
continue;
}
}
Tuple t;
if (tokens.size() == 4) {
t.append(tokens[3]);
} else {
t.appendNull();
}
t.append(tokens[1]);
getTransaction(configDb, tenant, config_tr, options, intrans);
config_tr->set(t.pack(), tokens[2]);
if (!intrans) {
// prompt for description and add to txn
state Optional<std::string> raw_desc;
while (!raw_desc.present() || raw_desc.get().empty()) {
fprintf(stdout,
"Please set a description for the change. Description must be non-empty\n");
state Optional<std::string> rawline_knob =
wait(makeInterruptable(linenoise.read("description: ")));
raw_desc = rawline_knob;
}
std::string line = raw_desc.get();
config_tr->set("\xff\xff/description"_sr, line);
wait(commitTransaction(config_tr));
} else {
isCommitDesc = true;
}
}
continue;
}
if (tokencmp(tokens[0], "getknob")) {
if (tokens.size() > 3 || tokens.size() < 2) {
printUsage(tokens[0]);
is_error = true;
} else {
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Config;
} else if (transtype == TransType::Db) {
fprintf(stderr, "ERROR: Cannot perform getknob in database transaction\n");
is_error = true;
continue;
}
}
Tuple t;
if (tokens.size() == 2) {
t.appendNull();
} else {
t.append(tokens[2]);
}
t.append(tokens[1]);
state ThreadFuture<Optional<Value>> valueF_knob =
getTransaction(configDb, tenant, config_tr, options, intrans)->get(t.pack());
Optional<Standalone<StringRef>> v =
wait(makeInterruptable(safeThreadFutureToFuture(valueF_knob)));
std::string knob_class = printable(tokens[1]);
if (tokens.size() == 3) {
std::string config_class = (" in configuration class " + printable(tokens[2]));
knob_class += config_class;
}
if (v.present())
printf("`%s' is `%s'\n",
knob_class.c_str(),
Tuple::tupleToString(Tuple::unpack(v.get())).c_str());
else
printf("`%s' is not found\n", knob_class.c_str());
}
continue;
}
if (tokencmp(tokens[0], "clear")) { if (tokencmp(tokens[0], "clear")) {
if (!writeMode) { if (!writeMode) {
fprintf(stderr, "ERROR: writemode must be enabled to set or clear keys in the database.\n"); fprintf(stderr, "ERROR: writemode must be enabled to set or clear keys in the database.\n");
@ -1728,6 +1889,15 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
printUsage(tokens[0]); printUsage(tokens[0]);
is_error = true; is_error = true;
} else { } else {
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Db;
} else if (transtype == TransType::Config) {
fprintf(stderr, "ERROR: Cannot perform clear in configuration transaction\n");
is_error = true;
continue;
}
}
getTransaction(db, tenant, tr, options, intrans); getTransaction(db, tenant, tr, options, intrans);
tr->clear(tokens[1]); tr->clear(tokens[1]);
@ -1749,6 +1919,15 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
printUsage(tokens[0]); printUsage(tokens[0]);
is_error = true; is_error = true;
} else { } else {
if (intrans) {
if (transtype == TransType::None) {
transtype = TransType::Db;
} else if (transtype == TransType::Config) {
fprintf(stderr, "ERROR: Cannot perform clearrange in configuration transaction\n");
is_error = true;
continue;
}
}
getTransaction(db, tenant, tr, options, intrans); getTransaction(db, tenant, tr, options, intrans);
tr->clear(KeyRangeRef(tokens[1], tokens[2])); tr->clear(KeyRangeRef(tokens[1], tokens[2]));
@ -1928,7 +2107,6 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
} }
TraceEvent(SevInfo, "CLICommandLog", randomID).detail("Command", line).detail("IsError", is_error); TraceEvent(SevInfo, "CLICommandLog", randomID).detail("Command", line).detail("IsError", is_error);
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_operation_cancelled) { if (e.code() == error_code_operation_cancelled) {
throw; throw;

View File

@ -334,6 +334,57 @@ def consistencycheck(logger):
assert output3 == consistency_check_on_output assert output3 == consistency_check_on_output
@enable_logging()
def knobmanagement(logger):
# this test will set knobs and verify that the knobs are properly set
# must use begin/commit to avoid prompt for description
# Incorrect arguments
output = run_fdbcli_command('setknob')
assert output == "Usage: setknob <KEY> <VALUE> [CONFIG_CLASS]"
output = run_fdbcli_command('setknob', 'min_trace_severity')
assert output == "Usage: setknob <KEY> <VALUE> [CONFIG_CLASS]"
output = run_fdbcli_command('getknob')
assert output == "Usage: getknob <KEY> [CONFIG_CLASS]"
logger.debug("incorrect args passed")
# Invalid knob name
err = run_fdbcli_command_and_get_error('begin; setknob dummy_knob 20; commit \"fdbcli change\";')
logger.debug("err is: {}".format(err))
assert len(err) > 0
logger.debug("invalid knob name passed")
# Invalid type for knob
err = run_fdbcli_command_and_get_error('begin; setknob min_trace_severity dummy-text; commit \"fdbcli change\";')
logger.debug("err is: {}".format(err))
assert len(err) > 0
logger.debug("invalid knob type passed")
# Verifying we can't do a normal set, clear, get, getrange, clearrange
# with a setknob
err = run_fdbcli_command_and_get_error('writemode on; begin; set foo bar; setknob max_metric_size 1000; commit;')
logger.debug("err is: {}".format(err))
assert len(err) > 0
err = run_fdbcli_command_and_get_error('writemode on; begin; clear foo; setknob max_metric_size 1000; commit')
logger.debug("err is: {}".format(err))
assert len(err) > 0
# Various setknobs and verified by getknob
output = run_fdbcli_command('begin; setknob min_trace_severity 30; setknob max_metric_size 1000; \
setknob tracing_udp_listener_addr 192.168.0.1; \
setknob tracing_sample_rate 0.3; \
commit \"This is an fdbcli test for knobs\";')
assert "Committed" in output
output = run_fdbcli_command('getknob', 'min_trace_severity')
assert r"`min_trace_severity' is `30'" == output
output = run_fdbcli_command('getknob', 'max_metric_size')
assert r"`max_metric_size' is `1000'" == output
output = run_fdbcli_command('getknob', 'tracing_udp_listener_addr')
assert r"`tracing_udp_listener_addr' is `'192.168.0.1''" == output
output = run_fdbcli_command('getknob', 'tracing_sample_rate')
assert r"`tracing_sample_rate' is `0.300000'" == output
@enable_logging() @enable_logging()
def cache_range(logger): def cache_range(logger):
# this command is currently experimental # this command is currently experimental
@ -983,6 +1034,7 @@ if __name__ == '__main__':
versionepoch() versionepoch()
integer_options() integer_options()
tls_address_suffix() tls_address_suffix()
knobmanagement()
else: else:
assert args.process_number > 1, "Process number should be positive" assert args.process_number > 1, "Process number should be positive"
coordinators() coordinators()

View File

@ -86,6 +86,26 @@ BlobCipherMetrics::BlobCipherMetrics()
traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc); traceFuture = traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, &cc);
} }
std::string toString(BlobCipherMetrics::UsageType type) {
switch (type) {
case BlobCipherMetrics::UsageType::TLOG:
return "TLog";
case BlobCipherMetrics::UsageType::KV_MEMORY:
return "KVMemory";
case BlobCipherMetrics::UsageType::KV_REDWOOD:
return "KVRedwood";
case BlobCipherMetrics::UsageType::BLOB_GRANULE:
return "BlobGranule";
case BlobCipherMetrics::UsageType::BACKUP:
return "Backup";
case BlobCipherMetrics::UsageType::TEST:
return "Test";
default:
ASSERT(false);
return "";
}
}
// BlobCipherKey class methods // BlobCipherKey class methods
BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId, BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId,
@ -636,34 +656,17 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
} else { } else {
// Populate header authToken details // Populate header authToken details
if (header->flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { ASSERT_EQ(header->flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
ASSERT_GE(allocSize, (bytes + finalBytes)); ASSERT_GE(allocSize, (bytes + finalBytes));
ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes)); ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes));
computeAuthToken({ { ciphertext, bytes + finalBytes }, computeAuthToken({ { ciphertext, bytes + finalBytes },
{ reinterpret_cast<const uint8_t*>(header), sizeof(BlobCipherEncryptHeader) } }, { reinterpret_cast<const uint8_t*>(header), sizeof(BlobCipherEncryptHeader) } },
headerCipherKey->rawCipher(), headerCipherKey->rawCipher(),
AES_256_KEY_LENGTH, AES_256_KEY_LENGTH,
&header->singleAuthToken.authToken[0], &header->singleAuthToken.authToken[0],
(EncryptAuthTokenAlgo)header->flags.authTokenAlgo, (EncryptAuthTokenAlgo)header->flags.authTokenAlgo,
AUTH_TOKEN_MAX_SIZE); AUTH_TOKEN_MAX_SIZE);
} else {
ASSERT_EQ(header->flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
// TOOD: Use HMAC_SHA encyrption authentication scheme as AES_CMAC needs minimum 16 bytes cipher key
computeAuthToken({ { ciphertext, bytes + finalBytes } },
reinterpret_cast<const uint8_t*>(&header->cipherTextDetails.salt),
sizeof(EncryptCipherRandomSalt),
&header->multiAuthTokens.cipherTextAuthToken[0],
EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA,
AUTH_TOKEN_MAX_SIZE);
computeAuthToken({ { reinterpret_cast<const uint8_t*>(header), sizeof(BlobCipherEncryptHeader) } },
headerCipherKey->rawCipher(),
AES_256_KEY_LENGTH,
&header->multiAuthTokens.headerAuthToken[0],
(EncryptAuthTokenAlgo)header->flags.authTokenAlgo,
AUTH_TOKEN_MAX_SIZE);
}
} }
encryptBuf->setLogicalSize(plaintextLen); encryptBuf->setLogicalSize(plaintextLen);
@ -708,44 +711,6 @@ DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey>
} }
} }
void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena) {
if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI) {
// NoneAuthToken mode; no authToken is generated; nothing to do
// SingleAuthToken mode; verification will happen as part of decryption.
return;
}
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
ASSERT(isEncryptHeaderAuthTokenAlgoValid((EncryptAuthTokenAlgo)header.flags.authTokenAlgo));
BlobCipherEncryptHeader headerCopy;
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
memset(reinterpret_cast<uint8_t*>(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_MAX_SIZE);
uint8_t computedHeaderAuthToken[AUTH_TOKEN_MAX_SIZE]{};
computeAuthToken({ { reinterpret_cast<const uint8_t*>(&headerCopy), sizeof(BlobCipherEncryptHeader) } },
headerCipherKey->rawCipher(),
AES_256_KEY_LENGTH,
&computedHeaderAuthToken[0],
(EncryptAuthTokenAlgo)header.flags.authTokenAlgo,
AUTH_TOKEN_MAX_SIZE);
int authTokenSize = getEncryptHeaderAuthTokenSize(header.flags.authTokenAlgo);
ASSERT_LE(authTokenSize, AUTH_TOKEN_MAX_SIZE);
if (memcmp(&header.multiAuthTokens.headerAuthToken[0], &computedHeaderAuthToken[0], authTokenSize) != 0) {
TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("MultiAuthHeaderAuthToken",
StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_MAX_SIZE).toString())
.detail("ComputedHeaderAuthToken", StringRef(computedHeaderAuthToken, AUTH_TOKEN_MAX_SIZE));
throw encrypt_header_authtoken_mismatch();
}
headerAuthTokenValidationDone = true;
}
void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext, void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
const int ciphertextLen, const int ciphertextLen,
const BlobCipherEncryptHeader& header, const BlobCipherEncryptHeader& header,
@ -759,7 +724,7 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph
memcpy(reinterpret_cast<uint8_t*>(&headerCopy), memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header), reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader)); sizeof(BlobCipherEncryptHeader));
memset(reinterpret_cast<uint8_t*>(&headerCopy.singleAuthToken), 0, 2 * AUTH_TOKEN_MAX_SIZE); memset(reinterpret_cast<uint8_t*>(&headerCopy.singleAuthToken), 0, AUTH_TOKEN_MAX_SIZE);
uint8_t computed[AUTH_TOKEN_MAX_SIZE]; uint8_t computed[AUTH_TOKEN_MAX_SIZE];
computeAuthToken({ { ciphertext, ciphertextLen }, computeAuthToken({ { ciphertext, ciphertextLen },
{ reinterpret_cast<const uint8_t*>(&headerCopy), sizeof(BlobCipherEncryptHeader) } }, { reinterpret_cast<const uint8_t*>(&headerCopy), sizeof(BlobCipherEncryptHeader) } },
@ -782,43 +747,12 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph
} }
} }
void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena) {
if (!headerAuthTokenValidationDone) {
verifyHeaderAuthToken(header, arena);
}
uint8_t computedCipherTextAuthToken[AUTH_TOKEN_MAX_SIZE];
// TOOD: Use HMAC_SHA encyrption authentication scheme as AES_CMAC needs minimum 16 bytes cipher key
computeAuthToken({ { ciphertext, ciphertextLen } },
reinterpret_cast<const uint8_t*>(&header.cipherTextDetails.salt),
sizeof(EncryptCipherRandomSalt),
&computedCipherTextAuthToken[0],
EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA,
AUTH_TOKEN_MAX_SIZE);
if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], &computedCipherTextAuthToken[0], AUTH_TOKEN_MAX_SIZE) !=
0) {
TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("MultiAuthCipherTextAuthToken",
StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_MAX_SIZE).toString())
.detail("ComputedCipherTextAuthToken", StringRef(computedCipherTextAuthToken, AUTH_TOKEN_MAX_SIZE));
throw encrypt_header_authtoken_mismatch();
}
}
void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext, void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext,
const int ciphertextLen, const int ciphertextLen,
const BlobCipherEncryptHeader& header, const BlobCipherEncryptHeader& header,
Arena& arena) { Arena& arena) {
if (header.flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, arena); verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, arena);
} else {
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, arena);
}
authTokensValidationDone = true; authTokensValidationDone = true;
} }
@ -1504,266 +1438,6 @@ TEST_CASE("flow/BlobCipher") {
TraceEvent("SingleAuthModeAesCmacDone"); TraceEvent("SingleAuthModeAesCmacDone");
} }
// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI
// HMAC_SHA authToken algorithm
{
TraceEvent("MultiAuthModeHmacShaStart").log();
EncryptBlobCipherAes265Ctr encryptor(cipherKey,
headerCipherKey,
iv,
AES_256_IV_LENGTH,
EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI,
EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA,
BlobCipherMetrics::TEST);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA);
TraceEvent("BlobCipherTestEncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode)
.detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo)
.detail("DomainId", header.cipherTextDetails.encryptDomainId)
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("HeaderAuthToken",
StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_HMAC_SHA_SIZE).toString());
Reference<BlobCipherKey> tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId,
header.cipherTextDetails.baseCipherId,
header.cipherTextDetails.salt);
Reference<BlobCipherKey> hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId,
header.cipherHeaderDetails.baseCipherId,
header.cipherHeaderDetails.salt);
ASSERT(tCipherKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
TraceEvent("BlobCipherTestDecryptDone").log();
// induce encryption header corruption - headerVersion corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.headerVersion += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encryption header corruption - encryptionMode corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.encryptMode += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encryption header corruption - cipherText authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1);
headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
// induce encryption header corruption - header authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_HMAC_SHA_SIZE - 1);
headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
uint8_t temp[bufLen];
memcpy(encrypted->begin(), &temp[0], bufLen);
int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
temp[tIdx] += 1;
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
TraceEvent("MultiAuthModeHmacShaDone");
}
// AES_CMAC authToken algorithm
{
TraceEvent("MultiAuthModeAesCmacStart");
EncryptBlobCipherAes265Ctr encryptor(cipherKey,
headerCipherKey,
iv,
AES_256_IV_LENGTH,
EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI,
EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC,
BlobCipherMetrics::TEST);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
ASSERT_EQ(header.flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC);
TraceEvent("BlobCipherTestEncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("HeaderEncryptAuthTokenMode", header.flags.authTokenMode)
.detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo)
.detail("DomainId", header.cipherTextDetails.encryptDomainId)
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("HeaderAuthToken",
StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_AES_CMAC_SIZE).toString());
Reference<BlobCipherKey> tCipherKey = cipherKeyCache->getCipherKey(header.cipherTextDetails.encryptDomainId,
header.cipherTextDetails.baseCipherId,
header.cipherTextDetails.salt);
Reference<BlobCipherKey> hCipherKey = cipherKeyCache->getCipherKey(header.cipherHeaderDetails.encryptDomainId,
header.cipherHeaderDetails.baseCipherId,
header.cipherHeaderDetails.salt);
ASSERT(tCipherKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
TraceEvent("BlobCipherTestDecryptDone").log();
// induce encryption header corruption - headerVersion corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.headerVersion += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encryption header corruption - encryptionMode corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.encryptMode += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encryption header corruption - cipherText authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1);
headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
// induce encryption header corruption - header authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_AES_CMAC_SIZE - 1);
headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
uint8_t temp[bufLen];
memcpy(encrypted->begin(), &temp[0], bufLen);
int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
temp[tIdx] += 1;
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, header.iv, BlobCipherMetrics::TEST);
decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
TraceEvent("MultiAuthModeAesCmacDone");
}
// Validate dropping encryptDomainId cached keys // Validate dropping encryptDomainId cached keys
const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId); const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
cipherKeyCache->resetEncryptDomainId(candidate); cipherKeyCache->resetEncryptDomainId(candidate);

View File

@ -33,6 +33,12 @@ public:
SingleBlobConnectionProvider(std::string url) { conn = BackupContainerFileSystem::openContainerFS(url, {}, {}); } SingleBlobConnectionProvider(std::string url) { conn = BackupContainerFileSystem::openContainerFS(url, {}, {}); }
bool needsRefresh() const { return false; }
bool isExpired() const { return false; }
void update(Standalone<BlobMetadataDetailsRef> newBlobMetadata) { ASSERT(false); }
private: private:
Reference<BackupContainerFileSystem> conn; Reference<BackupContainerFileSystem> conn;
}; };
@ -44,18 +50,42 @@ struct PartitionedBlobConnectionProvider : BlobConnectionProvider {
return std::pair(conn, metadata.partitions[writePartition].toString() + newFileName); return std::pair(conn, metadata.partitions[writePartition].toString() + newFileName);
} }
Reference<BackupContainerFileSystem> getForRead(std::string filePath) { return conn; } Reference<BackupContainerFileSystem> getForRead(std::string filePath) {
CODE_PROBE(isExpired(), "partitioned blob connection using expired blob metadata for read!");
return conn;
}
PartitionedBlobConnectionProvider(const Standalone<BlobMetadataDetailsRef> metadata) : metadata(metadata) { void updateMetadata(const Standalone<BlobMetadataDetailsRef>& newMetadata, bool checkPrevious) {
ASSERT(metadata.base.present()); ASSERT(newMetadata.base.present());
ASSERT(metadata.partitions.size() >= 2); ASSERT(newMetadata.partitions.size() >= 2);
conn = BackupContainerFileSystem::openContainerFS(metadata.base.get().toString(), {}, {}); for (auto& it : newMetadata.partitions) {
for (auto& it : metadata.partitions) {
// these should be suffixes, not whole blob urls // these should be suffixes, not whole blob urls
ASSERT(it.toString().find("://") == std::string::npos); ASSERT(it.toString().find("://") == std::string::npos);
} }
if (checkPrevious) {
if (newMetadata.expireAt <= metadata.expireAt) {
return;
}
// FIXME: validate only the credentials changed and the location is the same
ASSERT(newMetadata.partitions.size() == metadata.partitions.size());
for (int i = 0; i < newMetadata.partitions.size(); i++) {
ASSERT(newMetadata.partitions[i] == metadata.partitions[i]);
}
}
metadata = newMetadata;
conn = BackupContainerFileSystem::openContainerFS(metadata.base.get().toString(), {}, {});
} }
PartitionedBlobConnectionProvider(const Standalone<BlobMetadataDetailsRef> metadata) {
updateMetadata(metadata, false);
}
bool needsRefresh() const { return now() >= metadata.refreshAt; }
bool isExpired() const { return now() >= metadata.expireAt; }
void update(Standalone<BlobMetadataDetailsRef> newBlobMetadata) { updateMetadata(newBlobMetadata, true); }
private: private:
Standalone<BlobMetadataDetailsRef> metadata; Standalone<BlobMetadataDetailsRef> metadata;
Reference<BackupContainerFileSystem> conn; Reference<BackupContainerFileSystem> conn;
@ -72,6 +102,7 @@ struct StorageLocationBlobConnectionProvider : BlobConnectionProvider {
} }
Reference<BackupContainerFileSystem> getForRead(std::string filePath) { Reference<BackupContainerFileSystem> getForRead(std::string filePath) {
CODE_PROBE(isExpired(), "storage location blob connection using expired blob metadata for read!");
size_t slash = filePath.find("/"); size_t slash = filePath.find("/");
ASSERT(slash != std::string::npos); ASSERT(slash != std::string::npos);
int partition = stoi(filePath.substr(0, slash)); int partition = stoi(filePath.substr(0, slash));
@ -80,9 +111,18 @@ struct StorageLocationBlobConnectionProvider : BlobConnectionProvider {
return partitions[partition]; return partitions[partition];
} }
StorageLocationBlobConnectionProvider(const Standalone<BlobMetadataDetailsRef> metadata) { void updateMetadata(const Standalone<BlobMetadataDetailsRef>& newMetadata, bool checkPrevious) {
ASSERT(!metadata.base.present()); ASSERT(!newMetadata.base.present());
ASSERT(metadata.partitions.size() >= 2); ASSERT(newMetadata.partitions.size() >= 2);
if (checkPrevious) {
// FIXME: validate only the credentials changed and the locations are the same
ASSERT(newMetadata.partitions.size() == partitions.size());
if (newMetadata.expireAt <= metadata.expireAt) {
return;
}
}
metadata = newMetadata;
partitions.clear();
for (auto& it : metadata.partitions) { for (auto& it : metadata.partitions) {
// these should be whole blob urls // these should be whole blob urls
ASSERT(it.toString().find("://") != std::string::npos); ASSERT(it.toString().find("://") != std::string::npos);
@ -90,7 +130,18 @@ struct StorageLocationBlobConnectionProvider : BlobConnectionProvider {
} }
} }
StorageLocationBlobConnectionProvider(const Standalone<BlobMetadataDetailsRef> metadata) {
updateMetadata(metadata, false);
}
bool needsRefresh() const { return now() >= metadata.refreshAt; }
bool isExpired() const { return now() >= metadata.expireAt; }
void update(Standalone<BlobMetadataDetailsRef> newBlobMetadata) { updateMetadata(newBlobMetadata, true); }
private: private:
Standalone<BlobMetadataDetailsRef> metadata;
std::vector<Reference<BackupContainerFileSystem>> partitions; std::vector<Reference<BackupContainerFileSystem>> partitions;
}; };

View File

@ -198,6 +198,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( DEFAULT_AUTO_LOGS, 3 ); init( DEFAULT_AUTO_LOGS, 3 );
init( DEFAULT_COMMIT_GRV_PROXIES_RATIO, 3 ); init( DEFAULT_COMMIT_GRV_PROXIES_RATIO, 3 );
init( DEFAULT_MAX_GRV_PROXIES, 4 ); init( DEFAULT_MAX_GRV_PROXIES, 4 );
init( DELETE_NATIVE_LIB_AFTER_LOADING, true ); // if false, don't delete libfdb_c in tmp directory on client connect.
init( GLOBAL_CONFIG_REFRESH_BACKOFF, 0.5 ); init( GLOBAL_CONFIG_REFRESH_BACKOFF, 0.5 );
init( GLOBAL_CONFIG_REFRESH_MAX_BACKOFF, 60.0 ); init( GLOBAL_CONFIG_REFRESH_MAX_BACKOFF, 60.0 );

View File

@ -44,19 +44,20 @@ ConfigKey ConfigKeyRef::decodeKey(KeyRef const& key) {
} }
Value KnobValueRef::ToValueFunc::operator()(int v) const { Value KnobValueRef::ToValueFunc::operator()(int v) const {
return BinaryWriter::toValue(v, Unversioned()); // return BinaryWriter::toValue(v, Unversioned());
return Tuple::makeTuple(v).pack();
} }
Value KnobValueRef::ToValueFunc::operator()(int64_t v) const { Value KnobValueRef::ToValueFunc::operator()(int64_t v) const {
return BinaryWriter::toValue(v, Unversioned()); return Tuple::makeTuple(v).pack();
} }
Value KnobValueRef::ToValueFunc::operator()(bool v) const { Value KnobValueRef::ToValueFunc::operator()(bool v) const {
return BinaryWriter::toValue(v, Unversioned()); return Tuple::makeTuple(v).pack();
} }
Value KnobValueRef::ToValueFunc::operator()(ValueRef v) const { Value KnobValueRef::ToValueFunc::operator()(ValueRef v) const {
return v; return Tuple::makeTuple(v).pack();
} }
Value KnobValueRef::ToValueFunc::operator()(double v) const { Value KnobValueRef::ToValueFunc::operator()(double v) const {
return BinaryWriter::toValue(v, Unversioned()); return Tuple::makeTuple(v).pack();
} }
KnobValue KnobValueRef::CreatorFunc::operator()(NoKnobFound) const { KnobValue KnobValueRef::CreatorFunc::operator()(NoKnobFound) const {

View File

@ -23,6 +23,7 @@
#include "fdbclient/BackupContainer.h" #include "fdbclient/BackupContainer.h"
#include "fdbclient/BlobCipher.h" #include "fdbclient/BlobCipher.h"
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GetEncryptCipherKeys.actor.h" #include "fdbclient/GetEncryptCipherKeys.actor.h"
#include "fdbclient/JsonBuilder.h" #include "fdbclient/JsonBuilder.h"
#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/KeyBackedTypes.h"
@ -649,10 +650,8 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
return Void(); return Void();
} }
ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) {
KeyRef key, state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self));
Reference<TenantEntryCache<Void>> cache) {
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, cache));
state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo; state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;
// Get text and header cipher key // Get text and header cipher key
@ -694,13 +693,12 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; } static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; }
ACTOR static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetailsImpl( ACTOR static Future<std::pair<int64_t, TenantName>>
KeyRef key, getEncryptionDomainDetailsImpl(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache, bool useTenantCache) {
Reference<TenantEntryCache<Void>> tenantCache) {
if (isSystemKey(key)) { if (isSystemKey(key)) {
return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME); return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
} }
if (key.size() < TENANT_PREFIX_SIZE) { if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} }
KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE); KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE);
@ -712,10 +710,21 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME); return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
} }
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails( static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(KeyRef key,
KeyRef key, EncryptedRangeFileWriter* self) {
Reference<TenantEntryCache<Void>> tenantCache) { // If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of
return getEncryptionDomainDetailsImpl(key, tenantCache); // unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may
// degrade if most of the mutations belong to an invalid tenant
TenantMode mode = self->cx->clientInfo->get().tenantMode;
bool useTenantCache = mode != TenantMode::DISABLED;
if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) {
// TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any
// tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode
// support with backups is more performant
useTenantCache = false;
}
CODE_PROBE(useTenantCache, "using tenant cache");
return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache);
} }
// Handles the first block and internal blocks. Ends current block if needed. // Handles the first block and internal blocks. Ends current block if needed.
@ -813,7 +822,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
appendStringRefWithLenToBuffer(self, &endKey); appendStringRefWithLenToBuffer(self, &endKey);
appendStringRefWithLenToBuffer(self, &newValue); appendStringRefWithLenToBuffer(self, &newValue);
wait(newBlock(self, 0, endKey, writeValue)); wait(newBlock(self, 0, endKey, writeValue));
wait(updateEncryptionKeysCtx(self, self->lastKey, self->tenantCache)); wait(updateEncryptionKeysCtx(self, self->lastKey));
return Void(); return Void();
} }
@ -825,9 +834,8 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
if (self->lastKey.size() == 0 || k.size() == 0) { if (self->lastKey.size() == 0 || k.size() == 0) {
return false; return false;
} }
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache)); state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self));
state std::pair<int64_t, TenantName> prevKeyTenantInfo = state std::pair<int64_t, TenantName> prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self));
wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache));
// crossing tenant boundaries so finish the current block using only the tenant prefix of the new key // crossing tenant boundaries so finish the current block using only the tenant prefix of the new key
if (curKeyTenantInfo.first != prevKeyTenantInfo.first) { if (curKeyTenantInfo.first != prevKeyTenantInfo.first) {
CODE_PROBE(true, "crossed tenant boundaries"); CODE_PROBE(true, "crossed tenant boundaries");
@ -840,7 +848,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
// Start a new block if needed, then write the key and value // Start a new block if needed, then write the key and value
ACTOR static Future<Void> writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) { ACTOR static Future<Void> writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) {
if (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid()) { if (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid()) {
wait(updateEncryptionKeysCtx(self, k, self->tenantCache)); wait(updateEncryptionKeysCtx(self, k));
} }
state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size();
wait(newBlockIfNeeded(self, toWrite)); wait(newBlockIfNeeded(self, toWrite));
@ -862,7 +870,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
// TODO (Nim): Is it possible to write empty begin and end keys? // TODO (Nim): Is it possible to write empty begin and end keys?
if (k.size() > 0 && if (k.size() > 0 &&
(!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid())) { (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid())) {
wait(updateEncryptionKeysCtx(self, k, self->tenantCache)); wait(updateEncryptionKeysCtx(self, k));
} }
// Need to account for extra "empty" value being written in the case of crossing tenant boundaries // Need to account for extra "empty" value being written in the case of crossing tenant boundaries
@ -1035,8 +1043,7 @@ private:
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader, ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results, Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock, bool encryptedBlock,
Optional<Database> cx, Optional<Database> cx) {
Reference<TenantEntryCache<Void>> tenantCache) {
// Read begin key, if this fails then block was invalid. // Read begin key, if this fails then block was invalid.
state uint32_t kLen = reader->consumeNetworkUInt32(); state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen); state const uint8_t* k = reader->consume(kLen);
@ -1091,7 +1098,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION // BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>(); int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) { if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
wait(decodeKVPairs(&reader, &results, false, cx, Reference<TenantEntryCache<Void>>())); wait(decodeKVPairs(&reader, &results, false, cx));
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) { } else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block"); CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present()); ASSERT(cx.present());
@ -1114,8 +1121,7 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData = StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena())); wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data()); reader = StringRefReader(decryptedData, restore_corrupted_data());
Reference<TenantEntryCache<Void>> tenantCache = makeReference<TenantEntryCache<Void>>(cx.get()); wait(decodeKVPairs(&reader, &results, true, cx));
wait(decodeKVPairs(&reader, &results, true, cx, tenantCache));
} else { } else {
throw restore_unsupported_file_version(); throw restore_unsupported_file_version();
} }

174
fdbclient/IdempotencyId.cpp Normal file
View File

@ -0,0 +1,174 @@
/*
* IdempotencyId.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/SystemData.h"
#include "flow/UnitTest.h"
struct IdempotencyIdKVBuilderImpl {
Optional<Version> commitVersion;
Optional<uint8_t> batchIndexHighOrderByte;
BinaryWriter value{ IncludeVersion() };
};
IdempotencyIdKVBuilder::IdempotencyIdKVBuilder() : impl(PImpl<IdempotencyIdKVBuilderImpl>::create()) {}
void IdempotencyIdKVBuilder::setCommitVersion(Version commitVersion) {
impl->commitVersion = commitVersion;
}
void IdempotencyIdKVBuilder::add(const IdempotencyIdRef& id, uint16_t batchIndex) {
ASSERT(id.valid());
if (impl->batchIndexHighOrderByte.present()) {
ASSERT((batchIndex >> 8) == impl->batchIndexHighOrderByte.get());
} else {
impl->batchIndexHighOrderByte = batchIndex >> 8;
}
StringRef s = id.asStringRefUnsafe();
impl->value << uint8_t(s.size());
impl->value.serializeBytes(s);
impl->value << uint8_t(batchIndex); // Low order byte of batchIndex
}
Optional<KeyValue> IdempotencyIdKVBuilder::buildAndClear() {
ASSERT(impl->commitVersion.present());
if (!impl->batchIndexHighOrderByte.present()) {
return {};
}
BinaryWriter key{ Unversioned() };
key.serializeBytes(idempotencyIdKeys.begin);
key << bigEndian64(impl->commitVersion.get());
key << impl->batchIndexHighOrderByte.get();
Value v = impl->value.toValue();
impl->value = BinaryWriter(IncludeVersion());
impl->batchIndexHighOrderByte = Optional<uint8_t>();
Optional<KeyValue> result = KeyValue();
result.get().arena() = v.arena();
result.get().key = key.toValue(result.get().arena());
result.get().value = v;
return result;
}
IdempotencyIdKVBuilder::~IdempotencyIdKVBuilder() = default;
Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const IdempotencyIdRef& id) {
ASSERT(id.valid());
StringRef needle = id.asStringRefUnsafe();
StringRef haystack = kv.value;
#ifndef _WIN32
// The common case is that the kv does not contain the idempotency id, so early return if memmem is available
if (memmem(haystack.begin(), haystack.size(), needle.begin(), needle.size()) == nullptr) {
return {};
}
#endif
// Even if id is a substring of value, it may still not actually contain it.
BinaryReader reader(kv.value.begin(), kv.value.size(), IncludeVersion());
while (!reader.empty()) {
uint8_t length;
reader >> length;
StringRef candidate{ reinterpret_cast<const uint8_t*>(reader.readBytes(length)), length };
uint8_t lowOrderBatchIndex;
reader >> lowOrderBatchIndex;
if (candidate == needle) {
BinaryReader reader(kv.key.begin(), kv.key.size(), Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
Version commitVersion;
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
uint8_t highOrderBatchIndex;
reader >> highOrderBatchIndex;
return CommitResult{ commitVersion,
static_cast<uint16_t>((uint16_t(highOrderBatchIndex) << 8) |
uint16_t(lowOrderBatchIndex)) };
}
}
return {};
}
void forceLinkIdempotencyIdTests() {}
namespace {
IdempotencyIdRef generate(Arena& arena) {
int length = deterministicRandom()->coinflip() ? deterministicRandom()->randomInt(16, 256) : 16;
StringRef id = makeString(length, arena);
deterministicRandom()->randomBytes(mutateString(id), length);
return IdempotencyIdRef(id);
}
} // namespace
TEST_CASE("/fdbclient/IdempotencyId/basic") {
Arena arena;
uint16_t firstBatchIndex = deterministicRandom()->randomUInt32();
uint16_t batchIndex = firstBatchIndex;
Version commitVersion = deterministicRandom()->randomInt64(0, std::numeric_limits<Version>::max());
std::vector<IdempotencyIdRef> idVector; // Reference
std::unordered_set<IdempotencyIdRef> idSet; // Make sure hash+equals works
IdempotencyIdKVBuilder builder; // Check kv data format
builder.setCommitVersion(commitVersion);
for (int i = 0; i < 5; ++i) {
auto id = generate(arena);
idVector.emplace_back(id);
idSet.emplace(id);
builder.add(id, batchIndex++);
}
batchIndex = firstBatchIndex;
Optional<KeyValue> kvOpt = builder.buildAndClear();
ASSERT(kvOpt.present());
const auto& kv = kvOpt.get();
ASSERT(idSet.size() == idVector.size());
for (const auto& id : idVector) {
auto commitResult = kvContainsIdempotencyId(kv, id);
ASSERT(commitResult.present());
ASSERT(commitResult.get().commitVersion == commitVersion);
ASSERT(commitResult.get().batchIndex == batchIndex++);
ASSERT(idSet.find(id) != idSet.end());
idSet.erase(id);
ASSERT(idSet.find(id) == idSet.end());
}
ASSERT(idSet.size() == 0);
ASSERT(!kvContainsIdempotencyId(kv, generate(arena)).present());
return Void();
}
TEST_CASE("/fdbclient/IdempotencyId/serialization") {
ASSERT(ObjectReader::fromStringRef<IdempotencyIdRef>(ObjectWriter::toValue(IdempotencyIdRef(), Unversioned()),
Unversioned()) == IdempotencyIdRef());
for (int i = 0; i < 1000; ++i) {
Arena arena;
auto id = generate(arena);
auto serialized = ObjectWriter::toValue(id, Unversioned());
IdempotencyIdRef t;
ObjectReader reader(serialized.begin(), Unversioned());
reader.deserialize(t);
ASSERT(t == id);
}
return Void();
}

View File

@ -2356,6 +2356,21 @@ ACTOR Future<Void> forceRecovery(Reference<IClusterConnectionRecord> clusterFile
} }
} }
ACTOR Future<UID> auditStorage(Reference<IClusterConnectionRecord> clusterFile, KeyRange range, AuditType type) {
state Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface(new AsyncVar<Optional<ClusterInterface>>);
state Future<Void> leaderMon = monitorLeader<ClusterInterface>(clusterFile, clusterInterface);
loop {
while (!clusterInterface->get().present()) {
wait(clusterInterface->onChange());
}
UID auditId = wait(clusterInterface->get().get().triggerAudit.getReply(TriggerAuditRequest(type, range)));
TraceEvent(SevDebug, "ManagementAPIAuditStorageEnd").detail("AuditID", auditId);
return auditId;
}
}
ACTOR Future<Void> waitForPrimaryDC(Database cx, StringRef dcId) { ACTOR Future<Void> waitForPrimaryDC(Database cx, StringRef dcId) {
state ReadYourWritesTransaction tr(cx); state ReadYourWritesTransaction tr(cx);

View File

@ -2546,8 +2546,9 @@ void MultiVersionApi::setupNetwork() {
externalClients[filename] = {}; externalClients[filename] = {};
auto libCopies = copyExternalLibraryPerThread(path); auto libCopies = copyExternalLibraryPerThread(path);
for (int idx = 0; idx < libCopies.size(); ++idx) { for (int idx = 0; idx < libCopies.size(); ++idx) {
bool unlinkOnLoad = libCopies[idx].second && CLIENT_KNOBS->DELETE_NATIVE_LIB_AFTER_LOADING;
externalClients[filename].push_back(Reference<ClientInfo>( externalClients[filename].push_back(Reference<ClientInfo>(
new ClientInfo(new DLApi(libCopies[idx].first, libCopies[idx].second /*unlink on load*/), new ClientInfo(new DLApi(libCopies[idx].first, unlinkOnLoad /*unlink on load*/),
path, path,
useFutureVersion, useFutureVersion,
idx))); idx)));

View File

@ -32,6 +32,7 @@
#include <vector> #include <vector>
#include "boost/algorithm/string.hpp" #include "boost/algorithm/string.hpp"
#include "flow/CodeProbe.h" #include "flow/CodeProbe.h"
#include "fmt/format.h" #include "fmt/format.h"
@ -49,6 +50,7 @@
#include "fdbclient/ClusterConnectionFile.h" #include "fdbclient/ClusterConnectionFile.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbclient/CoordinationInterface.h" #include "fdbclient/CoordinationInterface.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/IKnobCollection.h" #include "fdbclient/IKnobCollection.h"
@ -190,6 +192,8 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe
TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics)); TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(), queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics)); TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.changeFeedStream.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.changeFeedStream.getEndpoint(), metrics));
// non-data requests duplicated for load // non-data requests duplicated for load
queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(), queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(),
@ -200,6 +204,12 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe
TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics)); TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(), queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics)); TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.overlappingChangeFeeds.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.overlappingChangeFeeds.getEndpoint(), metrics));
// duplicated to ensure feed data cleanup
queueModel.updateTssEndpoint(ssi.changeFeedPop.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.changeFeedPop.getEndpoint(), metrics));
} }
} }
@ -6113,6 +6123,61 @@ ACTOR static Future<Void> commitDummyTransaction(Reference<TransactionState> trS
} }
} }
ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<TransactionState> trState,
Version minPossibleCommitVersion,
Version maxPossibleCommitVersion,
IdempotencyIdRef idempotencyId) {
state Transaction tr(trState->cx);
state int retries = 0;
state Span span("NAPI:determineCommitStatus"_loc, trState->spanContext);
tr.span.setParent(span.context);
loop {
try {
tr.trState->options = trState->options;
tr.trState->taskID = trState->taskID;
tr.trState->authToken = trState->authToken;
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Version rv = wait(tr.getReadVersion());
TraceEvent("DetermineCommitStatusAttempt")
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries)
.detail("ReadVersion", rv)
.detail("MinPossibleCommitVersion", minPossibleCommitVersion)
.detail("MaxPossibleCommitVersion", maxPossibleCommitVersion);
KeyRange possibleRange =
KeyRangeRef(BinaryWriter::toValue(bigEndian64(minPossibleCommitVersion), Unversioned())
.withPrefix(idempotencyIdKeys.begin),
BinaryWriter::toValue(bigEndian64(maxPossibleCommitVersion + 1), Unversioned())
.withPrefix(idempotencyIdKeys.begin));
RangeResult range = wait(tr.getRange(possibleRange, CLIENT_KNOBS->TOO_MANY));
ASSERT(!range.more);
for (const auto& kv : range) {
auto commitResult = kvContainsIdempotencyId(kv, idempotencyId);
if (commitResult.present()) {
TraceEvent("DetermineCommitStatus")
.detail("Committed", 1)
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries);
return commitResult;
}
}
TraceEvent("DetermineCommitStatus")
.detail("Committed", 0)
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries);
return Optional<CommitResult>();
} catch (Error& e) {
TraceEvent("DetermineCommitStatusError")
.errorUnsuppressed(e)
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries);
wait(tr.onError(e));
}
++retries;
}
}
void Transaction::cancelWatches(Error const& e) { void Transaction::cancelWatches(Error const& e) {
for (int i = 0; i < watches.size(); ++i) for (int i = 0; i < watches.size(); ++i)
if (!watches[i]->onChangeTrigger.isSet()) if (!watches[i]->onChangeTrigger.isSet())
@ -6420,7 +6485,7 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
if (e.code() == error_code_request_maybe_delivered || e.code() == error_code_commit_unknown_result) { if (e.code() == error_code_request_maybe_delivered || e.code() == error_code_commit_unknown_result) {
// We don't know if the commit happened, and it might even still be in flight. // We don't know if the commit happened, and it might even still be in flight.
if (!trState->options.causalWriteRisky) { if (!trState->options.causalWriteRisky || req.idempotencyId.valid()) {
// Make sure it's not still in flight, either by ensuring the master we submitted to is dead, or the // Make sure it's not still in flight, either by ensuring the master we submitted to is dead, or the
// version we submitted with is dead, or by committing a conflicting transaction successfully // version we submitted with is dead, or by committing a conflicting transaction successfully
// if ( cx->getCommitProxies()->masterGeneration <= originalMasterGeneration ) // if ( cx->getCommitProxies()->masterGeneration <= originalMasterGeneration )
@ -6437,6 +6502,24 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
wait( wait(
commitDummyTransaction(trState, singleKeyRange(selfConflictingRange.begin), tenantPrefixPrepended)); commitDummyTransaction(trState, singleKeyRange(selfConflictingRange.begin), tenantPrefixPrepended));
if (req.idempotencyId.valid()) {
Optional<CommitResult> commitResult = wait(determineCommitStatus(
trState,
req.transaction.read_snapshot,
req.transaction.read_snapshot + 5e6 /* Based on MAX_WRITE_TRANSACTION_LIFE_VERSIONS */,
req.idempotencyId));
if (commitResult.present()) {
Standalone<StringRef> ret = makeString(10);
placeVersionstamp(
mutateString(ret), commitResult.get().commitVersion, commitResult.get().batchIndex);
trState->versionstampPromise.send(ret);
CODE_PROBE(true, "AutomaticIdempotencyCommitted");
return Void();
} else {
CODE_PROBE(true, "AutomaticIdempotencyNotCommitted");
throw transaction_too_old();
}
}
} }
// The user needs to be informed that we aren't sure whether the commit happened. Standard retry loops // The user needs to be informed that we aren't sure whether the commit happened. Standard retry loops
@ -6519,6 +6602,18 @@ Future<Void> Transaction::commitMutations() {
tr.transaction.read_conflict_ranges.emplace_back( tr.transaction.read_conflict_ranges.emplace_back(
tr.arena, extraConflictRanges[i].get().first, extraConflictRanges[i].get().second); tr.arena, extraConflictRanges[i].get().first, extraConflictRanges[i].get().second);
if (tr.idempotencyId.valid()) {
// We need to be able confirm that this transaction is no longer in
// flight, and if the idempotency id is in the read and write
// conflict range we can use that.
BinaryWriter wr(Unversioned());
wr.serializeBytes("\xFF/SC/"_sr);
wr.serializeBytes(tr.idempotencyId.asStringRefUnsafe());
auto r = singleKeyRange(wr.toValue(), tr.arena);
tr.transaction.read_conflict_ranges.push_back(tr.arena, r);
tr.transaction.write_conflict_ranges.push_back(tr.arena, r);
}
if (!trState->options.causalWriteRisky && if (!trState->options.causalWriteRisky &&
!intersects(tr.transaction.write_conflict_ranges, tr.transaction.read_conflict_ranges).present()) !intersects(tr.transaction.write_conflict_ranges, tr.transaction.read_conflict_ranges).present())
makeSelfConflicting(); makeSelfConflicting();
@ -6829,6 +6924,23 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
else else
trState->authToken.reset(); trState->authToken.reset();
break; break;
case FDBTransactionOptions::IDEMPOTENCY_ID:
validateOptionValuePresent(value);
if (!(value.get().size() >= 16 && value.get().size() < 256)) {
Error e = invalid_option();
TraceEvent(SevWarn, "IdempotencyIdInvalidSize")
.error(e)
.detail("IdempotencyId", value.get().printable())
.detail("Recommendation", "Use an idempotency id that's at least 16 bytes and less than 256 bytes");
throw e;
}
tr.idempotencyId = IdempotencyIdRef(tr.arena, IdempotencyIdRef(value.get()));
break;
case FDBTransactionOptions::AUTOMATIC_IDEMPOTENCY:
validateOptionValueNotPresent(value);
tr.idempotencyId = IdempotencyIdRef(
tr.arena, IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
break;
default: default:
break; break;
@ -9039,6 +9151,273 @@ void DatabaseContext::setDesiredChangeFeedVersion(Version v) {
} }
} }
// Because two storage servers, depending on the shard map, can have different representations of a clear at the same
// version depending on their shard maps at the time of the mutation, it is non-trivial to directly compare change feed
// streams. Instead we compare the presence of data at each version. This both saves on cpu cost of validation, and
// because historically most change feed corruption bugs are the absence of entire versions, not a subset of mutations
// within a version.
struct ChangeFeedTSSValidationData {
PromiseStream<Version> ssStreamSummary;
ReplyPromiseStream<ChangeFeedStreamReply> tssStream;
Future<Void> validatorFuture;
std::deque<std::pair<Version, Version>> rollbacks;
Version popVersion = invalidVersion;
bool done = false;
ChangeFeedTSSValidationData() {}
ChangeFeedTSSValidationData(ReplyPromiseStream<ChangeFeedStreamReply> tssStream) : tssStream(tssStream) {}
void updatePopped(Version newPopVersion) { popVersion = std::max(popVersion, newPopVersion); }
bool checkRollback(const MutationsAndVersionRef& m) {
if (m.mutations.size() == 1 && m.mutations.back().param1 == lastEpochEndPrivateKey) {
if (rollbacks.empty() || rollbacks.back().second < m.version) {
Version rollbackVersion;
BinaryReader br(m.mutations.back().param2, Unversioned());
br >> rollbackVersion;
if (!rollbacks.empty()) {
ASSERT(rollbacks.back().second <= rollbackVersion);
}
rollbacks.push_back({ rollbackVersion, m.version });
}
return true;
} else {
return false;
}
}
bool shouldAddMutation(const MutationsAndVersionRef& m) {
return !done && !m.mutations.empty() && !checkRollback(m);
}
bool isRolledBack(Version v) {
return !rollbacks.empty() && rollbacks.front().first < v && rollbacks.front().second > v;
}
void send(const ChangeFeedStreamReply& ssReply) {
if (done) {
return;
}
updatePopped(ssReply.popVersion);
for (auto& it : ssReply.mutations) {
if (shouldAddMutation(it)) {
ssStreamSummary.send(it.version);
}
}
}
void complete() {
done = true;
// destroy TSS stream to stop server actor
tssStream.reset();
}
};
void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
const TSSEndpointData& tssData,
int64_t matchesFound,
Version lastMatchingVersion,
Version ssVersion,
Version tssVersion,
Version popVersion) {
if (request.canReadPopped) {
// There is a known issue where this can return different data between an SS and TSS when a feed was popped but
// the SS restarted before the pop could be persisted, for reads that can read popped data. As such, only count
// this as a mismatch when !req.canReadPopped
return;
}
CODE_PROBE(true, "TSS mismatch in stream comparison");
if (tssData.metrics->shouldRecordDetailedMismatch()) {
TraceEvent mismatchEvent(
(g_network->isSimulated() && g_simulator->tssMode == ISimulator::TSSMode::EnabledDropMutations)
? SevWarnAlways
: SevError,
"TSSMismatchChangeFeedStream");
mismatchEvent.setMaxEventLength(FLOW_KNOBS->TSS_LARGE_TRACE_SIZE);
// request info
mismatchEvent.detail("TSSID", tssData.tssId);
mismatchEvent.detail("FeedID", request.rangeID);
mismatchEvent.detail("BeginVersion", request.begin);
mismatchEvent.detail("EndVersion", request.end);
mismatchEvent.detail("StartKey", request.range.begin);
mismatchEvent.detail("EndKey", request.range.end);
mismatchEvent.detail("CanReadPopped", request.canReadPopped);
mismatchEvent.detail("PopVersion", popVersion);
mismatchEvent.detail("DebugUID", request.debugUID);
// mismatch info
mismatchEvent.detail("MatchesFound", matchesFound);
mismatchEvent.detail("LastMatchingVersion", lastMatchingVersion);
mismatchEvent.detail("SSVersion", ssVersion);
mismatchEvent.detail("TSSVersion", tssVersion);
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Full TSS Feed Mismatch in stream comparison");
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Feed Mismatch in stream comparison and storing the rest in FDB");
if (!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL) {
mismatchEvent.disable();
UID mismatchUID = deterministicRandom()->randomUniqueID();
tssData.metrics->recordDetailedMismatchData(mismatchUID, mismatchEvent.getFields().toString());
// record a summarized trace event instead
TraceEvent summaryEvent(
(g_network->isSimulated() && g_simulator->tssMode == ISimulator::TSSMode::EnabledDropMutations)
? SevWarnAlways
: SevError,
"TSSMismatchChangeFeedStream");
summaryEvent.detail("TSSID", tssData.tssId)
.detail("MismatchId", mismatchUID)
.detail("FeedDebugUID", request.debugUID);
}
}
}
ACTOR Future<Void> changeFeedTSSValidator(ChangeFeedStreamRequest req,
Optional<ChangeFeedTSSValidationData>* data,
TSSEndpointData tssData) {
state bool ssDone = false;
state bool tssDone = false;
state std::deque<Version> ssSummary;
state std::deque<Version> tssSummary;
ASSERT(data->present());
state int64_t matchesFound = 0;
state Version lastMatchingVersion = req.begin - 1;
loop {
// If SS stream gets error, whole stream data gets reset, so it's ok to cancel this actor
if (!ssDone && ssSummary.empty()) {
try {
Version next = waitNext(data->get().ssStreamSummary.getFuture());
ssSummary.push_back(next);
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
data->get().complete();
if (e.code() != error_code_operation_cancelled) {
tssData.metrics->ssError(e.code());
}
throw e;
}
ssDone = true;
if (tssDone) {
data->get().complete();
return Void();
}
}
}
if (!tssDone && tssSummary.empty()) {
try {
choose {
when(ChangeFeedStreamReply nextTss = waitNext(data->get().tssStream.getFuture())) {
data->get().updatePopped(nextTss.popVersion);
for (auto& it : nextTss.mutations) {
if (data->get().shouldAddMutation(it)) {
tssSummary.push_back(it.version);
}
}
}
// if ss has result, tss needs to return it
when(wait((ssDone || !ssSummary.empty()) ? delay(2.0 * FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT)
: Never())) {
++tssData.metrics->tssTimeouts;
data->get().complete();
return Void();
}
}
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
throw e;
}
if (e.code() == error_code_end_of_stream) {
tssDone = true;
if (ssDone) {
data->get().complete();
return Void();
}
} else {
tssData.metrics->tssError(e.code());
data->get().complete();
return Void();
}
}
}
// handle rollbacks and concurrent pops
while (!ssSummary.empty() &&
(ssSummary.front() < data->get().popVersion || data->get().isRolledBack(ssSummary.front()))) {
ssSummary.pop_front();
}
while (!tssSummary.empty() &&
(tssSummary.front() < data->get().popVersion || data->get().isRolledBack(tssSummary.front()))) {
tssSummary.pop_front();
}
while (!ssSummary.empty() && !tssSummary.empty()) {
CODE_PROBE(true, "Comparing TSS change feed data");
if (ssSummary.front() != tssSummary.front()) {
CODE_PROBE(true, "TSS change feed mismatch");
handleTSSChangeFeedMismatch(req,
tssData,
matchesFound,
lastMatchingVersion,
ssSummary.front(),
tssSummary.front(),
data->get().popVersion);
data->get().complete();
return Void();
}
matchesFound++;
lastMatchingVersion = ssSummary.front();
ssSummary.pop_front();
tssSummary.pop_front();
while (!data->get().rollbacks.empty() && data->get().rollbacks.front().second <= lastMatchingVersion) {
data->get().rollbacks.pop_front();
}
}
ASSERT(!ssDone || !tssDone); // both shouldn't be done, otherwise we shouldn't have looped
if ((ssDone && !tssSummary.empty()) || (tssDone && !ssSummary.empty())) {
CODE_PROBE(true, "TSS change feed mismatch at end of stream");
handleTSSChangeFeedMismatch(req,
tssData,
matchesFound,
lastMatchingVersion,
ssDone ? -1 : ssSummary.front(),
tssDone ? -1 : tssSummary.front(),
data->get().popVersion);
data->get().complete();
return Void();
}
}
}
void maybeDuplicateTSSChangeFeedStream(ChangeFeedStreamRequest& req,
const RequestStream<ChangeFeedStreamRequest>& stream,
QueueModel* model,
Optional<ChangeFeedTSSValidationData>* tssData) {
if (model) {
Optional<TSSEndpointData> tssPair = model->getTssData(stream.getEndpoint().token.first());
if (tssPair.present()) {
CODE_PROBE(true, "duplicating feed stream to TSS");
resetReply(req);
RequestStream<ChangeFeedStreamRequest> tssRequestStream(tssPair.get().endpoint);
*tssData = Optional<ChangeFeedTSSValidationData>(
ChangeFeedTSSValidationData(tssRequestStream.getReplyStream(req)));
// tie validator actor to the lifetime of the stream being active
tssData->get().validatorFuture = changeFeedTSSValidator(req, tssData, tssPair.get());
}
}
}
ChangeFeedStorageData::~ChangeFeedStorageData() { ChangeFeedStorageData::~ChangeFeedStorageData() {
if (context) { if (context) {
context->changeFeedUpdaters.erase(interfToken); context->changeFeedUpdaters.erase(interfToken);
@ -9160,7 +9539,8 @@ ACTOR Future<Void> partialChangeFeedStream(StorageServerInterface interf,
Version end, Version end,
Reference<ChangeFeedData> feedData, Reference<ChangeFeedData> feedData,
Reference<ChangeFeedStorageData> storageData, Reference<ChangeFeedStorageData> storageData,
UID debugUID) { UID debugUID,
Optional<ChangeFeedTSSValidationData>* tssData) {
// calling lastReturnedVersion's callbacks could cause us to be cancelled // calling lastReturnedVersion's callbacks could cause us to be cancelled
state Promise<Void> refresh = feedData->refresh; state Promise<Void> refresh = feedData->refresh;
@ -9204,6 +9584,9 @@ ACTOR Future<Void> partialChangeFeedStream(StorageServerInterface interf,
if (rep.popVersion > feedData->popVersion) { if (rep.popVersion > feedData->popVersion) {
feedData->popVersion = rep.popVersion; feedData->popVersion = rep.popVersion;
} }
if (tssData->present()) {
tssData->get().updatePopped(rep.popVersion);
}
if (lastEmpty != invalidVersion && !results.isEmpty()) { if (lastEmpty != invalidVersion && !results.isEmpty()) {
for (auto& it : feedData->storageData) { for (auto& it : feedData->storageData) {
@ -9218,6 +9601,10 @@ ACTOR Future<Void> partialChangeFeedStream(StorageServerInterface interf,
while (resultLoc < rep.mutations.size()) { while (resultLoc < rep.mutations.size()) {
wait(results.onEmpty()); wait(results.onEmpty());
if (rep.mutations[resultLoc].version >= nextVersion) { if (rep.mutations[resultLoc].version >= nextVersion) {
if (tssData->present() && tssData->get().shouldAddMutation(rep.mutations[resultLoc])) {
tssData->get().ssStreamSummary.send(rep.mutations[resultLoc].version);
}
results.send(rep.mutations[resultLoc]); results.send(rep.mutations[resultLoc]);
if (DEBUG_CF_CLIENT_TRACE) { if (DEBUG_CF_CLIENT_TRACE) {
@ -9414,6 +9801,11 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
state std::vector<Future<Void>> fetchers(interfs.size()); state std::vector<Future<Void>> fetchers(interfs.size());
state std::vector<Future<Void>> onErrors(interfs.size()); state std::vector<Future<Void>> onErrors(interfs.size());
state std::vector<MutationAndVersionStream> streams(interfs.size()); state std::vector<MutationAndVersionStream> streams(interfs.size());
state std::vector<Optional<ChangeFeedTSSValidationData>> tssDatas;
tssDatas.reserve(interfs.size());
for (int i = 0; i < interfs.size(); i++) {
tssDatas.push_back({});
}
CODE_PROBE(interfs.size() > 10, "Large change feed merge cursor"); CODE_PROBE(interfs.size() > 10, "Large change feed merge cursor");
CODE_PROBE(interfs.size() > 100, "Very large change feed merge cursor"); CODE_PROBE(interfs.size() > 100, "Very large change feed merge cursor");
@ -9421,12 +9813,12 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
state UID mergeCursorUID = UID(); state UID mergeCursorUID = UID();
state std::vector<UID> debugUIDs; state std::vector<UID> debugUIDs;
results->streams.clear(); results->streams.clear();
for (auto& it : interfs) { for (int i = 0; i < interfs.size(); i++) {
ChangeFeedStreamRequest req; ChangeFeedStreamRequest req;
req.rangeID = rangeID; req.rangeID = rangeID;
req.begin = *begin; req.begin = *begin;
req.end = end; req.end = end;
req.range = it.second; req.range = interfs[i].second;
req.canReadPopped = canReadPopped; req.canReadPopped = canReadPopped;
// divide total buffer size among sub-streams, but keep individual streams large enough to be efficient // divide total buffer size among sub-streams, but keep individual streams large enough to be efficient
req.replyBufferSize = replyBufferSize / interfs.size(); req.replyBufferSize = replyBufferSize / interfs.size();
@ -9438,7 +9830,11 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
mergeCursorUID = mergeCursorUID =
UID(mergeCursorUID.first() ^ req.debugUID.first(), mergeCursorUID.second() ^ req.debugUID.second()); UID(mergeCursorUID.first() ^ req.debugUID.first(), mergeCursorUID.second() ^ req.debugUID.second());
results->streams.push_back(it.first.changeFeedStream.getReplyStream(req)); results->streams.push_back(interfs[i].first.changeFeedStream.getReplyStream(req));
maybeDuplicateTSSChangeFeedStream(req,
interfs[i].first.changeFeedStream,
db->enableLocalityLoadBalance ? &db->queueModel : nullptr,
&tssDatas[i]);
} }
results->maxSeenVersion = invalidVersion; results->maxSeenVersion = invalidVersion;
@ -9475,7 +9871,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
end, end,
results, results,
results->storageData[i], results->storageData[i],
debugUIDs[i]); debugUIDs[i],
&tssDatas[i]);
} }
wait(waitForAny(onErrors) || mergeChangeFeedStreamInternal(results, interfs, streams, begin, end, mergeCursorUID)); wait(waitForAny(onErrors) || mergeChangeFeedStreamInternal(results, interfs, streams, begin, end, mergeCursorUID));
@ -9529,7 +9926,8 @@ ACTOR Future<Void> singleChangeFeedStreamInternal(KeyRange range,
Reference<ChangeFeedData> results, Reference<ChangeFeedData> results,
Key rangeID, Key rangeID,
Version* begin, Version* begin,
Version end) { Version end,
Optional<ChangeFeedTSSValidationData>* tssData) {
state Promise<Void> refresh = results->refresh; state Promise<Void> refresh = results->refresh;
ASSERT(results->streams.size() == 1); ASSERT(results->streams.size() == 1);
@ -9564,6 +9962,9 @@ ACTOR Future<Void> singleChangeFeedStreamInternal(KeyRange range,
if (feedReply.popVersion > results->popVersion) { if (feedReply.popVersion > results->popVersion) {
results->popVersion = feedReply.popVersion; results->popVersion = feedReply.popVersion;
} }
if (tssData->present()) {
tssData->get().updatePopped(feedReply.popVersion);
}
// don't send completely empty set of mutations to promise stream // don't send completely empty set of mutations to promise stream
bool anyMutations = false; bool anyMutations = false;
@ -9578,6 +9979,10 @@ ACTOR Future<Void> singleChangeFeedStreamInternal(KeyRange range,
// stream. Anything with mutations should be strictly greater than lastReturnedVersion // stream. Anything with mutations should be strictly greater than lastReturnedVersion
ASSERT(feedReply.mutations.front().version > results->lastReturnedVersion.get()); ASSERT(feedReply.mutations.front().version > results->lastReturnedVersion.get());
if (tssData->present()) {
tssData->get().send(feedReply);
}
results->mutations.send( results->mutations.send(
Standalone<VectorRef<MutationsAndVersionRef>>(feedReply.mutations, feedReply.arena)); Standalone<VectorRef<MutationsAndVersionRef>>(feedReply.mutations, feedReply.arena));
@ -9629,6 +10034,7 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
bool canReadPopped) { bool canReadPopped) {
state Database cx(db); state Database cx(db);
state ChangeFeedStreamRequest req; state ChangeFeedStreamRequest req;
state Optional<ChangeFeedTSSValidationData> tssData;
req.rangeID = rangeID; req.rangeID = rangeID;
req.begin = *begin; req.begin = *begin;
req.end = end; req.end = end;
@ -9662,7 +10068,11 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
} }
refresh.send(Void()); refresh.send(Void());
wait(results->streams[0].onError() || singleChangeFeedStreamInternal(range, results, rangeID, begin, end)); maybeDuplicateTSSChangeFeedStream(
req, interf.changeFeedStream, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr, &tssData);
wait(results->streams[0].onError() ||
singleChangeFeedStreamInternal(range, results, rangeID, begin, end, &tssData));
return Void(); return Void();
} }
@ -10008,6 +10418,8 @@ ACTOR Future<Void> popChangeFeedMutationsActor(Reference<DatabaseContext> db, Ke
return Void(); return Void();
} }
auto model = cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr;
bool foundFailed = false; bool foundFailed = false;
for (int i = 0; i < locations.size() && !foundFailed; i++) { for (int i = 0; i < locations.size() && !foundFailed; i++) {
for (int j = 0; j < locations[i].locations->size() && !foundFailed; j++) { for (int j = 0; j < locations[i].locations->size() && !foundFailed; j++) {
@ -10016,6 +10428,15 @@ ACTOR Future<Void> popChangeFeedMutationsActor(Reference<DatabaseContext> db, Ke
.isFailed()) { .isFailed()) {
foundFailed = true; foundFailed = true;
} }
// for now, if any of popping SS has a TSS pair, just always use backup method
if (model && model
->getTssData(locations[i]
.locations->get(j, &StorageServerInterface::changeFeedPop)
.getEndpoint()
.token.first())
.present()) {
foundFailed = true;
}
} }
} }

View File

@ -570,6 +570,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RATEKEEPER_FAILURE_TIME, 1.0 ); init( RATEKEEPER_FAILURE_TIME, 1.0 );
init( CONSISTENCYSCAN_FAILURE_TIME, 1.0 ); init( CONSISTENCYSCAN_FAILURE_TIME, 1.0 );
init( BLOB_MANAGER_FAILURE_TIME, 1.0 ); init( BLOB_MANAGER_FAILURE_TIME, 1.0 );
init( BLOB_MIGRATOR_FAILURE_TIME, 1.0 );
init( REPLACE_INTERFACE_DELAY, 60.0 ); init( REPLACE_INTERFACE_DELAY, 60.0 );
init( REPLACE_INTERFACE_CHECK_DELAY, 5.0 ); init( REPLACE_INTERFACE_CHECK_DELAY, 5.0 );
init( COORDINATOR_REGISTER_INTERVAL, 5.0 ); init( COORDINATOR_REGISTER_INTERVAL, 5.0 );
@ -752,6 +753,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_PARALLELISM_FULL, 6 ); init( FETCH_KEYS_PARALLELISM_FULL, 6 );
init( FETCH_KEYS_LOWER_PRIORITY, 0 ); init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 ); init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 );
init( SERVE_AUDIT_STORAGE_PARALLELISM, 2 );
init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20; init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20;
init( BUGGIFY_BLOCK_BYTES, 10000 ); init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS ); init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
@ -987,8 +989,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Blob Metadata // Blob Metadata
init( BLOB_METADATA_CACHE_TTL, isSimulated ? 120 : 24 * 60 * 60 ); init( BLOB_METADATA_CACHE_TTL, isSimulated ? 120 : 24 * 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_CACHE_TTL = deterministicRandom()->randomInt(50, 100); } if ( randomize && BUGGIFY) { BLOB_METADATA_CACHE_TTL = deterministicRandom()->randomInt(50, 100); }
init( BLOB_METADATA_REFRESH_INTERVAL, isSimulated ? 60 : 12 * 60 * 60 ); init( BLOB_METADATA_REFRESH_INTERVAL, isSimulated ? 60 : 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(20, 40); } if ( randomize && BUGGIFY) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
// HTTP KMS Connector // HTTP KMS Connector
init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE, "file"); init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE, "file");
@ -1003,6 +1005,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// NOTE: Care must be taken when attempting to update below configurations for a up/running FDB cluster. // NOTE: Care must be taken when attempting to update below configurations for a up/running FDB cluster.
init( REST_KMS_CONNECTOR_DISCOVER_KMS_URL_FILE, ""); init( REST_KMS_CONNECTOR_DISCOVER_KMS_URL_FILE, "");
init( REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT, ""); init( REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT, "");
init( REST_KMS_CONNECTOR_GET_BLOB_METADATA_ENDPOINT, "");
// Details to fetch validation token from a localhost file // Details to fetch validation token from a localhost file
// acceptable format: "<token_name1>#<absolute_file_path1>,<token_name2>#<absolute_file_path2>,.." // acceptable format: "<token_name1>#<absolute_file_path1>,<token_name2>#<absolute_file_path2>,.."
// NOTE: 'token-name" can NOT contain '#' character // NOTE: 'token-name" can NOT contain '#' character

View File

@ -342,7 +342,7 @@ void TSS_traceMismatch(TraceEvent& event,
// change feed // change feed
template <> template <>
bool TSS_doCompare(const OverlappingChangeFeedsReply& src, const OverlappingChangeFeedsReply& tss) { bool TSS_doCompare(const OverlappingChangeFeedsReply& src, const OverlappingChangeFeedsReply& tss) {
ASSERT(false); // We duplicate for load, no need to validate replies
return true; return true;
} }

View File

@ -286,6 +286,41 @@ const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transactio
const KeyRef clusterIdKey = "\xff/clusterId"_sr; const KeyRef clusterIdKey = "\xff/clusterId"_sr;
const KeyRangeRef auditRange = KeyRangeRef("\xff/audit/"_sr, "\xff/audit0"_sr);
const KeyRef auditPrefix = auditRange.begin;
const Key auditRangeKey(const AuditType type, const UID& auditId, const KeyRef& key) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(auditPrefix);
wr << static_cast<uint8_t>(type);
wr.serializeBytes("/"_sr);
wr << auditId;
wr.serializeBytes("/"_sr);
wr.serializeBytes(key);
return wr.toValue();
}
const Key auditRangePrefix(const AuditType type, const UID& auditId) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(auditPrefix);
wr << static_cast<uint8_t>(type);
wr.serializeBytes("/"_sr);
wr << auditId;
wr.serializeBytes("/"_sr);
return wr.toValue();
}
const Value auditStorageStateValue(const AuditStorageState& auditStorageState) {
return ObjectWriter::toValue(auditStorageState, IncludeVersion());
}
AuditStorageState decodeAuditStorageState(const ValueRef& value) {
AuditStorageState auditState;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(auditState);
return auditState;
}
const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr; const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr;
const Key checkpointKeyFor(UID checkpointID) { const Key checkpointKeyFor(UID checkpointID) {
@ -1629,6 +1664,9 @@ Key storageQuotaKey(StringRef tenantName) {
return tenantName.withPrefix(storageQuotaPrefix); return tenantName.withPrefix(storageQuotaPrefix);
} }
const KeyRangeRef idempotencyIdKeys("\xff\x02/idmp/"_sr, "\xff\x02/idmp0"_sr);
const KeyRef idempotencyIdsExpiredVersion("\xff\x02/idmpExpiredVersion"_sr);
// for tests // for tests
void testSSISerdes(StorageServerInterface const& ssi) { void testSSISerdes(StorageServerInterface const& ssi) {
printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\nacceptingRequests=%s\naddress=%s\ngetValue=%s\n\n\n", printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\nacceptingRequests=%s\naddress=%s\ngetValue=%s\n\n\n",

View File

@ -99,6 +99,48 @@ Tuple Tuple::unpack(StringRef const& str, bool exclude_incomplete) {
return Tuple(str, exclude_incomplete); return Tuple(str, exclude_incomplete);
} }
std::string Tuple::tupleToString(const Tuple& tuple) {
std::string str;
if (tuple.size() > 1) {
str += "(";
}
for (int i = 0; i < tuple.size(); ++i) {
Tuple::ElementType type = tuple.getType(i);
if (type == Tuple::NULL_TYPE) {
str += "NULL";
} else if (type == Tuple::BYTES || type == Tuple::UTF8) {
if (type == Tuple::UTF8) {
str += "u";
}
str += "\'" + tuple.getString(i).printable() + "\'";
} else if (type == Tuple::INT) {
str += format("%ld", tuple.getInt(i));
} else if (type == Tuple::FLOAT) {
str += format("%f", tuple.getFloat(i));
} else if (type == Tuple::DOUBLE) {
str += format("%f", tuple.getDouble(i));
} else if (type == Tuple::BOOL) {
str += tuple.getBool(i) ? "true" : "false";
} else if (type == Tuple::VERSIONSTAMP) {
TupleVersionstamp versionstamp = tuple.getVersionstamp(i);
str += format("Transaction Version: '%ld', BatchNumber: '%hd', UserVersion : '%hd'",
versionstamp.getVersion(),
versionstamp.getBatchNumber(),
versionstamp.getUserVersion());
} else {
ASSERT(false);
}
if (i < tuple.size() - 1) {
str += ", ";
}
}
if (tuple.size() > 1) {
str += ")";
}
return str;
}
Tuple Tuple::unpackUserType(StringRef const& str, bool exclude_incomplete) { Tuple Tuple::unpackUserType(StringRef const& str, bool exclude_incomplete) {
return Tuple(str, exclude_incomplete, true); return Tuple(str, exclude_incomplete, true);
} }

View File

@ -0,0 +1,111 @@
/*
* Audit.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_AUDIT_H
#define FDBCLIENT_AUDIT_H
#pragma once
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
enum class AuditPhase : uint8_t {
Invalid = 0,
Running = 1,
Complete = 2,
Error = 3,
Failed = 4,
};
enum class AuditType : uint8_t {
Invalid = 0,
ValidateHA = 1,
};
struct AuditStorageState {
constexpr static FileIdentifier file_identifier = 13804340;
AuditStorageState() = default;
AuditStorageState(UID id, AuditType type) : id(id), type(static_cast<uint8_t>(type)) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, type, phase, error);
}
void setType(AuditType type) { this->type = static_cast<uint8_t>(this->type); }
AuditType getType() const { return static_cast<AuditType>(this->type); }
void setPhase(AuditPhase phase) { this->phase = static_cast<uint8_t>(phase); }
AuditPhase getPhase() const { return static_cast<AuditPhase>(this->phase); }
UID id;
uint8_t type;
uint8_t phase;
std::string error;
};
struct AuditStorageRequest {
constexpr static FileIdentifier file_identifier = 13804341;
AuditStorageRequest() = default;
AuditStorageRequest(UID id, KeyRange range, AuditType type)
: id(id), range(range), type(static_cast<uint8_t>(type)) {}
void setType(AuditType type) { this->type = static_cast<uint8_t>(this->type); }
AuditType getType() const { return static_cast<AuditType>(this->type); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, range, type, targetServers, reply);
}
UID id;
KeyRange range;
uint8_t type;
std::vector<UID> targetServers;
ReplyPromise<AuditStorageState> reply;
};
// Triggers an audit of the specific type, an audit id is returned if an audit is scheduled successfully.
// If there is an running audit, the corresponding id will be returned, unless force is true;
// When force is set, the ongoing audit will be cancelled, and a new audit will be scheduled.
struct TriggerAuditRequest {
constexpr static FileIdentifier file_identifier = 1384445;
TriggerAuditRequest() = default;
TriggerAuditRequest(AuditType type, KeyRange range)
: type(static_cast<uint8_t>(type)), range(range), force(false), async(false) {}
void setType(AuditType type) { this->type = static_cast<uint8_t>(this->type); }
AuditType getType() const { return static_cast<AuditType>(this->type); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, type, range, force, async, reply);
}
uint8_t type;
KeyRange range;
bool force;
bool async;
ReplyPromise<UID> reply;
};
#endif

View File

@ -0,0 +1,34 @@
/*
* AuditUtils.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_AUDITUTILS_ACTOR_H
#define FDBCLIENT_AUDITUTILS_ACTOR_H
#pragma once
#include "fdbclient/Audit.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
#include "flow/actorcompiler.h" // has to be last include
ACTOR Future<Void> persistAuditStorageState(Key key, AuditStorageState auditState);
#include "flow/unactorcompiler.h"
#endif

View File

@ -106,6 +106,8 @@ public:
std::array<CounterSet, int(UsageType::MAX)> counterSets; std::array<CounterSet, int(UsageType::MAX)> counterSets;
}; };
std::string toString(BlobCipherMetrics::UsageType type);
// Encryption operations buffer management // Encryption operations buffer management
// Approach limits number of copies needed during encryption or decryption operations. // Approach limits number of copies needed during encryption or decryption operations.
// For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture // For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture
@ -185,7 +187,7 @@ struct hash<BlobCipherDetails> {
#pragma pack(push, 1) // exact fit - no padding #pragma pack(push, 1) // exact fit - no padding
typedef struct BlobCipherEncryptHeader { typedef struct BlobCipherEncryptHeader {
static constexpr int headerSize = 136; static constexpr int headerSize = 104;
union { union {
struct { struct {
uint8_t size; // reading first byte is sufficient to determine header uint8_t size; // reading first byte is sufficient to determine header
@ -210,29 +212,22 @@ typedef struct BlobCipherEncryptHeader {
// reads. FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate 'authentication // reads. FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate 'authentication
// token' (crypto-secure) to protect against malicious tampering and/or bit rot/flip scenarios. // token' (crypto-secure) to protect against malicious tampering and/or bit rot/flip scenarios.
union { // Encryption header support two modes of generation 'authentication tokens':
// Encryption header support two modes of generation 'authentication tokens': // 1) SingleAuthTokenMode: the scheme generates single crypto-secrure auth token to protect {cipherText +
// 1) SingleAuthTokenMode: the scheme generates single crypto-secrure auth token to protect {cipherText + // header} payload. Scheme is geared towards optimizing cost due to crypto-secure auth-token generation,
// header} payload. Scheme is geared towards optimizing cost due to crypto-secure auth-token generation, // however, on decryption client needs to be read 'header' + 'encrypted-buffer' to validate the 'auth-token'.
// however, on decryption client needs to be read 'header' + 'encrypted-buffer' to validate the 'auth-token'. // The scheme is ideal for usecases where payload represented by the encryptionHeader is not large and it is
// The scheme is ideal for usecases where payload represented by the encryptionHeader is not large and it is // desirable to minimize CPU/latency penalty due to crypto-secure ops, such as: CommitProxies encrypted inline
// desirable to minimize CPU/latency penalty due to crypto-secure ops, such as: CommitProxies encrypted inline // transactions, StorageServer encrypting pages etc.
// transactions, StorageServer encrypting pages etc. 2) MultiAuthTokenMode: Scheme generates separate authTokens // SOMEDAY: Another potential scheme could be 'MultiAuthTokenMode': Scheme generates separate authTokens
// for 'encrypted buffer' & 'encryption-header'. The scheme is ideal where payload represented by // for 'encrypted buffer' & 'encryption-header'. The scheme is ideal where payload represented by
// encryptionHeader is large enough such that it is desirable to optimize cost of upfront reading full // encryptionHeader is large enough such that it is desirable to optimize cost of upfront reading full
// 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance: // 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance:
// backup-files. // backup-files.
struct { struct {
// Cipher text authentication token uint8_t authToken[AUTH_TOKEN_MAX_SIZE]{};
uint8_t cipherTextAuthToken[AUTH_TOKEN_MAX_SIZE]{}; } singleAuthToken;
uint8_t headerAuthToken[AUTH_TOKEN_MAX_SIZE]{};
} multiAuthTokens;
struct {
uint8_t authToken[AUTH_TOKEN_MAX_SIZE]{};
uint8_t _reserved[AUTH_TOKEN_MAX_SIZE]{};
} singleAuthToken;
};
BlobCipherEncryptHeader() {} BlobCipherEncryptHeader() {}
@ -628,10 +623,6 @@ private:
const int ciphertextLen, const int ciphertextLen,
const BlobCipherEncryptHeader& header, const BlobCipherEncryptHeader& header,
Arena& arena); Arena& arena);
void verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena);
}; };
class HmacSha256DigestGen final : NonCopyable { class HmacSha256DigestGen final : NonCopyable {

View File

@ -33,13 +33,15 @@ struct BlobConnectionProvider : NonCopyable, ReferenceCounted<BlobConnectionProv
// something returned from createForWrite // something returned from createForWrite
virtual Reference<BackupContainerFileSystem> getForRead(std::string filePath) = 0; virtual Reference<BackupContainerFileSystem> getForRead(std::string filePath) = 0;
virtual bool isExpired() const = 0;
virtual bool needsRefresh() const = 0;
virtual void update(Standalone<BlobMetadataDetailsRef> newBlobMetadata) = 0;
virtual ~BlobConnectionProvider() {} virtual ~BlobConnectionProvider() {}
static Reference<BlobConnectionProvider> newBlobConnectionProvider(std::string blobUrl); static Reference<BlobConnectionProvider> newBlobConnectionProvider(std::string blobUrl);
static Reference<BlobConnectionProvider> newBlobConnectionProvider(Standalone<BlobMetadataDetailsRef> blobMetadata); static Reference<BlobConnectionProvider> newBlobConnectionProvider(Standalone<BlobMetadataDetailsRef> blobMetadata);
// TODO add update impl
}; };
#endif #endif

View File

@ -44,18 +44,41 @@ struct BlobMetadataDetailsRef {
Optional<StringRef> base; Optional<StringRef> base;
VectorRef<StringRef> partitions; VectorRef<StringRef> partitions;
// cache options
double refreshAt;
double expireAt;
BlobMetadataDetailsRef() {} BlobMetadataDetailsRef() {}
BlobMetadataDetailsRef(Arena& arena, const BlobMetadataDetailsRef& from) BlobMetadataDetailsRef(Arena& arena, const BlobMetadataDetailsRef& from)
: domainId(from.domainId), domainName(arena, from.domainName), partitions(arena, from.partitions) { : domainId(from.domainId), domainName(arena, from.domainName), partitions(arena, from.partitions),
refreshAt(from.refreshAt), expireAt(from.expireAt) {
if (from.base.present()) { if (from.base.present()) {
base = StringRef(arena, from.base.get()); base = StringRef(arena, from.base.get());
} }
} }
explicit BlobMetadataDetailsRef(Arena& ar,
BlobMetadataDomainId domainId,
BlobMetadataDomainNameRef domainName,
Optional<StringRef> base,
VectorRef<StringRef> partitions,
int64_t refreshAt,
int64_t expireAt)
: domainId(domainId), domainName(ar, domainName), partitions(ar, partitions), refreshAt(refreshAt),
expireAt(expireAt) {
if (base.present()) {
base = StringRef(ar, base.get());
}
}
explicit BlobMetadataDetailsRef(BlobMetadataDomainId domainId, explicit BlobMetadataDetailsRef(BlobMetadataDomainId domainId,
BlobMetadataDomainNameRef domainName, BlobMetadataDomainNameRef domainName,
Optional<StringRef> base, Optional<StringRef> base,
VectorRef<StringRef> partitions) VectorRef<StringRef> partitions,
: domainId(domainId), domainName(domainName), base(base), partitions(partitions) {} double refreshAt,
double expireAt)
: domainId(domainId), domainName(domainName), base(base), partitions(partitions), refreshAt(refreshAt),
expireAt(expireAt) {}
int expectedSize() const { int expectedSize() const {
return sizeof(BlobMetadataDetailsRef) + domainName.size() + (base.present() ? base.get().size() : 0) + return sizeof(BlobMetadataDetailsRef) + domainName.size() + (base.present() ? base.get().size() : 0) +
@ -64,7 +87,7 @@ struct BlobMetadataDetailsRef {
template <class Ar> template <class Ar>
void serialize(Ar& ar) { void serialize(Ar& ar) {
serializer(ar, domainId, domainName, base, partitions); serializer(ar, domainId, domainName, base, partitions, refreshAt, expireAt);
} }
}; };

View File

@ -0,0 +1,58 @@
/*
* BuildIdempotencyIdMutations.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_BUILD_IDEMPOTENCY_ID_MUTATIONS_H
#define FDBCLIENT_BUILD_IDEMPOTENCY_ID_MUTATIONS_H
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#pragma once
// Iterate through trs looking for idempotency ids for committed transactions. Call onKvReady for each constructed key
// value pair.
template <class OnKVReady>
void buildIdempotencyIdMutations(const std::vector<CommitTransactionRequest>& trs,
IdempotencyIdKVBuilder& idempotencyKVBuilder,
Version commitVersion,
const std::vector<uint8_t>& committed,
uint8_t committedValue,
bool locked,
const OnKVReady& onKvReady) {
idempotencyKVBuilder.setCommitVersion(commitVersion);
for (int h = 0; h < trs.size(); h += 256) {
int end = std::min<int>(trs.size() - h, 256);
for (int l = 0; l < end; ++l) {
uint16_t batchIndex = h + l;
if ((committed[batchIndex] == committedValue && (!locked || trs[batchIndex].isLockAware()))) {
const auto& idempotency_id = trs[batchIndex].idempotencyId;
if (idempotency_id.valid()) {
idempotencyKVBuilder.add(idempotency_id, batchIndex);
}
}
}
Optional<KeyValue> kv = idempotencyKVBuilder.buildAndClear();
if (kv.present()) {
onKvReady(kv.get());
}
}
}
#endif

View File

@ -199,6 +199,7 @@ public:
int32_t DEFAULT_MAX_GRV_PROXIES; int32_t DEFAULT_MAX_GRV_PROXIES;
int32_t DEFAULT_AUTO_RESOLVERS; int32_t DEFAULT_AUTO_RESOLVERS;
int32_t DEFAULT_AUTO_LOGS; int32_t DEFAULT_AUTO_LOGS;
bool DELETE_NATIVE_LIB_AFTER_LOADING;
double GLOBAL_CONFIG_REFRESH_BACKOFF; double GLOBAL_CONFIG_REFRESH_BACKOFF;
double GLOBAL_CONFIG_REFRESH_MAX_BACKOFF; double GLOBAL_CONFIG_REFRESH_MAX_BACKOFF;

View File

@ -40,6 +40,7 @@ struct ClusterInterface {
RequestStream<struct MoveShardRequest> moveShard; RequestStream<struct MoveShardRequest> moveShard;
RequestStream<struct RepairSystemDataRequest> repairSystemData; RequestStream<struct RepairSystemDataRequest> repairSystemData;
RequestStream<struct SplitShardRequest> splitShard; RequestStream<struct SplitShardRequest> splitShard;
RequestStream<struct TriggerAuditRequest> triggerAudit;
bool operator==(ClusterInterface const& r) const { return id() == r.id(); } bool operator==(ClusterInterface const& r) const { return id() == r.id(); }
bool operator!=(ClusterInterface const& r) const { return id() != r.id(); } bool operator!=(ClusterInterface const& r) const { return id() != r.id(); }
@ -51,7 +52,7 @@ struct ClusterInterface {
databaseStatus.getFuture().isReady() || ping.getFuture().isReady() || databaseStatus.getFuture().isReady() || ping.getFuture().isReady() ||
getClientWorkers.getFuture().isReady() || forceRecovery.getFuture().isReady() || getClientWorkers.getFuture().isReady() || forceRecovery.getFuture().isReady() ||
moveShard.getFuture().isReady() || repairSystemData.getFuture().isReady() || moveShard.getFuture().isReady() || repairSystemData.getFuture().isReady() ||
splitShard.getFuture().isReady(); splitShard.getFuture().isReady() || triggerAudit.getFuture().isReady();
} }
void initEndpoints() { void initEndpoints() {
@ -64,6 +65,7 @@ struct ClusterInterface {
moveShard.getEndpoint(TaskPriority::ClusterController); moveShard.getEndpoint(TaskPriority::ClusterController);
repairSystemData.getEndpoint(TaskPriority::ClusterController); repairSystemData.getEndpoint(TaskPriority::ClusterController);
splitShard.getEndpoint(TaskPriority::ClusterController); splitShard.getEndpoint(TaskPriority::ClusterController);
triggerAudit.getEndpoint(TaskPriority::ClusterController);
} }
template <class Ar> template <class Ar>
@ -77,7 +79,8 @@ struct ClusterInterface {
forceRecovery, forceRecovery,
moveShard, moveShard,
repairSystemData, repairSystemData,
splitShard); splitShard,
triggerAudit);
} }
}; };

View File

@ -30,6 +30,7 @@
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/GlobalConfig.h" #include "fdbclient/GlobalConfig.h"
#include "fdbclient/GrvProxyInterface.h" #include "fdbclient/GrvProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/StorageServerInterface.h" #include "fdbclient/StorageServerInterface.h"
#include "fdbclient/TagThrottle.actor.h" #include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/VersionVector.h" #include "fdbclient/VersionVector.h"
@ -186,6 +187,7 @@ struct CommitTransactionRequest : TimedRequest {
Optional<UID> debugID; Optional<UID> debugID;
Optional<ClientTrCommitCostEstimation> commitCostEstimation; Optional<ClientTrCommitCostEstimation> commitCostEstimation;
Optional<TagSet> tagSet; Optional<TagSet> tagSet;
IdempotencyIdRef idempotencyId;
TenantInfo tenantInfo; TenantInfo tenantInfo;
@ -196,8 +198,17 @@ struct CommitTransactionRequest : TimedRequest {
template <class Ar> template <class Ar>
void serialize(Ar& ar) { void serialize(Ar& ar) {
serializer( serializer(ar,
ar, transaction, reply, flags, debugID, commitCostEstimation, tagSet, spanContext, tenantInfo, arena); transaction,
reply,
flags,
debugID,
commitCostEstimation,
tagSet,
spanContext,
tenantInfo,
idempotencyId,
arena);
} }
}; };
@ -224,6 +235,7 @@ struct GetReadVersionReply : public BasicLoadBalancedReply {
bool rkBatchThrottled = false; bool rkBatchThrottled = false;
TransactionTagMap<ClientTagThrottleLimits> tagThrottleInfo; TransactionTagMap<ClientTagThrottleLimits> tagThrottleInfo;
double proxyTagThrottledDuration{ 0.0 };
VersionVector ssVersionVectorDelta; VersionVector ssVersionVectorDelta;
UID proxyId; // GRV proxy ID to detect old GRV proxies at client side UID proxyId; // GRV proxy ID to detect old GRV proxies at client side
@ -242,7 +254,8 @@ struct GetReadVersionReply : public BasicLoadBalancedReply {
rkDefaultThrottled, rkDefaultThrottled,
rkBatchThrottled, rkBatchThrottled,
ssVersionVectorDelta, ssVersionVectorDelta,
proxyId); proxyId,
proxyTagThrottledDuration);
} }
}; };
@ -267,6 +280,10 @@ struct GetReadVersionRequest : TimedRequest {
TransactionPriority priority; TransactionPriority priority;
TransactionTagMap<uint32_t> tags; TransactionTagMap<uint32_t> tags;
// Not serialized, because this field does not need to be sent to master.
// It is used for reporting to clients the amount of time spent delayed by
// the TagQueue
double proxyTagThrottledDuration{ 0.0 };
Optional<UID> debugID; Optional<UID> debugID;
ReplyPromise<GetReadVersionReply> reply; ReplyPromise<GetReadVersionReply> reply;
@ -303,6 +320,8 @@ struct GetReadVersionRequest : TimedRequest {
bool operator<(GetReadVersionRequest const& rhs) const { return priority < rhs.priority; } bool operator<(GetReadVersionRequest const& rhs) const { return priority < rhs.priority; }
bool isTagged() const { return !tags.empty(); }
template <class Ar> template <class Ar>
void serialize(Ar& ar) { void serialize(Ar& ar) {
serializer(ar, transactionCount, flags, tags, debugID, reply, spanContext, maxVersion); serializer(ar, transactionCount, flags, tags, debugID, reply, spanContext, maxVersion);

View File

@ -59,11 +59,12 @@ Future<Void> onEncryptKeyProxyChange(Reference<AsyncVar<T> const> db) {
ACTOR template <class T> ACTOR template <class T>
Future<EKPGetLatestBaseCipherKeysReply> getUncachedLatestEncryptCipherKeys(Reference<AsyncVar<T> const> db, Future<EKPGetLatestBaseCipherKeysReply> getUncachedLatestEncryptCipherKeys(Reference<AsyncVar<T> const> db,
EKPGetLatestBaseCipherKeysRequest request) { EKPGetLatestBaseCipherKeysRequest request,
BlobCipherMetrics::UsageType usageType) {
Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy; Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy;
if (!proxy.present()) { if (!proxy.present()) {
// Wait for onEncryptKeyProxyChange. // Wait for onEncryptKeyProxyChange.
TraceEvent("GetLatestEncryptCipherKeys_EncryptKeyProxyNotPresent"); TraceEvent("GetLatestEncryptCipherKeys_EncryptKeyProxyNotPresent").detail("UsageType", toString(usageType));
return Never(); return Never();
} }
request.reply.reset(); request.reply.reset();
@ -117,7 +118,7 @@ Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getL
// Fetch any uncached cipher keys. // Fetch any uncached cipher keys.
state double startTime = now(); state double startTime = now();
loop choose { loop choose {
when(EKPGetLatestBaseCipherKeysReply reply = wait(getUncachedLatestEncryptCipherKeys(db, request))) { when(EKPGetLatestBaseCipherKeysReply reply = wait(getUncachedLatestEncryptCipherKeys(db, request, usageType))) {
// Insert base cipher keys into cache and construct result. // Insert base cipher keys into cache and construct result.
for (const EKPBaseCipherDetails& details : reply.baseCipherDetails) { for (const EKPBaseCipherDetails& details : reply.baseCipherDetails) {
EncryptCipherDomainId domainId = details.encryptDomainId; EncryptCipherDomainId domainId = details.encryptDomainId;
@ -167,11 +168,12 @@ Future<Reference<BlobCipherKey>> getLatestEncryptCipherKey(Reference<AsyncVar<T>
ACTOR template <class T> ACTOR template <class T>
Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<AsyncVar<T> const> db, Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<AsyncVar<T> const> db,
EKPGetBaseCipherKeysByIdsRequest request) { EKPGetBaseCipherKeysByIdsRequest request,
BlobCipherMetrics::UsageType usageType) {
Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy; Optional<EncryptKeyProxyInterface> proxy = db->get().encryptKeyProxy;
if (!proxy.present()) { if (!proxy.present()) {
// Wait for onEncryptKeyProxyChange. // Wait for onEncryptKeyProxyChange.
TraceEvent("GetEncryptCipherKeys_EncryptKeyProxyNotPresent"); TraceEvent("GetEncryptCipherKeys_EncryptKeyProxyNotPresent").detail("UsageType", toString(usageType));
return Never(); return Never();
} }
request.reply.reset(); request.reply.reset();
@ -232,7 +234,7 @@ Future<std::unordered_map<BlobCipherDetails, Reference<BlobCipherKey>>> getEncry
// Fetch any uncached cipher keys. // Fetch any uncached cipher keys.
state double startTime = now(); state double startTime = now();
loop choose { loop choose {
when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request))) { when(EKPGetBaseCipherKeysByIdsReply reply = wait(getUncachedEncryptCipherKeys(db, request, usageType))) {
std::unordered_map<BaseCipherIndex, EKPBaseCipherDetails, boost::hash<BaseCipherIndex>> baseCipherKeys; std::unordered_map<BaseCipherIndex, EKPBaseCipherDetails, boost::hash<BaseCipherIndex>> baseCipherKeys;
for (const EKPBaseCipherDetails& baseDetails : reply.baseCipherDetails) { for (const EKPBaseCipherDetails& baseDetails : reply.baseCipherDetails) {
BaseCipherIndex baseIdx = std::make_pair(baseDetails.encryptDomainId, baseDetails.baseCipherId); BaseCipherIndex baseIdx = std::make_pair(baseDetails.encryptDomainId, baseDetails.baseCipherId);

View File

@ -0,0 +1,166 @@
/*
* IdempotencyId.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_IDEMPOTENCYID_H
#define FDBCLIENT_IDEMPOTENCYID_H
#pragma once
#include "fdbclient/FDBTypes.h"
#include "fdbclient/PImpl.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/serialize.h"
struct CommitResult {
Version commitVersion;
uint16_t batchIndex;
};
// See design/idempotency_ids.md for more information. Designed so that the common case of a random 16 byte id does not
// usually require indirection. Either invalid or an id with length >= 16 and < 256.
struct IdempotencyIdRef {
static constexpr auto file_identifier = 3858470;
// Create an invalid IdempotencyIdRef
IdempotencyIdRef() : first(0) {}
// Borrows memory from the StringRef
explicit IdempotencyIdRef(StringRef id) {
if (id.empty()) {
first = 0;
return;
}
ASSERT(id.size() >= 16);
ASSERT(id.size() < 256);
if (id.size() == 16 &&
/* If it's 16 bytes but first < 256 we still need to use an indirection to avoid ambiguity. */
reinterpret_cast<const uint64_t*>(id.begin())[0] >= 256) {
first = reinterpret_cast<const uint64_t*>(id.begin())[0];
second.id = reinterpret_cast<const uint64_t*>(id.begin())[1];
} else {
first = id.size();
second.ptr = id.begin();
}
}
IdempotencyIdRef(Arena& arena, IdempotencyIdRef t)
: IdempotencyIdRef(t.valid() && t.indirect() ? StringRef(arena, t.asStringRefUnsafe()) : t.asStringRefUnsafe()) {}
int expectedSize() const {
if (valid() && indirect()) {
return first;
}
return 0;
}
bool operator==(const IdempotencyIdRef& other) const { return asStringRefUnsafe() == other.asStringRefUnsafe(); }
IdempotencyIdRef(IdempotencyIdRef&& other) = default;
IdempotencyIdRef& operator=(IdempotencyIdRef&& other) = default;
IdempotencyIdRef(const IdempotencyIdRef& other) = default;
IdempotencyIdRef& operator=(const IdempotencyIdRef& other) = default;
template <class Archive>
void serialize(Archive& ar) {
// Only support network messages/object serializer for now
ASSERT(false);
}
bool valid() const { return first != 0; }
// Result may reference this, so *this must outlive result.
StringRef asStringRefUnsafe() const {
if (!valid()) {
return StringRef();
}
if (indirect()) {
return StringRef(second.ptr, first);
} else {
return StringRef(reinterpret_cast<const uint8_t*>(this), sizeof(*this));
}
}
private:
bool indirect() const { return first < 256; }
// first == 0 means this id is invalid. This representation is not ambiguous
// because if first < 256, then first is the length of the id, but a valid
// id as at least 16 bytes long.
uint64_t first;
union {
uint64_t id;
const uint8_t* ptr;
} second; // If first < 256, then ptr is valid. Otherwise id is valid.
};
using IdempotencyId = Standalone<IdempotencyIdRef>;
namespace std {
template <>
struct hash<IdempotencyIdRef> {
std::size_t operator()(const IdempotencyIdRef& id) const { return std::hash<StringRef>{}(id.asStringRefUnsafe()); }
};
template <>
struct hash<IdempotencyId> {
std::size_t operator()(const IdempotencyId& id) const { return std::hash<StringRef>{}(id.asStringRefUnsafe()); }
};
} // namespace std
template <>
struct dynamic_size_traits<IdempotencyIdRef> : std::true_type {
template <class Context>
static size_t size(const IdempotencyIdRef& t, Context&) {
return t.asStringRefUnsafe().size();
}
template <class Context>
static void save(uint8_t* out, const IdempotencyIdRef& t, Context&) {
StringRef s = t.asStringRefUnsafe();
std::copy(s.begin(), s.end(), out);
}
template <class Context>
static void load(const uint8_t* ptr, size_t sz, IdempotencyIdRef& id, Context& context) {
id = IdempotencyIdRef(StringRef(context.tryReadZeroCopy(ptr, sz), sz));
}
};
// The plan is to use this as a key in a potentially large hashtable, so it should be compact.
static_assert(sizeof(IdempotencyIdRef) == 16);
// Use in the commit proxy to construct a kv pair according to the format described in design/idempotency_ids.md
struct IdempotencyIdKVBuilder : NonCopyable {
IdempotencyIdKVBuilder();
void setCommitVersion(Version commitVersion);
// All calls to add must share the same high order byte of batchIndex (until the next call to buildAndClear)
void add(const IdempotencyIdRef& id, uint16_t batchIndex);
// Must call setCommitVersion before calling buildAndClear. After calling buildAndClear, this object is ready to
// start a new kv pair for the high order byte of batchIndex.
Optional<KeyValue> buildAndClear();
~IdempotencyIdKVBuilder();
private:
PImpl<struct IdempotencyIdKVBuilderImpl> impl;
};
// Check if id is present in kv, and if so return the commit version and batchIndex
Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const IdempotencyIdRef& id);
#endif

View File

@ -138,6 +138,9 @@ ACTOR Future<int> setDDMode(Database cx, int mode);
ACTOR Future<Void> forceRecovery(Reference<IClusterConnectionRecord> clusterFile, Standalone<StringRef> dcId); ACTOR Future<Void> forceRecovery(Reference<IClusterConnectionRecord> clusterFile, Standalone<StringRef> dcId);
// Start an audit on range of the specific type.
ACTOR Future<UID> auditStorage(Reference<IClusterConnectionRecord> clusterFile, KeyRange range, AuditType type);
ACTOR Future<Void> printHealthyZone(Database cx); ACTOR Future<Void> printHealthyZone(Database cx);
ACTOR Future<bool> clearHealthyZone(Database cx, bool printWarning = false, bool clearSSFailureZoneString = false); ACTOR Future<bool> clearHealthyZone(Database cx, bool printWarning = false, bool clearSSFailureZoneString = false);
ACTOR Future<bool> setHealthyZone(Database cx, StringRef zoneId, double seconds, bool printWarning = false); ACTOR Future<bool> setHealthyZone(Database cx, StringRef zoneId, double seconds, bool printWarning = false);

View File

@ -480,6 +480,7 @@ public:
double RATEKEEPER_FAILURE_TIME; double RATEKEEPER_FAILURE_TIME;
double CONSISTENCYSCAN_FAILURE_TIME; double CONSISTENCYSCAN_FAILURE_TIME;
double BLOB_MANAGER_FAILURE_TIME; double BLOB_MANAGER_FAILURE_TIME;
double BLOB_MIGRATOR_FAILURE_TIME;
double REPLACE_INTERFACE_DELAY; double REPLACE_INTERFACE_DELAY;
double REPLACE_INTERFACE_CHECK_DELAY; double REPLACE_INTERFACE_CHECK_DELAY;
double COORDINATOR_REGISTER_INTERVAL; double COORDINATOR_REGISTER_INTERVAL;
@ -703,6 +704,7 @@ public:
int FETCH_KEYS_PARALLELISM_FULL; int FETCH_KEYS_PARALLELISM_FULL;
int FETCH_KEYS_LOWER_PRIORITY; int FETCH_KEYS_LOWER_PRIORITY;
int SERVE_FETCH_CHECKPOINT_PARALLELISM; int SERVE_FETCH_CHECKPOINT_PARALLELISM;
int SERVE_AUDIT_STORAGE_PARALLELISM;
int CHANGE_FEED_DISK_READS_PARALLELISM; int CHANGE_FEED_DISK_READS_PARALLELISM;
int BUGGIFY_BLOCK_BYTES; int BUGGIFY_BLOCK_BYTES;
int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT; int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
@ -973,6 +975,7 @@ public:
bool REST_KMS_CONNECTOR_REFRESH_KMS_URLS; bool REST_KMS_CONNECTOR_REFRESH_KMS_URLS;
double REST_KMS_CONNECTOR_REFRESH_KMS_URLS_INTERVAL_SEC; double REST_KMS_CONNECTOR_REFRESH_KMS_URLS_INTERVAL_SEC;
std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT; std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT;
std::string REST_KMS_CONNECTOR_GET_BLOB_METADATA_ENDPOINT;
ServerKnobs(Randomize, ClientKnobs*, IsSimulated); ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
void initialize(Randomize, ClientKnobs*, IsSimulated); void initialize(Randomize, ClientKnobs*, IsSimulated);

View File

@ -22,6 +22,7 @@
#define FDBCLIENT_STORAGESERVERINTERFACE_H #define FDBCLIENT_STORAGESERVERINTERFACE_H
#pragma once #pragma once
#include "fdbclient/Audit.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/StorageCheckpoint.h" #include "fdbclient/StorageCheckpoint.h"
#include "fdbclient/StorageServerShard.h" #include "fdbclient/StorageServerShard.h"
@ -120,8 +121,8 @@ struct StorageServerInterface {
RequestStream<struct GetCheckpointRequest> checkpoint; RequestStream<struct GetCheckpointRequest> checkpoint;
RequestStream<struct FetchCheckpointRequest> fetchCheckpoint; RequestStream<struct FetchCheckpointRequest> fetchCheckpoint;
RequestStream<struct FetchCheckpointKeyValuesRequest> fetchCheckpointKeyValues; RequestStream<struct FetchCheckpointKeyValuesRequest> fetchCheckpointKeyValues;
RequestStream<struct UpdateCommitCostRequest> updateCommitCostRequest; RequestStream<struct UpdateCommitCostRequest> updateCommitCostRequest;
RequestStream<struct AuditStorageRequest> auditStorage;
private: private:
bool acceptingRequests; bool acceptingRequests;
@ -195,6 +196,8 @@ public:
getValue.getEndpoint().getAdjustedEndpoint(21)); getValue.getEndpoint().getAdjustedEndpoint(21));
updateCommitCostRequest = updateCommitCostRequest =
RequestStream<struct UpdateCommitCostRequest>(getValue.getEndpoint().getAdjustedEndpoint(22)); RequestStream<struct UpdateCommitCostRequest>(getValue.getEndpoint().getAdjustedEndpoint(22));
auditStorage =
RequestStream<struct AuditStorageRequest>(getValue.getEndpoint().getAdjustedEndpoint(23));
} }
} else { } else {
ASSERT(Ar::isDeserializing); ASSERT(Ar::isDeserializing);
@ -246,6 +249,7 @@ public:
streams.push_back(fetchCheckpoint.getReceiver()); streams.push_back(fetchCheckpoint.getReceiver());
streams.push_back(fetchCheckpointKeyValues.getReceiver()); streams.push_back(fetchCheckpointKeyValues.getReceiver());
streams.push_back(updateCommitCostRequest.getReceiver()); streams.push_back(updateCommitCostRequest.getReceiver());
streams.push_back(auditStorage.getReceiver());
FlowTransport::transport().addEndpoints(streams); FlowTransport::transport().addEndpoints(streams);
} }
}; };

View File

@ -94,6 +94,13 @@ void decodeKeyServersValue(RangeResult result,
extern const KeyRef clusterIdKey; extern const KeyRef clusterIdKey;
extern const KeyRangeRef auditRange;
extern const KeyRef auditPrefix;
const Key auditRangeKey(const AuditType type, const UID& auditId, const KeyRef& key);
const Key auditRangePrefix(const AuditType type, const UID& auditId);
const Value auditStorageStateValue(const AuditStorageState& auditStorageState);
AuditStorageState decodeAuditStorageState(const ValueRef& value);
// "\xff/checkpoint/[[UID]] := [[CheckpointMetaData]]" // "\xff/checkpoint/[[UID]] := [[CheckpointMetaData]]"
extern const KeyRef checkpointPrefix; extern const KeyRef checkpointPrefix;
const Key checkpointKeyFor(UID checkpointID); const Key checkpointKeyFor(UID checkpointID);
@ -708,6 +715,9 @@ extern const KeyRangeRef storageQuotaKeys;
extern const KeyRef storageQuotaPrefix; extern const KeyRef storageQuotaPrefix;
Key storageQuotaKey(StringRef tenantName); Key storageQuotaKey(StringRef tenantName);
extern const KeyRangeRef idempotencyIdKeys;
extern const KeyRef idempotencyIdsExpiredVersion;
#pragma clang diagnostic pop #pragma clang diagnostic pop
#endif #endif

View File

@ -48,6 +48,7 @@ struct Tuple {
// Note that strings can't be incomplete because they are parsed such that the end of the packed // Note that strings can't be incomplete because they are parsed such that the end of the packed
// byte string is considered the end of the string in lieu of a specific end. // byte string is considered the end of the string in lieu of a specific end.
static Tuple unpack(StringRef const& str, bool exclude_incomplete = false); static Tuple unpack(StringRef const& str, bool exclude_incomplete = false);
static std::string tupleToString(Tuple const& tuple);
static Tuple unpackUserType(StringRef const& str, bool exclude_incomplete = false); static Tuple unpackUserType(StringRef const& str, bool exclude_incomplete = false);
Tuple& append(Tuple const& tuple); Tuple& append(Tuple const& tuple);

View File

@ -201,6 +201,9 @@ description is not currently required but encouraged.
<Option name="transaction_include_port_in_address" code="505" <Option name="transaction_include_port_in_address" code="505"
description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect." description="Deprecated. Addresses returned by get_addresses_for_key include the port when enabled. As of api version 630, this option is enabled by default and setting this has no effect."
defaultFor="23"/> defaultFor="23"/>
<Option name="transaction_automatic_idempotency" code="506"
description="Set a random idempotency id for all transactions. See the transaction option description for more information."
defaultFor="505"/>
<Option name="transaction_bypass_unreadable" code="700" <Option name="transaction_bypass_unreadable" code="700"
description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information." description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."
defaultFor="1100"/> defaultFor="1100"/>
@ -273,6 +276,11 @@ description is not currently required but encouraged.
<Option name="size_limit" code="503" <Option name="size_limit" code="503"
paramType="Int" paramDescription="value in bytes" paramType="Int" paramDescription="value in bytes"
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." /> description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="idempotency_id" code="504"
paramType="String" paramDescription="Unique ID"
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes." />
<Option name="automatic_idempotency" code="505"
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future." />
<Option name="snapshot_ryw_enable" code="600" <Option name="snapshot_ryw_enable" code="600"
description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." /> description="Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior." />
<Option name="snapshot_ryw_disable" code="601" <Option name="snapshot_ryw_disable" code="601"

View File

@ -283,6 +283,15 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
default: default:
return ProcessClass::NeverAssign; return ProcessClass::NeverAssign;
} }
case ProcessClass::BlobMigrator:
switch (_class) {
case ProcessClass::StatelessClass:
return ProcessClass::GoodFit;
case ProcessClass::MasterClass:
return ProcessClass::OkayFit;
default:
return ProcessClass::NeverAssign;
}
case ProcessClass::StorageCache: case ProcessClass::StorageCache:
switch (_class) { switch (_class) {
case ProcessClass::StorageCacheClass: case ProcessClass::StorageCacheClass:

View File

@ -8,6 +8,7 @@
#include "flow/network.h" #include "flow/network.h"
#include <boost/unordered_map.hpp> #include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include <fmt/format.h> #include <fmt/format.h>
#include <list> #include <list>
@ -123,20 +124,70 @@ TEST_CASE("/fdbrpc/authz/LRUCache") {
return Void(); return Void();
} }
struct TokenCacheImpl { struct CacheEntry {
struct CacheEntry { Arena arena;
Arena arena; VectorRef<TenantNameRef> tenants;
VectorRef<TenantNameRef> tenants; Optional<StringRef> tokenId;
double expirationTime = 0.0; double expirationTime = 0.0;
}; };
struct AuditEntry {
NetworkAddress address;
Optional<Standalone<StringRef>> tokenId;
explicit AuditEntry(NetworkAddress const& address, CacheEntry const& cacheEntry)
: address(address),
tokenId(cacheEntry.tokenId.present() ? Standalone<StringRef>(cacheEntry.tokenId.get(), cacheEntry.arena)
: Optional<Standalone<StringRef>>()) {}
};
bool operator==(AuditEntry const& lhs, AuditEntry const& rhs) {
return (lhs.address == rhs.address) && (lhs.tokenId.present() == rhs.tokenId.present()) &&
(!lhs.tokenId.present() || lhs.tokenId.get() == rhs.tokenId.get());
}
std::size_t hash_value(AuditEntry const& value) {
std::size_t seed = 0;
boost::hash_combine(seed, value.address);
if (value.tokenId.present()) {
boost::hash_combine(seed, value.tokenId.get());
}
return seed;
}
struct TokenCacheImpl {
LRUCache<StringRef, CacheEntry> cache; LRUCache<StringRef, CacheEntry> cache;
TokenCacheImpl() : cache(FLOW_KNOBS->TOKEN_CACHE_SIZE) {} boost::unordered_set<AuditEntry> usedTokens;
Future<Void> auditor;
TokenCacheImpl();
bool validate(TenantNameRef tenant, StringRef token); bool validate(TenantNameRef tenant, StringRef token);
bool validateAndAdd(double currentTime, StringRef token, NetworkAddress const& peer); bool validateAndAdd(double currentTime, StringRef token, NetworkAddress const& peer);
}; };
ACTOR Future<Void> tokenCacheAudit(TokenCacheImpl* self) {
state boost::unordered_set<AuditEntry> audits;
state boost::unordered_set<AuditEntry>::iterator iter;
state double lastLoggedTime = 0;
loop {
auto const timeSinceLog = g_network->timer() - lastLoggedTime;
if (timeSinceLog < FLOW_KNOBS->AUDIT_TIME_WINDOW) {
wait(delay(FLOW_KNOBS->AUDIT_TIME_WINDOW - timeSinceLog));
}
lastLoggedTime = g_network->timer();
audits.swap(self->usedTokens);
for (iter = audits.begin(); iter != audits.end(); ++iter) {
CODE_PROBE(true, "Audit Logging Running");
TraceEvent("AuditTokenUsed").detail("Client", iter->address).detail("TokenId", iter->tokenId).log();
wait(yield());
}
audits.clear();
}
}
TokenCacheImpl::TokenCacheImpl() : cache(FLOW_KNOBS->TOKEN_CACHE_SIZE) {
auditor = tokenCacheAudit(this);
}
TokenCache::TokenCache() : impl(new TokenCacheImpl()) {} TokenCache::TokenCache() : impl(new TokenCacheImpl()) {}
TokenCache::~TokenCache() { TokenCache::~TokenCache() {
delete impl; delete impl;
@ -212,6 +263,9 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
for (auto tenant : t.tenants.get()) { for (auto tenant : t.tenants.get()) {
c.tenants.push_back_deep(c.arena, tenant); c.tenants.push_back_deep(c.arena, tenant);
} }
if (t.tokenId.present()) {
c.tokenId = StringRef(c.arena, t.tokenId.get());
}
cache.insert(StringRef(c.arena, token), c); cache.insert(StringRef(c.arena, token), c);
return true; return true;
} }
@ -250,6 +304,8 @@ bool TokenCacheImpl::validate(TenantNameRef name, StringRef token) {
TraceEvent(SevWarn, "TenantTokenMismatch").detail("From", peer).detail("Tenant", name.toString()); TraceEvent(SevWarn, "TenantTokenMismatch").detail("From", peer).detail("Tenant", name.toString());
return false; return false;
} }
// audit logging
usedTokens.insert(AuditEntry(peer, *cachedEntry.get()));
return true; return true;
} }

View File

@ -50,6 +50,7 @@ struct ProcessClass {
BlobWorkerClass, BlobWorkerClass,
EncryptKeyProxyClass, EncryptKeyProxyClass,
ConsistencyScanClass, ConsistencyScanClass,
BlobMigratorClass,
InvalidClass = -1 InvalidClass = -1
}; };
@ -77,6 +78,7 @@ struct ProcessClass {
static_assert(ProcessClass::BlobWorkerClass == 19); static_assert(ProcessClass::BlobWorkerClass == 19);
static_assert(ProcessClass::EncryptKeyProxyClass == 20); static_assert(ProcessClass::EncryptKeyProxyClass == 20);
static_assert(ProcessClass::ConsistencyScanClass == 21); static_assert(ProcessClass::ConsistencyScanClass == 21);
static_assert(ProcessClass::BlobMigratorClass == 22);
static_assert(ProcessClass::InvalidClass == -1); static_assert(ProcessClass::InvalidClass == -1);
enum Fitness { enum Fitness {
@ -102,6 +104,7 @@ struct ProcessClass {
ConsistencyScan, ConsistencyScan,
BlobManager, BlobManager,
BlobWorker, BlobWorker,
BlobMigrator,
StorageCache, StorageCache,
Backup, Backup,
EncryptKeyProxy, EncryptKeyProxy,

View File

@ -59,10 +59,9 @@ public:
const UID& dbgid_, const UID& dbgid_,
Arena& arena_, Arena& arena_,
const VectorRef<MutationRef>& mutations_, const VectorRef<MutationRef>& mutations_,
IKeyValueStore* txnStateStore_, IKeyValueStore* txnStateStore_)
Reference<AsyncVar<ServerDBInfo> const> db)
: spanContext(spanContext_), dbgid(dbgid_), arena(arena_), mutations(mutations_), txnStateStore(txnStateStore_), : spanContext(spanContext_), dbgid(dbgid_), arena(arena_), mutations(mutations_), txnStateStore(txnStateStore_),
confChange(dummyConfChange), dbInfo(db) {} confChange(dummyConfChange) {}
ApplyMetadataMutationsImpl(const SpanContext& spanContext_, ApplyMetadataMutationsImpl(const SpanContext& spanContext_,
Arena& arena_, Arena& arena_,
@ -84,17 +83,16 @@ public:
commit(proxyCommitData_.commit), cx(proxyCommitData_.cx), committedVersion(&proxyCommitData_.committedVersion), commit(proxyCommitData_.commit), cx(proxyCommitData_.cx), committedVersion(&proxyCommitData_.committedVersion),
storageCache(&proxyCommitData_.storageCache), tag_popped(&proxyCommitData_.tag_popped), storageCache(&proxyCommitData_.storageCache), tag_popped(&proxyCommitData_.tag_popped),
tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap), tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap),
tenantIdIndex(&proxyCommitData_.tenantIdIndex), initialCommit(initialCommit_), dbInfo(proxyCommitData_.db) {} tenantIdIndex(&proxyCommitData_.tenantIdIndex), initialCommit(initialCommit_) {}
ApplyMetadataMutationsImpl(const SpanContext& spanContext_, ApplyMetadataMutationsImpl(const SpanContext& spanContext_,
ResolverData& resolverData_, ResolverData& resolverData_,
const VectorRef<MutationRef>& mutations_, const VectorRef<MutationRef>& mutations_)
Reference<AsyncVar<ServerDBInfo> const> db)
: spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_), : spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_),
txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit), txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion), confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion),
keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache), keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache),
initialCommit(resolverData_.initialCommit), forResolver(true), dbInfo(db) {} initialCommit(resolverData_.initialCommit), forResolver(true) {}
private: private:
// The following variables are incoming parameters // The following variables are incoming parameters
@ -142,8 +140,6 @@ private:
// true if called from Resolver // true if called from Resolver
bool forResolver = false; bool forResolver = false;
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
private: private:
// The following variables are used internally // The following variables are used internally
@ -164,7 +160,7 @@ private:
private: private:
void writeMutation(const MutationRef& m) { void writeMutation(const MutationRef& m) {
if (forResolver || !isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, dbInfo->get().client)) { if (forResolver || !isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
toCommit->writeTypedMessage(m); toCommit->writeTypedMessage(m);
} else { } else {
ASSERT(cipherKeys != nullptr); ASSERT(cipherKeys != nullptr);
@ -1347,16 +1343,14 @@ void applyMetadataMutations(SpanContext const& spanContext,
void applyMetadataMutations(SpanContext const& spanContext, void applyMetadataMutations(SpanContext const& spanContext,
ResolverData& resolverData, ResolverData& resolverData,
const VectorRef<MutationRef>& mutations, const VectorRef<MutationRef>& mutations) {
Reference<AsyncVar<ServerDBInfo> const> dbInfo) { ApplyMetadataMutationsImpl(spanContext, resolverData, mutations).apply();
ApplyMetadataMutationsImpl(spanContext, resolverData, mutations, dbInfo).apply();
} }
void applyMetadataMutations(SpanContext const& spanContext, void applyMetadataMutations(SpanContext const& spanContext,
const UID& dbgid, const UID& dbgid,
Arena& arena, Arena& arena,
const VectorRef<MutationRef>& mutations, const VectorRef<MutationRef>& mutations,
IKeyValueStore* txnStateStore, IKeyValueStore* txnStateStore) {
Reference<AsyncVar<ServerDBInfo> const> dbInfo) { ApplyMetadataMutationsImpl(spanContext, dbgid, arena, mutations, txnStateStore).apply();
ApplyMetadataMutationsImpl(spanContext, dbgid, arena, mutations, txnStateStore, dbInfo).apply();
} }

View File

@ -483,7 +483,7 @@ ACTOR Future<Void> loadBlobMetadataForTenants(
} }
auto dataEntry = self->tenantData.rangeContaining(info->second.prefix); auto dataEntry = self->tenantData.rangeContaining(info->second.prefix);
ASSERT(dataEntry.begin() == info->second.prefix); ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->setBStore(BlobConnectionProvider::newBlobConnectionProvider(metadata)); dataEntry.cvalue()->updateBStore(metadata);
} }
return Void(); return Void();
} }
@ -492,6 +492,14 @@ ACTOR Future<Void> loadBlobMetadataForTenants(
} }
} }
Future<Void> loadBlobMetadataForTenant(BGTenantMap* self,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName) {
std::vector<std::pair<BlobMetadataDomainId, BlobMetadataDomainName>> toLoad;
toLoad.push_back({ domainId, domainName });
return loadBlobMetadataForTenants(self, toLoad);
}
// list of tenants that may or may not already exist // list of tenants that may or may not already exist
void BGTenantMap::addTenants(std::vector<std::pair<TenantName, TenantMapEntry>> tenants) { void BGTenantMap::addTenants(std::vector<std::pair<TenantName, TenantMapEntry>> tenants) {
std::vector<std::pair<BlobMetadataDomainId, BlobMetadataDomainName>> tenantsToLoad; std::vector<std::pair<BlobMetadataDomainId, BlobMetadataDomainName>> tenantsToLoad;
@ -526,11 +534,41 @@ Optional<TenantMapEntry> BGTenantMap::getTenantById(int64_t id) {
} }
} }
// TODO: handle case where tenant isn't loaded yet // FIXME: batch requests for refresh?
Reference<GranuleTenantData> BGTenantMap::getDataForGranule(const KeyRangeRef& keyRange) { // FIXME: don't double fetch if multiple accesses to refreshing/expired metadata
auto tenant = tenantData.rangeContaining(keyRange.begin); // FIXME: log warning if after refresh, data is still expired!
ASSERT(tenant.begin() <= keyRange.begin); ACTOR Future<Reference<GranuleTenantData>> getDataForGranuleActor(BGTenantMap* self, KeyRange keyRange) {
ASSERT(tenant.end() >= keyRange.end); state int loopCount = 0;
loop {
loopCount++;
auto tenant = self->tenantData.rangeContaining(keyRange.begin);
ASSERT(tenant.begin() <= keyRange.begin);
ASSERT(tenant.end() >= keyRange.end);
return tenant.cvalue(); if (!tenant.cvalue().isValid() || !tenant.cvalue()->bstore.isValid()) {
return tenant.cvalue();
} else if (tenant.cvalue()->bstore->isExpired()) {
CODE_PROBE(true, "re-fetching expired blob metadata");
// fetch again
Future<Void> reload = loadBlobMetadataForTenant(self, tenant.cvalue()->entry.id, tenant->cvalue()->name);
wait(reload);
if (loopCount > 1) {
TraceEvent(SevWarn, "BlobMetadataStillExpired").suppressFor(5.0).detail("LoopCount", loopCount);
wait(delay(0.001));
}
} else {
// handle refresh in background if tenant needs refres
if (tenant.cvalue()->bstore->needsRefresh()) {
Future<Void> reload =
loadBlobMetadataForTenant(self, tenant.cvalue()->entry.id, tenant->cvalue()->name);
self->addActor.send(reload);
}
return tenant.cvalue();
}
}
}
// TODO: handle case where tenant isn't loaded yet
Future<Reference<GranuleTenantData>> BGTenantMap::getDataForGranule(const KeyRangeRef& keyRange) {
return getDataForGranuleActor(this, keyRange);
} }

View File

@ -562,11 +562,12 @@ ACTOR Future<BlobGranuleSplitPoints> alignKeys(Reference<BlobManagerData> bmData
state Transaction tr = Transaction(bmData->db); state Transaction tr = Transaction(bmData->db);
state int idx = 1; state int idx = 1;
state Reference<GranuleTenantData> tenantData = bmData->tenantData.getDataForGranule(granuleRange); state Reference<GranuleTenantData> tenantData;
wait(store(tenantData, bmData->tenantData.getDataForGranule(granuleRange)));
while (SERVER_KNOBS->BG_METADATA_SOURCE == "tenant" && !tenantData.isValid()) { while (SERVER_KNOBS->BG_METADATA_SOURCE == "tenant" && !tenantData.isValid()) {
// this is a bit of a hack, but if we know this range is supposed to have a tenant, and it doesn't, just wait // this is a bit of a hack, but if we know this range is supposed to have a tenant, and it doesn't, just wait
wait(delay(1.0)); wait(delay(1.0));
tenantData = bmData->tenantData.getDataForGranule(granuleRange); wait(store(tenantData, bmData->tenantData.getDataForGranule(granuleRange)));
} }
for (; idx < splits.size() - 1; idx++) { for (; idx < splits.size() - 1; idx++) {
loop { loop {
@ -4212,7 +4213,8 @@ ACTOR Future<Reference<BlobConnectionProvider>> getBStoreForGranule(Reference<Bl
return self->bstore; return self->bstore;
} }
loop { loop {
state Reference<GranuleTenantData> data = self->tenantData.getDataForGranule(granuleRange); state Reference<GranuleTenantData> data;
wait(store(data, self->tenantData.getDataForGranule(granuleRange)));
if (data.isValid()) { if (data.isValid()) {
wait(data->bstoreLoaded.getFuture()); wait(data->bstoreLoaded.getFuture());
wait(delay(0)); wait(delay(0));

View File

@ -0,0 +1,83 @@
/*
* BlobMigrator.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
#include "flow/flow.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/ServerDBInfo.actor.h"
#include "fdbserver/WaitFailure.h"
#include "flow/actorcompiler.h" // has to be last include
// BlobMigrator manages data migration from blob storage to storage server. It implements a minimal set of
// StorageServerInterface APIs which are needed for DataDistributor to start data migration.
class BlobMigrator : public NonCopyable, public ReferenceCounted<BlobMigrator> {
public:
BlobMigrator(Reference<AsyncVar<ServerDBInfo> const> dbInfo, BlobMigratorInterface interf)
: blobMigratorInterf(interf), actors(false) {
if (!blobConn.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
blobConn = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
}
db = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True);
}
~BlobMigrator() {}
ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
self->actors.add(waitFailureServer(self->blobMigratorInterf.waitFailure.getFuture()));
loop {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->blobMigratorInterf.haltBlobMigrator.getFuture())) {
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->blobMigratorInterf.id()).detail("ReqID", req.requesterID);
break;
}
when(wait(self->actors.getResult())) {}
}
}
return Void();
}
private:
Database db;
Reference<BlobConnectionProvider> blobConn;
BlobMigratorInterface blobMigratorInterf;
ActorCollection actors;
};
// Main entry point
ACTOR Future<Void> blobMigrator(BlobMigratorInterface ssi, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", ssi.id().toString());
try {
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, ssi);
wait(BlobMigrator::start(self));
} catch (Error& e) {
fmt::print("unexpected blob migrator error {}\n", e.what());
}
return Void();
}

View File

@ -225,8 +225,7 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
resnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_RESNAPSHOT_PARALLELISM)), resnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_RESNAPSHOT_PARALLELISM)),
deltaWritesLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM)), deltaWritesLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM)),
stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, initialSnapshotLock, resnapshotLock, deltaWritesLock), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, initialSnapshotLock, resnapshotLock, deltaWritesLock),
isEncryptionEnabled( isEncryptionEnabled(isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION)) {}
isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION, db->clientInfo->get())) {}
bool managerEpochOk(int64_t epoch) { bool managerEpochOk(int64_t epoch) {
if (epoch < currentManagerEpoch) { if (epoch < currentManagerEpoch) {
@ -367,7 +366,7 @@ ACTOR Future<BlobGranuleCipherKeysCtx> getLatestGranuleCipherKeys(Reference<Blob
KeyRange keyRange, KeyRange keyRange,
Arena* arena) { Arena* arena) {
state BlobGranuleCipherKeysCtx cipherKeysCtx; state BlobGranuleCipherKeysCtx cipherKeysCtx;
state Reference<GranuleTenantData> tenantData = bwData->tenantData.getDataForGranule(keyRange); state Reference<GranuleTenantData> tenantData = wait(bwData->tenantData.getDataForGranule(keyRange));
ASSERT(tenantData.isValid()); ASSERT(tenantData.isValid());
@ -1195,8 +1194,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
deltaF = files.deltaFiles[deltaIdx]; deltaF = files.deltaFiles[deltaIdx];
if (deltaF.cipherKeysMeta.present()) { if (deltaF.cipherKeysMeta.present()) {
ASSERT(isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION, ASSERT(isEncryptionOpSupported(EncryptOperationType::BLOB_GRANULE_ENCRYPTION));
bwData->dbInfo->get().client));
BlobGranuleCipherKeysCtx keysCtx = BlobGranuleCipherKeysCtx keysCtx =
wait(getGranuleCipherKeysFromKeysMeta(bwData, deltaF.cipherKeysMeta.get(), &filenameArena)); wait(getGranuleCipherKeysFromKeysMeta(bwData, deltaF.cipherKeysMeta.get(), &filenameArena));
@ -4097,7 +4095,8 @@ ACTOR Future<Reference<BlobConnectionProvider>> loadBStoreForTenant(Reference<Bl
KeyRange keyRange) { KeyRange keyRange) {
state int retryCount = 0; state int retryCount = 0;
loop { loop {
state Reference<GranuleTenantData> data = bwData->tenantData.getDataForGranule(keyRange); state Reference<GranuleTenantData> data;
wait(store(data, bwData->tenantData.getDataForGranule(keyRange)));
if (data.isValid()) { if (data.isValid()) {
wait(data->bstoreLoaded.getFuture()); wait(data->bstoreLoaded.getFuture());
wait(delay(0)); wait(delay(0));

View File

@ -29,6 +29,8 @@
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbrpc/FailureMonitor.h" #include "fdbrpc/FailureMonitor.h"
#include "fdbclient/EncryptKeyProxyInterface.h" #include "fdbclient/EncryptKeyProxyInterface.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h" #include "flow/ActorCollection.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/ClusterConnectionMemoryRecord.h"
@ -198,6 +200,32 @@ struct BlobManagerSingleton : Singleton<BlobManagerInterface> {
} }
}; };
struct BlobMigratorSingleton : Singleton<BlobMigratorInterface> {
BlobMigratorSingleton(const Optional<BlobMigratorInterface>& interface) : Singleton(interface) {}
Role getRole() const { return Role::BLOB_MIGRATOR; }
ProcessClass::ClusterRole getClusterRole() const { return ProcessClass::BlobMigrator; }
void setInterfaceToDbInfo(ClusterControllerData* cc) const {
if (interface.present()) {
TraceEvent("CCMG_SetInf", cc->id).detail("Id", interface.get().id());
cc->db.setBlobMigrator(interface.get());
}
}
void halt(ClusterControllerData* cc, Optional<Standalone<StringRef>> pid) const {
if (interface.present()) {
TraceEvent("CCMG_Halt", cc->id).detail("Id", interface.get().id());
cc->id_worker[pid].haltBlobMigrator =
brokenPromiseToNever(interface.get().haltBlobMigrator.getReply(HaltBlobMigratorRequest(cc->id)));
}
}
void recruit(ClusterControllerData* cc) const {
cc->lastRecruitTime = now();
cc->recruitBlobMigrator.set(true);
}
};
struct EncryptKeyProxySingleton : Singleton<EncryptKeyProxyInterface> { struct EncryptKeyProxySingleton : Singleton<EncryptKeyProxyInterface> {
EncryptKeyProxySingleton(const Optional<EncryptKeyProxyInterface>& interface) : Singleton(interface) {} EncryptKeyProxySingleton(const Optional<EncryptKeyProxyInterface>& interface) : Singleton(interface) {}
@ -275,6 +303,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
dbInfo.distributor = db->serverInfo->get().distributor; dbInfo.distributor = db->serverInfo->get().distributor;
dbInfo.ratekeeper = db->serverInfo->get().ratekeeper; dbInfo.ratekeeper = db->serverInfo->get().ratekeeper;
dbInfo.blobManager = db->serverInfo->get().blobManager; dbInfo.blobManager = db->serverInfo->get().blobManager;
dbInfo.blobMigrator = db->serverInfo->get().blobMigrator;
dbInfo.encryptKeyProxy = db->serverInfo->get().encryptKeyProxy; dbInfo.encryptKeyProxy = db->serverInfo->get().encryptKeyProxy;
dbInfo.consistencyScan = db->serverInfo->get().consistencyScan; dbInfo.consistencyScan = db->serverInfo->get().consistencyScan;
dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig; dbInfo.latencyBandConfig = db->serverInfo->get().latencyBandConfig;
@ -656,8 +685,12 @@ void checkBetterSingletons(ClusterControllerData* self) {
WorkerDetails newCSWorker = findNewProcessForSingleton(self, ProcessClass::ConsistencyScan, id_used); WorkerDetails newCSWorker = findNewProcessForSingleton(self, ProcessClass::ConsistencyScan, id_used);
WorkerDetails newBMWorker; WorkerDetails newBMWorker;
WorkerDetails newMGWorker;
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
newBMWorker = findNewProcessForSingleton(self, ProcessClass::BlobManager, id_used); newBMWorker = findNewProcessForSingleton(self, ProcessClass::BlobManager, id_used);
if (isFullRestoreMode()) {
newMGWorker = findNewProcessForSingleton(self, ProcessClass::BlobMigrator, id_used);
}
} }
WorkerDetails newEKPWorker; WorkerDetails newEKPWorker;
@ -671,8 +704,12 @@ void checkBetterSingletons(ClusterControllerData* self) {
auto bestFitnessForCS = findBestFitnessForSingleton(self, newCSWorker, ProcessClass::ConsistencyScan); auto bestFitnessForCS = findBestFitnessForSingleton(self, newCSWorker, ProcessClass::ConsistencyScan);
ProcessClass::Fitness bestFitnessForBM; ProcessClass::Fitness bestFitnessForBM;
ProcessClass::Fitness bestFitnessForMG;
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
bestFitnessForBM = findBestFitnessForSingleton(self, newBMWorker, ProcessClass::BlobManager); bestFitnessForBM = findBestFitnessForSingleton(self, newBMWorker, ProcessClass::BlobManager);
if (isFullRestoreMode()) {
bestFitnessForMG = findBestFitnessForSingleton(self, newMGWorker, ProcessClass::BlobManager);
}
} }
ProcessClass::Fitness bestFitnessForEKP; ProcessClass::Fitness bestFitnessForEKP;
@ -685,6 +722,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
auto ddSingleton = DataDistributorSingleton(db.distributor); auto ddSingleton = DataDistributorSingleton(db.distributor);
ConsistencyScanSingleton csSingleton(db.consistencyScan); ConsistencyScanSingleton csSingleton(db.consistencyScan);
BlobManagerSingleton bmSingleton(db.blobManager); BlobManagerSingleton bmSingleton(db.blobManager);
BlobMigratorSingleton mgSingleton(db.blobMigrator);
EncryptKeyProxySingleton ekpSingleton(db.encryptKeyProxy); EncryptKeyProxySingleton ekpSingleton(db.encryptKeyProxy);
// Check if the singletons are healthy. // Check if the singletons are healthy.
@ -699,9 +737,14 @@ void checkBetterSingletons(ClusterControllerData* self) {
self, newCSWorker, csSingleton, bestFitnessForCS, self->recruitingConsistencyScanID); self, newCSWorker, csSingleton, bestFitnessForCS, self->recruitingConsistencyScanID);
bool bmHealthy = true; bool bmHealthy = true;
bool mgHealthy = true;
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
bmHealthy = isHealthySingleton<BlobManagerInterface>( bmHealthy = isHealthySingleton<BlobManagerInterface>(
self, newBMWorker, bmSingleton, bestFitnessForBM, self->recruitingBlobManagerID); self, newBMWorker, bmSingleton, bestFitnessForBM, self->recruitingBlobManagerID);
if (isFullRestoreMode()) {
mgHealthy = isHealthySingleton<BlobMigratorInterface>(
self, newMGWorker, mgSingleton, bestFitnessForMG, self->recruitingBlobMigratorID);
}
} }
bool ekpHealthy = true; bool ekpHealthy = true;
@ -711,7 +754,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
} }
// if any of the singletons are unhealthy (rerecruited or not stable), then do not // if any of the singletons are unhealthy (rerecruited or not stable), then do not
// consider any further re-recruitments // consider any further re-recruitments
if (!(rkHealthy && ddHealthy && bmHealthy && ekpHealthy && csHealthy)) { if (!(rkHealthy && ddHealthy && bmHealthy && ekpHealthy && csHealthy && mgHealthy)) {
return; return;
} }
@ -725,9 +768,14 @@ void checkBetterSingletons(ClusterControllerData* self) {
Optional<Standalone<StringRef>> newCSProcessId = newCSWorker.interf.locality.processId(); Optional<Standalone<StringRef>> newCSProcessId = newCSWorker.interf.locality.processId();
Optional<Standalone<StringRef>> currBMProcessId, newBMProcessId; Optional<Standalone<StringRef>> currBMProcessId, newBMProcessId;
Optional<Standalone<StringRef>> currMGProcessId, newMGProcessId;
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
currBMProcessId = bmSingleton.interface.get().locality.processId(); currBMProcessId = bmSingleton.interface.get().locality.processId();
newBMProcessId = newBMWorker.interf.locality.processId(); newBMProcessId = newBMWorker.interf.locality.processId();
if (isFullRestoreMode()) {
currMGProcessId = mgSingleton.interface.get().locality.processId();
newMGProcessId = newMGWorker.interf.locality.processId();
}
} }
Optional<Standalone<StringRef>> currEKPProcessId, newEKPProcessId; Optional<Standalone<StringRef>> currEKPProcessId, newEKPProcessId;
@ -741,6 +789,10 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
currPids.emplace_back(currBMProcessId); currPids.emplace_back(currBMProcessId);
newPids.emplace_back(newBMProcessId); newPids.emplace_back(newBMProcessId);
if (isFullRestoreMode()) {
currPids.emplace_back(currMGProcessId);
newPids.emplace_back(newMGProcessId);
}
} }
if (SERVER_KNOBS->ENABLE_ENCRYPTION) { if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
@ -755,6 +807,10 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (!self->db.blobGranulesEnabled.get()) { if (!self->db.blobGranulesEnabled.get()) {
ASSERT(currColocMap[currBMProcessId] == 0); ASSERT(currColocMap[currBMProcessId] == 0);
ASSERT(newColocMap[newBMProcessId] == 0); ASSERT(newColocMap[newBMProcessId] == 0);
if (isFullRestoreMode()) {
ASSERT(currColocMap[currMGProcessId] == 0);
ASSERT(newColocMap[newMGProcessId] == 0);
}
} }
// if the knob is disabled, the EKP coloc counts should have no affect on the coloc counts check below // if the knob is disabled, the EKP coloc counts should have no affect on the coloc counts check below
@ -767,6 +823,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (newColocMap[newRKProcessId] <= currColocMap[currRKProcessId] && if (newColocMap[newRKProcessId] <= currColocMap[currRKProcessId] &&
newColocMap[newDDProcessId] <= currColocMap[currDDProcessId] && newColocMap[newDDProcessId] <= currColocMap[currDDProcessId] &&
newColocMap[newBMProcessId] <= currColocMap[currBMProcessId] && newColocMap[newBMProcessId] <= currColocMap[currBMProcessId] &&
newColocMap[newMGProcessId] <= currColocMap[currMGProcessId] &&
newColocMap[newEKPProcessId] <= currColocMap[currEKPProcessId] && newColocMap[newEKPProcessId] <= currColocMap[currEKPProcessId] &&
newColocMap[newCSProcessId] <= currColocMap[currCSProcessId]) { newColocMap[newCSProcessId] <= currColocMap[currCSProcessId]) {
// rerecruit the singleton for which we have found a better process, if any // rerecruit the singleton for which we have found a better process, if any
@ -776,6 +833,9 @@ void checkBetterSingletons(ClusterControllerData* self) {
ddSingleton.recruit(self); ddSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) { } else if (self->db.blobGranulesEnabled.get() && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) {
bmSingleton.recruit(self); bmSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() &&
newColocMap[newMGProcessId] < currColocMap[currMGProcessId]) {
mgSingleton.recruit(self);
} else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) { } else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) {
ekpSingleton.recruit(self); ekpSingleton.recruit(self);
} else if (newColocMap[newCSProcessId] < currColocMap[currCSProcessId]) { } else if (newColocMap[newCSProcessId] < currColocMap[currCSProcessId]) {
@ -1330,12 +1390,18 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
self, w, currSingleton, registeringSingleton, self->recruitingRatekeeperID); self, w, currSingleton, registeringSingleton, self->recruitingRatekeeperID);
} }
if (self->db.blobGranulesEnabled.get() && req.blobManagerInterf.present()) { if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() && req.blobManagerInterf.present()) {
auto currSingleton = BlobManagerSingleton(self->db.serverInfo->get().blobManager); auto currSingleton = BlobManagerSingleton(self->db.serverInfo->get().blobManager);
auto registeringSingleton = BlobManagerSingleton(req.blobManagerInterf); auto registeringSingleton = BlobManagerSingleton(req.blobManagerInterf);
haltRegisteringOrCurrentSingleton<BlobManagerInterface>( haltRegisteringOrCurrentSingleton<BlobManagerInterface>(
self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID); self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID);
} }
if (req.blobMigratorInterf.present()) {
auto currSingleton = BlobMigratorSingleton(self->db.serverInfo->get().blobMigrator);
auto registeringSingleton = BlobMigratorSingleton(req.blobMigratorInterf);
haltRegisteringOrCurrentSingleton<BlobMigratorInterface>(
self, w, currSingleton, registeringSingleton, self->recruitingBlobMigratorID);
}
if (SERVER_KNOBS->ENABLE_ENCRYPTION && req.encryptKeyProxyInterf.present()) { if (SERVER_KNOBS->ENABLE_ENCRYPTION && req.encryptKeyProxyInterf.present()) {
auto currSingleton = EncryptKeyProxySingleton(self->db.serverInfo->get().encryptKeyProxy); auto currSingleton = EncryptKeyProxySingleton(self->db.serverInfo->get().encryptKeyProxy);
@ -2013,6 +2079,53 @@ ACTOR Future<Void> handleForcedRecoveries(ClusterControllerData* self, ClusterCo
} }
} }
ACTOR Future<Void> triggerAuditStorage(ClusterControllerData* self, TriggerAuditRequest req) {
TraceEvent(SevInfo, "CCTriggerAuditStorageBegin", self->id)
.detail("Range", req.range)
.detail("AuditType", req.type);
state UID auditId;
try {
while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS ||
!self->db.serverInfo->get().distributor.present()) {
wait(self->db.serverInfo->onChange());
}
TriggerAuditRequest fReq(req.getType(), req.range);
UID auditId_ = wait(self->db.serverInfo->get().distributor.get().triggerAudit.getReply(fReq));
auditId = auditId_;
TraceEvent(SevDebug, "CCTriggerAuditStorageEnd", self->id)
.detail("AuditID", auditId)
.detail("Range", req.range)
.detail("AuditType", req.type);
if (!req.reply.isSet()) {
req.reply.send(auditId);
}
} catch (Error& e) {
TraceEvent(SevDebug, "CCTriggerAuditStorageError", self->id)
.errorUnsuppressed(e)
.detail("AuditID", auditId)
.detail("Range", req.range)
.detail("AuditType", req.type);
if (!req.reply.isSet()) {
req.reply.sendError(audit_storage_failed());
}
}
return Void();
}
ACTOR Future<Void> handleTriggerAuditStorage(ClusterControllerData* self, ClusterControllerFullInterface interf) {
loop {
TriggerAuditRequest req = waitNext(interf.clientInterface.triggerAudit.getFuture());
TraceEvent(SevDebug, "TriggerAuditStorageReceived", self->id)
.detail("ClusterControllerDcId", self->clusterControllerDcId)
.detail("Range", req.range)
.detail("AuditType", req.type);
self->addActor.send(triggerAuditStorage(self, req));
}
}
struct SingletonRecruitThrottler { struct SingletonRecruitThrottler {
double lastRecruitStart; double lastRecruitStart;
@ -2426,6 +2539,104 @@ ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
} }
} }
ACTOR Future<Void> startBlobMigrator(ClusterControllerData* self, double waitTime) {
// If master fails at the same time, give it a chance to clear master PID.
// Also wait to avoid too many consecutive recruits in a small time window.
wait(delay(waitTime));
TraceEvent("CCStartBlobMigrator", self->id).log();
loop {
try {
state bool noBlobMigrator = !self->db.serverInfo->get().blobMigrator.present();
while (!self->masterProcessId.present() ||
self->masterProcessId != self->db.serverInfo->get().master.locality.processId() ||
self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
wait(self->db.serverInfo->onChange() || delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
}
if (noBlobMigrator && self->db.serverInfo->get().blobMigrator.present()) {
// Existing instance registers while waiting, so skip.
return Void();
}
std::map<Optional<Standalone<StringRef>>, int> id_used = self->getUsedIds();
WorkerFitnessInfo blobMigratorWorker = self->getWorkerForRoleInDatacenter(self->clusterControllerDcId,
ProcessClass::BlobMigrator,
ProcessClass::NeverAssign,
self->db.config,
id_used);
InitializeBlobMigratorRequest req(deterministicRandom()->randomUniqueID());
state WorkerDetails worker = blobMigratorWorker.worker;
if (self->onMasterIsBetter(worker, ProcessClass::BlobMigrator)) {
worker = self->id_worker[self->masterProcessId.get()].details;
}
self->recruitingBlobMigratorID = req.reqId;
TraceEvent("CCRecruitBlobMigrator", self->id)
.detail("Addr", worker.interf.address())
.detail("MGID", req.reqId);
ErrorOr<BlobMigratorInterface> interf = wait(worker.interf.blobMigrator.getReplyUnlessFailedFor(
req, SERVER_KNOBS->WAIT_FOR_BLOB_MANAGER_JOIN_DELAY, 0));
if (interf.present()) {
self->recruitBlobMigrator.set(false);
self->recruitingBlobMigratorID = interf.get().id();
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
TraceEvent("CCBlobMigratorRecruited", self->id)
.detail("Addr", worker.interf.address())
.detail("MGID", interf.get().id());
if (blobMigrator.present() && blobMigrator.get().id() != interf.get().id() &&
self->id_worker.count(blobMigrator.get().locality.processId())) {
TraceEvent("CCHaltBlobMigratorAfterRecruit", self->id)
.detail("MGID", blobMigrator.get().id())
.detail("DcID", printable(self->clusterControllerDcId));
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
if (!blobMigrator.present() || blobMigrator.get().id() != interf.get().id()) {
self->db.setBlobMigrator(interf.get());
}
checkOutstandingRequests(self);
return Void();
}
} catch (Error& e) {
TraceEvent("CCBlobMigratorRecruitError", self->id).error(e);
if (e.code() != error_code_no_more_servers) {
throw;
}
}
wait(lowPriorityDelay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY));
}
}
ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
state SingletonRecruitThrottler recruitThrottler;
while (self->db.serverInfo->get().recoveryState < RecoveryState::ACCEPTING_COMMITS) {
wait(self->db.serverInfo->onChange());
}
loop {
if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
loop {
choose {
when(wait(wfClient)) {
TraceEvent("CCBlobMigratorDied", self->id)
.detail("MGID", self->db.serverInfo->get().blobMigrator.get().id());
self->db.clearInterf(ProcessClass::BlobMigratorClass);
break;
}
when(wait(self->recruitBlobMigrator.onChange())) {}
}
}
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode()) {
// if there is no blob migrator present but blob granules are now enabled, recruit a BM
wait(startBlobMigrator(self, recruitThrottler.newRecruitment()));
} else {
wait(self->db.blobGranulesEnabled.onChange());
}
}
}
ACTOR Future<Void> startBlobManager(ClusterControllerData* self, double waitTime) { ACTOR Future<Void> startBlobManager(ClusterControllerData* self, double waitTime) {
// If master fails at the same time, give it a chance to clear master PID. // If master fails at the same time, give it a chance to clear master PID.
// Also wait to avoid too many consecutive recruits in a small time window. // Also wait to avoid too many consecutive recruits in a small time window.
@ -2552,6 +2763,10 @@ ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
const auto& blobManager = self->db.serverInfo->get().blobManager; const auto& blobManager = self->db.serverInfo->get().blobManager;
BlobManagerSingleton(blobManager) BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId()); .haltBlobGranules(self, blobManager.get().locality.processId());
if (isFullRestoreMode()) {
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
break; break;
} }
} }
@ -2782,9 +2997,11 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(updatedChangedDatacenters(&self)); self.addActor.send(updatedChangedDatacenters(&self));
self.addActor.send(updateDatacenterVersionDifference(&self)); self.addActor.send(updateDatacenterVersionDifference(&self));
self.addActor.send(handleForcedRecoveries(&self, interf)); self.addActor.send(handleForcedRecoveries(&self, interf));
self.addActor.send(handleTriggerAuditStorage(&self, interf));
self.addActor.send(monitorDataDistributor(&self)); self.addActor.send(monitorDataDistributor(&self));
self.addActor.send(monitorRatekeeper(&self)); self.addActor.send(monitorRatekeeper(&self));
self.addActor.send(monitorBlobManager(&self)); self.addActor.send(monitorBlobManager(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobGranulesConfigKey(&self)); self.addActor.send(watchBlobGranulesConfigKey(&self));
self.addActor.send(monitorConsistencyScan(&self)); self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self)); self.addActor.send(metaclusterMetricsUpdater(&self));

View File

@ -1056,18 +1056,19 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
// Sets self->configuration to the configuration (FF/conf/ keys) at self->lastEpochEnd // Sets self->configuration to the configuration (FF/conf/ keys) at self->lastEpochEnd
// Recover transaction state store // Recover transaction state store
bool enableEncryptionForTxnStateStore = isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION);
CODE_PROBE(enableEncryptionForTxnStateStore, "Enable encryption for txnStateStore");
if (self->txnStateStore) if (self->txnStateStore)
self->txnStateStore->close(); self->txnStateStore->close();
self->txnStateLogAdapter = openDiskQueueAdapter(oldLogSystem, myLocality, txsPoppedVersion); self->txnStateLogAdapter = openDiskQueueAdapter(oldLogSystem, myLocality, txsPoppedVersion);
self->txnStateStore = keyValueStoreLogSystem( self->txnStateStore = keyValueStoreLogSystem(self->txnStateLogAdapter,
self->txnStateLogAdapter, self->dbInfo,
self->dbInfo, self->dbgid,
self->dbgid, self->memoryLimit,
self->memoryLimit, false,
false, false,
false, true,
true, enableEncryptionForTxnStateStore);
isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, self->dbInfo->get().client));
// Version 0 occurs at the version epoch. The version epoch is the number // Version 0 occurs at the version epoch. The version epoch is the number
// of microseconds since the Unix epoch. It can be set through fdbcli. // of microseconds since the Unix epoch. It can be set through fdbcli.
@ -1688,8 +1689,7 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
self->dbgid, self->dbgid,
recoveryCommitRequest.arena, recoveryCommitRequest.arena,
tr.mutations.slice(mmApplied, tr.mutations.size()), tr.mutations.slice(mmApplied, tr.mutations.size()),
self->txnStateStore, self->txnStateStore);
self->dbInfo);
mmApplied = tr.mutations.size(); mmApplied = tr.mutations.size();
tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial

View File

@ -24,9 +24,11 @@
#include "fdbclient/Atomic.h" #include "fdbclient/Atomic.h"
#include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BlobCipher.h" #include "fdbclient/BlobCipher.h"
#include "fdbclient/BuildIdempotencyIdMutations.h"
#include "fdbclient/CommitTransaction.h" #include "fdbclient/CommitTransaction.h"
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/Knobs.h" #include "fdbclient/Knobs.h"
#include "fdbclient/CommitProxyInterface.h" #include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/NativeAPI.actor.h" #include "fdbclient/NativeAPI.actor.h"
@ -55,6 +57,7 @@
#include "fdbserver/WaitFailure.h" #include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h" #include "flow/ActorCollection.h"
#include "flow/CodeProbe.h"
#include "flow/EncryptUtils.h" #include "flow/EncryptUtils.h"
#include "flow/Error.h" #include "flow/Error.h"
#include "flow/IRandom.h" #include "flow/IRandom.h"
@ -663,6 +666,8 @@ struct CommitBatchContext {
// Cipher keys to be used to encrypt mutations // Cipher keys to be used to encrypt mutations
std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys; std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys;
IdempotencyIdKVBuilder idempotencyKVBuilder;
CommitBatchContext(ProxyCommitData*, const std::vector<CommitTransactionRequest>*, const int); CommitBatchContext(ProxyCommitData*, const std::vector<CommitTransactionRequest>*, const int);
void setupTraceBatch(); void setupTraceBatch();
@ -998,7 +1003,7 @@ ACTOR Future<Void> getResolution(CommitBatchContext* self) {
// Fetch cipher keys if needed. // Fetch cipher keys if needed.
state Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getCipherKeys; state Future<std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>> getCipherKeys;
if (pProxyCommitData->isEncryptionEnabled) { if (pProxyCommitData->isEncryptionEnabled) {
static std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> defaultDomains = { static const std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> defaultDomains = {
{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME }, { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME },
{ ENCRYPT_HEADER_DOMAIN_ID, FDB_ENCRYPT_HEADER_DOMAIN_NAME }, { ENCRYPT_HEADER_DOMAIN_ID, FDB_ENCRYPT_HEADER_DOMAIN_NAME },
{ FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME } { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME }
@ -1091,6 +1096,7 @@ void applyMetadataEffect(CommitBatchContext* self) {
committed = committed =
committed && self->resolution[resolver].stateMutations[versionIndex][transactionIndex].committed; committed && self->resolution[resolver].stateMutations[versionIndex][transactionIndex].committed;
if (committed) { if (committed) {
// Note: since we are not to commit, we don't need to pass cipherKeys for encryption.
applyMetadataMutations(SpanContext(), applyMetadataMutations(SpanContext(),
*self->pProxyCommitData, *self->pProxyCommitData,
self->arena, self->arena,
@ -1594,6 +1600,22 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
&self->computeStart)); &self->computeStart));
} }
buildIdempotencyIdMutations(self->trs,
self->idempotencyKVBuilder,
self->commitVersion,
self->committed,
ConflictBatch::TransactionCommitted,
self->locked,
[&](const KeyValue& kv) {
MutationRef idempotencyIdSet;
idempotencyIdSet.type = MutationRef::Type::SetValue;
idempotencyIdSet.param1 = kv.key;
idempotencyIdSet.param2 = kv.value;
auto& tags = pProxyCommitData->tagsForKey(kv.key);
self->toCommit.addTags(tags);
self->toCommit.writeTypedMessage(idempotencyIdSet);
});
self->toCommit.saveTags(self->writtenTags); self->toCommit.saveTags(self->writtenTags);
pProxyCommitData->stats.mutations += self->mutationCount; pProxyCommitData->stats.mutations += self->mutationCount;
@ -2488,6 +2510,17 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key); tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
} }
state std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cipherKeys;
if (pContext->pCommitData->isEncryptionEnabled) {
static const std::unordered_map<EncryptCipherDomainId, EncryptCipherDomainName> metadataDomains = {
{ SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME },
{ ENCRYPT_HEADER_DOMAIN_ID, FDB_ENCRYPT_HEADER_DOMAIN_NAME }
};
std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>> cks =
wait(getLatestEncryptCipherKeys(pContext->pCommitData->db, metadataDomains, BlobCipherMetrics::TLOG));
cipherKeys = cks;
}
loop { loop {
wait(yield()); wait(yield());
@ -2545,13 +2578,16 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
Arena arena; Arena arena;
bool confChanges; bool confChanges;
CODE_PROBE(
pContext->pCommitData->isEncryptionEnabled,
"Commit proxy apply metadata mutations from txnStateStore on recovery, with encryption-at-rest enabled");
applyMetadataMutations(SpanContext(), applyMetadataMutations(SpanContext(),
*pContext->pCommitData, *pContext->pCommitData,
arena, arena,
Reference<ILogSystem>(), Reference<ILogSystem>(),
mutations, mutations,
/* pToCommit= */ nullptr, /* pToCommit= */ nullptr,
/* pCipherKeys= */ nullptr, pContext->pCommitData->isEncryptionEnabled ? &cipherKeys : nullptr,
confChanges, confChanges,
/* version= */ 0, /* version= */ 0,
/* popVersion= */ 0, /* popVersion= */ 0,
@ -2643,7 +2679,7 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
// Wait until we can load the "real" logsystem, since we don't support switching them currently // Wait until we can load the "real" logsystem, since we don't support switching them currently
while (!(masterLifetime.isEqual(commitData.db->get().masterLifetime) && while (!(masterLifetime.isEqual(commitData.db->get().masterLifetime) &&
commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION && commitData.db->get().recoveryState >= RecoveryState::RECOVERY_TRANSACTION &&
(!isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client) || (!isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) ||
commitData.db->get().encryptKeyProxy.present()))) { commitData.db->get().encryptKeyProxy.present()))) {
//TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch); //TraceEvent("ProxyInit2", proxy.id()).detail("LSEpoch", db->get().logSystemConfig.epoch).detail("Need", epoch);
wait(commitData.db->onChange()); wait(commitData.db->onChange());
@ -2668,15 +2704,14 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor); commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor);
commitData.logAdapter = commitData.logAdapter =
new LogSystemDiskQueueAdapter(commitData.logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false); new LogSystemDiskQueueAdapter(commitData.logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false);
commitData.txnStateStore = commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter,
keyValueStoreLogSystem(commitData.logAdapter, commitData.db,
commitData.db, proxy.id(),
proxy.id(), 2e9,
2e9, true,
true, true,
true, true,
true, isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION));
isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client));
createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec); createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec);
commitData.updateLatencyBandConfig(commitData.db->get().latencyBandConfig); commitData.updateLatencyBandConfig(commitData.db->get().latencyBandConfig);

View File

@ -344,7 +344,7 @@ class TransactionEnvironment {
state Key configKey = encodeConfigKey(configClass, knobName); state Key configKey = encodeConfigKey(configClass, knobName);
state Optional<Value> value = wait(tr->get(configKey)); state Optional<Value> value = wait(tr->get(configKey));
if (expected.present()) { if (expected.present()) {
ASSERT_EQ(BinaryReader::fromStringRef<int64_t>(value.get(), Unversioned()), expected.get()); ASSERT_EQ(Tuple::unpack(value.get()).getInt(0), expected.get());
} else { } else {
ASSERT(!value.present()); ASSERT(!value.present());
} }

View File

@ -106,6 +106,62 @@ class DDTxnProcessorImpl {
return IDDTxnProcessor::SourceServers{ std::vector<UID>(servers.begin(), servers.end()), completeSources }; return IDDTxnProcessor::SourceServers{ std::vector<UID>(servers.begin(), servers.end()), completeSources };
} }
ACTOR static Future<std::vector<IDDTxnProcessor::DDRangeLocations>> getSourceServerInterfacesForRange(
Database cx,
KeyRangeRef range) {
state std::vector<IDDTxnProcessor::DDRangeLocations> res;
state Transaction tr(cx);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
loop {
res.clear();
try {
state RangeResult shards = wait(krmGetRanges(&tr,
keyServersPrefix,
range,
SERVER_KNOBS->MOVE_SHARD_KRM_ROW_LIMIT,
SERVER_KNOBS->MOVE_SHARD_KRM_BYTE_LIMIT));
ASSERT(!shards.empty());
state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
state int i = 0;
for (i = 0; i < shards.size() - 1; ++i) {
state std::vector<UID> src;
std::vector<UID> dest;
UID srcId, destId;
decodeKeyServersValue(UIDtoTagMap, shards[i].value, src, dest, srcId, destId);
std::vector<Future<Optional<Value>>> serverListEntries;
for (int j = 0; j < src.size(); ++j) {
serverListEntries.push_back(tr.get(serverListKeyFor(src[j])));
}
std::vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
IDDTxnProcessor::DDRangeLocations current(KeyRangeRef(shards[i].key, shards[i + 1].key));
for (int j = 0; j < serverListValues.size(); ++j) {
if (!serverListValues[j].present()) {
TraceEvent(SevWarnAlways, "GetSourceServerInterfacesMissing")
.detail("StorageServer", src[j])
.detail("Range", KeyRangeRef(shards[i].key, shards[i + 1].key));
continue;
}
StorageServerInterface ssi = decodeServerListValue(serverListValues[j].get());
current.servers[ssi.locality.describeDcId()].push_back(ssi);
}
res.push_back(current);
}
break;
} catch (Error& e) {
TraceEvent(SevWarnAlways, "GetSourceServerInterfacesError").errorUnsuppressed(e).detail("Range", range);
wait(tr.onError(e));
}
}
return res;
}
// set the system key space // set the system key space
ACTOR static Future<Void> updateReplicaKeys(Database cx, ACTOR static Future<Void> updateReplicaKeys(Database cx,
std::vector<Optional<Key>> primaryDcId, std::vector<Optional<Key>> primaryDcId,
@ -537,6 +593,11 @@ Future<IDDTxnProcessor::SourceServers> DDTxnProcessor::getSourceServersForRange(
return DDTxnProcessorImpl::getSourceServersForRange(cx, range); return DDTxnProcessorImpl::getSourceServersForRange(cx, range);
} }
Future<std::vector<IDDTxnProcessor::DDRangeLocations>> DDTxnProcessor::getSourceServerInterfacesForRange(
const KeyRangeRef range) {
return DDTxnProcessorImpl::getSourceServerInterfacesForRange(cx, range);
}
Future<ServerWorkerInfos> DDTxnProcessor::getServerListAndProcessClasses() { Future<ServerWorkerInfos> DDTxnProcessor::getServerListAndProcessClasses() {
return DDTxnProcessorImpl::getServerListAndProcessClasses(cx); return DDTxnProcessorImpl::getServerListAndProcessClasses(cx);
} }

View File

@ -21,6 +21,7 @@
#include <set> #include <set>
#include <string> #include <string>
#include "fdbclient/Audit.h"
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
@ -52,6 +53,17 @@
#include "fdbserver/DDSharedContext.h" #include "fdbserver/DDSharedContext.h"
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
struct DDAudit {
DDAudit(UID id, KeyRange range, AuditType type)
: id(id), range(range), type(type), auditMap(AuditPhase::Invalid, allKeys.end), actors(true) {}
const UID id;
KeyRange range;
const AuditType type;
KeyRangeMap<AuditPhase> auditMap;
ActorCollection actors;
};
void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int priority) { void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int priority) {
if (!valid) { if (!valid) {
if (shard.hasDest && shard.destId != anonymousShardId) { if (shard.hasDest && shard.destId != anonymousShardId) {
@ -276,6 +288,10 @@ public:
StorageQuotaInfo storageQuotaInfo; StorageQuotaInfo storageQuotaInfo;
Promise<Void> initialized;
std::unordered_map<AuditType, std::vector<std::shared_ptr<DDAudit>>> audits;
DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id, Reference<DDSharedContext> context) DataDistributor(Reference<AsyncVar<ServerDBInfo> const> const& db, UID id, Reference<DDSharedContext> context)
: dbInfo(db), context(context), ddId(id), txnProcessor(nullptr), : dbInfo(db), context(context), ddId(id), txnProcessor(nullptr),
initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")), initialDDEventHolder(makeReference<EventCacheHolder>("InitialDD")),
@ -577,6 +593,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state bool ddIsTenantAware = SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED; state bool ddIsTenantAware = SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED;
loop { loop {
trackerCancelled = false; trackerCancelled = false;
self->initialized = Promise<Void>();
// Stored outside of data distribution tracker to avoid slow tasks // Stored outside of data distribution tracker to avoid slow tasks
// when tracker is cancelled // when tracker is cancelled
@ -594,7 +611,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state PromiseStream<GetTopKMetricsRequest> getTopKShardMetrics; state PromiseStream<GetTopKMetricsRequest> getTopKShardMetrics;
state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false)); state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false)); state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
state Promise<Void> readyToStart;
state Optional<Reference<TenantCache>> ddTenantCache; state Optional<Reference<TenantCache>> ddTenantCache;
if (ddIsTenantAware) { if (ddIsTenantAware) {
@ -636,7 +652,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
getTopKShardMetrics.getFuture(), getTopKShardMetrics.getFuture(),
getShardMetricsList, getShardMetricsList,
getAverageShardBytes.getFuture(), getAverageShardBytes.getFuture(),
readyToStart, self->initialized,
anyZeroHealthyTeams, anyZeroHealthyTeams,
self->ddId, self->ddId,
&shards, &shards,
@ -688,7 +704,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
self->configuration, self->configuration,
self->primaryDcId, self->primaryDcId,
self->configuration.usableRegions > 1 ? self->remoteDcIds : std::vector<Optional<Key>>(), self->configuration.usableRegions > 1 ? self->remoteDcIds : std::vector<Optional<Key>>(),
readyToStart.getFuture(), self->initialized.getFuture(),
zeroHealthyTeams[0], zeroHealthyTeams[0],
IsPrimary::True, IsPrimary::True,
processingUnhealthy, processingUnhealthy,
@ -709,7 +725,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
self->configuration, self->configuration,
self->remoteDcIds, self->remoteDcIds,
Optional<std::vector<Optional<Key>>>(), Optional<std::vector<Optional<Key>>>(),
readyToStart.getFuture() && remoteRecovered(self->dbInfo), self->initialized.getFuture() && remoteRecovered(self->dbInfo),
zeroHealthyTeams[1], zeroHealthyTeams[1],
IsPrimary::False, IsPrimary::False,
processingUnhealthy, processingUnhealthy,
@ -1327,6 +1343,157 @@ ACTOR Future<Void> ddGetMetrics(GetDataDistributorMetricsRequest req,
return Void(); return Void();
} }
ACTOR Future<Void> auditStorage(Reference<DataDistributor> self, TriggerAuditRequest req);
ACTOR Future<Void> scheduleAuditForRange(Reference<DataDistributor> self,
std::shared_ptr<DDAudit> audit,
KeyRange range);
ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
std::shared_ptr<DDAudit> audit,
StorageServerInterface ssi,
AuditStorageRequest req);
ACTOR Future<Void> auditStorage(Reference<DataDistributor> self, TriggerAuditRequest req) {
// TODO(heliu): Load running audit, and create one if no audit is running.
state std::shared_ptr<DDAudit> audit;
auto it = self->audits.find(req.getType());
if (it != self->audits.end() && !it->second.empty()) {
ASSERT_EQ(it->second.size(), 1);
auto& currentAudit = it->second.front();
if (currentAudit->range.contains(req.range)) {
audit = it->second.front();
} else {
req.reply.sendError(audit_storage_exceeded_request_limit());
return Void();
}
} else {
const UID auditId = deterministicRandom()->randomUniqueID();
audit = std::make_shared<DDAudit>(auditId, req.range, req.getType());
self->audits[req.getType()].push_back(audit);
audit->actors.add(scheduleAuditForRange(self, audit, req.range));
TraceEvent(SevDebug, "DDAuditStorageBegin", audit->id).detail("Range", req.range).detail("AuditType", req.type);
}
if (req.async && !req.reply.isSet()) {
req.reply.send(audit->id);
}
try {
wait(audit->actors.getResult());
TraceEvent(SevDebug, "DDAuditStorageEnd", audit->id).detail("Range", req.range).detail("AuditType", req.type);
// TODO(heliu): Set the audit result, and clear auditId.
if (!req.async && !req.reply.isSet()) {
TraceEvent(SevDebug, "DDAuditStorageReply", audit->id)
.detail("Range", req.range)
.detail("AuditType", req.type);
req.reply.send(audit->id);
}
} catch (Error& e) {
TraceEvent(SevWarnAlways, "DDAuditStorageOperationError", audit->id)
.errorUnsuppressed(e)
.detail("Range", req.range)
.detail("AuditType", req.type);
}
return Void();
}
ACTOR Future<Void> scheduleAuditForRange(Reference<DataDistributor> self,
std::shared_ptr<DDAudit> audit,
KeyRange range) {
TraceEvent(SevDebug, "DDScheduleAuditForRangeBegin", audit->id)
.detail("Range", range)
.detail("AuditType", audit->type);
// TODO(heliu): Load the audit map for `range`.
state Key begin = range.begin;
state KeyRange currentRange = range;
while (begin < range.end) {
currentRange = KeyRangeRef(begin, range.end);
// Find the first keyrange that hasn't been validated.
auto f = audit->auditMap.intersectingRanges(currentRange);
for (auto it = f.begin(); it != f.end(); ++it) {
if (it->value() != AuditPhase::Invalid && it->value() != AuditPhase::Failed) {
begin = it->range().end;
currentRange = KeyRangeRef(it->range().end, currentRange.end);
} else {
currentRange = KeyRangeRef(it->range().begin, it->range().end) & currentRange;
break;
}
}
try {
state std::vector<IDDTxnProcessor::DDRangeLocations> rangeLocations =
wait(self->txnProcessor->getSourceServerInterfacesForRange(currentRange));
state int i = 0;
for (i = 0; i < rangeLocations.size(); ++i) {
AuditStorageRequest req(audit->id, rangeLocations[i].range, audit->type);
if (audit->type == AuditType::ValidateHA && rangeLocations[i].servers.size() >= 2) {
auto it = rangeLocations[i].servers.begin();
const int idx = deterministicRandom()->randomInt(0, it->second.size());
StorageServerInterface& targetServer = it->second[idx];
++it;
for (; it != rangeLocations[i].servers.end(); ++it) {
const int idx = deterministicRandom()->randomInt(0, it->second.size());
req.targetServers.push_back(it->second[idx].id());
}
audit->actors.add(doAuditOnStorageServer(self, audit, targetServer, req));
}
begin = rangeLocations[i].range.end;
wait(delay(0.01));
}
} catch (Error& e) {
TraceEvent(SevWarnAlways, "DDScheduleAuditRangeError", audit->id)
.errorUnsuppressed(e)
.detail("Range", range);
if (e.code() == error_code_actor_cancelled) {
throw e;
}
}
}
return Void();
}
ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
std::shared_ptr<DDAudit> audit,
StorageServerInterface ssi,
AuditStorageRequest req) {
TraceEvent(SevDebug, "DDDoAuditOnStorageServerBegin", req.id)
.detail("Range", req.range)
.detail("AuditType", req.type)
.detail("StorageServer", ssi.toString())
.detail("TargetServers", describe(req.targetServers));
try {
audit->auditMap.insert(req.range, AuditPhase::Running);
ErrorOr<AuditStorageState> vResult = wait(ssi.auditStorage.getReplyUnlessFailedFor(
req, /*sustainedFailureDuration=*/2.0, /*sustainedFailureSlope=*/0));
if (vResult.isError()) {
throw vResult.getError();
}
TraceEvent e(vResult.get().error.empty() ? SevInfo : SevWarnAlways, "DDAuditStorageState", req.id);
e.detail("Range", req.range);
e.detail("StorageServer", ssi.toString());
if (!vResult.get().error.empty()) {
e.detail("ErrorMessage", vResult.get().error);
}
} catch (Error& e) {
TraceEvent(SevWarn, "DDDoAuditOnStorageServerError", req.id)
.errorUnsuppressed(e)
.detail("Range", req.range)
.detail("StorageServer", ssi.toString())
.detail("TargetServers", describe(req.targetServers));
if (e.code() != error_code_actor_cancelled) {
audit->auditMap.insert(req.range, AuditPhase::Failed);
audit->actors.add(scheduleAuditForRange(self, audit, req.range));
}
}
return Void();
}
ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<ServerDBInfo> const> db) { ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncVar<ServerDBInfo> const> db) {
state Reference<DDSharedContext> context(new DDSharedContext(di.id())); state Reference<DDSharedContext> context(new DDSharedContext(di.id()));
state Reference<DataDistributor> self(new DataDistributor(db, di.id(), context)); state Reference<DataDistributor> self(new DataDistributor(db, di.id(), context));
@ -1393,6 +1560,9 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
when(GetStorageWigglerStateRequest req = waitNext(di.storageWigglerState.getFuture())) { when(GetStorageWigglerStateRequest req = waitNext(di.storageWigglerState.getFuture())) {
req.reply.send(getStorageWigglerStates(self)); req.reply.send(getStorageWigglerStates(self));
} }
when(TriggerAuditRequest req = waitNext(di.triggerAudit.getFuture())) {
actors.add(auditStorage(self, req));
}
} }
} catch (Error& err) { } catch (Error& err) {
if (normalDataDistributorErrors().count(err.code()) == 0) { if (normalDataDistributorErrors().count(err.code()) == 0) {

View File

@ -18,6 +18,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "fdbclient/BlobMetadataUtils.h"
#include "fdbclient/EncryptKeyProxyInterface.h" #include "fdbclient/EncryptKeyProxyInterface.h"
#include "fdbrpc/Locality.h" #include "fdbrpc/Locality.h"
@ -594,10 +595,21 @@ bool isCipherKeyEligibleForRefresh(const EncryptBaseCipherKey& cipherKey, int64_
// Candidate eligible for refresh iff either is true: // Candidate eligible for refresh iff either is true:
// 1. CipherKey cell is either expired/needs-refresh right now. // 1. CipherKey cell is either expired/needs-refresh right now.
// 2. CipherKey cell 'will' be expired/needs-refresh before next refresh cycle interval (proactive refresh) // 2. CipherKey cell 'will' be expired/needs-refresh before next refresh cycle interval (proactive refresh)
if (BUGGIFY_WITH_PROB(0.01)) {
return true;
}
int64_t nextRefreshCycleTS = currTS + FLOW_KNOBS->ENCRYPT_KEY_REFRESH_INTERVAL; int64_t nextRefreshCycleTS = currTS + FLOW_KNOBS->ENCRYPT_KEY_REFRESH_INTERVAL;
return nextRefreshCycleTS > cipherKey.expireAt || nextRefreshCycleTS > cipherKey.refreshAt; return nextRefreshCycleTS > cipherKey.expireAt || nextRefreshCycleTS > cipherKey.refreshAt;
} }
bool isBlobMetadataEligibleForRefresh(const BlobMetadataDetailsRef& blobMetadata, int64_t currTS) {
if (BUGGIFY_WITH_PROB(0.01)) {
return true;
}
int64_t nextRefreshCycleTS = currTS + SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
return nextRefreshCycleTS > blobMetadata.expireAt || nextRefreshCycleTS > blobMetadata.refreshAt;
}
ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpProxyData, ACTOR Future<Void> refreshEncryptionKeysCore(Reference<EncryptKeyProxyData> ekpProxyData,
KmsConnectorInterface kmsConnectorInf) { KmsConnectorInterface kmsConnectorInf) {
state UID debugId = deterministicRandom()->randomUniqueID(); state UID debugId = deterministicRandom()->randomUniqueID();
@ -710,7 +722,8 @@ ACTOR Future<Void> getLatestBlobMetadata(Reference<EncryptKeyProxyData> ekpProxy
for (auto& info : dedupedDomainInfos) { for (auto& info : dedupedDomainInfos) {
const auto itr = ekpProxyData->blobMetadataDomainIdCache.find(info.first); const auto itr = ekpProxyData->blobMetadataDomainIdCache.find(info.first);
if (itr != ekpProxyData->blobMetadataDomainIdCache.end() && itr->second.isValid()) { if (itr != ekpProxyData->blobMetadataDomainIdCache.end() && itr->second.isValid() &&
now() <= itr->second.metadataDetails.expireAt) {
metadataDetails.arena().dependsOn(itr->second.metadataDetails.arena()); metadataDetails.arena().dependsOn(itr->second.metadataDetails.arena());
metadataDetails.push_back(metadataDetails.arena(), itr->second.metadataDetails); metadataDetails.push_back(metadataDetails.arena(), itr->second.metadataDetails);
@ -760,6 +773,7 @@ ACTOR Future<Void> getLatestBlobMetadata(Reference<EncryptKeyProxyData> ekpProxy
ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpProxyData, ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpProxyData,
KmsConnectorInterface kmsConnectorInf) { KmsConnectorInterface kmsConnectorInf) {
state UID debugId = deterministicRandom()->randomUniqueID(); state UID debugId = deterministicRandom()->randomUniqueID();
state double startTime;
state TraceEvent t("RefreshBlobMetadata_Start", ekpProxyData->myId); state TraceEvent t("RefreshBlobMetadata_Start", ekpProxyData->myId);
t.setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH); t.setMaxEventLength(SERVER_KNOBS->ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH);
@ -769,13 +783,28 @@ ACTOR Future<Void> refreshBlobMetadataCore(Reference<EncryptKeyProxyData> ekpPro
try { try {
KmsConnBlobMetadataReq req; KmsConnBlobMetadataReq req;
req.debugId = debugId; req.debugId = debugId;
req.domainInfos.reserve(req.domainInfos.arena(), ekpProxyData->blobMetadataDomainIdCache.size());
// TODO add refresh + expire timestamp and filter to only ones that need refreshing int64_t currTS = (int64_t)now();
for (auto& item : ekpProxyData->blobMetadataDomainIdCache) { for (auto itr = ekpProxyData->blobMetadataDomainIdCache.begin();
req.domainInfos.emplace_back(req.domainInfos.arena(), item.first, item.second.metadataDetails.domainName); itr != ekpProxyData->blobMetadataDomainIdCache.end();) {
if (isBlobMetadataEligibleForRefresh(itr->second.metadataDetails, currTS)) {
req.domainInfos.emplace_back_deep(
req.domainInfos.arena(), itr->first, itr->second.metadataDetails.domainName);
}
// Garbage collect expired cached Blob Metadata
if (itr->second.metadataDetails.expireAt >= currTS) {
itr = ekpProxyData->blobMetadataDomainIdCache.erase(itr);
} else {
itr++;
}
} }
state double startTime = now();
if (req.domainInfos.empty()) {
return Void();
}
startTime = now();
KmsConnBlobMetadataRep rep = wait(kmsConnectorInf.blobMetadataReq.getReply(req)); KmsConnBlobMetadataRep rep = wait(kmsConnectorInf.blobMetadataReq.getReply(req));
ekpProxyData->kmsBlobMetadataReqLatency.addMeasurement(now() - startTime); ekpProxyData->kmsBlobMetadataReqLatency.addMeasurement(now() - startTime);
for (auto& item : rep.metadataDetails) { for (auto& item : rep.metadataDetails) {

View File

@ -400,8 +400,8 @@ public:
void addRequests(TransactionTag tag, int count) { tagStatistics[tag].addTransactions(static_cast<double>(count)); } void addRequests(TransactionTag tag, int count) { tagStatistics[tag].addTransactions(static_cast<double>(count)); }
uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; } uint64_t getThrottledTagChangeId() const { return throttledTagChangeId; }
PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) { TransactionTagMap<double> getProxyRates(int numProxies) {
PrioritizedTransactionTagMap<double> result; TransactionTagMap<double> result;
lastBusyTagCount = 0; lastBusyTagCount = 0;
for (auto& [tag, stats] : tagStatistics) { for (auto& [tag, stats] : tagStatistics) {
@ -414,8 +414,7 @@ public:
} }
if (targetTps.present()) { if (targetTps.present()) {
auto const smoothedTargetTps = stats.updateAndGetTargetLimit(targetTps.get()); auto const smoothedTargetTps = stats.updateAndGetTargetLimit(targetTps.get());
result[TransactionPriority::BATCH][tag] = result[TransactionPriority::DEFAULT][tag] = result[tag] = smoothedTargetTps / numProxies;
smoothedTargetTps / numProxies;
} else { } else {
te.disable(); te.disable();
} }
@ -497,7 +496,7 @@ uint64_t GlobalTagThrottler::getThrottledTagChangeId() const {
PrioritizedTransactionTagMap<ClientTagThrottleLimits> GlobalTagThrottler::getClientRates() { PrioritizedTransactionTagMap<ClientTagThrottleLimits> GlobalTagThrottler::getClientRates() {
return impl->getClientRates(); return impl->getClientRates();
} }
PrioritizedTransactionTagMap<double> GlobalTagThrottler::getProxyRates(int numProxies) { TransactionTagMap<double> GlobalTagThrottler::getProxyRates(int numProxies) {
return impl->getProxyRates(numProxies); return impl->getProxyRates(numProxies);
} }
int64_t GlobalTagThrottler::autoThrottleCount() const { int64_t GlobalTagThrottler::autoThrottleCount() const {
@ -679,12 +678,9 @@ bool isNear(Optional<double> a, Optional<double> b) {
bool targetRateIsNear(GlobalTagThrottler& globalTagThrottler, TransactionTag tag, Optional<double> expected) { bool targetRateIsNear(GlobalTagThrottler& globalTagThrottler, TransactionTag tag, Optional<double> expected) {
Optional<double> rate; Optional<double> rate;
auto targetRates = globalTagThrottler.getProxyRates(1); auto targetRates = globalTagThrottler.getProxyRates(1);
auto it1 = targetRates.find(TransactionPriority::DEFAULT); auto it = targetRates.find(tag);
if (it1 != targetRates.end()) { if (it != targetRates.end()) {
auto it2 = it1->second.find(tag); rate = it->second;
if (it2 != it1->second.end()) {
rate = it2->second;
}
} }
TraceEvent("GlobalTagThrottling_RateMonitor") TraceEvent("GlobalTagThrottling_RateMonitor")
.detail("Tag", tag) .detail("Tag", tag)

View File

@ -24,11 +24,13 @@
#include "fdbclient/Notified.h" #include "fdbclient/Notified.h"
#include "fdbclient/TransactionLineage.h" #include "fdbclient/TransactionLineage.h"
#include "fdbclient/Tuple.h" #include "fdbclient/Tuple.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/LogSystemDiskQueueAdapter.h"
#include "fdbclient/CommitProxyInterface.h" #include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/GrvProxyInterface.h" #include "fdbclient/GrvProxyInterface.h"
#include "fdbclient/VersionVector.h" #include "fdbclient/VersionVector.h"
#include "fdbserver/GrvProxyTransactionTagThrottler.h"
#include "fdbserver/GrvTransactionRateInfo.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/LogSystemDiskQueueAdapter.h"
#include "fdbserver/WaitFailure.h" #include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h"
#include "fdbrpc/sim_validation.h" #include "fdbrpc/sim_validation.h"
@ -158,83 +160,6 @@ struct GrvProxyStats {
} }
}; };
struct GrvTransactionRateInfo {
double rate;
double limit;
double budget;
bool disabled;
Smoother smoothRate;
Smoother smoothReleased;
GrvTransactionRateInfo(double rate = 0.0)
: rate(rate), limit(0), budget(0), disabled(true), smoothRate(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW),
smoothReleased(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW) {}
void reset() {
// Determine the number of transactions that this proxy is allowed to release
// Roughly speaking, this is done by computing the number of transactions over some historical window that we
// could have started but didn't, and making that our limit. More precisely, we track a smoothed rate limit and
// release rate, the difference of which is the rate of additional transactions that we could have released
// based on that window. Then we multiply by the window size to get a number of transactions.
//
// Limit can be negative in the event that we are releasing more transactions than we are allowed (due to the
// use of our budget or because of higher priority transactions).
double releaseRate = smoothRate.smoothTotal() - smoothReleased.smoothRate();
limit = SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW * releaseRate;
}
bool canStart(int64_t numAlreadyStarted, int64_t count) const {
return numAlreadyStarted + count <=
std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
void updateBudget(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the window that
// elapsed.
//
// We may have exceeded our limit due to the budget or because of higher priority transactions, in which case
// this delta will be negative. The delta can also be negative in the event that our limit was negative, which
// can happen if we had already started more transactions in our window than our rate would have allowed.
//
// This budget has the property that when the budget is required to start transactions (because batches are
// big), the sum limit+budget will increase linearly from 0 to the batch size over time and decrease by the
// batch size upon starting a batch. In other words, this works equivalently to a model where we linearly
// accumulate budget over time in the case that our batches are too big to take advantage of the window based
// limits.
budget = std::max(
0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if (queueEmptyAtPriority) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
}
smoothReleased.addDelta(numStartedAtPriority);
}
void disable() {
disabled = true;
// Use smoothRate.setTotal(0) instead of setting rate to 0 so txns will not be throttled immediately.
smoothRate.setTotal(0);
}
void setRate(double rate) {
ASSERT(rate >= 0 && rate != std::numeric_limits<double>::infinity() && !std::isnan(rate));
this->rate = rate;
if (disabled) {
smoothRate.reset(rate);
disabled = false;
} else {
smoothRate.setTotal(rate);
}
}
};
struct GrvProxyData { struct GrvProxyData {
GrvProxyInterface proxy; GrvProxyInterface proxy;
UID dbgid; UID dbgid;
@ -437,7 +362,7 @@ ACTOR Future<Void> getRate(UID myID,
GetHealthMetricsReply* detailedHealthMetricsReply, GetHealthMetricsReply* detailedHealthMetricsReply,
TransactionTagMap<uint64_t>* transactionTagCounter, TransactionTagMap<uint64_t>* transactionTagCounter,
PrioritizedTransactionTagMap<ClientTagThrottleLimits>* clientThrottledTags, PrioritizedTransactionTagMap<ClientTagThrottleLimits>* clientThrottledTags,
PrioritizedTransactionTagMap<GrvTransactionRateInfo>* perTagRateInfo, GrvProxyTransactionTagThrottler* tagThrottler,
GrvProxyStats* stats, GrvProxyStats* stats,
GrvProxyData* proxyData) { GrvProxyData* proxyData) {
state Future<Void> nextRequestTimer = Never(); state Future<Void> nextRequestTimer = Never();
@ -498,12 +423,7 @@ ACTOR Future<Void> getRate(UID myID,
*clientThrottledTags = std::move(rep.clientThrottledTags.get()); *clientThrottledTags = std::move(rep.clientThrottledTags.get());
} }
if (rep.proxyThrottledTags.present()) { if (rep.proxyThrottledTags.present()) {
perTagRateInfo->clear(); tagThrottler->updateRates(rep.proxyThrottledTags.get());
for (const auto& [priority, tagToRate] : rep.proxyThrottledTags.get()) {
for (const auto& [tag, rate] : tagToRate) {
(*perTagRateInfo)[priority][tag].setRate(rate);
}
}
} }
} }
when(wait(leaseTimeout)) { when(wait(leaseTimeout)) {
@ -537,20 +457,19 @@ void dropRequestFromQueue(Deque<GetReadVersionRequest>* queue, GrvProxyStats* st
} }
// Put a GetReadVersion request into the queue corresponding to its priority. // Put a GetReadVersion request into the queue corresponding to its priority.
ACTOR Future<Void> queueGetReadVersionRequests( ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo> const> db,
Reference<AsyncVar<ServerDBInfo> const> db, SpannedDeque<GetReadVersionRequest>* systemQueue,
SpannedDeque<GetReadVersionRequest>* systemQueue, SpannedDeque<GetReadVersionRequest>* defaultQueue,
SpannedDeque<GetReadVersionRequest>* defaultQueue, SpannedDeque<GetReadVersionRequest>* batchQueue,
SpannedDeque<GetReadVersionRequest>* batchQueue, FutureStream<GetReadVersionRequest> readVersionRequests,
FutureStream<GetReadVersionRequest> readVersionRequests, PromiseStream<Void> GRVTimer,
PromiseStream<Void> GRVTimer, double* lastGRVTime,
double* lastGRVTime, double* GRVBatchTime,
double* GRVBatchTime, FutureStream<double> normalGRVLatency,
FutureStream<double> normalGRVLatency, GrvProxyStats* stats,
GrvProxyStats* stats, GrvTransactionRateInfo* batchRateInfo,
GrvTransactionRateInfo* batchRateInfo, TransactionTagMap<uint64_t>* transactionTagCounter,
TransactionTagMap<uint64_t>* transactionTagCounter, GrvProxyTransactionTagThrottler* tagThrottler) {
PrioritizedTransactionTagMap<GrvTransactionRateInfo> const* perClientRateInfo) {
getCurrentLineage()->modify(&TransactionLineage::operation) = getCurrentLineage()->modify(&TransactionLineage::operation) =
TransactionLineage::Operation::GetConsistentReadVersion; TransactionLineage::Operation::GetConsistentReadVersion;
loop choose { loop choose {
@ -617,12 +536,16 @@ ACTOR Future<Void> queueGetReadVersionRequests(
stats->txnStartIn += req.transactionCount; stats->txnStartIn += req.transactionCount;
stats->txnDefaultPriorityStartIn += req.transactionCount; stats->txnDefaultPriorityStartIn += req.transactionCount;
++stats->defaultGRVQueueSize; ++stats->defaultGRVQueueSize;
defaultQueue->push_back(req); if (SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES && req.isTagged()) {
tagThrottler->addRequest(req);
} else {
defaultQueue->push_back(req);
}
// defaultQueue->span.addParent(req.spanContext); // defaultQueue->span.addParent(req.spanContext);
} else { } else {
// Return error for batch_priority GRV requests // Return error for batch_priority GRV requests
int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1); int64_t proxiesCount = std::max((int)db->get().client.grvProxies.size(), 1);
if (batchRateInfo->rate <= (1.0 / proxiesCount)) { if (batchRateInfo->getRate() <= (1.0 / proxiesCount)) {
req.reply.sendError(batch_transaction_throttled()); req.reply.sendError(batch_transaction_throttled());
stats->txnThrottled += req.transactionCount; stats->txnThrottled += req.transactionCount;
} else { } else {
@ -630,7 +553,11 @@ ACTOR Future<Void> queueGetReadVersionRequests(
stats->txnStartIn += req.transactionCount; stats->txnStartIn += req.transactionCount;
stats->txnBatchPriorityStartIn += req.transactionCount; stats->txnBatchPriorityStartIn += req.transactionCount;
++stats->batchGRVQueueSize; ++stats->batchGRVQueueSize;
batchQueue->push_back(req); if (SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES && req.isTagged()) {
tagThrottler->addRequest(req);
} else {
batchQueue->push_back(req);
}
// batchQueue->span.addParent(req.spanContext); // batchQueue->span.addParent(req.spanContext);
} }
} }
@ -791,6 +718,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
grvProxyData->versionVectorSizeOnGRVReply.addMeasurement(reply.ssVersionVectorDelta.size()); grvProxyData->versionVectorSizeOnGRVReply.addMeasurement(reply.ssVersionVectorDelta.size());
} }
reply.proxyId = grvProxyData->dbgid; reply.proxyId = grvProxyData->dbgid;
reply.proxyTagThrottledDuration = request.proxyTagThrottledDuration;
if (!request.tags.empty()) { if (!request.tags.empty()) {
auto& priorityThrottledTags = clientThrottledTags[request.priority]; auto& priorityThrottledTags = clientThrottledTags[request.priority];
@ -895,7 +823,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
state int64_t batchTransactionCount = 0; state int64_t batchTransactionCount = 0;
state GrvTransactionRateInfo normalRateInfo(10); state GrvTransactionRateInfo normalRateInfo(10);
state GrvTransactionRateInfo batchRateInfo(0); state GrvTransactionRateInfo batchRateInfo(0);
state PrioritizedTransactionTagMap<GrvTransactionRateInfo> perTagRateInfo; state GrvProxyTransactionTagThrottler tagThrottler;
state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc); state SpannedDeque<GetReadVersionRequest> systemQueue("GP:transactionStarterSystemQueue"_loc);
state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc); state SpannedDeque<GetReadVersionRequest> defaultQueue("GP:transactionStarterDefaultQueue"_loc);
@ -922,7 +850,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
detailedHealthMetricsReply, detailedHealthMetricsReply,
&transactionTagCounter, &transactionTagCounter,
&clientThrottledTags, &clientThrottledTags,
&perTagRateInfo, &tagThrottler,
&grvProxyData->stats, &grvProxyData->stats,
grvProxyData)); grvProxyData));
addActor.send(queueGetReadVersionRequests(db, addActor.send(queueGetReadVersionRequests(db,
@ -937,7 +865,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
&grvProxyData->stats, &grvProxyData->stats,
&batchRateInfo, &batchRateInfo,
&transactionTagCounter, &transactionTagCounter,
&perTagRateInfo)); &tagThrottler));
while (std::find(db->get().client.grvProxies.begin(), db->get().client.grvProxies.end(), proxy) == while (std::find(db->get().client.grvProxies.begin(), db->get().client.grvProxies.end(), proxy) ==
db->get().client.grvProxies.end()) { db->get().client.grvProxies.end()) {
@ -960,11 +888,12 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
elapsed = 1e-15; elapsed = 1e-15;
} }
normalRateInfo.reset(); tagThrottler.releaseTransactions(elapsed, defaultQueue, batchQueue);
batchRateInfo.reset(); normalRateInfo.startReleaseWindow();
batchRateInfo.startReleaseWindow();
grvProxyData->stats.transactionLimit = normalRateInfo.limit; grvProxyData->stats.transactionLimit = normalRateInfo.getLimit();
grvProxyData->stats.batchTransactionLimit = batchRateInfo.limit; grvProxyData->stats.batchTransactionLimit = batchRateInfo.getLimit();
int transactionsStarted[2] = { 0, 0 }; int transactionsStarted[2] = { 0, 0 };
int systemTransactionsStarted[2] = { 0, 0 }; int systemTransactionsStarted[2] = { 0, 0 };
@ -1071,11 +1000,11 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
transactionCount += transactionsStarted[0] + transactionsStarted[1]; transactionCount += transactionsStarted[0] + transactionsStarted[1];
batchTransactionCount += batchTotalStarted; batchTransactionCount += batchTotalStarted;
normalRateInfo.updateBudget( normalRateInfo.endReleaseWindow(
systemTotalStarted + normalTotalStarted, systemQueue.empty() && defaultQueue.empty(), elapsed); systemTotalStarted + normalTotalStarted, systemQueue.empty() && defaultQueue.empty(), elapsed);
batchRateInfo.updateBudget(systemTotalStarted + normalTotalStarted + batchTotalStarted, batchRateInfo.endReleaseWindow(systemTotalStarted + normalTotalStarted + batchTotalStarted,
systemQueue.empty() && defaultQueue.empty() && batchQueue.empty(), systemQueue.empty() && defaultQueue.empty() && batchQueue.empty(),
elapsed); elapsed);
if (debugID.present()) { if (debugID.present()) {
g_traceBatch.addEvent("TransactionDebug", g_traceBatch.addEvent("TransactionDebug",

View File

@ -0,0 +1,399 @@
/*
* GrvProxyTransactionTagThrottler.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/GrvProxyTransactionTagThrottler.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // must be last include
uint64_t GrvProxyTransactionTagThrottler::DelayedRequest::lastSequenceNumber = 0;
void GrvProxyTransactionTagThrottler::DelayedRequest::updateProxyTagThrottledDuration() {
req.proxyTagThrottledDuration = now() - startTime;
}
void GrvProxyTransactionTagThrottler::TagQueue::setRate(double rate) {
if (rateInfo.present()) {
rateInfo.get().setRate(rate);
} else {
rateInfo = GrvTransactionRateInfo(rate);
}
}
void GrvProxyTransactionTagThrottler::updateRates(TransactionTagMap<double> const& newRates) {
for (const auto& [tag, rate] : newRates) {
auto it = queues.find(tag);
if (it == queues.end()) {
queues[tag] = TagQueue(rate);
} else {
it->second.setRate(rate);
}
}
// Clean up tags that did not appear in newRates
for (auto& [tag, queue] : queues) {
if (newRates.find(tag) == newRates.end()) {
queue.rateInfo.reset();
}
}
// TODO: Use std::erase_if in C++20
for (auto it = queues.begin(); it != queues.end();) {
const auto& [tag, queue] = *it;
if (queue.requests.empty() && !queue.rateInfo.present()) {
it = queues.erase(it);
} else {
++it;
}
}
}
void GrvProxyTransactionTagThrottler::addRequest(GetReadVersionRequest const& req) {
ASSERT(req.isTagged());
auto const& tag = req.tags.begin()->first;
if (req.tags.size() > 1) {
// The GrvProxyTransactionTagThrottler assumes that each GetReadVersionRequest
// has at most one tag. If a transaction uses multiple tags and
// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
// unexpected behaviour, because only one tag is used for throttling.
TraceEvent(SevWarnAlways, "GrvProxyTransactionTagThrottler_MultipleTags")
.detail("NumTags", req.tags.size())
.detail("UsingTag", printable(tag));
}
queues[tag].requests.emplace_back(req);
}
void GrvProxyTransactionTagThrottler::releaseTransactions(double elapsed,
SpannedDeque<GetReadVersionRequest>& outBatchPriority,
SpannedDeque<GetReadVersionRequest>& outDefaultPriority) {
// Pointer to a TagQueue with some extra metadata stored alongside
struct TagQueueHandle {
// Store pointers here to avoid frequent std::unordered_map lookups
TagQueue* queue;
// Cannot be stored directly because we need to
uint32_t* numReleased;
// Sequence number of the first queued request
int64_t nextSeqNo;
bool operator>(TagQueueHandle const& rhs) const { return nextSeqNo > rhs.nextSeqNo; }
explicit TagQueueHandle(TagQueue& queue, uint32_t& numReleased) : queue(&queue), numReleased(&numReleased) {
ASSERT(!this->queue->requests.empty());
nextSeqNo = this->queue->requests.front().sequenceNumber;
}
};
// Priority queue of queues for each tag, ordered by the sequence number of the
// next request to process in each queue
std::priority_queue<TagQueueHandle, std::vector<TagQueueHandle>, std::greater<TagQueueHandle>> pqOfQueues;
// Track transactions released for each tag
std::vector<std::pair<TransactionTag, uint32_t>> transactionsReleased;
transactionsReleased.reserve(queues.size());
auto const transactionsReleasedInitialCapacity = transactionsReleased.capacity();
for (auto& [tag, queue] : queues) {
if (queue.rateInfo.present()) {
queue.rateInfo.get().startReleaseWindow();
}
if (!queue.requests.empty()) {
// First place the count in the transactionsReleased object,
// then pass a reference to the count to the TagQueueHandle object
// emplaced into pqOfQueues.
//
// Because we've reserved enough space in transactionsReleased
// to avoid resizing, this reference should remain valid.
// This allows each TagQueueHandle to update its number of
// numReleased counter without incurring the cost of a std::unordered_map lookup.
auto& [_, count] = transactionsReleased.emplace_back(tag, 0);
pqOfQueues.emplace(queue, count);
}
}
while (!pqOfQueues.empty()) {
auto tagQueueHandle = pqOfQueues.top();
pqOfQueues.pop();
// Used to determine when it is time to start processing another tag
auto const nextQueueSeqNo =
pqOfQueues.empty() ? std::numeric_limits<int64_t>::max() : pqOfQueues.top().nextSeqNo;
while (!tagQueueHandle.queue->requests.empty()) {
auto& delayedReq = tagQueueHandle.queue->requests.front();
auto count = delayedReq.req.tags.begin()->second;
ASSERT_EQ(tagQueueHandle.nextSeqNo, delayedReq.sequenceNumber);
if (tagQueueHandle.queue->rateInfo.present() &&
!tagQueueHandle.queue->rateInfo.get().canStart(*(tagQueueHandle.numReleased), count)) {
// Cannot release any more transaction from this tag (don't push the tag queue handle back into
// pqOfQueues)
CODE_PROBE(true, "GrvProxyTransactionTagThrottler throttling transaction");
break;
} else {
if (tagQueueHandle.nextSeqNo < nextQueueSeqNo) {
// Releasing transaction
*(tagQueueHandle.numReleased) += count;
delayedReq.updateProxyTagThrottledDuration();
if (delayedReq.req.priority == TransactionPriority::BATCH) {
outBatchPriority.push_back(delayedReq.req);
} else if (delayedReq.req.priority == TransactionPriority::DEFAULT) {
outDefaultPriority.push_back(delayedReq.req);
} else {
// Immediate priority transactions should bypass the GrvProxyTransactionTagThrottler
ASSERT(false);
}
tagQueueHandle.queue->requests.pop_front();
if (!tagQueueHandle.queue->requests.empty()) {
tagQueueHandle.nextSeqNo = tagQueueHandle.queue->requests.front().sequenceNumber;
}
} else {
CODE_PROBE(true, "GrvProxyTransactionTagThrottler switching tags to preserve FIFO");
pqOfQueues.push(tagQueueHandle);
break;
}
}
}
}
// End release windows for queues with valid rateInfo
{
TransactionTagMap<uint32_t> transactionsReleasedMap;
for (const auto& [tag, count] : transactionsReleased) {
transactionsReleasedMap[tag] = count;
}
for (auto& [tag, queue] : queues) {
if (queue.rateInfo.present()) {
queue.rateInfo.get().endReleaseWindow(transactionsReleasedMap[tag], false, elapsed);
}
}
}
// If the capacity is increased, that means the vector has been illegally resized, potentially
// corrupting memory
ASSERT_EQ(transactionsReleased.capacity(), transactionsReleasedInitialCapacity);
}
uint32_t GrvProxyTransactionTagThrottler::size() {
return queues.size();
}
ACTOR static Future<Void> mockClient(GrvProxyTransactionTagThrottler* throttler,
TransactionPriority priority,
TagSet tagSet,
int batchSize,
double desiredRate,
TransactionTagMap<uint32_t>* counters) {
state Future<Void> timer;
state TransactionTagMap<uint32_t> tags;
for (const auto& tag : tagSet) {
tags[tag] = batchSize;
}
loop {
timer = delayJittered(static_cast<double>(batchSize) / desiredRate);
GetReadVersionRequest req;
req.tags = tags;
req.priority = priority;
throttler->addRequest(req);
wait(success(req.reply.getFuture()) && timer);
for (auto& [tag, _] : tags) {
(*counters)[tag] += batchSize;
}
}
}
ACTOR static Future<Void> mockFifoClient(GrvProxyTransactionTagThrottler* throttler) {
state TransactionTagMap<uint32_t> tagSet1;
state TransactionTagMap<uint32_t> tagSet2;
state std::vector<GetReadVersionRequest> reqs;
state int i = 0;
// Used to track the order in which replies are received
state std::vector<int> replyIndices;
// Tag half of requests with one tag, half with another, then randomly shuffle
tagSet1["sampleTag1"_sr] = 1;
tagSet2["sampleTag2"_sr] = 1;
for (i = 0; i < 2000; ++i) {
auto& req = reqs.emplace_back();
req.priority = TransactionPriority::DEFAULT;
if (i < 1000) {
req.tags = tagSet1;
} else {
req.tags = tagSet2;
}
}
deterministicRandom()->randomShuffle(reqs);
// Send requests to throttler and assert that responses are received in FIFO order
for (const auto& req : reqs) {
throttler->addRequest(req);
}
state std::vector<Future<Void>> futures;
for (int j = 0; j < 2000; ++j) {
// Flow hack to capture replyIndices
auto* _replyIndices = &replyIndices;
futures.push_back(map(reqs[j].reply.getFuture(), [_replyIndices, j](auto const&) {
(*_replyIndices).push_back(j);
return Void();
}));
}
wait(waitForAll(futures));
for (i = 0; i < 2000; ++i) {
ASSERT_EQ(replyIndices[i], i);
}
return Void();
}
ACTOR static Future<Void> mockServer(GrvProxyTransactionTagThrottler* throttler) {
state SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
state SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
loop {
state double elapsed = (0.009 + 0.002 * deterministicRandom()->random01());
wait(delay(elapsed));
throttler->releaseTransactions(elapsed, outBatchPriority, outDefaultPriority);
while (!outBatchPriority.empty()) {
outBatchPriority.front().reply.send(GetReadVersionReply{});
outBatchPriority.pop_front();
}
while (!outDefaultPriority.empty()) {
outDefaultPriority.front().reply.send(GetReadVersionReply{});
outDefaultPriority.pop_front();
}
}
}
static TransactionTag getRandomTag() {
TransactionTag result;
auto arr = new (result.arena()) uint8_t[32];
for (int i = 0; i < 32; ++i) {
arr[i] = (uint8_t)deterministicRandom()->randomInt(0, 256);
}
result.contents() = TransactionTagRef(arr, 32);
return result;
}
static bool isNear(double desired, int64_t actual) {
return std::abs(desired - actual) * 10 < desired;
}
// Rate limit set at 10, but client attempts 20 transactions per second.
// Client should be throttled to only 10 transactions per second.
TEST_CASE("/GrvProxyTransactionTagThrottler/Simple") {
state GrvProxyTransactionTagThrottler throttler;
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 10.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state Future<Void> client = mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 1, 20.0, &counters);
state Future<Void> server = mockServer(&throttler);
wait(timeout(client && server, 60.0, Void()));
TraceEvent("TagQuotaTest_Simple").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 10.0));
return Void();
}
// Clients share the available 30 transaction/second budget
TEST_CASE("/GrvProxyTransactionTagThrottler/MultiClient") {
state GrvProxyTransactionTagThrottler throttler;
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 30.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state std::vector<Future<Void>> clients;
clients.reserve(10);
for (int i = 0; i < 10; ++i) {
clients.push_back(mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 1, 10.0, &counters));
}
state Future<Void> server = mockServer(&throttler);
wait(timeout(waitForAll(clients) && server, 60.0, Void()));
TraceEvent("TagQuotaTest_MultiClient").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 30.0));
return Void();
}
// Test processing GetReadVersionRequests that batch several transactions
TEST_CASE("/GrvProxyTransactionTagThrottler/Batch") {
state GrvProxyTransactionTagThrottler throttler;
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 10.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state Future<Void> client = mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 5, 20.0, &counters);
state Future<Void> server = mockServer(&throttler);
wait(timeout(client && server, 60.0, Void()));
TraceEvent("TagQuotaTest_Batch").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 10.0));
return Void();
}
// Tests cleanup of tags that are no longer throttled.
TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup1") {
GrvProxyTransactionTagThrottler throttler;
for (int i = 0; i < 1000; ++i) {
auto const tag = getRandomTag();
TransactionTagMap<double> rates;
rates[tag] = 10.0;
throttler.updateRates(rates);
ASSERT_EQ(throttler.size(), 1);
}
return Void();
}
// Tests cleanup of tags once queues have been emptied
TEST_CASE("/GrvProxyTransactionTagThrottler/Cleanup2") {
GrvProxyTransactionTagThrottler throttler;
{
GetReadVersionRequest req;
req.tags["sampleTag"_sr] = 1;
req.priority = TransactionPriority::DEFAULT;
throttler.addRequest(req);
}
ASSERT_EQ(throttler.size(), 1);
throttler.updateRates(TransactionTagMap<double>{});
ASSERT_EQ(throttler.size(), 1);
{
SpannedDeque<GetReadVersionRequest> outBatchPriority("TestGrvProxyTransactionTagThrottler_Batch"_loc);
SpannedDeque<GetReadVersionRequest> outDefaultPriority("TestGrvProxyTransactionTagThrottler_Default"_loc);
throttler.releaseTransactions(0.1, outBatchPriority, outDefaultPriority);
}
// Calling updates cleans up the queues in throttler
throttler.updateRates(TransactionTagMap<double>{});
ASSERT_EQ(throttler.size(), 0);
return Void();
}
// Tests that unthrottled transactions are released in FIFO order, even when they
// have different tags
TEST_CASE("/GrvProxyTransactionTagThrottler/Fifo") {
state GrvProxyTransactionTagThrottler throttler;
state Future<Void> server = mockServer(&throttler);
wait(mockFifoClient(&throttler));
return Void();
}

View File

@ -0,0 +1,123 @@
/*
* GrvTransactionRateInfo.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/GrvTransactionRateInfo.h"
#include "fdbserver/Knobs.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // must be last include
GrvTransactionRateInfo::GrvTransactionRateInfo(double rate)
: rate(rate), smoothRate(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW),
smoothReleased(SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW) {
smoothRate.setTotal(rate);
}
bool GrvTransactionRateInfo::canStart(int64_t numAlreadyStarted, int64_t count) const {
return numAlreadyStarted + count <=
std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the rate window that
// elapsed.
//
// We may have exceeded our limit due to the budget or because of higher priority transactions, in which case
// this delta will be negative. The delta can also be negative in the event that our limit was negative, which
// can happen if we had already started more transactions in our rate window than our rate would have allowed.
//
// This budget has the property that when the budget is required to start transactions (because batches are
// big), the sum limit+budget will increase linearly from 0 to the batch size over time and decrease by the
// batch size upon starting a batch. In other words, this works equivalently to a model where we linearly
// accumulate budget over time in the case that our batches are too big to take advantage of the rate window based
// limits.
//
// Note that "rate window" here indicates a period of SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW seconds,
// whereas "release window" is the period between wait statements, with duration indicated by "elapsed."
budget =
std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if (queueEmptyAtPriority) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
}
smoothReleased.addDelta(numStartedAtPriority);
}
void GrvTransactionRateInfo::disable() {
disabled = true;
// Use smoothRate.setTotal(0) instead of setting rate to 0 so txns will not be throttled immediately.
smoothRate.setTotal(0);
}
void GrvTransactionRateInfo::setRate(double rate) {
ASSERT(rate >= 0 && rate != std::numeric_limits<double>::infinity() && !std::isnan(rate));
this->rate = rate;
if (disabled) {
smoothRate.reset(rate);
disabled = false;
} else {
smoothRate.setTotal(rate);
}
}
void GrvTransactionRateInfo::startReleaseWindow() {
// Determine the number of transactions that this proxy is allowed to release
// Roughly speaking, this is done by computing the number of transactions over some historical window that we
// could have started but didn't, and making that our limit. More precisely, we track a smoothed rate limit and
// release rate, the difference of which is the rate of additional transactions that we could have released
// based on that window. Then we multiply by the window size to get a number of transactions.
//
// Limit can be negative in the event that we are releasing more transactions than we are allowed (due to the
// use of our budget or because of higher priority transactions).
double releaseRate = smoothRate.smoothTotal() - smoothReleased.smoothRate();
limit = SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW * releaseRate;
}
static bool isNear(double desired, int64_t actual) {
return std::abs(desired - actual) * 10 < desired;
}
ACTOR static Future<Void> mockClient(GrvTransactionRateInfo* rateInfo, double desiredRate, int64_t* counter) {
loop {
state double elapsed = (0.9 + 0.2 * deterministicRandom()->random01()) / desiredRate;
wait(delay(elapsed));
rateInfo->startReleaseWindow();
int started = rateInfo->canStart(0, 1) ? 1 : 0;
*counter += started;
rateInfo->endReleaseWindow(started, false, elapsed);
}
}
// Rate limit set at 10, but client attempts 20 transactions per second.
// Client should be throttled to only 10 transactions per second.
TEST_CASE("/GrvTransactionRateInfo/Simple") {
state GrvTransactionRateInfo rateInfo;
state int64_t counter;
rateInfo.setRate(10.0);
wait(timeout(mockClient(&rateInfo, 20.0, &counter), 60.0, Void()));
TraceEvent("GrvTransactionRateInfoTest").detail("Counter", counter);
ASSERT(isNear(60.0 * 10.0, counter));
return Void();
}

View File

@ -288,6 +288,8 @@ public:
void enableSnapshot() override { disableSnapshot = false; } void enableSnapshot() override { disableSnapshot = false; }
int uncommittedBytes() { return queue.totalSize(); }
private: private:
enum OpType { enum OpType {
OpSet, OpSet,
@ -731,13 +733,16 @@ private:
.detail("Commits", dbgCommitCount) .detail("Commits", dbgCommitCount)
.detail("TimeTaken", now() - startt); .detail("TimeTaken", now() - startt);
self->semiCommit(); // Make sure cipher keys are ready before recovery finishes. The semiCommit below also require cipher
// keys.
// Make sure cipher keys are ready before recovery finishes.
if (self->enableEncryption) { if (self->enableEncryption) {
wait(updateCipherKeys(self)); wait(updateCipherKeys(self));
} }
CODE_PROBE(self->enableEncryption && self->uncommittedBytes() > 0,
"KeyValueStoreMemory recovered partial transaction while encryption-at-rest is enabled");
self->semiCommit();
return Void(); return Void();
} catch (Error& e) { } catch (Error& e) {
bool ok = e.code() == error_code_operation_cancelled || e.code() == error_code_file_not_found || bool ok = e.code() == error_code_operation_cancelled || e.code() == error_code_file_not_found ||

View File

@ -81,9 +81,9 @@ class SharedRocksDBState {
public: public:
SharedRocksDBState(UID id); SharedRocksDBState(UID id);
std::shared_ptr<LatencySample> commitLatency; LatencySample commitLatency;
std::shared_ptr<LatencySample> commitQueueLatency; LatencySample commitQueueLatency;
std::shared_ptr<LatencySample> dbWriteLatency; LatencySample dbWriteLatency;
void setClosing() { this->closing = true; } void setClosing() { this->closing = true; }
bool isClosing() const { return this->closing; } bool isClosing() const { return this->closing; }
@ -107,19 +107,18 @@ private:
SharedRocksDBState::SharedRocksDBState(UID id) SharedRocksDBState::SharedRocksDBState(UID id)
: id(id), closing(false), dbOptions(initialDbOptions()), cfOptions(initialCfOptions()), : id(id), closing(false), dbOptions(initialDbOptions()), cfOptions(initialCfOptions()),
readOptions(initialReadOptions()), readOptions(initialReadOptions()), commitLatency(LatencySample("RocksDBCommitLatency",
commitLatency(std::make_shared<LatencySample>("RocksDBCommitLatency", id,
id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE)),
SERVER_KNOBS->LATENCY_SAMPLE_SIZE)), commitQueueLatency(LatencySample("RocksDBCommitQueueLatency",
commitQueueLatency(std::make_shared<LatencySample>("RocksDBCommitQueueLatency", id,
id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE)),
SERVER_KNOBS->LATENCY_SAMPLE_SIZE)), dbWriteLatency(LatencySample("RocksDBWriteLatency",
dbWriteLatency(std::make_shared<LatencySample>("RocksDBWriteLatency", id,
id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE)) {}
SERVER_KNOBS->LATENCY_SAMPLE_SIZE)) {}
rocksdb::ColumnFamilyOptions SharedRocksDBState::initialCfOptions() { rocksdb::ColumnFamilyOptions SharedRocksDBState::initialCfOptions() {
rocksdb::ColumnFamilyOptions options; rocksdb::ColumnFamilyOptions options;
@ -1161,14 +1160,9 @@ struct RocksDBKeyValueStore : IKeyValueStore {
double startTime; double startTime;
bool getHistograms; bool getHistograms;
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; } double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
CommitAction() { CommitAction()
if (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) { : startTime(timer_monotonic()),
getHistograms = true; getHistograms(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) {}
startTime = timer_monotonic();
} else {
getHistograms = false;
}
}
}; };
void action(CommitAction& a) { void action(CommitAction& a) {
bool doPerfContextMetrics = bool doPerfContextMetrics =
@ -1178,7 +1172,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
perfContextMetrics->reset(); perfContextMetrics->reset();
} }
double commitBeginTime = timer_monotonic(); double commitBeginTime = timer_monotonic();
sharedState->commitQueueLatency->addMeasurement(commitBeginTime - a.startTime); sharedState->commitQueueLatency.addMeasurement(commitBeginTime - a.startTime);
if (a.getHistograms) { if (a.getHistograms) {
metricPromiseStream->send( metricPromiseStream->send(
std::make_pair(ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM.toString(), commitBeginTime - a.startTime)); std::make_pair(ROCKSDB_COMMIT_QUEUEWAIT_HISTOGRAM.toString(), commitBeginTime - a.startTime));
@ -1200,7 +1194,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
options.sync = false; options.sync = false;
} }
double writeBeginTime = a.getHistograms ? timer_monotonic() : 0; double writeBeginTime = timer_monotonic();
if (rateLimiter) { if (rateLimiter) {
// Controls the total write rate of compaction and flush in bytes per second. // Controls the total write rate of compaction and flush in bytes per second.
// Request for batchToCommit bytes. If this request cannot be satisfied, the call is blocked. // Request for batchToCommit bytes. If this request cannot be satisfied, the call is blocked.
@ -1209,7 +1203,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
s = db->Write(options, a.batchToCommit.get()); s = db->Write(options, a.batchToCommit.get());
readIterPool->update(); readIterPool->update();
double currTime = timer_monotonic(); double currTime = timer_monotonic();
sharedState->dbWriteLatency->addMeasurement(currTime - writeBeginTime); sharedState->dbWriteLatency.addMeasurement(currTime - writeBeginTime);
if (a.getHistograms) { if (a.getHistograms) {
metricPromiseStream->send( metricPromiseStream->send(
std::make_pair(ROCKSDB_WRITE_HISTOGRAM.toString(), currTime - writeBeginTime)); std::make_pair(ROCKSDB_WRITE_HISTOGRAM.toString(), currTime - writeBeginTime));
@ -1236,7 +1230,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
} }
} }
currTime = timer_monotonic(); currTime = timer_monotonic();
sharedState->commitLatency->addMeasurement(currTime - a.startTime); sharedState->commitLatency.addMeasurement(currTime - a.startTime);
if (a.getHistograms) { if (a.getHistograms) {
metricPromiseStream->send( metricPromiseStream->send(
std::make_pair(ROCKSDB_COMMIT_ACTION_HISTOGRAM.toString(), currTime - commitBeginTime)); std::make_pair(ROCKSDB_COMMIT_ACTION_HISTOGRAM.toString(), currTime - commitBeginTime));
@ -1361,9 +1355,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
ThreadReturnPromise<Optional<Value>> result; ThreadReturnPromise<Optional<Value>> result;
ReadValueAction(KeyRef key, Optional<UID> debugID) ReadValueAction(KeyRef key, Optional<UID> debugID)
: key(key), debugID(debugID), startTime(timer_monotonic()), : key(key), debugID(debugID), startTime(timer_monotonic()),
getHistograms( getHistograms(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) {}
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
}
double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; } double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
}; };
void action(ReadValueAction& a) { void action(ReadValueAction& a) {
@ -1447,9 +1439,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
ThreadReturnPromise<Optional<Value>> result; ThreadReturnPromise<Optional<Value>> result;
ReadValuePrefixAction(Key key, int maxLength, Optional<UID> debugID) ReadValuePrefixAction(Key key, int maxLength, Optional<UID> debugID)
: key(key), maxLength(maxLength), debugID(debugID), startTime(timer_monotonic()), : key(key), maxLength(maxLength), debugID(debugID), startTime(timer_monotonic()),
getHistograms( getHistograms(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) {}
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
}
double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; } double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
}; };
void action(ReadValuePrefixAction& a) { void action(ReadValuePrefixAction& a) {
@ -1529,9 +1519,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
ThreadReturnPromise<RangeResult> result; ThreadReturnPromise<RangeResult> result;
ReadRangeAction(KeyRange keys, int rowLimit, int byteLimit) ReadRangeAction(KeyRange keys, int rowLimit, int byteLimit)
: keys(keys), rowLimit(rowLimit), byteLimit(byteLimit), startTime(timer_monotonic()), : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit), startTime(timer_monotonic()),
getHistograms( getHistograms(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) {}
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
}
double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; } double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
}; };
void action(ReadRangeAction& a) { void action(ReadRangeAction& a) {

View File

@ -1023,6 +1023,10 @@ public:
writeBatch->Put(metadataShard->cf, writeBatch->Put(metadataShard->cf,
getShardMappingKey(lastKey, shardMappingPrefix), getShardMappingKey(lastKey, shardMappingPrefix),
nextShard == nullptr ? "" : nextShard->physicalShard->id); nextShard == nullptr ? "" : nextShard->physicalShard->id);
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMappingEnd")
.detail("NextShardKey", lastKey)
.detail("Value", nextShard == nullptr ? "" : nextShard->physicalShard->id);
dirtyShards->insert(metadataShard.get()); dirtyShards->insert(metadataShard.get());
} }

File diff suppressed because it is too large Load Diff

View File

@ -278,8 +278,9 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
// Detect conflicts // Detect conflicts
double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME; double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME;
ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena); ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena);
const Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
for (int t = 0; t < req.transactions.size(); t++) { for (int t = 0; t < req.transactions.size(); t++) {
conflictBatch.addTransaction(req.transactions[t]); conflictBatch.addTransaction(req.transactions[t], newOldestVersion);
self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size(); self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size();
self->resolvedWriteConflictRanges += req.transactions[t].write_conflict_ranges.size(); self->resolvedWriteConflictRanges += req.transactions[t].write_conflict_ranges.size();
@ -292,8 +293,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
it.begin, SERVER_KNOBS->SAMPLE_OFFSET_PER_KEY + it.begin.size(), expire); it.begin, SERVER_KNOBS->SAMPLE_OFFSET_PER_KEY + it.begin.size(), expire);
} }
} }
conflictBatch.detectConflicts( conflictBatch.detectConflicts(req.version, newOldestVersion, commitList, &tooOldList);
req.version, req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS, commitList, &tooOldList);
reply.debugID = req.debugID; reply.debugID = req.debugID;
reply.committed.resize(reply.arena, req.transactions.size()); reply.committed.resize(reply.arena, req.transactions.size());
@ -351,7 +351,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
SpanContext spanContext = SpanContext spanContext =
req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanContext(); req.transactions[t].spanContext.present() ? req.transactions[t].spanContext.get() : SpanContext();
applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations, db); applyMetadataMutations(spanContext, *resolverData, req.transactions[t].mutations);
} }
CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery"); CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
} }
@ -574,7 +574,7 @@ ACTOR Future<Void> processCompleteTransactionStateRequest(TransactionStateResolv
bool confChanges; // Ignore configuration changes for initial commits. bool confChanges; // Ignore configuration changes for initial commits.
ResolverData resolverData( ResolverData resolverData(
pContext->pResolverData->dbgid, pContext->pTxnStateStore, &pContext->pResolverData->keyInfo, confChanges); pContext->pResolverData->dbgid, pContext->pTxnStateStore, &pContext->pResolverData->keyInfo, confChanges);
applyMetadataMutations(SpanContext(), resolverData, mutations, db); applyMetadataMutations(SpanContext(), resolverData, mutations);
} // loop } // loop
auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get(); auto lockedKey = pContext->pTxnStateStore->readValue(databaseLockedKey).get();
@ -653,15 +653,14 @@ ACTOR Future<Void> resolverCore(ResolverInterface resolver,
state TransactionStateResolveContext transactionStateResolveContext; state TransactionStateResolveContext transactionStateResolveContext;
if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) { if (SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS) {
self->logAdapter = new LogSystemDiskQueueAdapter(self->logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false); self->logAdapter = new LogSystemDiskQueueAdapter(self->logSystem, Reference<AsyncVar<PeekTxsInfo>>(), 1, false);
self->txnStateStore = self->txnStateStore = keyValueStoreLogSystem(self->logAdapter,
keyValueStoreLogSystem(self->logAdapter, db,
db, resolver.id(),
resolver.id(), 2e9,
2e9, true,
true, true,
true, true,
true, isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION));
isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client));
// wait for txnStateStore recovery // wait for txnStateStore recovery
wait(success(self->txnStateStore->readValue(StringRef()))); wait(success(self->txnStateStore->readValue(StringRef())));

View File

@ -38,6 +38,7 @@
#include "flow/network.h" #include "flow/network.h"
#include "flow/UnitTest.h" #include "flow/UnitTest.h"
#include <limits>
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
@ -191,6 +192,7 @@ static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainI
BlobMetadataDomainName domainName) { BlobMetadataDomainName domainName) {
Standalone<BlobMetadataDetailsRef> metadata; Standalone<BlobMetadataDetailsRef> metadata;
metadata.domainId = domainId; metadata.domainId = domainId;
metadata.arena().dependsOn(domainName.arena());
metadata.domainName = domainName; metadata.domainName = domainName;
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned // 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3); int type = deterministicRandom()->randomInt(0, 3);
@ -226,6 +228,17 @@ static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainI
ev.detail("P" + std::to_string(i), metadata.partitions.back()); ev.detail("P" + std::to_string(i), metadata.partitions.back());
} }
} }
// set random refresh + expire time
if (deterministicRandom()->coinflip()) {
metadata.refreshAt = now() + deterministicRandom()->random01() * SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
metadata.expireAt =
metadata.refreshAt + deterministicRandom()->random01() * SERVER_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
} else {
metadata.refreshAt = std::numeric_limits<double>::max();
metadata.expireAt = metadata.refreshAt;
}
return metadata; return metadata;
} }
@ -244,6 +257,10 @@ ACTOR Future<Void> blobMetadataLookup(KmsConnectorInterface interf, KmsConnBlobM
it = simBlobMetadataStore it = simBlobMetadataStore
.insert({ domainInfo.domainId, createBlobMetadata(domainInfo.domainId, domainInfo.domainName) }) .insert({ domainInfo.domainId, createBlobMetadata(domainInfo.domainId, domainInfo.domainName) })
.first; .first;
} else if (now() >= it->second.expireAt) {
// update random refresh and expire time
it->second.refreshAt = now() + deterministicRandom()->random01() * 30;
it->second.expireAt = it->second.refreshAt + deterministicRandom()->random01() * 10;
} }
rep.metadataDetails.arena().dependsOn(it->second.arena()); rep.metadataDetails.arena().dependsOn(it->second.arena());
rep.metadataDetails.push_back(rep.metadataDetails.arena(), it->second); rep.metadataDetails.push_back(rep.metadataDetails.arena(), it->second);

View File

@ -368,6 +368,7 @@ public:
bool allowDisablingTenants = true; bool allowDisablingTenants = true;
bool allowCreatingTenants = true; bool allowCreatingTenants = true;
bool injectTargetedSSRestart = false; bool injectTargetedSSRestart = false;
bool tenantModeRequired = false;
bool injectSSDelay = false; bool injectSSDelay = false;
std::string testClass; // unused -- used in TestHarness std::string testClass; // unused -- used in TestHarness
float testPriority; // unused -- used in TestHarness float testPriority; // unused -- used in TestHarness
@ -433,6 +434,7 @@ public:
.add("allowDefaultTenant", &allowDefaultTenant) .add("allowDefaultTenant", &allowDefaultTenant)
.add("allowDisablingTenants", &allowDisablingTenants) .add("allowDisablingTenants", &allowDisablingTenants)
.add("allowCreatingTenants", &allowCreatingTenants) .add("allowCreatingTenants", &allowCreatingTenants)
.add("tenantModeRequired", &tenantModeRequired)
.add("randomlyRenameZoneId", &randomlyRenameZoneId) .add("randomlyRenameZoneId", &randomlyRenameZoneId)
.add("injectTargetedSSRestart", &injectTargetedSSRestart) .add("injectTargetedSSRestart", &injectTargetedSSRestart)
.add("injectSSDelay", &injectSSDelay); .add("injectSSDelay", &injectSSDelay);
@ -2451,6 +2453,7 @@ ACTOR void setupAndRun(std::string dataFolder,
state bool allowDefaultTenant = testConfig.allowDefaultTenant; state bool allowDefaultTenant = testConfig.allowDefaultTenant;
state bool allowDisablingTenants = testConfig.allowDisablingTenants; state bool allowDisablingTenants = testConfig.allowDisablingTenants;
state bool allowCreatingTenants = testConfig.allowCreatingTenants; state bool allowCreatingTenants = testConfig.allowCreatingTenants;
state bool tenantModeRequired = testConfig.tenantModeRequired;
if (!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) { if (!SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA) {
testConfig.storageEngineExcludeTypes.push_back(5); testConfig.storageEngineExcludeTypes.push_back(5);
@ -2467,6 +2470,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// TODO: persist the chosen default tenant in the restartInfo.ini file for the second test // TODO: persist the chosen default tenant in the restartInfo.ini file for the second test
allowDefaultTenant = false; allowDefaultTenant = false;
allowCreatingTenants = false; allowCreatingTenants = false;
tenantModeRequired = false;
} }
// TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine // TODO: Currently backup and restore related simulation tests are failing when run with rocksDB storage engine
@ -2516,10 +2520,10 @@ ACTOR void setupAndRun(std::string dataFolder,
state Optional<TenantName> defaultTenant; state Optional<TenantName> defaultTenant;
state Standalone<VectorRef<TenantNameRef>> tenantsToCreate; state Standalone<VectorRef<TenantNameRef>> tenantsToCreate;
state TenantMode tenantMode = TenantMode::DISABLED; state TenantMode tenantMode = TenantMode::DISABLED;
if (allowDefaultTenant && deterministicRandom()->random01() < 0.5) { if (tenantModeRequired || (allowDefaultTenant && deterministicRandom()->random01() < 0.5)) {
defaultTenant = "SimulatedDefaultTenant"_sr; defaultTenant = "SimulatedDefaultTenant"_sr;
tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get()); tenantsToCreate.push_back_deep(tenantsToCreate.arena(), defaultTenant.get());
if (deterministicRandom()->random01() < 0.9) { if (tenantModeRequired || deterministicRandom()->random01() < 0.9) {
tenantMode = TenantMode::REQUIRED; tenantMode = TenantMode::REQUIRED;
} else { } else {
tenantMode = TenantMode::OPTIONAL_TENANT; tenantMode = TenantMode::OPTIONAL_TENANT;

View File

@ -816,14 +816,14 @@ struct TransactionInfo {
bool reportConflictingKeys; bool reportConflictingKeys;
}; };
void ConflictBatch::addTransaction(const CommitTransactionRef& tr) { void ConflictBatch::addTransaction(const CommitTransactionRef& tr, Version newOldestVersion) {
const int t = transactionCount++; const int t = transactionCount++;
Arena& arena = transactionInfo.arena(); Arena& arena = transactionInfo.arena();
TransactionInfo* info = new (arena) TransactionInfo; TransactionInfo* info = new (arena) TransactionInfo;
info->reportConflictingKeys = tr.report_conflicting_keys; info->reportConflictingKeys = tr.report_conflicting_keys;
if (tr.read_snapshot < cs->oldestVersion && tr.read_conflict_ranges.size()) { if (tr.read_snapshot < newOldestVersion && tr.read_conflict_ranges.size()) {
info->tooOld = true; info->tooOld = true;
} else { } else {
info->tooOld = false; info->tooOld = false;
@ -1143,7 +1143,7 @@ void skipListTest() {
t = timer(); t = timer();
ConflictBatch batch(cs); ConflictBatch batch(cs);
for (const auto& tr : trs) { for (const auto& tr : trs) {
batch.addTransaction(tr); batch.addTransaction(tr, version);
} }
g_add += timer() - t; g_add += timer() - t;

View File

@ -828,6 +828,10 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
roles.addRole("blob_manager", db->get().blobManager.get()); roles.addRole("blob_manager", db->get().blobManager.get());
} }
if (configuration.present() && configuration.get().blobGranulesEnabled && db->get().blobMigrator.present()) {
roles.addRole("blob_migrator", db->get().blobMigrator.get());
}
if (db->get().consistencyScan.present()) { if (db->get().consistencyScan.present()) {
roles.addRole("consistency_scan", db->get().consistencyScan.get()); roles.addRole("consistency_scan", db->get().consistencyScan.get());
} }

View File

@ -117,7 +117,7 @@ enum {
OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE, OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_NO_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME, OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_NO_CONFIG_DB, OPT_FAULT_INJECTION, OPT_PROFILER, OPT_PRINT_SIMTIME,
OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONNECTOR_TYPE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, OPT_FLOW_PROCESS_NAME, OPT_FLOW_PROCESS_ENDPOINT, OPT_IP_TRUSTED_MASK, OPT_KMS_CONN_DISCOVERY_URL_FILE, OPT_KMS_CONNECTOR_TYPE, OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS,
OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, OPT_NEW_CLUSTER_KEY, OPT_AUTHZ_PUBLIC_KEY_FILE, OPT_USE_FUTURE_PROTOCOL_VERSION OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, OPT_KMS_CONN_GET_BLOB_METADATA_ENDPOINT, OPT_NEW_CLUSTER_KEY, OPT_AUTHZ_PUBLIC_KEY_FILE, OPT_USE_FUTURE_PROTOCOL_VERSION
}; };
CSimpleOpt::SOption g_rgOptions[] = { CSimpleOpt::SOption g_rgOptions[] = {
@ -218,6 +218,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
{ OPT_KMS_CONNECTOR_TYPE, "--kms-connector-type", SO_REQ_SEP }, { OPT_KMS_CONNECTOR_TYPE, "--kms-connector-type", SO_REQ_SEP },
{ OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, "--kms-conn-validation-token-details", SO_REQ_SEP }, { OPT_KMS_CONN_VALIDATION_TOKEN_DETAILS, "--kms-conn-validation-token-details", SO_REQ_SEP },
{ OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, "--kms-conn-get-encryption-keys-endpoint", SO_REQ_SEP }, { OPT_KMS_CONN_GET_ENCRYPTION_KEYS_ENDPOINT, "--kms-conn-get-encryption-keys-endpoint", SO_REQ_SEP },
{ OPT_KMS_CONN_GET_BLOB_METADATA_ENDPOINT, "--kms-conn-get-blob-metadata-endpoint", SO_REQ_SEP },
{ OPT_USE_FUTURE_PROTOCOL_VERSION, "--use-future-protocol-version", SO_REQ_SEP }, { OPT_USE_FUTURE_PROTOCOL_VERSION, "--use-future-protocol-version", SO_REQ_SEP },
TLS_OPTION_FLAGS, TLS_OPTION_FLAGS,
SO_END_OF_OPTIONS SO_END_OF_OPTIONS
@ -1707,6 +1708,10 @@ private:
knobs.emplace_back("rest_kms_connector_get_encryption_keys_endpoint", args.OptionArg()); knobs.emplace_back("rest_kms_connector_get_encryption_keys_endpoint", args.OptionArg());
break; break;
} }
case OPT_KMS_CONN_GET_BLOB_METADATA_ENDPOINT: {
knobs.emplace_back("rest_kms_connector_get_blob_metadata_endpoint", args.OptionArg());
break;
}
case OPT_NEW_CLUSTER_KEY: { case OPT_NEW_CLUSTER_KEY: {
newClusterKey = args.OptionArg(); newClusterKey = args.OptionArg();
try { try {

View File

@ -103,8 +103,7 @@ void applyMetadataMutations(SpanContext const& spanContext,
const UID& dbgid, const UID& dbgid,
Arena& arena, Arena& arena,
const VectorRef<MutationRef>& mutations, const VectorRef<MutationRef>& mutations,
IKeyValueStore* txnStateStore, IKeyValueStore* txnStateStore);
Reference<AsyncVar<ServerDBInfo> const> dbInfo);
inline bool isSystemKey(KeyRef key) { inline bool isSystemKey(KeyRef key) {
return key.size() && key[0] == systemKeys.begin[0]; return key.size() && key[0] == systemKeys.begin[0];
@ -145,7 +144,6 @@ inline bool containsMetadataMutation(const VectorRef<MutationRef>& mutations) {
// Resolver's version // Resolver's version
void applyMetadataMutations(SpanContext const& spanContext, void applyMetadataMutations(SpanContext const& spanContext,
ResolverData& resolverData, ResolverData& resolverData,
const VectorRef<MutationRef>& mutations, const VectorRef<MutationRef>& mutations);
Reference<AsyncVar<ServerDBInfo> const> dbInfo);
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
* BlobGranuleServerCommon.h * BlobGranuleServerCommon.actor.h
* *
* This source file is part of the FoundationDB open source project * This source file is part of the FoundationDB open source project
* *
@ -105,10 +105,15 @@ struct GranuleTenantData : NonCopyable, ReferenceCounted<GranuleTenantData> {
GranuleTenantData() {} GranuleTenantData() {}
GranuleTenantData(TenantName name, TenantMapEntry entry) : name(name), entry(entry) {} GranuleTenantData(TenantName name, TenantMapEntry entry) : name(name), entry(entry) {}
void setBStore(Reference<BlobConnectionProvider> bs) { void updateBStore(const BlobMetadataDetailsRef& metadata) {
ASSERT(bstoreLoaded.canBeSet()); if (bstoreLoaded.canBeSet()) {
bstore = bs; // new
bstoreLoaded.send(Void()); bstore = BlobConnectionProvider::newBlobConnectionProvider(metadata);
bstoreLoaded.send(Void());
} else {
// update existing
bstore->update(metadata);
}
} }
}; };
@ -119,7 +124,7 @@ public:
void removeTenants(std::vector<int64_t> tenantIds); void removeTenants(std::vector<int64_t> tenantIds);
Optional<TenantMapEntry> getTenantById(int64_t id); Optional<TenantMapEntry> getTenantById(int64_t id);
Reference<GranuleTenantData> getDataForGranule(const KeyRangeRef& keyRange); Future<Reference<GranuleTenantData>> getDataForGranule(const KeyRangeRef& keyRange);
KeyRangeMap<Reference<GranuleTenantData>> tenantData; KeyRangeMap<Reference<GranuleTenantData>> tenantData;
std::unordered_map<int64_t, TenantMapEntry> tenantInfoById; std::unordered_map<int64_t, TenantMapEntry> tenantInfoById;

View File

@ -0,0 +1,67 @@
/*
* BlobMigratorInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBSERVER_BLOBMIGRATORINTERFACE_H
#define FDBSERVER_BLOBMIGRATORINTERFACE_H
#pragma once
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/fdbrpc.h"
struct BlobMigratorInterface {
constexpr static FileIdentifier file_identifier = 869199;
RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
RequestStream<ReplyPromise<Void>> waitFailure;
LocalityData locality;
UID uniqueID;
BlobMigratorInterface() {}
BlobMigratorInterface(const struct LocalityData& l, UID id) : uniqueID(id), locality(l) {}
void initEndpoints() {}
UID id() const { return uniqueID; }
NetworkAddress address() const { return waitFailure.getEndpoint().getPrimaryAddress(); }
bool operator==(const BlobMigratorInterface& r) const { return id() == r.id(); }
bool operator!=(const BlobMigratorInterface& r) const { return !(*this == r); }
template <class Archive>
void serialize(Archive& ar) {
// StorageServerInterface::serialize(ar);
serializer(ar, waitFailure, haltBlobMigrator, locality, uniqueID);
}
};
struct HaltBlobMigratorRequest {
constexpr static FileIdentifier file_identifier = 4980139;
UID requesterID;
ReplyPromise<Void> reply;
HaltBlobMigratorRequest() {}
explicit HaltBlobMigratorRequest(UID uid) : requesterID(uid) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, requesterID, reply);
}
};
#endif

View File

@ -22,6 +22,8 @@
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source // When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version. // version.
#include "fdbclient/StorageServerInterface.h"
#include "fdbserver/BlobMigratorInterface.h"
#include <utility> #include <utility>
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_CLUSTERCONTROLLER_ACTOR_G_H) #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_CLUSTERCONTROLLER_ACTOR_G_H)
@ -51,6 +53,7 @@ struct WorkerInfo : NonCopyable {
Future<Void> haltRatekeeper; Future<Void> haltRatekeeper;
Future<Void> haltDistributor; Future<Void> haltDistributor;
Future<Void> haltBlobManager; Future<Void> haltBlobManager;
Future<Void> haltBlobMigrator;
Future<Void> haltEncryptKeyProxy; Future<Void> haltEncryptKeyProxy;
Future<Void> haltConsistencyScan; Future<Void> haltConsistencyScan;
Standalone<VectorRef<StringRef>> issues; Standalone<VectorRef<StringRef>> issues;
@ -184,6 +187,14 @@ public:
serverInfo->set(newInfo); serverInfo->set(newInfo);
} }
void setBlobMigrator(const BlobMigratorInterface& interf) {
auto newInfo = serverInfo->get();
newInfo.id = deterministicRandom()->randomUniqueID();
newInfo.infoGeneration = ++dbInfoCount;
newInfo.blobMigrator = interf;
serverInfo->set(newInfo);
}
void setEncryptKeyProxy(const EncryptKeyProxyInterface& interf) { void setEncryptKeyProxy(const EncryptKeyProxyInterface& interf) {
auto newInfo = serverInfo->get(); auto newInfo = serverInfo->get();
auto newClientInfo = clientInfo->get(); auto newClientInfo = clientInfo->get();
@ -217,6 +228,8 @@ public:
newInfo.ratekeeper = Optional<RatekeeperInterface>(); newInfo.ratekeeper = Optional<RatekeeperInterface>();
} else if (t == ProcessClass::BlobManagerClass) { } else if (t == ProcessClass::BlobManagerClass) {
newInfo.blobManager = Optional<BlobManagerInterface>(); newInfo.blobManager = Optional<BlobManagerInterface>();
} else if (t == ProcessClass::BlobMigratorClass) {
newInfo.blobMigrator = Optional<BlobMigratorInterface>();
} else if (t == ProcessClass::EncryptKeyProxyClass) { } else if (t == ProcessClass::EncryptKeyProxyClass) {
newInfo.encryptKeyProxy = Optional<EncryptKeyProxyInterface>(); newInfo.encryptKeyProxy = Optional<EncryptKeyProxyInterface>();
newInfo.client.encryptKeyProxy = Optional<EncryptKeyProxyInterface>(); newInfo.client.encryptKeyProxy = Optional<EncryptKeyProxyInterface>();
@ -317,6 +330,8 @@ public:
db.serverInfo->get().ratekeeper.get().locality.processId() == processId) || db.serverInfo->get().ratekeeper.get().locality.processId() == processId) ||
(db.serverInfo->get().blobManager.present() && (db.serverInfo->get().blobManager.present() &&
db.serverInfo->get().blobManager.get().locality.processId() == processId) || db.serverInfo->get().blobManager.get().locality.processId() == processId) ||
(db.serverInfo->get().blobMigrator.present() &&
db.serverInfo->get().blobMigrator.get().locality.processId() == processId) ||
(db.serverInfo->get().encryptKeyProxy.present() && (db.serverInfo->get().encryptKeyProxy.present() &&
db.serverInfo->get().encryptKeyProxy.get().locality.processId() == processId) || db.serverInfo->get().encryptKeyProxy.get().locality.processId() == processId) ||
(db.serverInfo->get().consistencyScan.present() && (db.serverInfo->get().consistencyScan.present() &&
@ -3360,6 +3375,8 @@ public:
Optional<UID> recruitingRatekeeperID; Optional<UID> recruitingRatekeeperID;
AsyncVar<bool> recruitBlobManager; AsyncVar<bool> recruitBlobManager;
Optional<UID> recruitingBlobManagerID; Optional<UID> recruitingBlobManagerID;
AsyncVar<bool> recruitBlobMigrator;
Optional<UID> recruitingBlobMigratorID;
AsyncVar<bool> recruitEncryptKeyProxy; AsyncVar<bool> recruitEncryptKeyProxy;
Optional<UID> recruitingEncryptKeyProxyID; Optional<UID> recruitingEncryptKeyProxyID;
AsyncVar<bool> recruitConsistencyScan; AsyncVar<bool> recruitConsistencyScan;
@ -3401,8 +3418,9 @@ public:
ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()), startTime(now()), ac(false), outstandingRequestChecker(Void()), outstandingRemoteRequestChecker(Void()), startTime(now()),
goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()), datacenterVersionDifference(0), goodRecruitmentTime(Never()), goodRemoteRecruitmentTime(Never()), datacenterVersionDifference(0),
versionDifferenceUpdated(false), remoteDCMonitorStarted(false), remoteTransactionSystemDegraded(false), versionDifferenceUpdated(false), remoteDCMonitorStarted(false), remoteTransactionSystemDegraded(false),
recruitDistributor(false), recruitRatekeeper(false), recruitBlobManager(false), recruitEncryptKeyProxy(false), recruitDistributor(false), recruitRatekeeper(false), recruitBlobManager(false), recruitBlobMigrator(false),
recruitConsistencyScan(false), clusterControllerMetrics("ClusterController", id.toString()), recruitEncryptKeyProxy(false), recruitConsistencyScan(false),
clusterControllerMetrics("ClusterController", id.toString()),
openDatabaseRequests("OpenDatabaseRequests", clusterControllerMetrics), openDatabaseRequests("OpenDatabaseRequests", clusterControllerMetrics),
registerWorkerRequests("RegisterWorkerRequests", clusterControllerMetrics), registerWorkerRequests("RegisterWorkerRequests", clusterControllerMetrics),
getWorkersRequests("GetWorkersRequests", clusterControllerMetrics), getWorkersRequests("GetWorkersRequests", clusterControllerMetrics),

View File

@ -45,7 +45,7 @@ struct ConflictBatch {
TransactionCommitted, TransactionCommitted,
}; };
void addTransaction(const CommitTransactionRef& transaction); void addTransaction(const CommitTransactionRef& transaction, Version newOldestVersion);
void detectConflicts(Version now, void detectConflicts(Version now,
Version newOldestVersion, Version newOldestVersion,
std::vector<int>& nonConflicting, std::vector<int>& nonConflicting,

View File

@ -44,11 +44,25 @@ public:
struct SourceServers { struct SourceServers {
std::vector<UID> srcServers, completeSources; // the same as RelocateData.src, RelocateData.completeSources; std::vector<UID> srcServers, completeSources; // the same as RelocateData.src, RelocateData.completeSources;
}; };
struct DDRangeLocations {
DDRangeLocations() = default;
DDRangeLocations(KeyRangeRef range) : range(range) {}
// A map of dcId : list of servers
std::map<std::string, std::vector<StorageServerInterface>> servers;
KeyRange range;
};
virtual Database context() const = 0; virtual Database context() const = 0;
virtual bool isMocked() const = 0; virtual bool isMocked() const = 0;
// get the source server list and complete source server list for range // get the source server list and complete source server list for range
virtual Future<SourceServers> getSourceServersForRange(const KeyRangeRef range) { return SourceServers{}; }; virtual Future<SourceServers> getSourceServersForRange(const KeyRangeRef range) { return SourceServers{}; };
virtual Future<std::vector<DDRangeLocations>> getSourceServerInterfacesForRange(const KeyRangeRef range) {
return std::vector<DDRangeLocations>();
}
// get the storage server list and Process class, only throw transaction non-retryable exceptions // get the storage server list and Process class, only throw transaction non-retryable exceptions
virtual Future<ServerWorkerInfos> getServerListAndProcessClasses() = 0; virtual Future<ServerWorkerInfos> getServerListAndProcessClasses() = 0;
@ -142,6 +156,9 @@ public:
Future<SourceServers> getSourceServersForRange(const KeyRangeRef range) override; Future<SourceServers> getSourceServersForRange(const KeyRangeRef range) override;
Future<std::vector<IDDTxnProcessor::DDRangeLocations>> getSourceServerInterfacesForRange(
const KeyRangeRef range) override;
// Call NativeAPI implementation directly // Call NativeAPI implementation directly
Future<ServerWorkerInfos> getServerListAndProcessClasses() override; Future<ServerWorkerInfos> getServerListAndProcessClasses() override;

View File

@ -37,6 +37,7 @@ struct DataDistributorInterface {
RequestStream<struct GetDataDistributorMetricsRequest> dataDistributorMetrics; RequestStream<struct GetDataDistributorMetricsRequest> dataDistributorMetrics;
RequestStream<struct DistributorSplitRangeRequest> distributorSplitRange; RequestStream<struct DistributorSplitRangeRequest> distributorSplitRange;
RequestStream<struct GetStorageWigglerStateRequest> storageWigglerState; RequestStream<struct GetStorageWigglerStateRequest> storageWigglerState;
RequestStream<struct TriggerAuditRequest> triggerAudit;
DataDistributorInterface() {} DataDistributorInterface() {}
explicit DataDistributorInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {} explicit DataDistributorInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {}
@ -58,7 +59,8 @@ struct DataDistributorInterface {
distributorExclCheckReq, distributorExclCheckReq,
dataDistributorMetrics, dataDistributorMetrics,
distributorSplitRange, distributorSplitRange,
storageWigglerState); storageWigglerState,
triggerAudit);
} }
}; };

View File

@ -27,8 +27,11 @@
typedef enum { TLOG_ENCRYPTION = 0, STORAGE_SERVER_ENCRYPTION = 1, BLOB_GRANULE_ENCRYPTION = 2 } EncryptOperationType; typedef enum { TLOG_ENCRYPTION = 0, STORAGE_SERVER_ENCRYPTION = 1, BLOB_GRANULE_ENCRYPTION = 2 } EncryptOperationType;
inline bool isEncryptionOpSupported(EncryptOperationType operation_type, const ClientDBInfo& dbInfo) { inline bool isEncryptionOpSupported(EncryptOperationType operation_type) {
if (!dbInfo.isEncryptionEnabled) { // We would check against dbInfo.isEncryptionEnabled instead, but the dbInfo may not be available before
// ClusterController broadcast the dbInfo to workers. Before the broadcast encryption may appear to be disabled
// when it should be enabled. Moving the encryption switch to DB config could fix the issue.
if (!SERVER_KNOBS->ENABLE_ENCRYPTION) {
return false; return false;
} }

View File

@ -0,0 +1,80 @@
/*
* GrvProxyTransactionTagThrottler.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbserver/GrvTransactionRateInfo.h"
// GrvProxyTransactionTagThrottler is used to throttle GetReadVersionRequests based on tag quotas
// before they're pushed into priority-partitioned queues.
//
// A GrvTransactionRateInfo object and a request queue are maintained for each tag.
// The GrvTransactionRateInfo object is used to determine when a request can be released.
//
// Between each set of waits, releaseTransactions is run, releasing queued transactions
// that have passed the tag throttling stage. Transactions that are not yet ready
// are requeued during releaseTransactions.
class GrvProxyTransactionTagThrottler {
class DelayedRequest {
static uint64_t lastSequenceNumber;
double startTime;
public:
GetReadVersionRequest req;
uint64_t sequenceNumber;
explicit DelayedRequest(GetReadVersionRequest const& req)
: req(req), startTime(now()), sequenceNumber(++lastSequenceNumber) {}
void updateProxyTagThrottledDuration();
};
struct TagQueue {
Optional<GrvTransactionRateInfo> rateInfo;
Deque<DelayedRequest> requests;
TagQueue() = default;
explicit TagQueue(double rate) : rateInfo(rate) {}
void setRate(double rate);
};
// Track the budgets for each tag
TransactionTagMap<TagQueue> queues;
public:
// Called with rates received from ratekeeper
void updateRates(TransactionTagMap<double> const& newRates);
// elapsed indicates the amount of time since the last epoch was run.
// If a request is ready to be executed, it is sent to the deque
// corresponding to its priority. If not, the request remains queued.
void releaseTransactions(double elapsed,
SpannedDeque<GetReadVersionRequest>& outBatchPriority,
SpannedDeque<GetReadVersionRequest>& outDefaultPriority);
void addRequest(GetReadVersionRequest const&);
public: // testing
// Returns number of tags tracked
uint32_t size();
};

View File

@ -0,0 +1,69 @@
/*
* GrvTransactionRateInfo.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbrpc/Smoother.h"
// Used by GRV Proxy to enforce rate limits received from the Ratekeeper.
//
// Between waits, the GrvTransactionRateInfo executes a "release window" starting
// with a call to the startReleaseWindow method. Within this release window, transactions are
// released while canStart returns true. At the end of the release window, the
// endReleaseWindow method is called, and the budget is updated to add or
// remove capacity.
//
// Meanwhile, the desired rate is updated through the setRate method.
//
// Smoothers are used to avoid turbulent throttling behaviour.
class GrvTransactionRateInfo {
double rate = 0.0;
double limit{ 0.0 };
double budget{ 0.0 };
bool disabled{ true };
Smoother smoothRate;
Smoother smoothReleased;
public:
explicit GrvTransactionRateInfo(double rate = 0.0);
// Determines the number of transactions that this proxy is allowed to release
// in this release window.
void startReleaseWindow();
// Checks if a "count" new transactions can be released, given that
// "numAlreadyStarted" transactions have already been released in the
// current release window.
bool canStart(int64_t numAlreadyStarted, int64_t count) const;
// Updates the budget to accumulate any extra capacity available or remove any excess that was used.
// Call at the end of a release window.
void endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed);
// Smoothly sets rate. If currently disabled, reenable
void setRate(double rate);
// Smoothly sets transaction rate to 0. Call disable when new rates have not been
// set for a sufficiently long period of time.
void disable();
double getRate() const { return rate; }
double getLimit() const { return limit; }
};

View File

@ -1,299 +0,0 @@
/*
* IEncryptionKeyProvider.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BlobCipher.h"
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_G_H)
#define FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_G_H
#include "fdbserver/IEncryptionKeyProvider.actor.g.h"
#elif !defined(FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_H)
#define FDBSERVER_IENCRYPTIONKEYPROVIDER_ACTOR_H
#include "fdbclient/GetEncryptCipherKeys.actor.h"
#include "fdbclient/Tenant.h"
#include "fdbserver/EncryptionOpsUtils.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/Arena.h"
#include "flow/EncryptUtils.h"
#define XXH_INLINE_ALL
#include "flow/xxhash.h"
#include "flow/actorcompiler.h" // This must be the last #include.
typedef uint64_t XOREncryptionKeyID;
// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
// of all fields relevant to any implemented encryption scheme. They are generally of
// the form
// Page Fields - fields which come from or are stored in the Page
// Secret Fields - fields which are only known by the Key Provider
// but it is up to each encoding and provider which fields are which and which ones are used
//
// TODO(yiwu): Rename and/or refactor this struct. It doesn't sound like an encryption key should
// contain page fields like encryption header.
struct EncryptionKeyRef {
EncryptionKeyRef(){};
EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy)
: cipherKeys(toCopy.cipherKeys), secret(arena, toCopy.secret), id(toCopy.id) {}
int expectedSize() const { return secret.size(); }
// Fields for AESEncryptionV1
TextAndHeaderCipherKeys cipherKeys;
Optional<BlobCipherEncryptHeader> cipherHeader;
// Fields for XOREncryption_TestOnly
StringRef secret;
Optional<XOREncryptionKeyID> id;
};
typedef Standalone<EncryptionKeyRef> EncryptionKey;
// Interface used by pager to get encryption keys reading pages from disk
// and by the BTree to get encryption keys to use for new pages
class IEncryptionKeyProvider : public ReferenceCounted<IEncryptionKeyProvider> {
public:
virtual ~IEncryptionKeyProvider() {}
// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
// It is up to the implementation which fields those are.
// The output Page Fields must match the input Page Fields.
virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
// Get encryption key that should be used for a given user Key-Value range
virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
// Setting tenant prefix to tenant name map.
virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {}
virtual bool shouldEnableEncryption() const = 0;
};
// The null key provider is useful to simplify page decoding.
// It throws an error for any key info requested.
class NullKeyProvider : public IEncryptionKeyProvider {
public:
virtual ~NullKeyProvider() {}
bool shouldEnableEncryption() const override { return true; }
Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
throw encryption_key_not_found();
}
};
// Key provider for dummy XOR encryption scheme
class XOREncryptionKeyProvider_TestOnly : public IEncryptionKeyProvider {
public:
XOREncryptionKeyProvider_TestOnly(std::string filename) {
ASSERT(g_network->isSimulated());
// Choose a deterministic random filename (without path) byte for secret generation
// Remove any leading directory names
size_t lastSlash = filename.find_last_of("\\/");
if (lastSlash != filename.npos) {
filename.erase(0, lastSlash);
}
xorWith = filename.empty() ? 0x5e
: (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
}
virtual ~XOREncryptionKeyProvider_TestOnly() {}
bool shouldEnableEncryption() const override { return true; }
Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
if (!key.id.present()) {
throw encryption_key_not_found();
}
EncryptionKey s = key;
uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
s.secret = StringRef(s.arena(), &secret, 1);
return s;
}
Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
EncryptionKeyRef k;
k.id = end.empty() ? 0 : *(end.end() - 1);
return getSecrets(k);
}
uint8_t xorWith;
};
// Key provider to provider cipher keys randomly from a pre-generated pool. Use for testing.
class RandomEncryptionKeyProvider : public IEncryptionKeyProvider {
public:
RandomEncryptionKeyProvider() {
for (unsigned i = 0; i < NUM_CIPHER; i++) {
BlobCipherDetails cipherDetails;
cipherDetails.encryptDomainId = i;
cipherDetails.baseCipherId = deterministicRandom()->randomUInt64();
cipherDetails.salt = deterministicRandom()->randomUInt64();
cipherKeys[i] = generateCipherKey(cipherDetails);
}
}
virtual ~RandomEncryptionKeyProvider() = default;
bool shouldEnableEncryption() const override { return true; }
Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
ASSERT(key.cipherHeader.present());
EncryptionKey s = key;
s.cipherKeys.cipherTextKey = cipherKeys[key.cipherHeader.get().cipherTextDetails.encryptDomainId];
s.cipherKeys.cipherHeaderKey = cipherKeys[key.cipherHeader.get().cipherHeaderDetails.encryptDomainId];
return s;
}
Future<EncryptionKey> getByRange(const KeyRef& /*begin*/, const KeyRef& /*end*/) override {
EncryptionKey s;
s.cipherKeys.cipherTextKey = getRandomCipherKey();
s.cipherKeys.cipherHeaderKey = getRandomCipherKey();
return s;
}
private:
Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) {
static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b";
uint8_t digest[AUTH_TOKEN_SIZE];
computeAuthToken(reinterpret_cast<const unsigned char*>(&cipherDetails.baseCipherId),
sizeof(EncryptCipherBaseKeyId),
SHA_KEY,
AES_256_KEY_LENGTH,
&digest[0],
AUTH_TOKEN_SIZE);
return makeReference<BlobCipherKey>(cipherDetails.encryptDomainId,
cipherDetails.baseCipherId,
&digest[0],
AES_256_KEY_LENGTH,
cipherDetails.salt,
std::numeric_limits<int64_t>::max() /* refreshAt */,
std::numeric_limits<int64_t>::max() /* expireAt */);
}
Reference<BlobCipherKey> getRandomCipherKey() {
return cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)];
}
static constexpr int NUM_CIPHER = 1000;
Reference<BlobCipherKey> cipherKeys[NUM_CIPHER];
};
// Key provider which extract tenant id from range key prefixes, and fetch tenant specific encryption keys from
// EncryptKeyProxy.
class TenantAwareEncryptionKeyProvider : public IEncryptionKeyProvider {
public:
TenantAwareEncryptionKeyProvider(Reference<AsyncVar<ServerDBInfo> const> db) : db(db) {}
virtual ~TenantAwareEncryptionKeyProvider() = default;
bool shouldEnableEncryption() const override {
return isEncryptionOpSupported(EncryptOperationType::STORAGE_SERVER_ENCRYPTION, db->get().client);
}
ACTOR static Future<EncryptionKey> getSecrets(TenantAwareEncryptionKeyProvider* self, EncryptionKeyRef key) {
if (!key.cipherHeader.present()) {
TraceEvent("TenantAwareEncryptionKeyProvider_CipherHeaderMissing");
throw encrypt_ops_error();
}
TextAndHeaderCipherKeys cipherKeys =
wait(getEncryptCipherKeys(self->db, key.cipherHeader.get(), BlobCipherMetrics::KV_REDWOOD));
EncryptionKey s = key;
s.cipherKeys = cipherKeys;
return s;
}
Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { return getSecrets(this, key); }
ACTOR static Future<EncryptionKey> getByRange(TenantAwareEncryptionKeyProvider* self, KeyRef begin, KeyRef end) {
EncryptCipherDomainNameRef domainName;
EncryptCipherDomainId domainId = self->getEncryptionDomainId(begin, end, &domainName);
TextAndHeaderCipherKeys cipherKeys =
wait(getLatestEncryptCipherKeysForDomain(self->db, domainId, domainName, BlobCipherMetrics::KV_REDWOOD));
EncryptionKey s;
s.cipherKeys = cipherKeys;
return s;
}
Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
return getByRange(this, begin, end);
}
void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) override {
ASSERT(tenantPrefixIndex.isValid());
this->tenantPrefixIndex = tenantPrefixIndex;
}
private:
EncryptCipherDomainId getEncryptionDomainId(const KeyRef& begin,
const KeyRef& end,
EncryptCipherDomainNameRef* domainName) {
int64_t domainId = SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
int64_t beginTenantId = getTenantId(begin, true /*inclusive*/);
int64_t endTenantId = getTenantId(end, false /*inclusive*/);
if (beginTenantId == endTenantId && beginTenantId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
ASSERT(tenantPrefixIndex.isValid());
Key tenantPrefix = TenantMapEntry::idToPrefix(beginTenantId);
auto view = tenantPrefixIndex->atLatest();
auto itr = view.find(tenantPrefix);
if (itr != view.end()) {
*domainName = *itr;
domainId = beginTenantId;
} else {
// No tenant with the same tenant id. We could be in optional or disabled tenant mode.
}
}
if (domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) {
*domainName = FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME;
}
return domainId;
}
int64_t getTenantId(const KeyRef& key, bool inclusive) {
// A valid tenant id is always a valid encrypt domain id.
static_assert(INVALID_ENCRYPT_DOMAIN_ID == -1);
if (key.size() && key >= systemKeys.begin) {
return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
}
if (key.size() < TENANT_PREFIX_SIZE) {
// Encryption domain information not available, leverage 'default encryption domain'
return FDB_DEFAULT_ENCRYPT_DOMAIN_ID;
}
StringRef prefix = key.substr(0, TENANT_PREFIX_SIZE);
int64_t tenantId = TenantMapEntry::prefixToId(prefix, EnforceValidTenantId::False);
if (tenantId == TenantInfo::INVALID_TENANT) {
// Encryption domain information not available, leverage 'default encryption domain'
return FDB_DEFAULT_ENCRYPT_DOMAIN_ID;
}
if (!inclusive && key.size() == TENANT_PREFIX_SIZE) {
tenantId = tenantId - 1;
}
ASSERT(tenantId >= 0);
return tenantId;
}
Reference<AsyncVar<ServerDBInfo> const> db;
Reference<TenantPrefixIndex> tenantPrefixIndex;
};
#include "flow/unactorcompiler.h"
#endif

View File

@ -294,7 +294,7 @@ public:
EncodingType expectedEncodingType() const override { return EncodingType::AESEncryptionV1; } EncodingType expectedEncodingType() const override { return EncodingType::AESEncryptionV1; }
bool enableEncryption() const override { bool enableEncryption() const override {
return isEncryptionOpSupported(EncryptOperationType::STORAGE_SERVER_ENCRYPTION, db->get().client); return isEncryptionOpSupported(EncryptOperationType::STORAGE_SERVER_ENCRYPTION);
} }
bool enableEncryptionDomain() const override { return SERVER_KNOBS->REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; } bool enableEncryptionDomain() const override { return SERVER_KNOBS->REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; }

View File

@ -186,6 +186,8 @@ struct KmsConnLookupDomainIdsReqInfoRef {
EncryptCipherDomainNameRef domainName; EncryptCipherDomainNameRef domainName;
KmsConnLookupDomainIdsReqInfoRef() : domainId(INVALID_ENCRYPT_DOMAIN_ID) {} KmsConnLookupDomainIdsReqInfoRef() : domainId(INVALID_ENCRYPT_DOMAIN_ID) {}
explicit KmsConnLookupDomainIdsReqInfoRef(Arena& arena, const KmsConnLookupDomainIdsReqInfoRef& from)
: domainId(from.domainId), domainName(StringRef(arena, from.domainName)) {}
explicit KmsConnLookupDomainIdsReqInfoRef(Arena& arena, const EncryptCipherDomainId dId, StringRef name) explicit KmsConnLookupDomainIdsReqInfoRef(Arena& arena, const EncryptCipherDomainId dId, StringRef name)
: domainId(dId), domainName(StringRef(arena, name)) {} : domainId(dId), domainName(StringRef(arena, name)) {}
explicit KmsConnLookupDomainIdsReqInfoRef(const EncryptCipherDomainId dId, StringRef name) explicit KmsConnLookupDomainIdsReqInfoRef(const EncryptCipherDomainId dId, StringRef name)

View File

@ -294,8 +294,7 @@ struct ProxyCommitData {
cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True)), db(db), cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, LockAware::True)), db(db),
singleKeyMutationEvent("SingleKeyMutation"_sr), lastTxsPop(0), popRemoteTxs(false), lastStartCommit(0), singleKeyMutationEvent("SingleKeyMutation"_sr), lastTxsPop(0), popRemoteTxs(false), lastStartCommit(0),
lastCommitLatency(SERVER_KNOBS->REQUIRED_MIN_RECOVERY_DURATION), lastCommitTime(0), lastMasterReset(now()), lastCommitLatency(SERVER_KNOBS->REQUIRED_MIN_RECOVERY_DURATION), lastCommitTime(0), lastMasterReset(now()),
lastResolverReset(now()), lastResolverReset(now()), isEncryptionEnabled(isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
isEncryptionEnabled(isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION, db->get().client)) {
commitComputePerOperation.resize(SERVER_KNOBS->PROXY_COMPUTE_BUCKETS, 0.0); commitComputePerOperation.resize(SERVER_KNOBS->PROXY_COMPUTE_BUCKETS, 0.0);
} }
}; };

View File

@ -81,7 +81,7 @@ struct GetRateInfoReply {
// Depending on the value of SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES, // Depending on the value of SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES,
// one of these fields may be populated // one of these fields may be populated
Optional<PrioritizedTransactionTagMap<ClientTagThrottleLimits>> clientThrottledTags; Optional<PrioritizedTransactionTagMap<ClientTagThrottleLimits>> clientThrottledTags;
Optional<PrioritizedTransactionTagMap<double>> proxyThrottledTags; Optional<TransactionTagMap<double>> proxyThrottledTags;
template <class Ar> template <class Ar>
void serialize(Ar& ar) { void serialize(Ar& ar) {

View File

@ -26,12 +26,13 @@
#define FDBSERVER_SERVERDBINFO_H #define FDBSERVER_SERVERDBINFO_H
#pragma once #pragma once
#include "fdbclient/ConsistencyScanInterface.actor.h"
#include "fdbserver/DataDistributorInterface.h" #include "fdbserver/DataDistributorInterface.h"
#include "fdbserver/MasterInterface.h" #include "fdbserver/MasterInterface.h"
#include "fdbserver/LogSystemConfig.h" #include "fdbserver/LogSystemConfig.h"
#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/BlobManagerInterface.h" #include "fdbserver/BlobManagerInterface.h"
#include "fdbclient/ConsistencyScanInterface.actor.h" #include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/RecoveryState.h" #include "fdbserver/RecoveryState.h"
#include "fdbserver/LatencyBandConfig.h" #include "fdbserver/LatencyBandConfig.h"
#include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h"
@ -50,6 +51,7 @@ struct ServerDBInfo {
MasterInterface master; // The best guess as to the most recent master, which might still be recovering MasterInterface master; // The best guess as to the most recent master, which might still be recovering
Optional<RatekeeperInterface> ratekeeper; Optional<RatekeeperInterface> ratekeeper;
Optional<BlobManagerInterface> blobManager; Optional<BlobManagerInterface> blobManager;
Optional<BlobMigratorInterface> blobMigrator;
Optional<EncryptKeyProxyInterface> encryptKeyProxy; Optional<EncryptKeyProxyInterface> encryptKeyProxy;
Optional<ConsistencyScanInterface> consistencyScan; Optional<ConsistencyScanInterface> consistencyScan;
std::vector<ResolverInterface> resolvers; std::vector<ResolverInterface> resolvers;
@ -84,6 +86,7 @@ struct ServerDBInfo {
master, master,
ratekeeper, ratekeeper,
blobManager, blobManager,
blobMigrator,
encryptKeyProxy, encryptKeyProxy,
consistencyScan, consistencyScan,
resolvers, resolvers,

View File

@ -42,7 +42,7 @@ public:
// For each tag and priority combination, return the throughput limit for the cluster // For each tag and priority combination, return the throughput limit for the cluster
// (to be shared across all GRV proxies) // (to be shared across all GRV proxies)
virtual PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) = 0; virtual TransactionTagMap<double> getProxyRates(int numProxies) = 0;
virtual int64_t autoThrottleCount() const = 0; virtual int64_t autoThrottleCount() const = 0;
virtual uint32_t busyReadTagCount() const = 0; virtual uint32_t busyReadTagCount() const = 0;
@ -66,7 +66,7 @@ public:
void addRequests(TransactionTag tag, int count) override; void addRequests(TransactionTag tag, int count) override;
uint64_t getThrottledTagChangeId() const override; uint64_t getThrottledTagChangeId() const override;
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override; PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override;
PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) override { throw not_implemented(); } TransactionTagMap<double> getProxyRates(int numProxies) override { throw not_implemented(); }
int64_t autoThrottleCount() const override; int64_t autoThrottleCount() const override;
uint32_t busyReadTagCount() const override; uint32_t busyReadTagCount() const override;
uint32_t busyWriteTagCount() const override; uint32_t busyWriteTagCount() const override;
@ -94,7 +94,7 @@ public:
Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&) override; Future<Void> tryUpdateAutoThrottling(StorageQueueInfo const&) override;
PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override; PrioritizedTransactionTagMap<ClientTagThrottleLimits> getClientRates() override;
PrioritizedTransactionTagMap<double> getProxyRates(int numProxies) override; TransactionTagMap<double> getProxyRates(int numProxies) override;
// Testing only: // Testing only:
public: public:

View File

@ -33,6 +33,7 @@
#include "fdbserver/RatekeeperInterface.h" #include "fdbserver/RatekeeperInterface.h"
#include "fdbclient/ConsistencyScanInterface.actor.h" #include "fdbclient/ConsistencyScanInterface.actor.h"
#include "fdbserver/BlobManagerInterface.h" #include "fdbserver/BlobManagerInterface.h"
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/ResolverInterface.h" #include "fdbserver/ResolverInterface.h"
#include "fdbclient/BlobWorkerInterface.h" #include "fdbclient/BlobWorkerInterface.h"
#include "fdbclient/ClientBooleanParams.h" #include "fdbclient/ClientBooleanParams.h"
@ -59,6 +60,7 @@ struct WorkerInterface {
RequestStream<struct InitializeBlobManagerRequest> blobManager; RequestStream<struct InitializeBlobManagerRequest> blobManager;
RequestStream<struct InitializeBlobWorkerRequest> blobWorker; RequestStream<struct InitializeBlobWorkerRequest> blobWorker;
RequestStream<struct InitializeConsistencyScanRequest> consistencyScan; RequestStream<struct InitializeConsistencyScanRequest> consistencyScan;
RequestStream<struct InitializeBlobMigratorRequest> blobMigrator;
RequestStream<struct InitializeResolverRequest> resolver; RequestStream<struct InitializeResolverRequest> resolver;
RequestStream<struct InitializeStorageRequest> storage; RequestStream<struct InitializeStorageRequest> storage;
RequestStream<struct InitializeLogRouterRequest> logRouter; RequestStream<struct InitializeLogRouterRequest> logRouter;
@ -115,6 +117,7 @@ struct WorkerInterface {
blobManager, blobManager,
blobWorker, blobWorker,
consistencyScan, consistencyScan,
blobMigrator,
resolver, resolver,
storage, storage,
logRouter, logRouter,
@ -430,6 +433,7 @@ struct RegisterWorkerRequest {
Optional<DataDistributorInterface> distributorInterf; Optional<DataDistributorInterface> distributorInterf;
Optional<RatekeeperInterface> ratekeeperInterf; Optional<RatekeeperInterface> ratekeeperInterf;
Optional<BlobManagerInterface> blobManagerInterf; Optional<BlobManagerInterface> blobManagerInterf;
Optional<BlobMigratorInterface> blobMigratorInterf;
Optional<EncryptKeyProxyInterface> encryptKeyProxyInterf; Optional<EncryptKeyProxyInterface> encryptKeyProxyInterf;
Optional<ConsistencyScanInterface> consistencyScanInterf; Optional<ConsistencyScanInterface> consistencyScanInterf;
Standalone<VectorRef<StringRef>> issues; Standalone<VectorRef<StringRef>> issues;
@ -452,6 +456,7 @@ struct RegisterWorkerRequest {
Optional<DataDistributorInterface> ddInterf, Optional<DataDistributorInterface> ddInterf,
Optional<RatekeeperInterface> rkInterf, Optional<RatekeeperInterface> rkInterf,
Optional<BlobManagerInterface> bmInterf, Optional<BlobManagerInterface> bmInterf,
Optional<BlobMigratorInterface> mgInterf,
Optional<EncryptKeyProxyInterface> ekpInterf, Optional<EncryptKeyProxyInterface> ekpInterf,
Optional<ConsistencyScanInterface> csInterf, Optional<ConsistencyScanInterface> csInterf,
bool degraded, bool degraded,
@ -461,9 +466,10 @@ struct RegisterWorkerRequest {
ConfigBroadcastInterface configBroadcastInterface) ConfigBroadcastInterface configBroadcastInterface)
: wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), : wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo),
generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), blobManagerInterf(bmInterf), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), blobManagerInterf(bmInterf),
encryptKeyProxyInterf(ekpInterf), consistencyScanInterf(csInterf), degraded(degraded), blobMigratorInterf(mgInterf), encryptKeyProxyInterf(ekpInterf), consistencyScanInterf(csInterf),
lastSeenKnobVersion(lastSeenKnobVersion), knobConfigClassSet(knobConfigClassSet), requestDbInfo(false), degraded(degraded), lastSeenKnobVersion(lastSeenKnobVersion), knobConfigClassSet(knobConfigClassSet),
recoveredDiskFiles(recoveredDiskFiles), configBroadcastInterface(configBroadcastInterface) {} requestDbInfo(false), recoveredDiskFiles(recoveredDiskFiles),
configBroadcastInterface(configBroadcastInterface) {}
template <class Ar> template <class Ar>
void serialize(Ar& ar) { void serialize(Ar& ar) {
@ -476,6 +482,7 @@ struct RegisterWorkerRequest {
distributorInterf, distributorInterf,
ratekeeperInterf, ratekeeperInterf,
blobManagerInterf, blobManagerInterf,
blobMigratorInterf,
encryptKeyProxyInterf, encryptKeyProxyInterf,
consistencyScanInterf, consistencyScanInterf,
issues, issues,
@ -762,6 +769,19 @@ struct InitializeBlobManagerRequest {
} }
}; };
struct InitializeBlobMigratorRequest {
constexpr static FileIdentifier file_identifier = 7932681;
UID reqId;
ReplyPromise<BlobMigratorInterface> reply;
InitializeBlobMigratorRequest() {}
explicit InitializeBlobMigratorRequest(UID uid) : reqId(uid) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reqId, reply);
}
};
struct InitializeResolverRequest { struct InitializeResolverRequest {
constexpr static FileIdentifier file_identifier = 7413317; constexpr static FileIdentifier file_identifier = 7413317;
LifetimeToken masterLifetime; LifetimeToken masterLifetime;
@ -1006,6 +1026,7 @@ struct Role {
static const Role RATEKEEPER; static const Role RATEKEEPER;
static const Role BLOB_MANAGER; static const Role BLOB_MANAGER;
static const Role BLOB_WORKER; static const Role BLOB_WORKER;
static const Role BLOB_MIGRATOR;
static const Role STORAGE_CACHE; static const Role STORAGE_CACHE;
static const Role COORDINATOR; static const Role COORDINATOR;
static const Role BACKUP; static const Role BACKUP;
@ -1042,6 +1063,8 @@ struct Role {
return BLOB_MANAGER; return BLOB_MANAGER;
case ProcessClass::BlobWorker: case ProcessClass::BlobWorker:
return BLOB_WORKER; return BLOB_WORKER;
case ProcessClass::BlobMigrator:
return BLOB_MIGRATOR;
case ProcessClass::StorageCache: case ProcessClass::StorageCache:
return STORAGE_CACHE; return STORAGE_CACHE;
case ProcessClass::Backup: case ProcessClass::Backup:
@ -1173,6 +1196,7 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface ddi, Reference<Async
ACTOR Future<Void> ratekeeper(RatekeeperInterface rki, Reference<AsyncVar<ServerDBInfo> const> db); ACTOR Future<Void> ratekeeper(RatekeeperInterface rki, Reference<AsyncVar<ServerDBInfo> const> db);
ACTOR Future<Void> consistencyScan(ConsistencyScanInterface csInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo); ACTOR Future<Void> consistencyScan(ConsistencyScanInterface csInterf, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
ACTOR Future<Void> blobManager(BlobManagerInterface bmi, Reference<AsyncVar<ServerDBInfo> const> db, int64_t epoch); ACTOR Future<Void> blobManager(BlobManagerInterface bmi, Reference<AsyncVar<ServerDBInfo> const> db, int64_t epoch);
ACTOR Future<Void> blobMigrator(BlobMigratorInterface mgi, Reference<AsyncVar<ServerDBInfo> const> db);
ACTOR Future<Void> storageCacheServer(StorageServerInterface interf, ACTOR Future<Void> storageCacheServer(StorageServerInterface interf,
uint16_t id, uint16_t id,
Reference<AsyncVar<ServerDBInfo> const> db); Reference<AsyncVar<ServerDBInfo> const> db);

View File

@ -64,6 +64,8 @@ struct WorkloadContext {
}; };
struct TestWorkload : NonCopyable, WorkloadContext, ReferenceCounted<TestWorkload> { struct TestWorkload : NonCopyable, WorkloadContext, ReferenceCounted<TestWorkload> {
// Implementations of TestWorkload need to provide their name by defining a static member variable called name:
// static constexpr const char* name = "WorkloadName";
int phases; int phases;
// Subclasses are expected to also have a constructor with this signature (to work with WorkloadFactory<>): // Subclasses are expected to also have a constructor with this signature (to work with WorkloadFactory<>):
@ -75,6 +77,8 @@ struct TestWorkload : NonCopyable, WorkloadContext, ReferenceCounted<TestWorkloa
} }
virtual ~TestWorkload(){}; virtual ~TestWorkload(){};
virtual Future<Void> initialized() { return Void(); } virtual Future<Void> initialized() { return Void(); }
// WARNING: this method must not be implemented by a workload directly. Instead, this will be implemented by
// the workload factory. Instead, provide a static member variable called name.
virtual std::string description() const = 0; virtual std::string description() const = 0;
virtual void disableFailureInjectionWorkloads(std::set<std::string>& out) const; virtual void disableFailureInjectionWorkloads(std::set<std::string>& out) const;
virtual Future<Void> setup(Database const& cx) { return Void(); } virtual Future<Void> setup(Database const& cx) { return Void(); }
@ -94,11 +98,26 @@ private:
virtual void getMetrics(std::vector<PerfMetric>& m) = 0; virtual void getMetrics(std::vector<PerfMetric>& m) = 0;
}; };
struct NoOptions {};
template <class Workload, bool isFailureInjectionWorkload = false>
struct TestWorkloadImpl : Workload {
static_assert(std::is_convertible_v<Workload&, TestWorkload&>);
static_assert(std::is_convertible_v<decltype(Workload::NAME), std::string>,
"Workload must have a static member `name` which is convertible to string");
static_assert(std::is_same_v<decltype(&TestWorkload::description), decltype(&Workload::description)>,
"Workload must not override TestWorkload::description");
TestWorkloadImpl(WorkloadContext const& wcx) : Workload(wcx) {}
template <bool E = isFailureInjectionWorkload>
TestWorkloadImpl(WorkloadContext const& wcx, std::enable_if_t<E, NoOptions> o) : Workload(wcx, o) {}
std::string description() const override { return Workload::NAME; }
};
struct CompoundWorkload; struct CompoundWorkload;
class DeterministicRandom; class DeterministicRandom;
struct NoOptions {};
struct FailureInjectionWorkload : TestWorkload { struct FailureInjectionWorkload : TestWorkload {
FailureInjectionWorkload(WorkloadContext const&); FailureInjectionWorkload(WorkloadContext const&);
virtual ~FailureInjectionWorkload() {} virtual ~FailureInjectionWorkload() {}
@ -126,12 +145,11 @@ struct FailureInjectorFactory : IFailureInjectorFactory {
IFailureInjectorFactory::factories().push_back(Reference<IFailureInjectorFactory>::addRef(this)); IFailureInjectorFactory::factories().push_back(Reference<IFailureInjectorFactory>::addRef(this));
} }
Reference<FailureInjectionWorkload> create(WorkloadContext const& wcx) override { Reference<FailureInjectionWorkload> create(WorkloadContext const& wcx) override {
return makeReference<W>(wcx, NoOptions()); return makeReference<TestWorkloadImpl<W, true>>(wcx, NoOptions());
} }
}; };
struct CompoundWorkload : TestWorkload { struct CompoundWorkload : TestWorkload {
bool runFailureWorkloads = true;
std::vector<Reference<TestWorkload>> workloads; std::vector<Reference<TestWorkload>> workloads;
std::vector<Reference<FailureInjectionWorkload>> failureInjection; std::vector<Reference<FailureInjectionWorkload>> failureInjection;
@ -213,14 +231,20 @@ struct IWorkloadFactory : ReferenceCounted<IWorkloadFactory> {
virtual Reference<TestWorkload> create(WorkloadContext const& wcx) = 0; virtual Reference<TestWorkload> create(WorkloadContext const& wcx) = 0;
}; };
template <class WorkloadType> FDB_DECLARE_BOOLEAN_PARAM(UntrustedMode);
template <class Workload>
struct WorkloadFactory : IWorkloadFactory { struct WorkloadFactory : IWorkloadFactory {
bool asClient; static_assert(std::is_convertible_v<decltype(Workload::NAME), std::string>,
WorkloadFactory(const char* name, bool asClient = false) : asClient(asClient) { "Each workload must have a Workload::NAME member");
factories()[name] = Reference<IWorkloadFactory>::addRef(this); using WorkloadType = TestWorkloadImpl<Workload>;
bool runInUntrustedClient;
WorkloadFactory(UntrustedMode runInUntrustedClient = UntrustedMode::False)
: runInUntrustedClient(runInUntrustedClient) {
factories()[WorkloadType::NAME] = Reference<IWorkloadFactory>::addRef(this);
} }
Reference<TestWorkload> create(WorkloadContext const& wcx) override { Reference<TestWorkload> create(WorkloadContext const& wcx) override {
if (g_network->isSimulated() && asClient) { if (g_network->isSimulated() && runInUntrustedClient) {
return makeReference<ClientWorkload>( return makeReference<ClientWorkload>(
[](WorkloadContext const& wcx) { return makeReference<WorkloadType>(wcx); }, wcx); [](WorkloadContext const& wcx) { return makeReference<WorkloadType>(wcx); }, wcx);
} }
@ -228,7 +252,7 @@ struct WorkloadFactory : IWorkloadFactory {
} }
}; };
#define REGISTER_WORKLOAD(classname) WorkloadFactory<classname> classname##WorkloadFactory(#classname) #define REGISTER_WORKLOAD(classname) WorkloadFactory<classname> classname##WorkloadFactory
struct DistributedTestResults { struct DistributedTestResults {
std::vector<PerfMetric> metrics; std::vector<PerfMetric> metrics;

View File

@ -28,6 +28,7 @@
#include "fdbrpc/TenantInfo.h" #include "fdbrpc/TenantInfo.h"
#include "flow/ApiVersion.h" #include "flow/ApiVersion.h"
#include "fmt/format.h" #include "fmt/format.h"
#include "fdbclient/Audit.h"
#include "fdbclient/CommitTransaction.h" #include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h" #include "fdbrpc/fdbrpc.h"
@ -1016,6 +1017,8 @@ public:
FlowLock serveFetchCheckpointParallelismLock; FlowLock serveFetchCheckpointParallelismLock;
FlowLock serveAuditStorageParallelismLock;
int64_t instanceID; int64_t instanceID;
Promise<Void> otherError; Promise<Void> otherError;
@ -1225,6 +1228,12 @@ public:
specialCounter(cc, "ServeFetchCheckpointWaiting", [self]() { specialCounter(cc, "ServeFetchCheckpointWaiting", [self]() {
return self->serveFetchCheckpointParallelismLock.waiters(); return self->serveFetchCheckpointParallelismLock.waiters();
}); });
specialCounter(cc, "ServeValidateStorageActive", [self]() {
return self->serveAuditStorageParallelismLock.activePermits();
});
specialCounter(cc, "ServeValidateStorageWaiting", [self]() {
return self->serveAuditStorageParallelismLock.waiters();
});
specialCounter( specialCounter(
cc, "ChangeFeedDiskReadsActive", [self]() { return self->changeFeedDiskReadsLock.activePermits(); }); cc, "ChangeFeedDiskReadsActive", [self]() { return self->changeFeedDiskReadsLock.activePermits(); });
specialCounter( specialCounter(
@ -1291,6 +1300,7 @@ public:
changeFeedDiskReadsLock(SERVER_KNOBS->CHANGE_FEED_DISK_READS_PARALLELISM), changeFeedDiskReadsLock(SERVER_KNOBS->CHANGE_FEED_DISK_READS_PARALLELISM),
fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
serveAuditStorageParallelismLock(SERVER_KNOBS->SERVE_AUDIT_STORAGE_PARALLELISM),
instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false), instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false),
versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0), versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0),
lastDurableVersionEBrake(0), maxQueryQueue(0), transactionTagCounter(ssi.id()), lastDurableVersionEBrake(0), maxQueryQueue(0), transactionTagCounter(ssi.id()),
@ -2864,20 +2874,6 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
} }
} }
if (DEBUG_CF_TRACE) {
TraceEvent(SevDebug, "ChangeFeedMutationsDone", data->thisServerID)
.detail("FeedID", req.rangeID)
.detail("StreamUID", streamUID)
.detail("Range", req.range)
.detail("Begin", req.begin)
.detail("End", req.end)
.detail("FirstVersion", reply.mutations.empty() ? invalidVersion : reply.mutations.front().version)
.detail("LastVersion", reply.mutations.empty() ? invalidVersion : reply.mutations.back().version)
.detail("Count", reply.mutations.size())
.detail("GotAll", gotAll)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
}
if (DEBUG_CF_MISSING(req.rangeID, req.range, req.begin, reply.mutations.back().version) && !req.canReadPopped) { if (DEBUG_CF_MISSING(req.rangeID, req.range, req.begin, reply.mutations.back().version) && !req.canReadPopped) {
bool foundVersion = false; bool foundVersion = false;
bool foundKey = false; bool foundKey = false;
@ -2929,6 +2925,21 @@ ACTOR Future<std::pair<ChangeFeedStreamReply, bool>> getChangeFeedMutations(Stor
reply.popVersion = feedInfo->emptyVersion + 1; reply.popVersion = feedInfo->emptyVersion + 1;
if (DEBUG_CF_TRACE) {
TraceEvent(SevDebug, "ChangeFeedMutationsDone", data->thisServerID)
.detail("FeedID", req.rangeID)
.detail("StreamUID", streamUID)
.detail("Range", req.range)
.detail("Begin", req.begin)
.detail("End", req.end)
.detail("FirstVersion", reply.mutations.empty() ? invalidVersion : reply.mutations.front().version)
.detail("LastVersion", reply.mutations.empty() ? invalidVersion : reply.mutations.back().version)
.detail("PopVersion", reply.popVersion)
.detail("Count", reply.mutations.size())
.detail("GotAll", gotAll)
.detail("PeerAddr", req.reply.getEndpoint().getPrimaryAddress());
}
// If the SS's version advanced at all during any of the waits, the read from memory may have missed some // If the SS's version advanced at all during any of the waits, the read from memory may have missed some
// mutations, so gotAll can only be true if data->version didn't change over the course of this actor // mutations, so gotAll can only be true if data->version didn't change over the course of this actor
return std::make_pair(reply, gotAll); return std::make_pair(reply, gotAll);
@ -4138,6 +4149,322 @@ Key constructMappedKey(KeyValueRef* keyValue, std::vector<Optional<Tuple>>& vec,
return mappedKeyTuple.pack(); return mappedKeyTuple.pack();
} }
ACTOR Future<Void> validateRangeAgainstServer(StorageServer* data,
KeyRange range,
Version version,
StorageServerInterface remoteServer) {
TraceEvent(SevInfo, "ValidateRangeAgainstServerBegin", data->thisServerID)
.detail("Range", range)
.detail("Version", version)
.detail("RemoteServer", remoteServer.toString());
state int validatedKeys = 0;
state std::string error;
loop {
try {
std::vector<Future<ErrorOr<GetKeyValuesReply>>> fs;
int limit = 1e4;
int limitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT;
GetKeyValuesRequest req;
req.begin = firstGreaterOrEqual(range.begin);
req.end = firstGreaterOrEqual(range.end);
req.limit = limit;
req.limitBytes = limitBytes;
req.version = version;
req.tags = TagSet();
fs.push_back(remoteServer.getKeyValues.getReplyUnlessFailedFor(req, 2, 0));
GetKeyValuesRequest localReq;
localReq.begin = firstGreaterOrEqual(range.begin);
localReq.end = firstGreaterOrEqual(range.end);
localReq.limit = limit;
localReq.limitBytes = limitBytes;
localReq.version = version;
localReq.tags = TagSet();
data->actors.add(getKeyValuesQ(data, localReq));
fs.push_back(errorOr(localReq.reply.getFuture()));
std::vector<ErrorOr<GetKeyValuesReply>> reps = wait(getAll(fs));
for (int i = 0; i < reps.size(); ++i) {
if (reps[i].isError()) {
TraceEvent(SevWarn, "ValidateRangeGetKeyValuesError", data->thisServerID)
.errorUnsuppressed(reps[i].getError())
.detail("ReplyIndex", i)
.detail("Range", range);
throw reps[i].getError();
}
if (reps[i].get().error.present()) {
TraceEvent(SevWarn, "ValidateRangeGetKeyValuesError", data->thisServerID)
.errorUnsuppressed(reps[i].get().error.get())
.detail("ReplyIndex", i)
.detail("Range", range);
throw reps[i].get().error.get();
}
}
GetKeyValuesReply remote = reps[0].get(), local = reps[1].get();
Key lastKey = range.begin;
const int end = std::min(local.data.size(), remote.data.size());
int i = 0;
for (; i < end; ++i) {
KeyValueRef remoteKV = remote.data[i];
KeyValueRef localKV = local.data[i];
if (!range.contains(remoteKV.key) || !range.contains(localKV.key)) {
TraceEvent(SevDebug, "SSValidateRangeKeyOutOfRange", data->thisServerID)
.detail("Range", range)
.detail("RemoteServer", remoteServer.toString().c_str())
.detail("LocalKey", Traceable<StringRef>::toString(localKV.key).c_str())
.detail("RemoteKey", Traceable<StringRef>::toString(remoteKV.key).c_str());
throw wrong_shard_server();
}
if (remoteKV.key != localKV.key) {
error = format("Key Mismatch: local server (%016llx): %s, remote server(%016llx) %s",
data->thisServerID.first(),
Traceable<StringRef>::toString(localKV.key).c_str(),
remoteServer.uniqueID.first(),
Traceable<StringRef>::toString(remoteKV.key).c_str());
} else if (remoteKV.value != localKV.value) {
error = format("Value Mismatch for Key %s: local server (%016llx): %s, remote server(%016llx) %s",
Traceable<StringRef>::toString(localKV.key).c_str(),
data->thisServerID.first(),
Traceable<StringRef>::toString(localKV.value).c_str(),
remoteServer.uniqueID.first(),
Traceable<StringRef>::toString(remoteKV.value).c_str());
} else {
TraceEvent(SevVerbose, "ValidatedKey", data->thisServerID).detail("Key", localKV.key);
++validatedKeys;
}
lastKey = localKV.key;
}
if (!error.empty()) {
break;
}
if (!local.more && !remote.more && local.data.size() == remote.data.size()) {
break;
} else if (i >= local.data.size() && !local.more && i < remote.data.size()) {
error = format("Missing key(s) form local server (%lld), next key: %s, remote server(%016llx) ",
data->thisServerID.first(),
Traceable<StringRef>::toString(remote.data[i].key).c_str(),
remoteServer.uniqueID.first());
break;
} else if (i >= remote.data.size() && !remote.more && i < local.data.size()) {
error = format("Missing key(s) form remote server (%lld), next local server(%016llx) key: %s",
remoteServer.uniqueID.first(),
data->thisServerID.first(),
Traceable<StringRef>::toString(local.data[i].key).c_str());
break;
}
range = KeyRangeRef(keyAfter(lastKey), range.end);
} catch (Error& e) {
TraceEvent(SevWarnAlways, "ValidateRangeAgainstServerError", data->thisServerID)
.errorUnsuppressed(e)
.detail("RemoteServer", remoteServer.toString())
.detail("Range", range)
.detail("Version", version);
throw e;
}
}
if (!error.empty()) {
TraceEvent(SevError, "ValidateRangeAgainstServerError", data->thisServerID)
.detail("Range", range)
.detail("Version", version)
.detail("ErrorMessage", error)
.detail("RemoteServer", remoteServer.toString());
}
TraceEvent(SevDebug, "ValidateRangeAgainstServerEnd", data->thisServerID)
.detail("Range", range)
.detail("Version", version)
.detail("ValidatedKeys", validatedKeys)
.detail("Servers", remoteServer.toString());
return Void();
}
ACTOR Future<Void> validateRangeShard(StorageServer* data, KeyRange range, std::vector<UID> candidates) {
TraceEvent(SevDebug, "ServeValidateRangeShardBegin", data->thisServerID)
.detail("Range", range)
.detail("Servers", describe(candidates));
state Version version;
state std::vector<Optional<Value>> serverListValues;
state Transaction tr(data->cx);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
loop {
try {
std::vector<Future<Optional<Value>>> serverListEntries;
for (const UID& id : candidates) {
serverListEntries.push_back(tr.get(serverListKeyFor(id)));
}
std::vector<Optional<Value>> serverListValues_ = wait(getAll(serverListEntries));
serverListValues = serverListValues_;
Version version_ = wait(tr.getReadVersion());
version = version_;
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
std::unordered_map<std::string, std::vector<StorageServerInterface>> ssis;
std::string thisDcId;
for (const auto& v : serverListValues) {
if (!v.present()) {
continue;
}
const StorageServerInterface ssi = decodeServerListValue(v.get());
if (ssi.uniqueID == data->thisServerID) {
thisDcId = ssi.locality.describeDcId();
}
ssis[ssi.locality.describeDcId()].push_back(ssi);
}
if (ssis.size() < 2) {
TraceEvent(SevWarn, "ServeValidateRangeShardNotHAConfig", data->thisServerID)
.detail("Range", range)
.detail("Servers", describe(candidates));
return Void();
}
StorageServerInterface* remoteServer = nullptr;
for (auto& [dcId, ssiList] : ssis) {
if (dcId != thisDcId) {
if (ssiList.empty()) {
break;
}
const int idx = deterministicRandom()->randomInt(0, ssiList.size());
remoteServer = &ssiList[idx];
break;
}
}
if (remoteServer != nullptr) {
wait(validateRangeAgainstServer(data, range, version, *remoteServer));
} else {
TraceEvent(SevWarn, "ServeValidateRangeShardRemoteNotFound", data->thisServerID)
.detail("Range", range)
.detail("Servers", describe(candidates));
throw audit_storage_failed();
}
return Void();
}
ACTOR Future<Void> validateRangeAgainstServers(StorageServer* data, KeyRange range, std::vector<UID> targetServers) {
TraceEvent(SevDebug, "ValidateRangeAgainstServersBegin", data->thisServerID)
.detail("Range", range)
.detail("TargetServers", describe(targetServers));
state Version version;
state std::vector<Optional<Value>> serverListValues;
state Transaction tr(data->cx);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
loop {
try {
std::vector<Future<Optional<Value>>> serverListEntries;
for (const UID& id : targetServers) {
if (id != data->thisServerID) {
serverListEntries.push_back(tr.get(serverListKeyFor(id)));
}
}
std::vector<Optional<Value>> serverListValues_ = wait(getAll(serverListEntries));
serverListValues = serverListValues_;
Version version_ = wait(tr.getReadVersion());
version = version_;
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
std::vector<Future<Void>> fs;
for (const auto& v : serverListValues) {
if (!v.present()) {
TraceEvent(SevWarn, "ValidateRangeRemoteServerNotFound", data->thisServerID).detail("Range", range);
throw audit_storage_failed();
}
fs.push_back(validateRangeAgainstServer(data, range, version, decodeServerListValue(v.get())));
}
wait(waitForAll(fs));
return Void();
}
ACTOR Future<Void> auditStorageQ(StorageServer* data, AuditStorageRequest req) {
wait(data->serveAuditStorageParallelismLock.take(TaskPriority::DefaultYield));
state FlowLock::Releaser holder(data->serveAuditStorageParallelismLock);
TraceEvent(SevInfo, "ServeAuditStorageBegin", data->thisServerID)
.detail("RequestID", req.id)
.detail("Range", req.range)
.detail("AuditType", req.type)
.detail("TargetServers", describe(req.targetServers));
state Key begin = req.range.begin;
state std::vector<Future<Void>> fs;
try {
if (req.targetServers.empty()) {
while (begin < req.range.end) {
state Transaction tr(data->cx);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state RangeResult shards = wait(krmGetRanges(&tr,
keyServersPrefix,
req.range,
SERVER_KNOBS->MOVE_SHARD_KRM_ROW_LIMIT,
SERVER_KNOBS->MOVE_SHARD_KRM_BYTE_LIMIT));
ASSERT(!shards.empty());
state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
for (int i = 0; i < shards.size() - 1; ++i) {
std::vector<UID> src;
std::vector<UID> dest;
UID srcId, destId;
decodeKeyServersValue(UIDtoTagMap, shards[i].value, src, dest, srcId, destId);
fs.push_back(validateRangeShard(data, KeyRangeRef(shards[i].key, shards[i + 1].key), src));
begin = shards[i + 1].key;
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
} else {
fs.push_back(validateRangeAgainstServers(data, req.range, req.targetServers));
}
wait(waitForAll(fs));
AuditStorageState res(req.id, req.getType());
res.setPhase(AuditPhase::Complete);
req.reply.send(res);
} catch (Error& e) {
TraceEvent(SevWarn, "ServeAuditStorageError", data->thisServerID)
.errorUnsuppressed(e)
.detail("RequestID", req.id)
.detail("Range", req.range)
.detail("AuditType", req.type);
req.reply.sendError(audit_storage_failed());
}
return Void();
}
TEST_CASE("/fdbserver/storageserver/constructMappedKey") { TEST_CASE("/fdbserver/storageserver/constructMappedKey") {
Key key = Tuple::makeTuple("key-0"_sr, "key-1"_sr, "key-2"_sr).getDataAsStandalone(); Key key = Tuple::makeTuple("key-0"_sr, "key-1"_sr, "key-2"_sr).getDataAsStandalone();
Value value = Tuple::makeTuple("value-0"_sr, "value-1"_sr, "value-2"_sr).getDataAsStandalone(); Value value = Tuple::makeTuple("value-0"_sr, "value-1"_sr, "value-2"_sr).getDataAsStandalone();
@ -8780,6 +9107,16 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
if (info != data->uidChangeFeed.end()) { if (info != data->uidChangeFeed.end()) {
// Cannot yield in mutation updating loop because of race with fetchVersion // Cannot yield in mutation updating loop because of race with fetchVersion
Version alreadyFetched = std::max(info->second->fetchVersion, info->second->durableFetchVersion.get()); Version alreadyFetched = std::max(info->second->fetchVersion, info->second->durableFetchVersion.get());
if (info->second->removing) {
auto cleanupPending = data->changeFeedCleanupDurable.find(info->second->id);
if (cleanupPending != data->changeFeedCleanupDurable.end() &&
cleanupPending->second <= newOldestVersion) {
// due to a race, we just applied a cleanup mutation, but feed updates happen just after. Don't
// write any mutations for this feed.
curFeed++;
continue;
}
}
for (auto& it : info->second->mutations) { for (auto& it : info->second->mutations) {
if (it.version <= alreadyFetched) { if (it.version <= alreadyFetched) {
continue; continue;
@ -10366,6 +10703,9 @@ ACTOR Future<Void> storageServerCore(StorageServer* self, StorageServerInterface
when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) { when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
self->actors.add(fetchCheckpointKeyValuesQ(self, req)); self->actors.add(fetchCheckpointKeyValuesQ(self, req));
} }
when(AuditStorageRequest req = waitNext(ssi.auditStorage.getFuture())) {
self->actors.add(auditStorageQ(self, req));
}
when(wait(updateProcessStatsTimer)) { when(wait(updateProcessStatsTimer)) {
updateProcessStats(self); updateProcessStats(self);
updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL); updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);

View File

@ -46,6 +46,8 @@
#include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
FDB_DEFINE_BOOLEAN_PARAM(UntrustedMode);
WorkloadContext::WorkloadContext() {} WorkloadContext::WorkloadContext() {}
WorkloadContext::WorkloadContext(const WorkloadContext& r) WorkloadContext::WorkloadContext(const WorkloadContext& r)

View File

@ -23,6 +23,7 @@
#include <boost/lexical_cast.hpp> #include <boost/lexical_cast.hpp>
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbserver/BlobMigratorInterface.h"
#include "flow/ApiVersion.h" #include "flow/ApiVersion.h"
#include "flow/IAsyncFile.h" #include "flow/IAsyncFile.h"
#include "fdbrpc/Locality.h" #include "fdbrpc/Locality.h"
@ -561,6 +562,7 @@ ACTOR Future<Void> registrationClient(
Reference<AsyncVar<Optional<DataDistributorInterface>> const> ddInterf, Reference<AsyncVar<Optional<DataDistributorInterface>> const> ddInterf,
Reference<AsyncVar<Optional<RatekeeperInterface>> const> rkInterf, Reference<AsyncVar<Optional<RatekeeperInterface>> const> rkInterf,
Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>> const> bmInterf, Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>> const> bmInterf,
Reference<AsyncVar<Optional<BlobMigratorInterface>> const> blobMigratorInterf,
Reference<AsyncVar<Optional<EncryptKeyProxyInterface>> const> ekpInterf, Reference<AsyncVar<Optional<EncryptKeyProxyInterface>> const> ekpInterf,
Reference<AsyncVar<Optional<ConsistencyScanInterface>> const> csInterf, Reference<AsyncVar<Optional<ConsistencyScanInterface>> const> csInterf,
Reference<AsyncVar<bool> const> degraded, Reference<AsyncVar<bool> const> degraded,
@ -602,6 +604,7 @@ ACTOR Future<Void> registrationClient(
ddInterf->get(), ddInterf->get(),
rkInterf->get(), rkInterf->get(),
bmInterf->get().present() ? bmInterf->get().get().second : Optional<BlobManagerInterface>(), bmInterf->get().present() ? bmInterf->get().get().second : Optional<BlobManagerInterface>(),
blobMigratorInterf->get(),
ekpInterf->get(), ekpInterf->get(),
csInterf->get(), csInterf->get(),
degraded->get(), degraded->get(),
@ -674,6 +677,7 @@ ACTOR Future<Void> registrationClient(
when(wait(rkInterf->onChange())) { break; } when(wait(rkInterf->onChange())) { break; }
when(wait(csInterf->onChange())) { break; } when(wait(csInterf->onChange())) { break; }
when(wait(bmInterf->onChange())) { break; } when(wait(bmInterf->onChange())) { break; }
when(wait(blobMigratorInterf->onChange())) { break; }
when(wait(ekpInterf->onChange())) { break; } when(wait(ekpInterf->onChange())) { break; }
when(wait(degraded->onChange())) { break; } when(wait(degraded->onChange())) { break; }
when(wait(FlowTransport::transport().onIncompatibleChanged())) { break; } when(wait(FlowTransport::transport().onIncompatibleChanged())) { break; }
@ -707,6 +711,10 @@ bool addressInDbAndPrimaryDc(const NetworkAddress& address, Reference<AsyncVar<S
return true; return true;
} }
if (dbi.blobMigrator.present() && dbi.blobMigrator.get().address() == address) {
return true;
}
if (dbi.encryptKeyProxy.present() && dbi.encryptKeyProxy.get().address() == address) { if (dbi.encryptKeyProxy.present() && dbi.encryptKeyProxy.get().address() == address) {
return true; return true;
} }
@ -1651,6 +1659,8 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>()); state Reference<AsyncVar<Optional<RatekeeperInterface>>> rkInterf(new AsyncVar<Optional<RatekeeperInterface>>());
state Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>> bmEpochAndInterf( state Reference<AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>> bmEpochAndInterf(
new AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>()); new AsyncVar<Optional<std::pair<int64_t, BlobManagerInterface>>>());
state Reference<AsyncVar<Optional<BlobMigratorInterface>>> blobMigratorInterf(
new AsyncVar<Optional<BlobMigratorInterface>>());
state UID lastBMRecruitRequestId; state UID lastBMRecruitRequestId;
state Reference<AsyncVar<Optional<EncryptKeyProxyInterface>>> ekpInterf( state Reference<AsyncVar<Optional<EncryptKeyProxyInterface>>> ekpInterf(
new AsyncVar<Optional<EncryptKeyProxyInterface>>()); new AsyncVar<Optional<EncryptKeyProxyInterface>>());
@ -1977,6 +1987,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
ddInterf, ddInterf,
rkInterf, rkInterf,
bmEpochAndInterf, bmEpochAndInterf,
blobMigratorInterf,
ekpInterf, ekpInterf,
csInterf, csInterf,
degraded, degraded,
@ -2023,8 +2034,11 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
localInfo.distributor.present() ? localInfo.distributor.get().id() : UID()) localInfo.distributor.present() ? localInfo.distributor.get().id() : UID())
.detail("BlobManagerID", .detail("BlobManagerID",
localInfo.blobManager.present() ? localInfo.blobManager.get().id() : UID()) localInfo.blobManager.present() ? localInfo.blobManager.get().id() : UID())
.detail("BlobMigratorID",
localInfo.blobMigrator.present() ? localInfo.blobMigrator.get().id() : UID())
.detail("EncryptKeyProxyID", .detail("EncryptKeyProxyID",
localInfo.encryptKeyProxy.present() ? localInfo.encryptKeyProxy.get().id() : UID()); localInfo.encryptKeyProxy.present() ? localInfo.encryptKeyProxy.get().id() : UID())
.detail("IsEncryptionEnabled", localInfo.client.isEncryptionEnabled);
dbInfo->set(localInfo); dbInfo->set(localInfo);
} }
@ -2242,6 +2256,31 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
TraceEvent("BlobManagerReceived", req.reqId).detail("BlobManagerId", recruited.id()); TraceEvent("BlobManagerReceived", req.reqId).detail("BlobManagerId", recruited.id());
req.reply.send(recruited); req.reply.send(recruited);
} }
when(InitializeBlobMigratorRequest req = waitNext(interf.blobMigrator.getFuture())) {
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::BlobMigrator;
BlobMigratorInterface recruited(locality, req.reqId);
recruited.initEndpoints();
if (blobMigratorInterf->get().present()) {
recruited = blobMigratorInterf->get().get();
CODE_PROBE(true, "Recruited while already a blob migrator.");
} else {
startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.waitFailure);
Future<Void> blobMigratorProcess = blobMigrator(recruited, dbInfo);
errorForwarders.add(forwardError(errors,
Role::BLOB_MIGRATOR,
recruited.id(),
setWhenDoneOrError(blobMigratorProcess,
blobMigratorInterf,
Optional<BlobMigratorInterface>())));
blobMigratorInterf->set(Optional<BlobMigratorInterface>(recruited));
}
TraceEvent("BlobMigrator_InitRequest", req.reqId).detail("BlobMigratorId", recruited.id());
req.reply.send(recruited);
}
when(InitializeBackupRequest req = waitNext(interf.backup.getFuture())) { when(InitializeBackupRequest req = waitNext(interf.backup.getFuture())) {
if (!backupWorkerCache.exists(req.reqId)) { if (!backupWorkerCache.exists(req.reqId)) {
LocalLineage _; LocalLineage _;
@ -2727,8 +2766,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
f.cancel(); f.cancel();
state Error e = err; state Error e = err;
bool ok = e.code() == error_code_please_reboot || e.code() == error_code_actor_cancelled || bool ok = e.code() == error_code_please_reboot || e.code() == error_code_actor_cancelled ||
e.code() == error_code_please_reboot_delete; e.code() == error_code_please_reboot_delete || e.code() == error_code_local_config_changed;
endRole(Role::WORKER, interf.id(), "WorkerError", ok, e); endRole(Role::WORKER, interf.id(), "WorkerError", ok, e);
errorForwarders.clear(false); errorForwarders.clear(false);
sharedLogs.clear(); sharedLogs.clear();
@ -3546,6 +3584,7 @@ const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD");
const Role Role::RATEKEEPER("Ratekeeper", "RK"); const Role Role::RATEKEEPER("Ratekeeper", "RK");
const Role Role::BLOB_MANAGER("BlobManager", "BM"); const Role Role::BLOB_MANAGER("BlobManager", "BM");
const Role Role::BLOB_WORKER("BlobWorker", "BW"); const Role Role::BLOB_WORKER("BlobWorker", "BW");
const Role Role::BLOB_MIGRATOR("BlobMigrator", "MG");
const Role Role::STORAGE_CACHE("StorageCache", "SC"); const Role Role::STORAGE_CACHE("StorageCache", "SC");
const Role Role::COORDINATOR("Coordinator", "CD"); const Role Role::COORDINATOR("Coordinator", "CD");
const Role Role::BACKUP("Backup", "BK"); const Role Role::BACKUP("Backup", "BK");

Some files were not shown because too many files have changed in this diff Show More