Merge branch 'main' of github.com:apple/foundationdb into jfu-grv-cache-multi-threaded

This commit is contained in:
Jon Fu 2022-03-16 14:32:33 -04:00
commit 9731d96797
111 changed files with 8190 additions and 1113 deletions

View File

@ -93,11 +93,35 @@ if(NOT WIN32)
set(UNIT_TEST_VERSION_510_SRCS test/unit/unit_tests_version_510.cpp)
set(TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS test/unit/trace_partial_file_suffix_test.cpp)
set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS
set(DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS
test/unit/disconnected_timeout_tests.cpp
test/unit/fdb_api.cpp
test/unit/fdb_api.hpp)
set(API_TESTER_SRCS
test/apitester/fdb_c_api_tester.cpp
test/apitester/TesterApiWorkload.cpp
test/apitester/TesterApiWorkload.h
test/apitester/TesterApiWrapper.cpp
test/apitester/TesterApiWrapper.h
test/apitester/TesterTestSpec.cpp
test/apitester/TesterTestSpec.h
test/apitester/TesterCancelTransactionWorkload.cpp
test/apitester/TesterCorrectnessWorkload.cpp
test/apitester/TesterKeyValueStore.cpp
test/apitester/TesterKeyValueStore.h
test/apitester/TesterOptions.h
test/apitester/TesterScheduler.cpp
test/apitester/TesterScheduler.h
test/apitester/TesterTransactionExecutor.cpp
test/apitester/TesterTransactionExecutor.h
test/apitester/TesterUtil.cpp
test/apitester/TesterUtil.h
test/apitester/TesterWorkload.cpp
test/apitester/TesterWorkload.h
../../flow/SimpleOpt.h
)
if(OPEN_FOR_IDE)
add_library(fdb_c_performance_test OBJECT test/performance_test.c test/test.h)
add_library(fdb_c_ryw_benchmark OBJECT test/ryw_benchmark.c test/test.h)
@ -108,6 +132,7 @@ if(NOT WIN32)
add_library(fdb_c_unit_tests_version_510 OBJECT ${UNIT_TEST_VERSION_510_SRCS})
add_library(trace_partial_file_suffix_test OBJECT ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS})
add_library(disconnected_timeout_unit_tests OBJECT ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS})
add_library(fdb_c_api_tester OBJECT ${API_TESTER_SRCS})
else()
add_executable(fdb_c_performance_test test/performance_test.c test/test.h)
add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h)
@ -118,6 +143,7 @@ if(NOT WIN32)
add_executable(fdb_c_unit_tests_version_510 ${UNIT_TEST_VERSION_510_SRCS})
add_executable(trace_partial_file_suffix_test ${TRACE_PARTIAL_FILE_SUFFIX_TEST_SRCS})
add_executable(disconnected_timeout_unit_tests ${DISCONNECTED_TIMEOUT_UNIT_TEST_SRCS})
add_executable(fdb_c_api_tester ${API_TESTER_SRCS})
strip_debug_symbols(fdb_c_performance_test)
strip_debug_symbols(fdb_c_ryw_benchmark)
strip_debug_symbols(fdb_c_txn_size_test)
@ -140,6 +166,12 @@ if(NOT WIN32)
target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow)
target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads)
if(USE_SANITIZER)
target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_asan)
else()
target_link_libraries(fdb_c_api_tester PRIVATE fdb_c toml11_target Threads::Threads fmt::fmt boost_target)
endif()
# do not set RPATH for mako
set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
target_link_libraries(mako PRIVATE fdb_c fdbclient)
@ -165,6 +197,7 @@ if(NOT WIN32)
add_custom_target(external_client DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so)
add_dependencies(fdb_c_unit_tests external_client)
add_dependencies(disconnected_timeout_unit_tests external_client)
add_dependencies(fdb_c_api_tester external_client)
add_fdbclient_test(
NAME fdb_c_setup_tests
@ -202,6 +235,19 @@ if(NOT WIN32)
@CLUSTER_FILE@
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
)
add_fdbclient_test(
NAME fdb_c_api_tests
DISABLE_LOG_DUMP
COMMAND ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/run_c_api_tests.py
--cluster-file
@CLUSTER_FILE@
--tester-binary
$<TARGET_FILE:fdb_c_api_tester>
--external-client-library
${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-dir
${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
)
endif()
set(c_workloads_srcs

View File

@ -38,12 +38,14 @@ int g_api_version = 0;
* FDBFuture -> ThreadSingleAssignmentVarBase
* FDBResult -> ThreadSingleAssignmentVarBase
* FDBDatabase -> IDatabase
* FDBTenant -> ITenant
* FDBTransaction -> ITransaction
*/
#define TSAVB(f) ((ThreadSingleAssignmentVarBase*)(f))
#define TSAV(T, f) ((ThreadSingleAssignmentVar<T>*)(f))
#define DB(d) ((IDatabase*)d)
#define TENANT(t) ((ITenant*)t)
#define TXN(t) ((ITransaction*)t)
// Legacy (pre API version 610)
@ -387,6 +389,14 @@ extern "C" DLLEXPORT void fdb_database_destroy(FDBDatabase* d) {
CATCH_AND_DIE(DB(d)->delref(););
}
extern "C" DLLEXPORT fdb_error_t fdb_database_open_tenant(FDBDatabase* d,
uint8_t const* tenant_name,
int tenant_name_length,
FDBTenant** out_tenant) {
CATCH_AND_RETURN(*out_tenant =
(FDBTenant*)DB(d)->openTenant(TenantNameRef(tenant_name, tenant_name_length)).extractPtr(););
}
extern "C" DLLEXPORT fdb_error_t fdb_database_create_transaction(FDBDatabase* d, FDBTransaction** out_transaction) {
CATCH_AND_RETURN(Reference<ITransaction> tr = DB(d)->createTransaction();
if (g_api_version <= 15) tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -448,6 +458,17 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
}).extractPtr());
}
extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
}
extern "C" DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant) {
try {
TENANT(tenant)->delref();
} catch (...) {
}
}
extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
try {
TXN(tr)->delref();

View File

@ -263,6 +263,11 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_set_option(FDBDatabase* d,
uint8_t const* value,
int value_length);
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_open_tenant(FDBDatabase* d,
uint8_t const* tenant_name,
int tenant_name_length,
FDBTenant** out_tenant);
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_database_create_transaction(FDBDatabase* d,
FDBTransaction** out_transaction);
@ -286,6 +291,11 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
FDBTransaction** out_transaction);
DLLEXPORT void fdb_tenant_destroy(FDBTenant* tenant);
DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr);
DLLEXPORT void fdb_transaction_cancel(FDBTransaction* tr);

View File

@ -33,7 +33,9 @@ extern "C" {
/* Pointers to these opaque types represent objects in the FDB API */
typedef struct FDB_future FDBFuture;
typedef struct FDB_result FDBResult;
typedef struct FDB_cluster FDBCluster;
typedef struct FDB_database FDBDatabase;
typedef struct FDB_tenant FDBTenant;
typedef struct FDB_transaction FDBTransaction;
typedef int fdb_error_t;

View File

@ -0,0 +1,129 @@
/*
* TesterApiWorkload.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterApiWorkload.h"
#include "TesterUtil.h"
#include <fmt/format.h>
namespace FdbApiTester {
ApiWorkload::ApiWorkload(const WorkloadConfig& config) : WorkloadBase(config) {
minKeyLength = config.getIntOption("minKeyLength", 1);
maxKeyLength = config.getIntOption("maxKeyLength", 64);
minValueLength = config.getIntOption("minValueLength", 1);
maxValueLength = config.getIntOption("maxValueLength", 1000);
maxKeysPerTransaction = config.getIntOption("maxKeysPerTransaction", 50);
initialSize = config.getIntOption("initialSize", 1000);
readExistingKeysRatio = config.getFloatOption("readExistingKeysRatio", 0.9);
keyPrefix = fmt::format("{}/", workloadId);
}
void ApiWorkload::start() {
schedule([this]() {
// 1. Clear data
clearData([this]() {
// 2. Populate initial data
populateData([this]() {
// 3. Generate random workload
runTests();
});
});
});
}
std::string ApiWorkload::randomKeyName() {
return keyPrefix + Random::get().randomStringLowerCase(minKeyLength, maxKeyLength);
}
std::string ApiWorkload::randomValue() {
return Random::get().randomStringLowerCase(minValueLength, maxValueLength);
}
std::string ApiWorkload::randomNotExistingKey() {
while (true) {
std::string key = randomKeyName();
if (!store.exists(key)) {
return key;
}
}
}
std::string ApiWorkload::randomExistingKey() {
std::string genKey = randomKeyName();
std::string key = store.getKey(genKey, true, 1);
if (key != store.endKey()) {
return key;
}
key = store.getKey(genKey, true, 0);
if (key != store.startKey()) {
return key;
}
info("No existing key found, using a new random key.");
return genKey;
}
std::string ApiWorkload::randomKey(double existingKeyRatio) {
if (Random::get().randomBool(existingKeyRatio)) {
return randomExistingKey();
} else {
return randomNotExistingKey();
}
}
void ApiWorkload::populateDataTx(TTaskFct cont) {
int numKeys = maxKeysPerTransaction;
auto kvPairs = std::make_shared<std::vector<KeyValue>>();
for (int i = 0; i < numKeys; i++) {
kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() });
}
execTransaction(
[kvPairs](auto ctx) {
for (const KeyValue& kv : *kvPairs) {
ctx->tx()->set(kv.key, kv.value);
}
ctx->commit();
},
[this, kvPairs, cont]() {
for (const KeyValue& kv : *kvPairs) {
store.set(kv.key, kv.value);
}
schedule(cont);
});
}
void ApiWorkload::clearData(TTaskFct cont) {
execTransaction(
[this](auto ctx) {
ctx->tx()->clearRange(keyPrefix, fmt::format("{}\xff", keyPrefix));
ctx->commit();
},
[this, cont]() { schedule(cont); });
}
void ApiWorkload::populateData(TTaskFct cont) {
if (store.size() < initialSize) {
populateDataTx([this, cont]() { populateData(cont); });
} else {
info("Data population completed");
schedule(cont);
}
}
} // namespace FdbApiTester

View File

@ -0,0 +1,89 @@
/*
* TesterApiWorkload.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef APITESTER_API_WORKLOAD_H
#define APITESTER_API_WORKLOAD_H
#include "TesterWorkload.h"
#include "TesterKeyValueStore.h"
namespace FdbApiTester {
/**
* Base class for implementing API testing workloads.
* Provides various helper methods and reusable configuration parameters
*/
class ApiWorkload : public WorkloadBase {
public:
void start() override;
// Method to be overridden to run specific tests
virtual void runTests() = 0;
protected:
// The minimum length of a key
int minKeyLength;
// The maximum length of a key
int maxKeyLength;
// The minimum length of a value
int minValueLength;
// The maximum length of a value
int maxValueLength;
// Maximum number of keys to be accessed by a transaction
int maxKeysPerTransaction;
// Initial data size (number of key-value pairs)
int initialSize;
// The ratio of reading existing keys
double readExistingKeysRatio;
// Key prefix
std::string keyPrefix;
// In-memory store maintaining expected database state
KeyValueStore store;
ApiWorkload(const WorkloadConfig& config);
// Methods for generating random keys and values
std::string randomKeyName();
std::string randomValue();
std::string randomNotExistingKey();
std::string randomExistingKey();
std::string randomKey(double existingKeyRatio);
// Generate initial random data for the workload
void populateData(TTaskFct cont);
// Clear the data of the workload
void clearData(TTaskFct cont);
private:
void populateDataTx(TTaskFct cont);
};
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,124 @@
/*
* TesterApiWrapper.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterApiWrapper.h"
#include "TesterUtil.h"
#include <cstdint>
#include <fmt/format.h>
namespace FdbApiTester {
namespace {
void fdb_check(fdb_error_t e) {
if (e) {
fmt::print(stderr, "Unexpected error: %s\n", fdb_get_error(e));
std::abort();
}
}
} // namespace
Future::Future(FDBFuture* f) : future_(f, fdb_future_destroy) {}
void Future::reset() {
future_.reset();
}
void Future::cancel() {
ASSERT(future_);
fdb_future_cancel(future_.get());
}
fdb_error_t Future::getError() const {
ASSERT(future_);
return fdb_future_get_error(future_.get());
}
std::optional<std::string> ValueFuture::getValue() const {
ASSERT(future_);
int out_present;
const std::uint8_t* val;
int vallen;
fdb_check(fdb_future_get_value(future_.get(), &out_present, &val, &vallen));
return out_present ? std::make_optional(std::string((const char*)val, vallen)) : std::nullopt;
}
// Given an FDBDatabase, initializes a new transaction.
Transaction::Transaction(FDBTransaction* tx) : tx_(tx, fdb_transaction_destroy) {}
ValueFuture Transaction::get(std::string_view key, fdb_bool_t snapshot) {
ASSERT(tx_);
return ValueFuture(fdb_transaction_get(tx_.get(), (const uint8_t*)key.data(), key.size(), snapshot));
}
void Transaction::set(std::string_view key, std::string_view value) {
ASSERT(tx_);
fdb_transaction_set(tx_.get(), (const uint8_t*)key.data(), key.size(), (const uint8_t*)value.data(), value.size());
}
void Transaction::clear(std::string_view key) {
ASSERT(tx_);
fdb_transaction_clear(tx_.get(), (const uint8_t*)key.data(), key.size());
}
void Transaction::clearRange(std::string_view begin, std::string_view end) {
ASSERT(tx_);
fdb_transaction_clear_range(
tx_.get(), (const uint8_t*)begin.data(), begin.size(), (const uint8_t*)end.data(), end.size());
}
Future Transaction::commit() {
ASSERT(tx_);
return Future(fdb_transaction_commit(tx_.get()));
}
void Transaction::cancel() {
ASSERT(tx_);
fdb_transaction_cancel(tx_.get());
}
Future Transaction::onError(fdb_error_t err) {
ASSERT(tx_);
return Future(fdb_transaction_on_error(tx_.get(), err));
}
void Transaction::reset() {
ASSERT(tx_);
fdb_transaction_reset(tx_.get());
}
fdb_error_t Transaction::setOption(FDBTransactionOption option) {
ASSERT(tx_);
return fdb_transaction_set_option(tx_.get(), option, reinterpret_cast<const uint8_t*>(""), 0);
}
fdb_error_t FdbApi::setOption(FDBNetworkOption option, std::string_view value) {
return fdb_network_set_option(option, reinterpret_cast<const uint8_t*>(value.data()), value.size());
}
fdb_error_t FdbApi::setOption(FDBNetworkOption option, int64_t value) {
return fdb_network_set_option(option, reinterpret_cast<const uint8_t*>(&value), sizeof(value));
}
fdb_error_t FdbApi::setOption(FDBNetworkOption option) {
return fdb_network_set_option(option, reinterpret_cast<const uint8_t*>(""), 0);
}
} // namespace FdbApiTester

View File

@ -0,0 +1,92 @@
/*
* TesterApiWrapper.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_API_WRAPPER_H
#define APITESTER_API_WRAPPER_H
#include <string_view>
#include <optional>
#include <memory>
#define FDB_API_VERSION 710
#include "bindings/c/foundationdb/fdb_c.h"
#undef ERROR
#define ERROR(name, number, description) enum { error_code_##name = number };
#include "flow/error_definitions.h"
namespace FdbApiTester {
// Wrapper parent class to manage memory of an FDBFuture pointer. Cleans up
// FDBFuture when this instance goes out of scope.
class Future {
public:
Future() = default;
Future(FDBFuture* f);
FDBFuture* fdbFuture() { return future_.get(); };
fdb_error_t getError() const;
explicit operator bool() const { return future_ != nullptr; };
void reset();
void cancel();
protected:
std::shared_ptr<FDBFuture> future_;
};
class ValueFuture : public Future {
public:
ValueFuture() = default;
ValueFuture(FDBFuture* f) : Future(f) {}
std::optional<std::string> getValue() const;
};
class Transaction {
public:
Transaction() = default;
Transaction(FDBTransaction* tx);
ValueFuture get(std::string_view key, fdb_bool_t snapshot);
void set(std::string_view key, std::string_view value);
void clear(std::string_view key);
void clearRange(std::string_view begin, std::string_view end);
Future commit();
void cancel();
Future onError(fdb_error_t err);
void reset();
fdb_error_t setOption(FDBTransactionOption option);
private:
std::shared_ptr<FDBTransaction> tx_;
};
class FdbApi {
public:
static fdb_error_t setOption(FDBNetworkOption option, std::string_view value);
static fdb_error_t setOption(FDBNetworkOption option, int64_t value);
static fdb_error_t setOption(FDBNetworkOption option);
};
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,113 @@
/*
* TesterCancelTransactionWorkload.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterApiWorkload.h"
#include "TesterUtil.h"
namespace FdbApiTester {
class CancelTransactionWorkload : public ApiWorkload {
public:
CancelTransactionWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
numRandomOperations = config.getIntOption("numRandomOperations", 1000);
numOpLeft = numRandomOperations;
}
void runTests() override { randomOperations(); }
private:
enum OpType { OP_CANCEL_GET, OP_CANCEL_AFTER_FIRST_GET, OP_LAST = OP_CANCEL_AFTER_FIRST_GET };
// The number of operations to be executed
int numRandomOperations;
// Operations counter
int numOpLeft;
// Start multiple concurrent gets and cancel the transaction
void randomCancelGetTx(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto keys = std::make_shared<std::vector<std::string>>();
for (int i = 0; i < numKeys; i++) {
keys->push_back(randomKey(readExistingKeysRatio));
}
execTransaction(
[keys](auto ctx) {
std::vector<Future> futures;
for (const auto& key : *keys) {
futures.push_back(ctx->tx()->get(key, false));
}
ctx->done();
},
[this, cont]() { schedule(cont); });
}
// Start multiple concurrent gets and cancel the transaction after the first get returns
void randomCancelAfterFirstResTx(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto keys = std::make_shared<std::vector<std::string>>();
for (int i = 0; i < numKeys; i++) {
keys->push_back(randomKey(readExistingKeysRatio));
}
execTransaction(
[this, keys](auto ctx) {
std::vector<ValueFuture> futures;
for (const auto& key : *keys) {
futures.push_back(ctx->tx()->get(key, false));
}
for (int i = 0; i < keys->size(); i++) {
ValueFuture f = futures[i];
auto expectedVal = store.get((*keys)[i]);
ctx->continueAfter(f, [expectedVal, f, this, ctx]() {
auto val = f.getValue();
if (expectedVal != val) {
error(fmt::format(
"cancelAfterFirstResTx mismatch. expected: {:.80} actual: {:.80}", expectedVal, val));
}
ctx->done();
});
}
},
[this, cont]() { schedule(cont); });
}
void randomOperation(TTaskFct cont) {
OpType txType = (OpType)Random::get().randomInt(0, OP_LAST);
switch (txType) {
case OP_CANCEL_GET:
randomCancelGetTx(cont);
break;
case OP_CANCEL_AFTER_FIRST_GET:
randomCancelAfterFirstResTx(cont);
break;
}
}
void randomOperations() {
if (numOpLeft == 0)
return;
numOpLeft--;
randomOperation([this]() { randomOperations(); });
}
};
WorkloadFactory<CancelTransactionWorkload> MiscTestWorkloadFactory("CancelTransaction");
} // namespace FdbApiTester

View File

@ -0,0 +1,227 @@
/*
* TesterCorrectnessWorkload.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterApiWorkload.h"
#include "TesterUtil.h"
#include <memory>
#include <fmt/format.h>
namespace FdbApiTester {
class ApiCorrectnessWorkload : public ApiWorkload {
public:
ApiCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
numRandomOperations = config.getIntOption("numRandomOperations", 1000);
numOpLeft = numRandomOperations;
}
void runTests() override { randomOperations(); }
private:
enum OpType { OP_INSERT, OP_GET, OP_CLEAR, OP_CLEAR_RANGE, OP_COMMIT_READ, OP_LAST = OP_COMMIT_READ };
// The number of operations to be executed
int numRandomOperations;
// Operations counter
int numOpLeft;
void randomInsertOp(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto kvPairs = std::make_shared<std::vector<KeyValue>>();
for (int i = 0; i < numKeys; i++) {
kvPairs->push_back(KeyValue{ randomNotExistingKey(), randomValue() });
}
execTransaction(
[kvPairs](auto ctx) {
for (const KeyValue& kv : *kvPairs) {
ctx->tx()->set(kv.key, kv.value);
}
ctx->commit();
},
[this, kvPairs, cont]() {
for (const KeyValue& kv : *kvPairs) {
store.set(kv.key, kv.value);
}
schedule(cont);
});
}
void randomCommitReadOp(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto kvPairs = std::make_shared<std::vector<KeyValue>>();
for (int i = 0; i < numKeys; i++) {
kvPairs->push_back(KeyValue{ randomKey(readExistingKeysRatio), randomValue() });
}
execTransaction(
[kvPairs](auto ctx) {
for (const KeyValue& kv : *kvPairs) {
ctx->tx()->set(kv.key, kv.value);
}
ctx->commit();
},
[this, kvPairs, cont]() {
for (const KeyValue& kv : *kvPairs) {
store.set(kv.key, kv.value);
}
auto results = std::make_shared<std::vector<std::optional<std::string>>>();
execTransaction(
[kvPairs, results](auto ctx) {
// TODO: Enable after merging with GRV caching
// ctx->tx()->setOption(FDB_TR_OPTION_USE_GRV_CACHE);
auto futures = std::make_shared<std::vector<Future>>();
for (const auto& kv : *kvPairs) {
futures->push_back(ctx->tx()->get(kv.key, false));
}
ctx->continueAfterAll(*futures, [ctx, futures, results]() {
results->clear();
for (auto& f : *futures) {
results->push_back(((ValueFuture&)f).getValue());
}
ASSERT(results->size() == futures->size());
ctx->done();
});
},
[this, kvPairs, results, cont]() {
ASSERT(results->size() == kvPairs->size());
for (int i = 0; i < kvPairs->size(); i++) {
auto expected = store.get((*kvPairs)[i].key);
auto actual = (*results)[i];
if (actual != expected) {
error(
fmt::format("randomCommitReadOp mismatch. key: {} expected: {:.80} actual: {:.80}",
(*kvPairs)[i].key,
expected,
actual));
ASSERT(false);
}
}
schedule(cont);
});
});
}
void randomGetOp(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto keys = std::make_shared<std::vector<std::string>>();
auto results = std::make_shared<std::vector<std::optional<std::string>>>();
for (int i = 0; i < numKeys; i++) {
keys->push_back(randomKey(readExistingKeysRatio));
}
execTransaction(
[keys, results](auto ctx) {
auto futures = std::make_shared<std::vector<Future>>();
for (const auto& key : *keys) {
futures->push_back(ctx->tx()->get(key, false));
}
ctx->continueAfterAll(*futures, [ctx, futures, results]() {
results->clear();
for (auto& f : *futures) {
results->push_back(((ValueFuture&)f).getValue());
}
ASSERT(results->size() == futures->size());
ctx->done();
});
},
[this, keys, results, cont]() {
ASSERT(results->size() == keys->size());
for (int i = 0; i < keys->size(); i++) {
auto expected = store.get((*keys)[i]);
if ((*results)[i] != expected) {
error(fmt::format("randomGetOp mismatch. key: {} expected: {:.80} actual: {:.80}",
(*keys)[i],
expected,
(*results)[i]));
}
}
schedule(cont);
});
}
void randomClearOp(TTaskFct cont) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto keys = std::make_shared<std::vector<std::string>>();
for (int i = 0; i < numKeys; i++) {
keys->push_back(randomExistingKey());
}
execTransaction(
[keys](auto ctx) {
for (const auto& key : *keys) {
ctx->tx()->clear(key);
}
ctx->commit();
},
[this, keys, cont]() {
for (const auto& key : *keys) {
store.clear(key);
}
schedule(cont);
});
}
void randomClearRangeOp(TTaskFct cont) {
std::string begin = randomKeyName();
std::string end = randomKeyName();
if (begin > end) {
std::swap(begin, end);
}
execTransaction(
[begin, end](auto ctx) {
ctx->tx()->clearRange(begin, end);
ctx->commit();
},
[this, begin, end, cont]() {
store.clear(begin, end);
schedule(cont);
});
}
void randomOperation(TTaskFct cont) {
OpType txType = (store.size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
switch (txType) {
case OP_INSERT:
randomInsertOp(cont);
break;
case OP_GET:
randomGetOp(cont);
break;
case OP_CLEAR:
randomClearOp(cont);
break;
case OP_CLEAR_RANGE:
randomClearRangeOp(cont);
break;
case OP_COMMIT_READ:
randomCommitReadOp(cont);
break;
}
}
void randomOperations() {
if (numOpLeft == 0)
return;
numOpLeft--;
randomOperation([this]() { randomOperations(); });
}
};
WorkloadFactory<ApiCorrectnessWorkload> ApiCorrectnessWorkloadFactory("ApiCorrectness");
} // namespace FdbApiTester

View File

@ -0,0 +1,167 @@
/*
* TesterKeyValueStore.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterKeyValueStore.h"
namespace FdbApiTester {
// Get the value associated with a key
std::optional<std::string> KeyValueStore::get(std::string_view key) const {
std::unique_lock<std::mutex> lock(mutex);
auto value = store.find(std::string(key));
if (value != store.end())
return value->second;
else
return std::optional<std::string>();
}
// Checks if the key exists
bool KeyValueStore::exists(std::string_view key) {
std::unique_lock<std::mutex> lock(mutex);
return (store.find(std::string(key)) != store.end());
}
// Returns the key designated by a key selector
std::string KeyValueStore::getKey(std::string_view keyName, bool orEqual, int offset) const {
std::unique_lock<std::mutex> lock(mutex);
// Begin by getting the start key referenced by the key selector
std::map<std::string, std::string>::const_iterator mapItr = store.lower_bound(keyName);
// Update the iterator position if necessary based on the value of orEqual
int count = 0;
if (offset <= 0) {
if (mapItr == store.end() || keyName != mapItr->first || !orEqual) {
if (mapItr == store.begin())
return startKey();
mapItr--;
}
} else {
if (mapItr == store.end())
return endKey();
if (keyName == mapItr->first && orEqual) {
mapItr++;
}
count++;
}
// Increment the map iterator until the desired offset is reached
for (; count < abs(offset); count++) {
if (offset < 0) {
if (mapItr == store.begin())
break;
mapItr--;
} else {
if (mapItr == store.end())
break;
mapItr++;
}
}
if (mapItr == store.end())
return endKey();
else if (count == abs(offset))
return mapItr->first;
else
return startKey();
}
// Gets a range of key-value pairs, returning a maximum of <limit> results
std::vector<KeyValue> KeyValueStore::getRange(std::string_view begin,
std::string_view end,
int limit,
bool reverse) const {
std::unique_lock<std::mutex> lock(mutex);
std::vector<KeyValue> results;
if (!reverse) {
std::map<std::string, std::string>::const_iterator mapItr = store.lower_bound(begin);
for (; mapItr != store.end() && mapItr->first < end && results.size() < limit; mapItr++)
results.push_back(KeyValue{ mapItr->first, mapItr->second });
}
// Support for reverse getRange queries is supported, but not tested at this time. This is because reverse range
// queries have been disallowed by the database at the API level
else {
std::map<std::string, std::string>::const_iterator mapItr = store.lower_bound(end);
if (mapItr == store.begin())
return results;
for (--mapItr; mapItr->first >= begin && results.size() < abs(limit); mapItr--) {
results.push_back(KeyValue{ mapItr->first, mapItr->second });
if (mapItr == store.begin())
break;
}
}
return results;
}
// Stores a key-value pair in the database
void KeyValueStore::set(std::string_view key, std::string_view value) {
std::unique_lock<std::mutex> lock(mutex);
store[std::string(key)] = value;
}
// Removes a key from the database
void KeyValueStore::clear(std::string_view key) {
std::unique_lock<std::mutex> lock(mutex);
auto iter = store.find(key);
if (iter != store.end()) {
store.erase(iter);
}
}
// Removes a range of keys from the database
void KeyValueStore::clear(std::string_view begin, std::string_view end) {
std::unique_lock<std::mutex> lock(mutex);
store.erase(store.lower_bound(begin), store.lower_bound(end));
}
// The number of keys in the database
uint64_t KeyValueStore::size() const {
std::unique_lock<std::mutex> lock(mutex);
return store.size();
}
// The first key in the database; returned by key selectors that choose a key off the front
std::string KeyValueStore::startKey() const {
return "";
}
// The last key in the database; returned by key selectors that choose a key off the back
std::string KeyValueStore::endKey() const {
return "\xff";
}
// Debugging function that prints all key-value pairs
void KeyValueStore::printContents() const {
std::unique_lock<std::mutex> lock(mutex);
printf("Contents:\n");
std::map<std::string, std::string>::const_iterator mapItr;
for (mapItr = store.begin(); mapItr != store.end(); mapItr++)
printf("%s\n", mapItr->first.c_str());
}
} // namespace FdbApiTester

View File

@ -0,0 +1,83 @@
/*
* TesterKeyValueStore.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_KEY_VALUE_STORE_H
#define APITESTER_KEY_VALUE_STORE_H
#include <map>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
#include <mutex>
namespace FdbApiTester {
struct KeyValue {
std::string key;
std::string value;
};
class KeyValueStore {
public:
// Get the value associated with a key
std::optional<std::string> get(std::string_view key) const;
// Checks if the key exists
bool exists(std::string_view key);
// Returns the key designated by a key selector
std::string getKey(std::string_view keyName, bool orEqual, int offset) const;
// Gets a range of key-value pairs, returning a maximum of <limit> results
std::vector<KeyValue> getRange(std::string_view begin, std::string_view end, int limit, bool reverse) const;
// Stores a key-value pair in the database
void set(std::string_view key, std::string_view value);
// Removes a key from the database
void clear(std::string_view key);
// Removes a range of keys from the database
void clear(std::string_view begin, std::string_view end);
// The number of keys in the database
uint64_t size() const;
// The first key in the database; returned by key selectors that choose a key off the front
std::string startKey() const;
// The last key in the database; returned by key selectors that choose a key off the back
std::string endKey() const;
// Debugging function that prints all key-value pairs
void printContents() const;
private:
// A map holding the key-value pairs
std::map<std::string, std::string, std::less<>> store;
mutable std::mutex mutex;
};
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,49 @@
/*
* TesterOptions.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_TESTER_OPTIONS_H
#define APITESTER_TESTER_OPTIONS_H
#include "TesterTestSpec.h"
namespace FdbApiTester {
class TesterOptions {
public:
std::string clusterFile;
bool trace = false;
std::string traceDir;
std::string traceFormat;
std::string logGroup;
std::string externalClientLibrary;
std::string testFile;
int numFdbThreads;
int numClientThreads;
int numDatabases;
int numClients;
std::vector<std::pair<std::string, std::string>> knobs;
TestSpec testSpec;
};
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,67 @@
/*
* TesterScheduler.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterScheduler.h"
#include "TesterUtil.h"
#include <memory>
#include <thread>
#include <boost/asio.hpp>
using namespace boost::asio;
namespace FdbApiTester {
const TTaskFct NO_OP_TASK = []() {};
class AsioScheduler : public IScheduler {
public:
AsioScheduler(int numThreads) : numThreads(numThreads) {}
void start() override {
work = require(io_ctx.get_executor(), execution::outstanding_work.tracked);
for (int i = 0; i < numThreads; i++) {
threads.emplace_back([this]() { io_ctx.run(); });
}
}
void schedule(TTaskFct task) override { post(io_ctx, task); }
void stop() override { work = any_io_executor(); }
void join() override {
for (auto& th : threads) {
th.join();
}
}
private:
int numThreads;
std::vector<std::thread> threads;
io_context io_ctx;
any_io_executor work;
};
std::unique_ptr<IScheduler> createScheduler(int numThreads) {
ASSERT(numThreads > 0 && numThreads <= 1000);
return std::make_unique<AsioScheduler>(numThreads);
}
} // namespace FdbApiTester

View File

@ -0,0 +1,60 @@
/*
* TesterScheduler.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_SCHEDULER_H
#define APITESTER_SCHEDULER_H
#include <functional>
#include <memory>
namespace FdbApiTester {
using TTaskFct = std::function<void(void)>;
extern const TTaskFct NO_OP_TASK;
/**
* Scheduler for asynchronous execution of tasks on a pool of threads
*/
class IScheduler {
public:
virtual ~IScheduler() {}
// Create scheduler threads and begin accepting tasks
virtual void start() = 0;
// Schedule a task for asynchronous execution
virtual void schedule(TTaskFct task) = 0;
// Gracefully stop the scheduler. Waits for already running tasks to be finish
virtual void stop() = 0;
// Join with all threads of the scheduler
virtual void join() = 0;
};
// create a scheduler using given number of threads
std::unique_ptr<IScheduler> createScheduler(int numThreads);
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,169 @@
/*
* TesterTestSpec.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterTestSpec.h"
#include "TesterUtil.h"
#include <toml.hpp>
#include <fmt/format.h>
#include <functional>
namespace FdbApiTester {
namespace {
void processIntOption(const std::string& value, const std::string& optionName, int& res, int minVal, int maxVal) {
char* endptr;
res = strtol(value.c_str(), &endptr, 10);
if (*endptr != '\0') {
throw TesterError(fmt::format("Invalid test file. Invalid value {} for {}", value, optionName));
}
if (res < minVal || res > maxVal) {
throw TesterError(
fmt::format("Invalid test file. Value for {} must be between {} and {}", optionName, minVal, maxVal));
}
}
std::unordered_map<std::string, std::function<void(const std::string& value, TestSpec* spec)>> testSpecTestKeys = {
{ "title",
[](const std::string& value, TestSpec* spec) { //
spec->title = value;
} },
{ "apiVersion",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "apiVersion", spec->apiVersion, 700, 710);
} },
{ "blockOnFutures",
[](const std::string& value, TestSpec* spec) { //
spec->blockOnFutures = (value == "true");
} },
{ "buggify",
[](const std::string& value, TestSpec* spec) { //
spec->buggify = (value == "true");
} },
{ "multiThreaded",
[](const std::string& value, TestSpec* spec) { //
spec->multiThreaded = (value == "true");
} },
{ "fdbCallbacksOnExternalThreads",
[](const std::string& value, TestSpec* spec) { //
spec->fdbCallbacksOnExternalThreads = (value == "true");
} },
{ "databasePerTransaction",
[](const std::string& value, TestSpec* spec) { //
spec->databasePerTransaction = (value == "true");
} },
{ "minFdbThreads",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "minFdbThreads", spec->minFdbThreads, 1, 1000);
} },
{ "maxFdbThreads",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "maxFdbThreads", spec->maxFdbThreads, 1, 1000);
} },
{ "minClientThreads",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "minClientThreads", spec->minClientThreads, 1, 1000);
} },
{ "maxClientThreads",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "maxClientThreads", spec->maxClientThreads, 1, 1000);
} },
{ "minDatabases",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "minDatabases", spec->minDatabases, 1, 1000);
} },
{ "maxDatabases",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "maxDatabases", spec->maxDatabases, 1, 1000);
} },
{ "minClients",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "minClients", spec->minClients, 1, 1000);
} },
{ "maxClients",
[](const std::string& value, TestSpec* spec) { //
processIntOption(value, "maxClients", spec->maxClients, 1, 1000);
} }
};
template <typename T>
std::string toml_to_string(const T& value) {
// TOML formatting converts numbers to strings exactly how they're in the file
// and thus, is equivalent to testspec. However, strings are quoted, so we
// must remove the quotes.
if (value.type() == toml::value_t::string) {
const std::string& formatted = toml::format(value);
return formatted.substr(1, formatted.size() - 2);
} else {
return toml::format(value);
}
}
} // namespace
TestSpec readTomlTestSpec(std::string fileName) {
TestSpec spec;
WorkloadSpec workloadSpec;
const toml::value& conf = toml::parse(fileName);
// Then parse each test
const toml::array& tests = toml::find(conf, "test").as_array();
if (tests.size() == 0) {
throw TesterError("Invalid test file. No [test] section found");
} else if (tests.size() > 1) {
throw TesterError("Invalid test file. More than one [test] section found");
}
const toml::value& test = tests[0];
// First handle all test-level settings
for (const auto& [k, v] : test.as_table()) {
if (k == "workload") {
continue;
}
if (testSpecTestKeys.find(k) != testSpecTestKeys.end()) {
testSpecTestKeys[k](toml_to_string(v), &spec);
} else {
throw TesterError(fmt::format(
"Invalid test file. Unrecognized test parameter. Name: {}, value {}", k, toml_to_string(v)));
}
}
// And then copy the workload attributes to spec.options
const toml::array& workloads = toml::find(test, "workload").as_array();
for (const toml::value& workload : workloads) {
workloadSpec = WorkloadSpec();
auto& options = workloadSpec.options;
for (const auto& [attrib, v] : workload.as_table()) {
options[attrib] = toml_to_string(v);
}
auto itr = options.find("name");
if (itr == options.end()) {
throw TesterError("Invalid test file. Unspecified workload name.");
}
workloadSpec.name = itr->second;
spec.workloads.push_back(workloadSpec);
}
return spec;
}
} // namespace FdbApiTester

View File

@ -0,0 +1,90 @@
/*
* TesterTestSpec.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_CONFIG_READER_H
#define APITESTER_CONFIG_READER_H
#include <string>
#include <unordered_map>
#include <vector>
#define FDB_API_VERSION 710
namespace FdbApiTester {
/// Workload specification
struct WorkloadSpec {
std::string name;
std::unordered_map<std::string, std::string> options;
};
// Test speficification loaded from a *.toml file
struct TestSpec {
// Title of the test
std::string title;
// FDB API version, using the latest version by default
int apiVersion = FDB_API_VERSION;
// Use blocking waits on futures instead of scheduling callbacks
bool blockOnFutures = false;
// Use multi-threaded FDB client
bool multiThreaded = false;
// Enable injection of errors in FDB client
bool buggify = false;
// Execute future callbacks on the threads of the external FDB library
// rather than on the main thread of the local FDB client library
bool fdbCallbacksOnExternalThreads = false;
// Execute each transaction in a separate database instance
bool databasePerTransaction = false;
// Size of the FDB client thread pool (a random number in the [min,max] range)
int minFdbThreads = 1;
int maxFdbThreads = 1;
// Size of the thread pool for test workloads (a random number in the [min,max] range)
int minClientThreads = 1;
int maxClientThreads = 1;
// Size of the database instance pool (a random number in the [min,max] range)
// Each transaction is assigned randomly to one of the databases in the pool
int minDatabases = 1;
int maxDatabases = 1;
// Number of workload clients (a random number in the [min,max] range)
int minClients = 1;
int maxClients = 10;
// List of workloads with their options
std::vector<WorkloadSpec> workloads;
};
// Read the test specfication from a *.toml file
TestSpec readTomlTestSpec(std::string fileName);
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,471 @@
/*
* TesterTransactionExecutor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterTransactionExecutor.h"
#include "TesterUtil.h"
#include "test/apitester/TesterScheduler.h"
#include <memory>
#include <unordered_map>
#include <mutex>
#include <atomic>
#include <chrono>
#include <thread>
#include <fmt/format.h>
namespace FdbApiTester {
void TransactionActorBase::complete(fdb_error_t err) {
error = err;
context = {};
}
void ITransactionContext::continueAfterAll(std::vector<Future> futures, TTaskFct cont) {
auto counter = std::make_shared<std::atomic<int>>(futures.size());
auto errorCode = std::make_shared<std::atomic<fdb_error_t>>(error_code_success);
auto thisPtr = shared_from_this();
for (auto& f : futures) {
continueAfter(
f,
[thisPtr, f, counter, errorCode, cont]() {
if (f.getError() != error_code_success) {
(*errorCode) = f.getError();
}
if (--(*counter) == 0) {
if (*errorCode == error_code_success) {
// all futures successful -> continue
cont();
} else {
// at least one future failed -> retry the transaction
thisPtr->onError(*errorCode);
}
}
},
false);
}
}
/**
* Transaction context base class, containing reusable functionality
*/
class TransactionContextBase : public ITransactionContext {
public:
TransactionContextBase(FDBTransaction* tx,
std::shared_ptr<ITransactionActor> txActor,
TTaskFct cont,
IScheduler* scheduler)
: fdbTx(tx), txActor(txActor), contAfterDone(cont), scheduler(scheduler), txState(TxState::IN_PROGRESS) {}
// A state machine:
// IN_PROGRESS -> (ON_ERROR -> IN_PROGRESS)* [-> ON_ERROR] -> DONE
enum class TxState { IN_PROGRESS, ON_ERROR, DONE };
Transaction* tx() override { return &fdbTx; }
// Set a continuation to be executed when a future gets ready
void continueAfter(Future f, TTaskFct cont, bool retryOnError) override { doContinueAfter(f, cont, retryOnError); }
// Complete the transaction with a commit
void commit() override {
std::unique_lock<std::mutex> lock(mutex);
if (txState != TxState::IN_PROGRESS) {
return;
}
lock.unlock();
Future f = fdbTx.commit();
auto thisRef = shared_from_this();
doContinueAfter(
f, [thisRef]() { thisRef->done(); }, true);
}
// Complete the transaction without a commit (for read transactions)
void done() override {
std::unique_lock<std::mutex> lock(mutex);
if (txState != TxState::IN_PROGRESS) {
return;
}
txState = TxState::DONE;
lock.unlock();
// cancel transaction so that any pending operations on it
// fail gracefully
fdbTx.cancel();
txActor->complete(error_code_success);
cleanUp();
contAfterDone();
}
protected:
virtual void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) = 0;
// Clean up transaction state after completing the transaction
// Note that the object may live longer, because it is referenced
// by not yet triggered callbacks
virtual void cleanUp() {
ASSERT(txState == TxState::DONE);
ASSERT(!onErrorFuture);
txActor = {};
}
// Complete the transaction with an (unretriable) error
void transactionFailed(fdb_error_t err) {
ASSERT(err != error_code_success);
std::unique_lock<std::mutex> lock(mutex);
if (txState == TxState::DONE) {
return;
}
txState = TxState::DONE;
lock.unlock();
txActor->complete(err);
cleanUp();
contAfterDone();
}
// Handle result of an a transaction onError call
void handleOnErrorResult() {
ASSERT(txState == TxState::ON_ERROR);
fdb_error_t err = onErrorFuture.getError();
onErrorFuture = {};
if (err) {
transactionFailed(err);
} else {
std::unique_lock<std::mutex> lock(mutex);
txState = TxState::IN_PROGRESS;
lock.unlock();
txActor->start();
}
}
// FDB transaction
Transaction fdbTx;
// Actor implementing the transaction worklflow
std::shared_ptr<ITransactionActor> txActor;
// Mutex protecting access to shared mutable state
std::mutex mutex;
// Continuation to be called after completion of the transaction
TTaskFct contAfterDone;
// Reference to the scheduler
IScheduler* scheduler;
// Transaction execution state
TxState txState;
// onError future used in ON_ERROR state
Future onErrorFuture;
};
/**
* Transaction context using blocking waits to implement continuations on futures
*/
class BlockingTransactionContext : public TransactionContextBase {
public:
BlockingTransactionContext(FDBTransaction* tx,
std::shared_ptr<ITransactionActor> txActor,
TTaskFct cont,
IScheduler* scheduler)
: TransactionContextBase(tx, txActor, cont, scheduler) {}
protected:
void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) override {
auto thisRef = std::static_pointer_cast<BlockingTransactionContext>(shared_from_this());
scheduler->schedule(
[thisRef, f, cont, retryOnError]() mutable { thisRef->blockingContinueAfter(f, cont, retryOnError); });
}
void blockingContinueAfter(Future f, TTaskFct cont, bool retryOnError) {
std::unique_lock<std::mutex> lock(mutex);
if (txState != TxState::IN_PROGRESS) {
return;
}
lock.unlock();
fdb_error_t err = fdb_future_block_until_ready(f.fdbFuture());
if (err) {
transactionFailed(err);
return;
}
err = f.getError();
if (err == error_code_transaction_cancelled) {
return;
}
if (err == error_code_success || !retryOnError) {
scheduler->schedule([cont]() { cont(); });
return;
}
onError(err);
}
virtual void onError(fdb_error_t err) override {
std::unique_lock<std::mutex> lock(mutex);
if (txState != TxState::IN_PROGRESS) {
// Ignore further errors, if the transaction is in the error handing mode or completed
return;
}
txState = TxState::ON_ERROR;
lock.unlock();
ASSERT(!onErrorFuture);
onErrorFuture = fdbTx.onError(err);
fdb_error_t err2 = fdb_future_block_until_ready(onErrorFuture.fdbFuture());
if (err2) {
transactionFailed(err2);
return;
}
auto thisRef = std::static_pointer_cast<BlockingTransactionContext>(shared_from_this());
scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); });
}
};
/**
* Transaction context using callbacks to implement continuations on futures
*/
class AsyncTransactionContext : public TransactionContextBase {
public:
AsyncTransactionContext(FDBTransaction* tx,
std::shared_ptr<ITransactionActor> txActor,
TTaskFct cont,
IScheduler* scheduler)
: TransactionContextBase(tx, txActor, cont, scheduler) {}
protected:
void doContinueAfter(Future f, TTaskFct cont, bool retryOnError) override {
std::unique_lock<std::mutex> lock(mutex);
if (txState != TxState::IN_PROGRESS) {
return;
}
callbackMap[f.fdbFuture()] = CallbackInfo{ f, cont, shared_from_this(), retryOnError };
lock.unlock();
fdb_error_t err = fdb_future_set_callback(f.fdbFuture(), futureReadyCallback, this);
if (err) {
lock.lock();
callbackMap.erase(f.fdbFuture());
lock.unlock();
transactionFailed(err);
}
}
static void futureReadyCallback(FDBFuture* f, void* param) {
AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param;
txCtx->onFutureReady(f);
}
void onFutureReady(FDBFuture* f) {
injectRandomSleep();
// Hold a reference to this to avoid it to be
// destroyed before releasing the mutex
auto thisRef = shared_from_this();
std::unique_lock<std::mutex> lock(mutex);
auto iter = callbackMap.find(f);
ASSERT(iter != callbackMap.end());
CallbackInfo cbInfo = iter->second;
callbackMap.erase(iter);
if (txState != TxState::IN_PROGRESS) {
return;
}
lock.unlock();
fdb_error_t err = fdb_future_get_error(f);
if (err == error_code_transaction_cancelled) {
return;
}
if (err == error_code_success || !cbInfo.retryOnError) {
scheduler->schedule(cbInfo.cont);
return;
}
onError(err);
}
virtual void onError(fdb_error_t err) override {
std::unique_lock<std::mutex> lock(mutex);
if (txState != TxState::IN_PROGRESS) {
// Ignore further errors, if the transaction is in the error handing mode or completed
return;
}
txState = TxState::ON_ERROR;
lock.unlock();
ASSERT(!onErrorFuture);
onErrorFuture = tx()->onError(err);
onErrorThisRef = std::static_pointer_cast<AsyncTransactionContext>(shared_from_this());
fdb_error_t err2 = fdb_future_set_callback(onErrorFuture.fdbFuture(), onErrorReadyCallback, this);
if (err2) {
onErrorFuture = {};
transactionFailed(err2);
}
}
static void onErrorReadyCallback(FDBFuture* f, void* param) {
AsyncTransactionContext* txCtx = (AsyncTransactionContext*)param;
txCtx->onErrorReady(f);
}
void onErrorReady(FDBFuture* f) {
injectRandomSleep();
auto thisRef = onErrorThisRef;
onErrorThisRef = {};
scheduler->schedule([thisRef]() { thisRef->handleOnErrorResult(); });
}
void cleanUp() override {
TransactionContextBase::cleanUp();
// Cancel all pending operations
// Note that the callbacks of the cancelled futures will still be called
std::unique_lock<std::mutex> lock(mutex);
std::vector<Future> futures;
for (auto& iter : callbackMap) {
futures.push_back(iter.second.future);
}
lock.unlock();
for (auto& f : futures) {
f.cancel();
}
}
// Inject a random sleep with a low probability
void injectRandomSleep() {
if (Random::get().randomBool(0.01)) {
std::this_thread::sleep_for(std::chrono::milliseconds(Random::get().randomInt(1, 5)));
}
}
// Object references for a future callback
struct CallbackInfo {
Future future;
TTaskFct cont;
std::shared_ptr<ITransactionContext> thisRef;
bool retryOnError;
};
// Map for keeping track of future waits and holding necessary object references
std::unordered_map<FDBFuture*, CallbackInfo> callbackMap;
// Holding reference to this for onError future C callback
std::shared_ptr<AsyncTransactionContext> onErrorThisRef;
};
/**
* Transaction executor base class, containing reusable functionality
*/
class TransactionExecutorBase : public ITransactionExecutor {
public:
TransactionExecutorBase(const TransactionExecutorOptions& options) : options(options), scheduler(nullptr) {}
void init(IScheduler* scheduler, const char* clusterFile) override {
this->scheduler = scheduler;
this->clusterFile = clusterFile;
}
protected:
// Execute the transaction on the given database instance
void executeOnDatabase(FDBDatabase* db, std::shared_ptr<ITransactionActor> txActor, TTaskFct cont) {
FDBTransaction* tx;
fdb_error_t err = fdb_database_create_transaction(db, &tx);
if (err != error_code_success) {
txActor->complete(err);
cont();
} else {
std::shared_ptr<ITransactionContext> ctx;
if (options.blockOnFutures) {
ctx = std::make_shared<BlockingTransactionContext>(tx, txActor, cont, scheduler);
} else {
ctx = std::make_shared<AsyncTransactionContext>(tx, txActor, cont, scheduler);
}
txActor->init(ctx);
txActor->start();
}
}
protected:
TransactionExecutorOptions options;
std::string clusterFile;
IScheduler* scheduler;
};
/**
* Transaction executor load balancing transactions over a fixed pool of databases
*/
class DBPoolTransactionExecutor : public TransactionExecutorBase {
public:
DBPoolTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {}
~DBPoolTransactionExecutor() override { release(); }
void init(IScheduler* scheduler, const char* clusterFile) override {
TransactionExecutorBase::init(scheduler, clusterFile);
for (int i = 0; i < options.numDatabases; i++) {
FDBDatabase* db;
fdb_error_t err = fdb_create_database(clusterFile, &db);
if (err != error_code_success) {
throw TesterError(fmt::format("Failed create database with the cluster file '{}'. Error: {}({})",
clusterFile,
err,
fdb_get_error(err)));
}
databases.push_back(db);
}
}
void execute(std::shared_ptr<ITransactionActor> txActor, TTaskFct cont) override {
int idx = Random::get().randomInt(0, options.numDatabases - 1);
executeOnDatabase(databases[idx], txActor, cont);
}
void release() {
for (FDBDatabase* db : databases) {
fdb_database_destroy(db);
}
}
private:
std::vector<FDBDatabase*> databases;
};
/**
* Transaction executor executing each transaction on a separate database
*/
class DBPerTransactionExecutor : public TransactionExecutorBase {
public:
DBPerTransactionExecutor(const TransactionExecutorOptions& options) : TransactionExecutorBase(options) {}
void execute(std::shared_ptr<ITransactionActor> txActor, TTaskFct cont) override {
FDBDatabase* db = nullptr;
fdb_error_t err = fdb_create_database(clusterFile.c_str(), &db);
if (err != error_code_success) {
txActor->complete(err);
cont();
}
executeOnDatabase(db, txActor, [cont, db]() {
fdb_database_destroy(db);
cont();
});
}
};
std::unique_ptr<ITransactionExecutor> createTransactionExecutor(const TransactionExecutorOptions& options) {
if (options.databasePerTransaction) {
return std::make_unique<DBPerTransactionExecutor>(options);
} else {
return std::make_unique<DBPoolTransactionExecutor>(options);
}
}
} // namespace FdbApiTester

View File

@ -0,0 +1,145 @@
/*
* TesterTransactionExecutor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_TRANSACTION_EXECUTOR_H
#define APITESTER_TRANSACTION_EXECUTOR_H
#include "TesterOptions.h"
#include "TesterApiWrapper.h"
#include "TesterScheduler.h"
#include <string_view>
#include <memory>
namespace FdbApiTester {
/**
* Interface to be used for implementation of a concrete transaction
*/
class ITransactionContext : public std::enable_shared_from_this<ITransactionContext> {
public:
virtual ~ITransactionContext() {}
// Current FDB transaction
virtual Transaction* tx() = 0;
// Schedule a continuation to be executed when the future gets ready
// retryOnError controls whether transaction is retried in case of an error instead
// of calling the continuation
virtual void continueAfter(Future f, TTaskFct cont, bool retryOnError = true) = 0;
// Complete the transaction with a commit
virtual void commit() = 0;
// retry transaction on error
virtual void onError(fdb_error_t err) = 0;
// Mark the transaction as completed without committing it (for read transactions)
virtual void done() = 0;
// A continuation to be executed when all of the given futures get ready
virtual void continueAfterAll(std::vector<Future> futures, TTaskFct cont);
};
/**
* Interface of an actor object implementing a concrete transaction
*/
class ITransactionActor {
public:
virtual ~ITransactionActor() {}
// Initialize with the given transaction context
virtual void init(std::shared_ptr<ITransactionContext> ctx) = 0;
// Start execution of the transaction, also called on retries
virtual void start() = 0;
// Transaction completion result (error_code_success in case of success)
virtual fdb_error_t getErrorCode() = 0;
// Notification about the completion of the transaction
virtual void complete(fdb_error_t err) = 0;
};
/**
* A helper base class for transaction actors
*/
class TransactionActorBase : public ITransactionActor {
public:
void init(std::shared_ptr<ITransactionContext> ctx) override { context = ctx; }
fdb_error_t getErrorCode() override { return error; }
void complete(fdb_error_t err) override;
protected:
std::shared_ptr<ITransactionContext> ctx() { return context; }
private:
std::shared_ptr<ITransactionContext> context;
fdb_error_t error = error_code_success;
};
// Type of the lambda functions implementing a transaction
using TTxStartFct = std::function<void(std::shared_ptr<ITransactionContext>)>;
/**
* A wrapper class for transactions implemented by lambda functions
*/
class TransactionFct : public TransactionActorBase {
public:
TransactionFct(TTxStartFct startFct) : startFct(startFct) {}
void start() override { startFct(this->ctx()); }
private:
TTxStartFct startFct;
};
/**
* Configuration of transaction execution mode
*/
struct TransactionExecutorOptions {
// Use blocking waits on futures
bool blockOnFutures = false;
// Create each transaction in a separate database instance
bool databasePerTransaction = false;
// The size of the database instance pool
int numDatabases = 1;
};
/**
* Transaction executor provides an interface for executing transactions
* It is responsible for instantiating FDB databases and transactions and managing their lifecycle
* according to the provided options
*/
class ITransactionExecutor {
public:
virtual ~ITransactionExecutor() {}
virtual void init(IScheduler* sched, const char* clusterFile) = 0;
virtual void execute(std::shared_ptr<ITransactionActor> tx, TTaskFct cont) = 0;
};
// Create a transaction executor for the given options
std::unique_ptr<ITransactionExecutor> createTransactionExecutor(const TransactionExecutorOptions& options);
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,58 @@
/*
* TesterUtil.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterUtil.h"
#include <cstdio>
namespace FdbApiTester {
Random::Random() {
std::random_device dev;
random.seed(dev());
}
int Random::randomInt(int min, int max) {
return std::uniform_int_distribution<int>(min, max)(random);
}
Random& Random::get() {
static thread_local Random random;
return random;
}
std::string Random::randomStringLowerCase(int minLength, int maxLength) {
int length = randomInt(minLength, maxLength);
std::string str;
str.reserve(length);
for (int i = 0; i < length; i++) {
str += (char)randomInt('a', 'z');
}
return str;
}
bool Random::randomBool(double trueRatio) {
return std::uniform_real_distribution<double>(0.0, 1.0)(random) <= trueRatio;
}
void print_internal_error(const char* msg, const char* file, int line) {
fprintf(stderr, "Assertion %s failed @ %s %d:\n", msg, file, line);
}
} // namespace FdbApiTester

View File

@ -0,0 +1,87 @@
/*
* TesterUtil.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef APITESTER_UTIL_H
#define APITESTER_UTIL_H
#include <random>
#include <ostream>
#include <optional>
#include <fmt/format.h>
namespace fmt {
template <typename T>
struct formatter<std::optional<T>> : fmt::formatter<T> {
template <typename FormatContext>
auto format(const std::optional<T>& opt, FormatContext& ctx) {
if (opt) {
fmt::formatter<T>::format(*opt, ctx);
return ctx.out();
}
return fmt::format_to(ctx.out(), "<empty>");
}
};
} // namespace fmt
namespace FdbApiTester {
class Random {
public:
Random();
static Random& get();
int randomInt(int min, int max);
std::string randomStringLowerCase(int minLength, int maxLength);
bool randomBool(double trueRatio);
std::mt19937 random;
};
class TesterError : public std::runtime_error {
public:
explicit TesterError(const char* message) : std::runtime_error(message) {}
explicit TesterError(const std::string& message) : std::runtime_error(message) {}
TesterError(const TesterError&) = default;
TesterError& operator=(const TesterError&) = default;
TesterError(TesterError&&) = default;
TesterError& operator=(TesterError&&) = default;
};
void print_internal_error(const char* msg, const char* file, int line);
#define ASSERT(condition) \
do { \
if (!(condition)) { \
print_internal_error(#condition, __FILE__, __LINE__); \
abort(); \
} \
} while (false) // For use in destructors, where throwing exceptions is extremely dangerous
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,184 @@
/*
* TesterWorkload.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterWorkload.h"
#include "TesterUtil.h"
#include "test/apitester/TesterScheduler.h"
#include <cstdlib>
#include <memory>
#include <fmt/format.h>
#include <vector>
namespace FdbApiTester {
int WorkloadConfig::getIntOption(const std::string& name, int defaultVal) const {
auto iter = options.find(name);
if (iter == options.end()) {
return defaultVal;
} else {
char* endptr;
int intVal = strtol(iter->second.c_str(), &endptr, 10);
if (*endptr != '\0') {
throw TesterError(
fmt::format("Invalid workload configuration. Invalid value {} for {}", iter->second, name));
}
return intVal;
}
}
double WorkloadConfig::getFloatOption(const std::string& name, double defaultVal) const {
auto iter = options.find(name);
if (iter == options.end()) {
return defaultVal;
} else {
char* endptr;
double floatVal = strtod(iter->second.c_str(), &endptr);
if (*endptr != '\0') {
throw TesterError(
fmt::format("Invalid workload configuration. Invalid value {} for {}", iter->second, name));
}
return floatVal;
}
}
WorkloadBase::WorkloadBase(const WorkloadConfig& config)
: manager(nullptr), tasksScheduled(0), numErrors(0), clientId(config.clientId), numClients(config.numClients),
failed(false) {
maxErrors = config.getIntOption("maxErrors", 10);
workloadId = fmt::format("{}{}", config.name, clientId);
}
void WorkloadBase::init(WorkloadManager* manager) {
this->manager = manager;
}
void WorkloadBase::schedule(TTaskFct task) {
if (failed) {
return;
}
tasksScheduled++;
manager->scheduler->schedule([this, task]() {
task();
scheduledTaskDone();
});
}
void WorkloadBase::execTransaction(std::shared_ptr<ITransactionActor> tx, TTaskFct cont, bool failOnError) {
if (failed) {
return;
}
tasksScheduled++;
manager->txExecutor->execute(tx, [this, tx, cont, failOnError]() {
fdb_error_t err = tx->getErrorCode();
if (tx->getErrorCode() == error_code_success) {
cont();
} else {
std::string msg = fmt::format("Transaction failed with error: {} ({}})", err, fdb_get_error(err));
if (failOnError) {
error(msg);
failed = true;
} else {
info(msg);
cont();
}
}
scheduledTaskDone();
});
}
void WorkloadBase::info(const std::string& msg) {
fmt::print(stderr, "[{}] {}\n", workloadId, msg);
}
void WorkloadBase::error(const std::string& msg) {
fmt::print(stderr, "[{}] ERROR: {}\n", workloadId, msg);
numErrors++;
if (numErrors > maxErrors && !failed) {
fmt::print(stderr, "[{}] ERROR: Stopping workload after {} errors\n", workloadId, numErrors);
failed = true;
}
}
void WorkloadBase::scheduledTaskDone() {
if (--tasksScheduled == 0) {
if (numErrors > 0) {
error(fmt::format("Workload failed with {} errors", numErrors.load()));
} else {
info("Workload successfully completed");
}
manager->workloadDone(this, numErrors > 0);
}
}
void WorkloadManager::add(std::shared_ptr<IWorkload> workload, TTaskFct cont) {
std::unique_lock<std::mutex> lock(mutex);
workloads[workload.get()] = WorkloadInfo{ workload, cont };
}
void WorkloadManager::run() {
std::vector<std::shared_ptr<IWorkload>> initialWorkloads;
for (auto iter : workloads) {
initialWorkloads.push_back(iter.second.ref);
}
for (auto iter : initialWorkloads) {
iter->init(this);
}
for (auto iter : initialWorkloads) {
iter->start();
}
scheduler->join();
if (failed()) {
fmt::print(stderr, "{} workloads failed\n", numWorkloadsFailed);
} else {
fprintf(stderr, "All workloads succesfully completed\n");
}
}
void WorkloadManager::workloadDone(IWorkload* workload, bool failed) {
std::unique_lock<std::mutex> lock(mutex);
auto iter = workloads.find(workload);
ASSERT(iter != workloads.end());
lock.unlock();
iter->second.cont();
lock.lock();
workloads.erase(iter);
if (failed) {
numWorkloadsFailed++;
}
bool done = workloads.empty();
lock.unlock();
if (done) {
scheduler->stop();
}
}
std::shared_ptr<IWorkload> IWorkloadFactory::create(std::string const& name, const WorkloadConfig& config) {
auto it = factories().find(name);
if (it == factories().end())
return {}; // or throw?
return it->second->create(config);
}
std::unordered_map<std::string, IWorkloadFactory*>& IWorkloadFactory::factories() {
static std::unordered_map<std::string, IWorkloadFactory*> theFactories;
return theFactories;
}
} // namespace FdbApiTester

View File

@ -0,0 +1,205 @@
/*
* TesterWorkload.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#ifndef APITESTER_WORKLOAD_H
#define APITESTER_WORKLOAD_H
#include "TesterTransactionExecutor.h"
#include "TesterUtil.h"
#include <atomic>
#include <unordered_map>
#include <mutex>
namespace FdbApiTester {
class WorkloadManager;
// Workoad interface
class IWorkload {
public:
virtual ~IWorkload() {}
// Intialize the workload
virtual void init(WorkloadManager* manager) = 0;
// Start executing the workload
virtual void start() = 0;
};
// Workload configuration
struct WorkloadConfig {
// Workoad name
std::string name;
// Client ID assigned to the workload (a number from 0 to numClients-1)
int clientId;
// Total number of clients
int numClients;
// Workload options: as key-value pairs
std::unordered_map<std::string, std::string> options;
// Get option of a certain type by name. Throws an exception if the values is of a wrong type
int getIntOption(const std::string& name, int defaultVal) const;
double getFloatOption(const std::string& name, double defaultVal) const;
};
// A base class for test workloads
// Tracks if workload is active, notifies the workload manager when the workload completes
class WorkloadBase : public IWorkload {
public:
WorkloadBase(const WorkloadConfig& config);
// Initialize the workload
void init(WorkloadManager* manager) override;
protected:
// Schedule the a task as a part of the workload
void schedule(TTaskFct task);
// Execute a transaction within the workload
void execTransaction(std::shared_ptr<ITransactionActor> tx, TTaskFct cont, bool failOnError = true);
// Execute a transaction within the workload, a convenience method for a tranasaction defined by a lambda function
void execTransaction(TTxStartFct start, TTaskFct cont, bool failOnError = true) {
execTransaction(std::make_shared<TransactionFct>(start), cont, failOnError);
}
// Log an error message, increase error counter
void error(const std::string& msg);
// Log an info message
void info(const std::string& msg);
private:
WorkloadManager* manager;
// Decrease scheduled task counter, notify the workload manager
// that the task is done if no more tasks schedule
void scheduledTaskDone();
// Keep track of tasks scheduled by the workload
// End workload when this number falls to 0
std::atomic<int> tasksScheduled;
// Number of errors logged
std::atomic<int> numErrors;
protected:
// Client ID assigned to the workload (a number from 0 to numClients-1)
int clientId;
// Total number of clients
int numClients;
// The maximum number of errors before stoppoing the workload
int maxErrors;
// Workload identifier, consisting of workload name and client ID
std::string workloadId;
// Workload is failed, no further transactions or continuations will be scheduled by the workload
std::atomic<bool> failed;
};
// Workload manager
// Keeps track of active workoads, stops the scheduler after all workloads complete
class WorkloadManager {
public:
WorkloadManager(ITransactionExecutor* txExecutor, IScheduler* scheduler)
: txExecutor(txExecutor), scheduler(scheduler), numWorkloadsFailed(0) {}
// Add a workload
// A continuation is to be specified for subworkloads
void add(std::shared_ptr<IWorkload> workload, TTaskFct cont = NO_OP_TASK);
// Run all workloads. Blocks until all workloads complete
void run();
// True if at least one workload has failed
bool failed() {
std::unique_lock<std::mutex> lock(mutex);
return numWorkloadsFailed > 0;
}
private:
friend WorkloadBase;
// Info about a running workload
struct WorkloadInfo {
// Reference to the workoad for ownership
std::shared_ptr<IWorkload> ref;
// Continuation to be executed after completing the workload
TTaskFct cont;
};
// To be called by a workload to notify that it is done
void workloadDone(IWorkload* workload, bool failed);
// Transaction executor to be used by the workloads
ITransactionExecutor* txExecutor;
// A scheduler to be used by the workloads
IScheduler* scheduler;
// Mutex protects access to workloads & numWorkloadsFailed
std::mutex mutex;
// A map of currently running workloads
std::unordered_map<IWorkload*, WorkloadInfo> workloads;
// Number of workloads failed
int numWorkloadsFailed;
};
// A workload factory
struct IWorkloadFactory {
// create a workload by name
static std::shared_ptr<IWorkload> create(std::string const& name, const WorkloadConfig& config);
// a singleton registry of workload factories
static std::unordered_map<std::string, IWorkloadFactory*>& factories();
// Interface to be implemented by a workload factory
virtual ~IWorkloadFactory() = default;
virtual std::shared_ptr<IWorkload> create(const WorkloadConfig& config) = 0;
};
/**
* A template for a workload factory for creating workloads of a certain type
*
* Declare a global instance of the factory for a workload type as follows:
* WorkloadFactory<MyWorkload> MyWorkloadFactory("myWorkload");
*/
template <class WorkloadType>
struct WorkloadFactory : IWorkloadFactory {
WorkloadFactory(const char* name) { factories()[name] = this; }
std::shared_ptr<IWorkload> create(const WorkloadConfig& config) override {
return std::make_shared<WorkloadType>(config);
}
};
} // namespace FdbApiTester
#endif

View File

@ -0,0 +1,284 @@
/*
* fdb_c_api_tester.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TesterOptions.h"
#include "TesterWorkload.h"
#include "TesterScheduler.h"
#include "TesterTransactionExecutor.h"
#include "TesterTestSpec.h"
#include "TesterUtil.h"
#include "flow/SimpleOpt.h"
#include "bindings/c/foundationdb/fdb_c.h"
#include <memory>
#include <stdexcept>
#include <thread>
#include <fmt/format.h>
namespace FdbApiTester {
namespace {
enum TesterOptionId {
OPT_CONNFILE,
OPT_HELP,
OPT_TRACE,
OPT_TRACE_DIR,
OPT_LOGGROUP,
OPT_TRACE_FORMAT,
OPT_KNOB,
OPT_EXTERNAL_CLIENT_LIBRARY,
OPT_TEST_FILE
};
CSimpleOpt::SOption TesterOptionDefs[] = //
{ { OPT_CONNFILE, "-C", SO_REQ_SEP },
{ OPT_CONNFILE, "--cluster-file", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
{ OPT_LOGGROUP, "--log-group", SO_REQ_SEP },
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },
{ OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP },
{ OPT_KNOB, "--knob-", SO_REQ_SEP },
{ OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP },
{ OPT_TEST_FILE, "-f", SO_REQ_SEP },
{ OPT_TEST_FILE, "--test-file", SO_REQ_SEP },
SO_END_OF_OPTIONS };
void printProgramUsage(const char* execName) {
printf("usage: %s [OPTIONS]\n"
"\n",
execName);
printf(" -C, --cluster-file FILE\n"
" The path of a file containing the connection string for the\n"
" FoundationDB cluster. The default is `fdb.cluster'\n"
" --log Enables trace file logging for the CLI session.\n"
" --log-dir PATH Specifes the output directory for trace files. If\n"
" unspecified, defaults to the current directory. Has\n"
" no effect unless --log is specified.\n"
" --log-group LOG_GROUP\n"
" Sets the LogGroup field with the specified value for all\n"
" events in the trace output (defaults to `default').\n"
" --trace-format FORMAT\n"
" Select the format of the log files. xml (the default) and json\n"
" are supported. Has no effect unless --log is specified.\n"
" --knob-KNOBNAME KNOBVALUE\n"
" Changes a knob option. KNOBNAME should be lowercase.\n"
" --external-client-library FILE\n"
" Path to the external client library.\n"
" -f, --test-file FILE\n"
" Test file to run.\n"
" -h, --help Display this help and exit.\n");
}
// Extracts the key for command line arguments that are specified with a prefix (e.g. --knob-).
// This function converts any hyphens in the extracted key to underscores.
bool extractPrefixedArgument(std::string prefix, const std::string& arg, std::string& res) {
if (arg.size() <= prefix.size() || arg.find(prefix) != 0 ||
(arg[prefix.size()] != '-' && arg[prefix.size()] != '_')) {
return false;
}
res = arg.substr(prefix.size() + 1);
std::transform(res.begin(), res.end(), res.begin(), [](int c) { return c == '-' ? '_' : c; });
return true;
}
bool validateTraceFormat(std::string_view format) {
return format == "xml" || format == "json";
}
bool processArg(TesterOptions& options, const CSimpleOpt& args) {
switch (args.OptionId()) {
case OPT_CONNFILE:
options.clusterFile = args.OptionArg();
break;
case OPT_TRACE:
options.trace = true;
break;
case OPT_TRACE_DIR:
options.traceDir = args.OptionArg();
break;
case OPT_LOGGROUP:
options.logGroup = args.OptionArg();
break;
case OPT_TRACE_FORMAT:
if (!validateTraceFormat(args.OptionArg())) {
fmt::print(stderr, "ERROR: Unrecognized trace format `{}'\n", args.OptionArg());
return false;
}
options.traceFormat = args.OptionArg();
break;
case OPT_KNOB: {
std::string knobName;
if (!extractPrefixedArgument("--knob", args.OptionSyntax(), knobName)) {
fmt::print(stderr, "ERROR: unable to parse knob option '{}'\n", args.OptionSyntax());
return false;
}
options.knobs.emplace_back(knobName, args.OptionArg());
break;
}
case OPT_EXTERNAL_CLIENT_LIBRARY:
options.externalClientLibrary = args.OptionArg();
break;
case OPT_TEST_FILE:
options.testFile = args.OptionArg();
options.testSpec = readTomlTestSpec(options.testFile);
break;
}
return true;
}
bool parseArgs(TesterOptions& options, int argc, char** argv) {
// declare our options parser, pass in the arguments from main
// as well as our array of valid options.
CSimpleOpt args(argc, argv, TesterOptionDefs);
// while there are arguments left to process
while (args.Next()) {
if (args.LastError() == SO_SUCCESS) {
if (args.OptionId() == OPT_HELP) {
printProgramUsage(argv[0]);
return false;
}
if (!processArg(options, args)) {
return false;
}
} else {
fmt::print(stderr, "ERROR: Invalid argument: {}\n", args.OptionText());
printProgramUsage(argv[0]);
return false;
}
}
return true;
}
void fdb_check(fdb_error_t e) {
if (e) {
fmt::print(stderr, "Unexpected FDB error: {}({})\n", e, fdb_get_error(e));
std::abort();
}
}
void applyNetworkOptions(TesterOptions& options) {
if (!options.externalClientLibrary.empty()) {
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT));
fdb_check(
FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, options.externalClientLibrary));
}
if (options.testSpec.multiThreaded) {
fdb_check(
FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_THREADS_PER_VERSION, options.numFdbThreads));
}
if (options.testSpec.fdbCallbacksOnExternalThreads) {
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CALLBACKS_ON_EXTERNAL_THREADS));
}
if (options.testSpec.buggify) {
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE));
}
if (options.trace) {
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir));
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, options.traceFormat));
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_LOG_GROUP, options.logGroup));
}
for (auto knob : options.knobs) {
fdb_check(FdbApi::setOption(FDBNetworkOption::FDB_NET_OPTION_KNOB,
fmt::format("{}={}", knob.first.c_str(), knob.second.c_str())));
}
}
void randomizeOptions(TesterOptions& options) {
Random& random = Random::get();
options.numFdbThreads = random.randomInt(options.testSpec.minFdbThreads, options.testSpec.maxFdbThreads);
options.numClientThreads = random.randomInt(options.testSpec.minClientThreads, options.testSpec.maxClientThreads);
options.numDatabases = random.randomInt(options.testSpec.minDatabases, options.testSpec.maxDatabases);
options.numClients = random.randomInt(options.testSpec.minClients, options.testSpec.maxClients);
}
bool runWorkloads(TesterOptions& options) {
TransactionExecutorOptions txExecOptions;
txExecOptions.blockOnFutures = options.testSpec.blockOnFutures;
txExecOptions.numDatabases = options.numDatabases;
txExecOptions.databasePerTransaction = options.testSpec.databasePerTransaction;
std::unique_ptr<IScheduler> scheduler = createScheduler(options.numClientThreads);
std::unique_ptr<ITransactionExecutor> txExecutor = createTransactionExecutor(txExecOptions);
scheduler->start();
txExecutor->init(scheduler.get(), options.clusterFile.c_str());
WorkloadManager workloadMgr(txExecutor.get(), scheduler.get());
for (const auto& workloadSpec : options.testSpec.workloads) {
for (int i = 0; i < options.numClients; i++) {
WorkloadConfig config;
config.name = workloadSpec.name;
config.options = workloadSpec.options;
config.clientId = i;
config.numClients = options.numClients;
std::shared_ptr<IWorkload> workload = IWorkloadFactory::create(workloadSpec.name, config);
if (!workload) {
throw TesterError(fmt::format("Unknown workload '{}'", workloadSpec.name));
}
workloadMgr.add(workload);
}
}
workloadMgr.run();
return !workloadMgr.failed();
}
} // namespace
} // namespace FdbApiTester
using namespace FdbApiTester;
int main(int argc, char** argv) {
int retCode = 0;
try {
TesterOptions options;
if (!parseArgs(options, argc, argv)) {
return 1;
}
randomizeOptions(options);
fdb_check(fdb_select_api_version(options.testSpec.apiVersion));
applyNetworkOptions(options);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
if (!runWorkloads(options)) {
retCode = 1;
}
fdb_check(fdb_stop_network());
network_thread.join();
} catch (const std::runtime_error& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1;
}
return retCode;
}

View File

@ -0,0 +1,125 @@
#!/usr/bin/env python3
#
# run_c_api_tests.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys
import subprocess
import argparse
import os
from subprocess import Popen, TimeoutExpired
import logging
import signal
def get_logger():
return logging.getLogger('foundationdb.run_c_api_tests')
def initialize_logger_level(logging_level):
logger = get_logger()
assert logging_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']
logging.basicConfig(format='%(message)s')
if logging_level == 'DEBUG':
logger.setLevel(logging.DEBUG)
elif logging_level == 'INFO':
logger.setLevel(logging.INFO)
elif logging_level == 'WARNING':
logger.setLevel(logging.WARNING)
elif logging_level == 'ERROR':
logger.setLevel(logging.ERROR)
def run_tester(args, test_file):
cmd = [args.tester_binary, "--cluster-file",
args.cluster_file, "--test-file", test_file]
if args.external_client_library is not None:
cmd += ["--external-client-library", args.external_client_library]
get_logger().info('\nRunning tester \'%s\'...' % ' '.join(cmd))
proc = Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
timed_out = False
try:
ret_code = proc.wait(args.timeout)
except TimeoutExpired:
proc.kill()
timed_out = True
except Exception as e:
raise Exception('Unable to run tester (%s)' % e)
if ret_code != 0:
if ret_code < 0:
reason = signal.Signals(-ret_code).name
else:
reason = 'exit code: %d' % ret_code
if timed_out:
reason = 'timed out after %d seconds' % args.timeout
ret_code = 1
get_logger().error('\n\'%s\' did not complete succesfully (%s)' %
(cmd[0], reason))
get_logger().info('')
return ret_code
def run_tests(args):
num_failed = 0
test_files = [f for f in os.listdir(args.test_dir)
if os.path.isfile(os.path.join(args.test_dir, f)) and f.endswith(".toml")]
for test_file in test_files:
get_logger().info('=========================================================')
get_logger().info('Running test %s' % test_file)
get_logger().info('=========================================================')
ret_code = run_tester(args, os.path.join(args.test_dir, test_file))
if ret_code != 0:
num_failed += 1
return num_failed
def parse_args(argv):
parser = argparse.ArgumentParser(description='FoundationDB C API Tester')
parser.add_argument('--cluster-file', type=str, default="fdb.cluster",
help='The cluster file for the cluster being connected to. (default: fdb.cluster)')
parser.add_argument('--tester-binary', type=str, default="fdb_c_api_tester",
help='Path to the fdb_c_api_tester executable. (default: fdb_c_api_tester)')
parser.add_argument('--external-client-library', type=str, default=None,
help='Path to the external client library. (default: None)')
parser.add_argument('--test-dir', type=str, default="./",
help='Path to a directory with test definitions. (default: ./)')
parser.add_argument('--timeout', type=int, default=300,
help='The timeout in seconds for running each individual test. (default 300)')
parser.add_argument('--logging-level', type=str, default='INFO',
choices=['ERROR', 'WARNING', 'INFO', 'DEBUG'], help='Specifies the level of detail in the tester output (default=\'INFO\').')
return parser.parse_args(argv)
def main(argv):
args = parse_args(argv)
initialize_logger_level(args.logging_level)
return run_tests(args)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))

View File

@ -0,0 +1,24 @@
[[test]]
title = 'Cancel Transaction with Blocking Waits'
multiThreaded = true
buggify = true
blockOnFutures = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,23 @@
[[test]]
title = 'Cancel Transactions with Future Callbacks'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,24 @@
[[test]]
title = 'Cancel Transaction with Database per Transaction'
multiThreaded = true
buggify = true
databasePerTransaction = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'CancelTransaction'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,25 @@
[[test]]
title = 'API Correctness Blocking'
multiThreaded = true
buggify = true
blockOnFutures = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,24 @@
[[test]]
title = 'API Correctness Callbacks On External Threads'
multiThreaded = true
fdbCallbacksOnExternalThreads = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,24 @@
[[test]]
title = 'API Correctness Database Per Transaction'
multiThreaded = true
buggify = true
databasePerTransaction = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,23 @@
[[test]]
title = 'API Correctness Multi Threaded'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minDatabases = 2
maxDatabases = 8
minClientThreads = 2
maxClientThreads = 8
minClients = 2
maxClients = 8
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -0,0 +1,16 @@
[[test]]
title = 'API Correctness Single Threaded'
minClients = 1
maxClients = 3
multiThreaded = false
[[test.workload]]
name = 'ApiCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
initialSize = 100
numRandomOperations = 100
readExistingKeysRatio = 0.9

View File

@ -448,16 +448,21 @@ func (o TransactionOptions) SetInitializeNewDatabase() error {
return o.setOpt(300, nil)
}
// Allows this transaction to read and modify system keys (those that start with the byte 0xFF)
// Allows this transaction to read and modify system keys (those that start with the byte 0xFF). Implies raw_access.
func (o TransactionOptions) SetAccessSystemKeys() error {
return o.setOpt(301, nil)
}
// Allows this transaction to read system keys (those that start with the byte 0xFF)
// Allows this transaction to read system keys (those that start with the byte 0xFF). Implies raw_access.
func (o TransactionOptions) SetReadSystemKeys() error {
return o.setOpt(302, nil)
}
// Allows this transaction to access the raw key-space when tenant mode is on.
func (o TransactionOptions) SetRawAccess() error {
return o.setOpt(303, nil)
}
// Not yet implemented.
func (o TransactionOptions) SetDebugRetryLogging(param string) error {
return o.setOpt(401, []byte(param))

View File

@ -129,7 +129,7 @@ function(add_fdb_test)
-n ${test_name}
-b ${PROJECT_BINARY_DIR}
-t ${test_type}
-O ${OLD_FDBSERVER_BINARY}
-O ${OLD_FDBSERVER_BINARY}
--config "@CTEST_CONFIGURATION_TYPE@"
--crash
--aggregate-traces ${TEST_AGGREGATE_TRACES}
@ -404,7 +404,7 @@ endfunction()
# Creates a single cluster before running the specified command (usually a ctest test)
function(add_fdbclient_test)
set(options DISABLED ENABLED)
set(options DISABLED ENABLED DISABLE_LOG_DUMP)
set(oneValueArgs NAME PROCESS_NUMBER TEST_TIMEOUT WORKING_DIRECTORY)
set(multiValueArgs COMMAND)
cmake_parse_arguments(T "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
@ -423,23 +423,20 @@ function(add_fdbclient_test)
if(NOT T_COMMAND)
message(FATAL_ERROR "COMMAND is a required argument for add_fdbclient_test")
endif()
message(STATUS "Adding Client test ${T_NAME}")
if (T_PROCESS_NUMBER)
add_test(NAME "${T_NAME}"
WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
--build-dir ${CMAKE_BINARY_DIR}
--process-number ${T_PROCESS_NUMBER}
--
${T_COMMAND})
else()
add_test(NAME "${T_NAME}"
WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
--build-dir ${CMAKE_BINARY_DIR}
--
${T_COMMAND})
set(TMP_CLUSTER_CMD ${CMAKE_SOURCE_DIR}/tests/TestRunner/tmp_cluster.py
--build-dir ${CMAKE_BINARY_DIR})
if(T_PROCESS_NUMBER)
list(APPEND TMP_CLUSTER_CMD --process-number ${T_PROCESS_NUMBER})
endif()
if(T_DISABLE_LOG_DUMP)
list(APPEND TMP_CLUSTER_CMD --disable-log-dump)
endif()
message(STATUS "Adding Client test ${T_NAME}")
add_test(NAME "${T_NAME}"
WORKING_DIRECTORY ${T_WORKING_DIRECTORY}
COMMAND ${Python_EXECUTABLE} ${TMP_CLUSTER_CMD}
--
${T_COMMAND})
if (T_TEST_TIMEOUT)
set_tests_properties("${T_NAME}" PROPERTIES TIMEOUT ${T_TEST_TIMEOUT})
else()
@ -449,7 +446,7 @@ function(add_fdbclient_test)
set_tests_properties("${T_NAME}" PROPERTIES ENVIRONMENT UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1)
endfunction()
# Creates a cluster file for a nonexistent cluster before running the specified command
# Creates a cluster file for a nonexistent cluster before running the specified command
# (usually a ctest test)
function(add_unavailable_fdbclient_test)
set(options DISABLED ENABLED)

View File

@ -41,10 +41,10 @@ def print_stacks(stack_count, sort_by_count):
sort_dict = counts if sort_by_count else sizes
ordered_list = [(val, backtrace) for (backtrace, val) in sort_dict.items()]
ordered_list.sort(reverse=True)
ordered_list.sort()
if stack_count:
ordered_list = ordered_list[:stack_count]
ordered_list = ordered_list[-stack_count:]
for size, backtrace in ordered_list:
print(str.format('bytes={0:<10} count={1:<8} {2}', sizes[backtrace], counts[backtrace], backtrace))

View File

@ -192,6 +192,8 @@ class BaseInfo(object):
self.start_timestamp = bb.get_double()
if protocol_version >= PROTOCOL_VERSION_6_3:
self.dc_id = bb.get_bytes_with_length()
if protocol_version >= PROTOCOL_VERSION_7_1:
self.tenant = bb.get_bytes_with_length()
class GetVersionInfo(BaseInfo):
def __init__(self, bb, protocol_version):

View File

@ -6,6 +6,7 @@
.. |database-type| replace:: ``FDBDatabase``
.. |database-class| replace:: :type:`FDBDatabase`
.. |database-auto| replace:: FIXME
.. |tenant-type| replace:: ``FDBTenant``
.. |transaction-class| replace:: FIXME
.. |get-key-func| replace:: :func:`fdb_transaction_get_key()`
.. |get-range-func| replace:: :func:`fdb_transaction_get_range()`
@ -419,9 +420,20 @@ An |database-blurb1| Modifications to a database are performed via transactions.
|option-doc|
.. function:: fdb_error_t fdb_database_open_tenant(FDBDatabase* database, uint8_t const* tenant_name, int tenant_name_length, FDBTenant** out_tenant)
Opens a tenant on the given database. All transactions created by this tenant will operate on the tenant's key-space. The caller assumes ownership of the :type:`FDBTenant` object and must destroy it with :func:`fdb_tenant_destroy()`.
``tenant_name``
The name of the tenant being accessed, as a byte string.
``tenant_name_length``
The length of the tenant name byte string.
``*out_tenant``
Set to point to the newly created :type:`FDBTenant`.
.. function:: fdb_error_t fdb_database_create_transaction(FDBDatabase* database, FDBTransaction** out_transaction)
Creates a new transaction on the given database. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.
Creates a new transaction on the given database without using a tenant, meaning that it will operate on the entire database key-space. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.
``*out_transaction``
Set to point to the newly created :type:`FDBTransaction`.
@ -486,6 +498,26 @@ An |database-blurb1| Modifications to a database are performed via transactions.
Returns a value where 0 indicates that the client is idle and 1 (or larger) indicates that the client is saturated. By default, this value is updated every second.
Tenant
======
|tenant-blurb1|
.. type:: FDBTenant
An opaque type that represents a tenant in the FoundationDB C API.
.. function:: void fdb_tenant_destroy(FDBTenant* tenant)
Destroys an :type:`FDBTenant` object. It must be called exactly once for each successful call to :func:`fdb_database_create_tenant()`. This function only destroys a handle to the tenant -- the tenant and its data will be fine!
.. function:: fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTronsaction **out_transaction)
Creates a new transaction on the given tenant. This transaction will operate within the tenant's key-space and cannot access data outside the tenant. The caller assumes ownership of the :type:`FDBTransaction` object and must destroy it with :func:`fdb_transaction_destroy()`.
``*out_transaction``
Set to point to the newly created :type:`FDBTransaction`.
Transaction
===========

View File

@ -74,6 +74,9 @@
.. |database-sync| replace::
The convenience methods provided by |database-type| have the same signature as the corresponding methods of ``Transaction``. However, most of the |database-type| methods are fully synchronous. (An exception is the methods for watches.) As a result, the |database-type| methods do not support the use of :ref:`implicit parallelism with futures <developer-guide-programming-with-futures>`.
.. |tenant-blurb1| replace::
|tenant-type| represents a FoundationDB tenant. Tenants are optional named transaction domains that can be used to provide multiple disjoint key-spaces to client applications. A transaction created in a tenant will be limited to the keys contained within that tenant, and transactions operating on different tenants can use the same key names without interfering with each other.
.. |keysel-blurb1| replace::
FoundationDB's lexicographically ordered data model permits finding keys based on their order (for example, finding the first key in the database greater than a given key). Key selectors represent a description of a key in the database that could be resolved to an actual key by |get-key-func| or used directly as the beginning or end of a range in |get-range-func|.
@ -627,4 +630,4 @@
.. |option-set-distributed-client-tracer| replace::
Sets a tracer to run on the client. Should be set to the same value as the tracer set on the server.
Sets a tracer to run on the client. Should be set to the same value as the tracer set on the server.

View File

@ -7,6 +7,7 @@
.. |database-type| replace:: ``Database``
.. |database-class| replace:: :class:`Database`
.. |database-auto| replace:: the :func:`@fdb.transactional <transactional>` decorator
.. |tenant-type| replace:: FIXME
.. |transaction-class| replace:: :class:`Transaction`
.. |get-key-func| replace:: :func:`Transaction.get_key`
.. |get-range-func| replace:: :func:`Transaction.get_range`

View File

@ -5,6 +5,7 @@
.. |database-type| replace:: ``Database``
.. |database-class| replace:: :class:`Database`
.. |database-auto| replace:: :meth:`Database.transact`
.. |tenant-type| replace:: FIXME
.. |transaction-class| replace:: :class:`Transaction`
.. |get-key-func| replace:: :meth:`Transaction.get_key`
.. |get-range-func| replace:: :meth:`Transaction.get_range`

View File

@ -8,6 +8,7 @@
.. |database-type| replace:: ``Database``
.. |database-class| replace:: ``Database``
.. |database-auto| replace:: FIXME
.. |tenant-type| replace:: FIXME
.. |transaction-class| replace:: ``Transaction``
.. |get-key-func| replace:: get_key()
.. |get-range-func| replace:: get_range()

View File

@ -8,6 +8,7 @@
.. |database-type| replace:: ``Database``
.. |database-class| replace:: ``Database``
.. |database-auto| replace:: FIXME
.. |tenant-type| replace:: FIXME
.. |transaction-class| replace:: ``Transaction``
.. |get-key-func| replace:: get_key()
.. |get-range-func| replace:: get_range()

View File

@ -205,6 +205,7 @@ that process, and wait for necessary data to be moved away.
#. ``\xff\xff/management/failed_locality/<locality>`` Read/write. Indicates that the cluster should consider matching processes as permanently failed. This allows the cluster to avoid maintaining extra state and doing extra work in the hope that these processes come back. See :ref:`removing machines from a cluster <removing-machines-from-a-cluster>` for documentation for the corresponding fdbcli command.
#. ``\xff\xff/management/options/excluded_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/excluded_locality/<locality>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
#. ``\xff\xff/management/options/failed_locality/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed_locality/<locality>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
#. ``\xff\xff/management/tenant_map/<tenant>`` Read/write. Setting a key in this range to any value will result in a tenant being created with name ``<tenant>``. Clearing a key in this range will delete the tenant with name ``<tenant>``. Reading all or a portion of this range will return the list of tenants currently present in the cluster, excluding any changes in this transaction. Values read in this range will be JSON objects containing the metadata for the associated tenants.
An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
an ip address and port (e.g. ``127.0.0.1:4500``) or any locality (e.g ``locality_dcid:primary-satellite`` or

View File

@ -128,6 +128,7 @@ set(FDBCLIENT_SRCS
StatusClient.h
StorageServerInterface.cpp
StorageServerInterface.h
StorageCheckpoint.h
Subspace.cpp
Subspace.h
StackLineage.h

View File

@ -61,6 +61,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
init( REPLY_BYTE_LIMIT, 80000 );
init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
init( DEFAULT_MAX_BACKOFF, 1.0 );
@ -89,6 +90,8 @@ void ClientKnobs::initialize(Randomize randomize) {
init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3;
init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 );
init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 );
init( TENANT_CACHE_EVICTION_SIZE, 100000 );
init( TENANT_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_EVICTION_SIZE_SIM = 3;
init( GET_RANGE_SHARD_LIMIT, 2 );
init( WARM_RANGE_SHARD_LIMIT, 100 );

View File

@ -60,6 +60,7 @@ public:
double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is
// mostly wrong (e.g. dumping the database after a test)
double FUTURE_VERSION_RETRY_DELAY;
double UNKNOWN_TENANT_RETRY_DELAY;
int REPLY_BYTE_LIMIT;
double DEFAULT_BACKOFF;
double DEFAULT_MAX_BACKOFF;
@ -89,6 +90,8 @@ public:
int LOCATION_CACHE_EVICTION_SIZE_SIM;
double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD;
double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL;
int TENANT_CACHE_EVICTION_SIZE;
int TENANT_CACHE_EVICTION_SIZE_SIM;
int GET_RANGE_SHARD_LIMIT;
int WARM_RANGE_SHARD_LIMIT;

View File

@ -41,7 +41,8 @@ enum class TransactionPriorityType : int { PRIORITY_DEFAULT = 0, PRIORITY_BATCH
static_assert(sizeof(TransactionPriorityType) == 4, "transaction_profiling_analyzer.py assumes this field has size 4");
struct Event {
Event(EventType t, double ts, const Optional<Standalone<StringRef>>& dc) : type(t), startTs(ts) {
Event(EventType t, double ts, const Optional<Standalone<StringRef>>& dc, const Optional<TenantName>& tenant)
: type(t), startTs(ts), tenant(tenant) {
if (dc.present())
dcId = dc.get();
}
@ -49,7 +50,9 @@ struct Event {
template <typename Ar>
Ar& serialize(Ar& ar) {
if (ar.protocolVersion().version() >= (uint64_t)0x0FDB00B063010001LL) {
if (ar.protocolVersion().hasTenants()) {
return serializer(ar, type, startTs, dcId, tenant);
} else if (ar.protocolVersion().version() >= (uint64_t)0x0FDB00B063010001LL) {
return serializer(ar, type, startTs, dcId);
} else {
return serializer(ar, type, startTs);
@ -59,8 +62,10 @@ struct Event {
EventType type{ EventType::UNSET };
double startTs{ 0 };
Key dcId{};
Optional<TenantName> tenant{};
void logEvent(std::string id, int maxFieldLength) const {}
void augmentTraceEvent(TraceEvent& event) const { event.detail("Tenant", tenant); }
};
struct EventGetVersion : public Event {
@ -77,7 +82,9 @@ struct EventGetVersion : public Event {
double latency;
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetVersion").detail("TransactionID", id).detail("Latency", latency);
TraceEvent event("TransactionTrace_GetVersion");
event.detail("TransactionID", id).detail("Latency", latency);
augmentTraceEvent(event);
}
};
@ -97,10 +104,9 @@ struct EventGetVersion_V2 : public Event {
TransactionPriorityType priorityType{ TransactionPriorityType::UNSET };
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetVersion")
.detail("TransactionID", id)
.detail("Latency", latency)
.detail("PriorityType", priorityType);
TraceEvent event("TransactionTrace_GetVersion");
event.detail("TransactionID", id).detail("Latency", latency).detail("PriorityType", priorityType);
augmentTraceEvent(event);
}
};
@ -110,8 +116,9 @@ struct EventGetVersion_V3 : public Event {
const Optional<Standalone<StringRef>>& dcId,
double lat,
TransactionPriority priority,
Version version)
: Event(EventType::GET_VERSION_LATENCY, ts, dcId), latency(lat), readVersion(version) {
Version version,
const Optional<TenantName>& tenant)
: Event(EventType::GET_VERSION_LATENCY, ts, dcId, tenant), latency(lat), readVersion(version) {
switch (priority) {
// Unfortunately, the enum serialized here disagrees with the enum used elsewhere for the values used by each
// priority
@ -143,17 +150,23 @@ struct EventGetVersion_V3 : public Event {
Version readVersion;
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetVersion")
.detail("TransactionID", id)
TraceEvent event("TransactionTrace_GetVersion");
event.detail("TransactionID", id)
.detail("Latency", latency)
.detail("PriorityType", priorityType)
.detail("ReadVersion", readVersion);
augmentTraceEvent(event);
}
};
struct EventGet : public Event {
EventGet(double ts, const Optional<Standalone<StringRef>>& dcId, double lat, int size, const KeyRef& in_key)
: Event(EventType::GET_LATENCY, ts, dcId), latency(lat), valueSize(size), key(in_key) {}
EventGet(double ts,
const Optional<Standalone<StringRef>>& dcId,
double lat,
int size,
const KeyRef& in_key,
const Optional<TenantName>& tenant)
: Event(EventType::GET_LATENCY, ts, dcId, tenant), latency(lat), valueSize(size), key(in_key) {}
EventGet() {}
template <typename Ar>
@ -169,13 +182,14 @@ struct EventGet : public Event {
Key key;
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_Get")
.setMaxEventLength(-1)
TraceEvent event("TransactionTrace_Get");
event.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("Latency", latency)
.detail("ValueSizeBytes", valueSize)
.setMaxFieldLength(maxFieldLength)
.detail("Key", key);
augmentTraceEvent(event);
}
};
@ -185,8 +199,9 @@ struct EventGetRange : public Event {
double lat,
int size,
const KeyRef& start_key,
const KeyRef& end_key)
: Event(EventType::GET_RANGE_LATENCY, ts, dcId), latency(lat), rangeSize(size), startKey(start_key),
const KeyRef& end_key,
const Optional<TenantName>& tenant)
: Event(EventType::GET_RANGE_LATENCY, ts, dcId, tenant), latency(lat), rangeSize(size), startKey(start_key),
endKey(end_key) {}
EventGetRange() {}
@ -204,14 +219,15 @@ struct EventGetRange : public Event {
Key endKey;
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetRange")
.setMaxEventLength(-1)
TraceEvent event("TransactionTrace_GetRange");
event.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("Latency", latency)
.detail("RangeSizeBytes", rangeSize)
.setMaxFieldLength(maxFieldLength)
.detail("StartKey", startKey)
.detail("EndKey", endKey);
augmentTraceEvent(event);
}
};
@ -234,36 +250,40 @@ struct EventCommit : public Event {
void logEvent(std::string id, int maxFieldLength) const {
for (auto& read_range : req.transaction.read_conflict_ranges) {
TraceEvent("TransactionTrace_Commit_ReadConflictRange")
.setMaxEventLength(-1)
TraceEvent ev1("TransactionTrace_Commit_ReadConflictRange");
ev1.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", read_range.begin)
.detail("End", read_range.end);
augmentTraceEvent(ev1);
}
for (auto& write_range : req.transaction.write_conflict_ranges) {
TraceEvent("TransactionTrace_Commit_WriteConflictRange")
.setMaxEventLength(-1)
TraceEvent ev2("TransactionTrace_Commit_WriteConflictRange");
ev2.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", write_range.begin)
.detail("End", write_range.end);
augmentTraceEvent(ev2);
}
for (auto& mutation : req.transaction.mutations) {
TraceEvent("TransactionTrace_Commit_Mutation")
.setMaxEventLength(-1)
TraceEvent ev3("TransactionTrace_Commit_Mutation");
ev3.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation);
augmentTraceEvent(ev3);
}
TraceEvent("TransactionTrace_Commit")
.detail("TransactionID", id)
TraceEvent ev4("TransactionTrace_Commit");
ev4.detail("TransactionID", id)
.detail("Latency", latency)
.detail("NumMutations", numMutations)
.detail("CommitSizeBytes", commitBytes);
augmentTraceEvent(ev4);
}
};
@ -275,8 +295,9 @@ struct EventCommit_V2 : public Event {
int mut,
int bytes,
Version version,
const CommitTransactionRequest& commit_req)
: Event(EventType::COMMIT_LATENCY, ts, dcId), latency(lat), numMutations(mut), commitBytes(bytes),
const CommitTransactionRequest& commit_req,
const Optional<TenantName>& tenant)
: Event(EventType::COMMIT_LATENCY, ts, dcId, tenant), latency(lat), numMutations(mut), commitBytes(bytes),
commitVersion(version), req(commit_req) {}
EventCommit_V2() {}
@ -298,43 +319,51 @@ struct EventCommit_V2 : public Event {
void logEvent(std::string id, int maxFieldLength) const {
for (auto& read_range : req.transaction.read_conflict_ranges) {
TraceEvent("TransactionTrace_Commit_ReadConflictRange")
.setMaxEventLength(-1)
TraceEvent ev1("TransactionTrace_Commit_ReadConflictRange");
ev1.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", read_range.begin)
.detail("End", read_range.end);
augmentTraceEvent(ev1);
}
for (auto& write_range : req.transaction.write_conflict_ranges) {
TraceEvent("TransactionTrace_Commit_WriteConflictRange")
.setMaxEventLength(-1)
TraceEvent ev2("TransactionTrace_Commit_WriteConflictRange");
ev2.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", write_range.begin)
.detail("End", write_range.end);
augmentTraceEvent(ev2);
}
for (auto& mutation : req.transaction.mutations) {
TraceEvent("TransactionTrace_Commit_Mutation")
.setMaxEventLength(-1)
TraceEvent ev3("TransactionTrace_Commit_Mutation");
ev3.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation);
augmentTraceEvent(ev3);
}
TraceEvent("TransactionTrace_Commit")
.detail("TransactionID", id)
TraceEvent ev4("TransactionTrace_Commit");
ev4.detail("TransactionID", id)
.detail("CommitVersion", commitVersion)
.detail("Latency", latency)
.detail("NumMutations", numMutations)
.detail("CommitSizeBytes", commitBytes);
augmentTraceEvent(ev4);
}
};
struct EventGetError : public Event {
EventGetError(double ts, const Optional<Standalone<StringRef>>& dcId, int err_code, const KeyRef& in_key)
: Event(EventType::ERROR_GET, ts, dcId), errCode(err_code), key(in_key) {}
EventGetError(double ts,
const Optional<Standalone<StringRef>>& dcId,
int err_code,
const KeyRef& in_key,
const Optional<TenantName>& tenant)
: Event(EventType::ERROR_GET, ts, dcId, tenant), errCode(err_code), key(in_key) {}
EventGetError() {}
template <typename Ar>
@ -349,12 +378,13 @@ struct EventGetError : public Event {
Key key;
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetError")
.setMaxEventLength(-1)
TraceEvent event("TransactionTrace_GetError");
event.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("ErrCode", errCode)
.setMaxFieldLength(maxFieldLength)
.detail("Key", key);
augmentTraceEvent(event);
}
};
@ -363,8 +393,9 @@ struct EventGetRangeError : public Event {
const Optional<Standalone<StringRef>>& dcId,
int err_code,
const KeyRef& start_key,
const KeyRef& end_key)
: Event(EventType::ERROR_GET_RANGE, ts, dcId), errCode(err_code), startKey(start_key), endKey(end_key) {}
const KeyRef& end_key,
const Optional<TenantName>& tenant)
: Event(EventType::ERROR_GET_RANGE, ts, dcId, tenant), errCode(err_code), startKey(start_key), endKey(end_key) {}
EventGetRangeError() {}
template <typename Ar>
@ -380,13 +411,14 @@ struct EventGetRangeError : public Event {
Key endKey;
void logEvent(std::string id, int maxFieldLength) const {
TraceEvent("TransactionTrace_GetRangeError")
.setMaxEventLength(-1)
TraceEvent event("TransactionTrace_GetRangeError");
event.setMaxEventLength(-1)
.detail("TransactionID", id)
.detail("ErrCode", errCode)
.setMaxFieldLength(maxFieldLength)
.detail("StartKey", startKey)
.detail("EndKey", endKey);
augmentTraceEvent(event);
}
};
@ -394,8 +426,9 @@ struct EventCommitError : public Event {
EventCommitError(double ts,
const Optional<Standalone<StringRef>>& dcId,
int err_code,
const CommitTransactionRequest& commit_req)
: Event(EventType::ERROR_COMMIT, ts, dcId), errCode(err_code), req(commit_req) {}
const CommitTransactionRequest& commit_req,
const Optional<TenantName>& tenant)
: Event(EventType::ERROR_COMMIT, ts, dcId, tenant), errCode(err_code), req(commit_req) {}
EventCommitError() {}
template <typename Ar>
@ -412,32 +445,37 @@ struct EventCommitError : public Event {
void logEvent(std::string id, int maxFieldLength) const {
for (auto& read_range : req.transaction.read_conflict_ranges) {
TraceEvent("TransactionTrace_CommitError_ReadConflictRange")
.setMaxEventLength(-1)
TraceEvent ev1("TransactionTrace_CommitError_ReadConflictRange");
ev1.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", read_range.begin)
.detail("End", read_range.end);
augmentTraceEvent(ev1);
}
for (auto& write_range : req.transaction.write_conflict_ranges) {
TraceEvent("TransactionTrace_CommitError_WriteConflictRange")
.setMaxEventLength(-1)
TraceEvent ev2("TransactionTrace_CommitError_WriteConflictRange");
ev2.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Begin", write_range.begin)
.detail("End", write_range.end);
augmentTraceEvent(ev2);
}
for (auto& mutation : req.transaction.mutations) {
TraceEvent("TransactionTrace_CommitError_Mutation")
.setMaxEventLength(-1)
TraceEvent ev3("TransactionTrace_CommitError_Mutation");
ev3.setMaxEventLength(-1)
.detail("TransactionID", id)
.setMaxFieldLength(maxFieldLength)
.detail("Mutation", mutation);
augmentTraceEvent(ev3);
}
TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode);
TraceEvent ev4("TransactionTrace_CommitError");
ev4.detail("TransactionID", id).detail("ErrCode", errCode);
augmentTraceEvent(ev4);
}
};
} // namespace FdbClientLogEvents

View File

@ -171,9 +171,8 @@ struct CommitTransactionRequest : TimedRequest {
TenantInfo tenantInfo;
CommitTransactionRequest() : CommitTransactionRequest(TenantInfo(), SpanID()) {}
CommitTransactionRequest(TenantInfo const& tenantInfo, SpanID const& context)
: spanContext(context), flags(0), tenantInfo(tenantInfo) {}
CommitTransactionRequest() : CommitTransactionRequest(SpanID()) {}
CommitTransactionRequest(SpanID const& context) : spanContext(context), flags(0) {}
template <class Ar>
void serialize(Ar& ar) {

View File

@ -134,6 +134,7 @@ public:
};
struct WatchParameters : public ReferenceCounted<WatchParameters> {
const TenantInfo tenant;
const Key key;
const Optional<Value> value;
@ -144,7 +145,8 @@ struct WatchParameters : public ReferenceCounted<WatchParameters> {
const Optional<UID> debugID;
const UseProvisionalProxies useProvisionalProxies;
WatchParameters(Key key,
WatchParameters(TenantInfo tenant,
Key key,
Optional<Value> value,
Version version,
TagSet tags,
@ -152,8 +154,8 @@ struct WatchParameters : public ReferenceCounted<WatchParameters> {
TaskPriority taskID,
Optional<UID> debugID,
UseProvisionalProxies useProvisionalProxies)
: key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID), debugID(debugID),
useProvisionalProxies(useProvisionalProxies) {}
: tenant(tenant), key(key), value(value), version(version), tags(tags), spanID(spanID), taskID(taskID),
debugID(debugID), useProvisionalProxies(useProvisionalProxies) {}
};
class WatchMetadata : public ReferenceCounted<WatchMetadata> {
@ -204,6 +206,16 @@ struct EndpointFailureInfo {
double lastRefreshTime = 0;
};
struct KeyRangeLocationInfo {
TenantMapEntry tenantEntry;
KeyRange range;
Reference<LocationInfo> locations;
KeyRangeLocationInfo() {}
KeyRangeLocationInfo(TenantMapEntry tenantEntry, KeyRange range, Reference<LocationInfo> locations)
: tenantEntry(tenantEntry), range(range), locations(locations) {}
};
class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
public:
static DatabaseContext* allocateOnForeignThread() {
@ -238,14 +250,22 @@ public:
switchable));
}
std::pair<KeyRange, Reference<LocationInfo>> getCachedLocation(const KeyRef&, Reverse isBackward = Reverse::False);
bool getCachedLocations(const KeyRangeRef&,
std::vector<std::pair<KeyRange, Reference<LocationInfo>>>&,
Optional<KeyRangeLocationInfo> getCachedLocation(const Optional<TenantName>& tenant,
const KeyRef&,
Reverse isBackward = Reverse::False);
bool getCachedLocations(const Optional<TenantName>& tenant,
const KeyRangeRef&,
std::vector<KeyRangeLocationInfo>&,
int limit,
Reverse reverse);
Reference<LocationInfo> setCachedLocation(const KeyRangeRef&, const std::vector<struct StorageServerInterface>&);
void invalidateCache(const KeyRef&, Reverse isBackward = Reverse::False);
void invalidateCache(const KeyRangeRef&);
void cacheTenant(const TenantName& tenant, const TenantMapEntry& tenantEntry);
Reference<LocationInfo> setCachedLocation(const Optional<TenantName>& tenant,
const TenantMapEntry& tenantEntry,
const KeyRangeRef&,
const std::vector<struct StorageServerInterface>&);
void invalidateCachedTenant(const TenantNameRef& tenant);
void invalidateCache(const KeyRef& tenantPrefix, const KeyRef& key, Reverse isBackward = Reverse::False);
void invalidateCache(const KeyRef& tenantPrefix, const KeyRangeRef& keys);
// Records that `endpoint` is failed on a healthy server.
void setFailedEndpointOnHealthyServer(const Endpoint& endpoint);
@ -288,9 +308,9 @@ public:
void removeWatch();
// watch map operations
Reference<WatchMetadata> getWatchMetadata(KeyRef key) const;
Key setWatchMetadata(Reference<WatchMetadata> metadata);
void deleteWatchMetadata(KeyRef key);
Reference<WatchMetadata> getWatchMetadata(int64_t tenantId, KeyRef key) const;
void setWatchMetadata(Reference<WatchMetadata> metadata);
void deleteWatchMetadata(int64_t tenant, KeyRef key);
void clearWatchMetadata();
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value);
@ -408,8 +428,10 @@ public:
// Cache of location information
int locationCacheSize;
int tenantCacheSize;
CoalescedKeyRangeMap<Reference<LocationInfo>> locationCache;
std::unordered_map<Endpoint, EndpointFailureInfo> failedEndpointsOnHealthyServersInfo;
std::unordered_map<TenantName, TenantMapEntry> tenantCache;
std::map<UID, StorageServerInfo*> server_interf;
std::map<UID, BlobWorkerInterface> blobWorker_interf; // blob workers don't change endpoints for the same ID
@ -564,7 +586,8 @@ public:
EventCacheHolder connectToDatabaseEventCacheHolder;
private:
std::unordered_map<Key, Reference<WatchMetadata>> watchMap;
std::unordered_map<std::pair<int64_t, Key>, Reference<WatchMetadata>, boost::hash<std::pair<int64_t, Key>>>
watchMap;
};
#endif

View File

@ -37,6 +37,9 @@ the contents of the system key space.
#include "fdbclient/ClientBooleanParams.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/Status.h"
#include "fdbclient/Subspace.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/Status.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // has to be last include
@ -626,6 +629,231 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db,
// used by special keys and fdbcli
std::string generateErrorMessage(const CoordinatorsResult& res);
ACTOR template <class Transaction>
Future<Optional<TenantMapEntry>> tryGetTenantTransaction(Transaction tr, TenantName name) {
state Key tenantMapKey = name.withPrefix(tenantMapPrefix);
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<Value> val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey)));
return val.map<TenantMapEntry>([](Optional<Value> v) { return decodeTenantEntry(v.get()); });
}
ACTOR template <class DB>
Future<Optional<TenantMapEntry>> tryGetTenant(Reference<DB> db, TenantName name) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
loop {
try {
Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
return entry;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR template <class Transaction>
Future<TenantMapEntry> getTenantTransaction(Transaction tr, TenantName name) {
Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
if (!entry.present()) {
throw tenant_not_found();
}
return entry.get();
}
ACTOR template <class DB>
Future<TenantMapEntry> getTenant(Reference<DB> db, TenantName name) {
Optional<TenantMapEntry> entry = wait(tryGetTenant(db, name));
if (!entry.present()) {
throw tenant_not_found();
}
return entry.get();
}
// Creates a tenant with the given name. If the tenant already exists, an empty optional will be returned.
ACTOR template <class Transaction>
Future<Optional<TenantMapEntry>> createTenantTransaction(Transaction tr, TenantNameRef name) {
state Key tenantMapKey = name.withPrefix(tenantMapPrefix);
if (name.startsWith("\xff"_sr)) {
throw invalid_tenant_name();
}
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
state Future<Optional<TenantMapEntry>> tenantEntryFuture = tryGetTenantTransaction(tr, name);
state Future<Optional<Value>> tenantDataPrefixFuture = safeThreadFutureToFuture(tr->get(tenantDataPrefixKey));
state Future<Optional<Value>> lastIdFuture = safeThreadFutureToFuture(tr->get(tenantLastIdKey));
Optional<Value> tenantMode = wait(safeThreadFutureToFuture(tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr))));
if (!tenantMode.present() || tenantMode.get() == StringRef(format("%d", TenantMode::DISABLED))) {
throw tenants_disabled();
}
Optional<TenantMapEntry> tenantEntry = wait(tenantEntryFuture);
if (tenantEntry.present()) {
return Optional<TenantMapEntry>();
}
state Optional<Value> lastIdVal = wait(lastIdFuture);
Optional<Value> tenantDataPrefix = wait(tenantDataPrefixFuture);
state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0,
tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr);
RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(newTenant.prefix), 1)));
if (!contents.empty()) {
throw tenant_prefix_allocator_conflict();
}
tr->set(tenantLastIdKey, TenantMapEntry::idToPrefix(newTenant.id));
tr->set(tenantMapKey, encodeTenantEntry(newTenant));
return newTenant;
}
ACTOR template <class DB>
Future<Void> createTenant(Reference<DB> db, TenantName name) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state bool firstTry = true;
loop {
try {
if (firstTry) {
Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
if (entry.present()) {
throw tenant_already_exists();
}
firstTry = false;
}
state Optional<TenantMapEntry> newTenant = wait(createTenantTransaction(tr, name));
if (BUGGIFY) {
throw commit_unknown_result();
}
wait(safeThreadFutureToFuture(tr->commit()));
if (BUGGIFY) {
throw commit_unknown_result();
}
TraceEvent("CreatedTenant")
.detail("Tenant", name)
.detail("TenantId", newTenant.present() ? newTenant.get().id : -1)
.detail("Prefix", newTenant.present() ? (StringRef)newTenant.get().prefix : "Unknown"_sr)
.detail("Version", tr->getCommittedVersion());
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR template <class Transaction>
Future<Void> deleteTenantTransaction(Transaction tr, TenantNameRef name) {
state Key tenantMapKey = name.withPrefix(tenantMapPrefix);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
state Optional<TenantMapEntry> tenantEntry = wait(tryGetTenantTransaction(tr, name));
if (!tenantEntry.present()) {
return Void();
}
RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(tenantEntry.get().prefix), 1)));
if (!contents.empty()) {
throw tenant_not_empty();
}
tr->clear(tenantMapKey);
return Void();
}
ACTOR template <class DB>
Future<Void> deleteTenant(Reference<DB> db, TenantName name) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
state bool firstTry = true;
loop {
try {
if (firstTry) {
Optional<TenantMapEntry> entry = wait(tryGetTenantTransaction(tr, name));
if (!entry.present()) {
throw tenant_not_found();
}
firstTry = false;
}
wait(deleteTenantTransaction(tr, name));
if (BUGGIFY) {
throw commit_unknown_result();
}
wait(safeThreadFutureToFuture(tr->commit()));
if (BUGGIFY) {
throw commit_unknown_result();
}
TraceEvent("DeletedTenant").detail("Tenant", name).detail("Version", tr->getCommittedVersion());
return Void();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR template <class Transaction>
Future<std::map<TenantName, TenantMapEntry>> listTenantsTransaction(Transaction tr,
TenantNameRef begin,
TenantNameRef end,
int limit) {
state KeyRange range = KeyRangeRef(begin, end).withPrefix(tenantMapPrefix);
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
RangeResult results = wait(safeThreadFutureToFuture(
tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit)));
std::map<TenantName, TenantMapEntry> tenants;
for (auto kv : results) {
tenants[kv.key.removePrefix(tenantMapPrefix)] = decodeTenantEntry(kv.value);
}
return tenants;
}
ACTOR template <class DB>
Future<std::map<TenantName, TenantMapEntry>> listTenants(Reference<DB> db,
TenantName begin,
TenantName end,
int limit) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
loop {
try {
std::map<TenantName, TenantMapEntry> tenants = wait(listTenantsTransaction(tr, begin, end, limit));
return tenants;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
} // namespace ManagementAPI
#include "flow/unactorcompiler.h"

View File

@ -24,6 +24,7 @@
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Tenant.h"
#include "flow/ThreadHelper.actor.h"
@ -109,6 +110,18 @@ public:
// Only if it's a MultiVersionTransaction and the underlying transaction handler is null,
// it will return false
virtual bool isValid() { return true; }
virtual Optional<TenantName> getTenant() = 0;
};
class ITenant {
public:
virtual ~ITenant() {}
virtual Reference<ITransaction> createTransaction() = 0;
virtual void addref() = 0;
virtual void delref() = 0;
};
// An interface that represents a connection to a cluster made by a client
@ -116,6 +129,7 @@ class IDatabase {
public:
virtual ~IDatabase() {}
virtual Reference<ITenant> openTenant(TenantNameRef tenantName) = 0;
virtual Reference<ITransaction> createTransaction() = 0;
virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
virtual double getMainThreadBusyness() = 0;

View File

@ -48,6 +48,21 @@ Reference<ISingleThreadTransaction> ISingleThreadTransaction::create(Type type,
} else {
result = makeReference<PaxosConfigTransaction>();
}
result->setDatabase(cx);
result->construct(cx);
return result;
}
Reference<ISingleThreadTransaction> ISingleThreadTransaction::create(Type type,
Database const& cx,
TenantName const& tenant) {
Reference<ISingleThreadTransaction> result;
if (type == Type::RYW) {
result = makeReference<ReadYourWritesTransaction>();
} else if (type == Type::SIMPLE_CONFIG) {
result = makeReference<SimpleConfigTransaction>();
} else {
result = makeReference<PaxosConfigTransaction>();
}
result->construct(cx, tenant);
return result;
}

View File

@ -45,8 +45,15 @@ public:
};
static ISingleThreadTransaction* allocateOnForeignThread(Type);
static Reference<ISingleThreadTransaction> create(Type, Database const&);
virtual void setDatabase(Database const&) = 0;
static Reference<ISingleThreadTransaction> create(Type, Database const&, TenantName const&);
virtual void construct(Database const&) = 0;
virtual void construct(Database const&, TenantName const&) {
// By default, a transaction implementation does not support tenants.
ASSERT(false);
}
virtual void setVersion(Version v) = 0;
virtual Future<Version> getReadVersion() = 0;

View File

@ -18,7 +18,9 @@
* limitations under the License.
*/
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GenericManagementAPI.actor.h"
#include "fdbclient/MultiVersionTransaction.h"
#include "fdbclient/MultiVersionAssignmentVars.h"
#include "fdbclient/ClientVersion.h"
@ -382,6 +384,15 @@ void DLTransaction::reset() {
api->transactionReset(tr);
}
// DLTenant
Reference<ITransaction> DLTenant::createTransaction() {
ASSERT(api->tenantCreateTransaction != nullptr);
FdbCApi::FDBTransaction* tr;
api->tenantCreateTransaction(tenant, &tr);
return Reference<ITransaction>(new DLTransaction(api, tr));
}
// DLDatabase
DLDatabase::DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture) : api(api), db(nullptr) {
addref();
@ -401,9 +412,19 @@ ThreadFuture<Void> DLDatabase::onReady() {
return ready;
}
Reference<ITenant> DLDatabase::openTenant(TenantNameRef tenantName) {
if (!api->databaseOpenTenant) {
throw unsupported_operation();
}
FdbCApi::FDBTenant* tenant;
throwIfError(api->databaseOpenTenant(db, tenantName.begin(), tenantName.size(), &tenant));
return makeReference<DLTenant>(api, tenant);
}
Reference<ITransaction> DLDatabase::createTransaction() {
FdbCApi::FDBTransaction* tr;
api->databaseCreateTransaction(db, &tr);
throwIfError(api->databaseCreateTransaction(db, &tr));
return Reference<ITransaction>(new DLTransaction(api, tr));
}
@ -535,6 +556,7 @@ void DLApi::init() {
loadClientFunction(&api->stopNetwork, lib, fdbCPath, "fdb_stop_network", headerVersion >= 0);
loadClientFunction(&api->createDatabase, lib, fdbCPath, "fdb_create_database", headerVersion >= 610);
loadClientFunction(&api->databaseOpenTenant, lib, fdbCPath, "fdb_database_open_tenant", headerVersion >= 710);
loadClientFunction(
&api->databaseCreateTransaction, lib, fdbCPath, "fdb_database_create_transaction", headerVersion >= 0);
loadClientFunction(&api->databaseSetOption, lib, fdbCPath, "fdb_database_set_option", headerVersion >= 0);
@ -555,6 +577,10 @@ void DLApi::init() {
loadClientFunction(
&api->databaseCreateSnapshot, lib, fdbCPath, "fdb_database_create_snapshot", headerVersion >= 700);
loadClientFunction(
&api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710);
loadClientFunction(&api->transactionSetOption, lib, fdbCPath, "fdb_transaction_set_option", headerVersion >= 0);
loadClientFunction(&api->transactionDestroy, lib, fdbCPath, "fdb_transaction_destroy", headerVersion >= 0);
loadClientFunction(
@ -751,8 +777,9 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParame
// MultiVersionTransaction
MultiVersionTransaction::MultiVersionTransaction(Reference<MultiVersionDatabase> db,
Optional<Reference<MultiVersionTenant>> tenant,
UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions)
: db(db), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar<Void>()) {
: db(db), tenant(tenant), startTime(timer_monotonic()), timeoutTsav(new ThreadSingleAssignmentVar<Void>()) {
setDefaultOptions(defaultOptions);
updateTransaction();
}
@ -763,18 +790,29 @@ void MultiVersionTransaction::setDefaultOptions(UniqueOrderedOptionList<FDBTrans
}
void MultiVersionTransaction::updateTransaction() {
auto currentDb = db->dbState->dbVar->get();
TransactionInfo newTr;
if (currentDb.value) {
newTr.transaction = currentDb.value->createTransaction();
if (tenant.present()) {
ASSERT(tenant.get());
auto currentTenant = tenant.get()->tenantVar->get();
if (currentTenant.value) {
newTr.transaction = currentTenant.value->createTransaction();
}
newTr.onChange = currentTenant.onChange;
} else {
auto currentDb = db->dbState->dbVar->get();
if (currentDb.value) {
newTr.transaction = currentDb.value->createTransaction();
}
newTr.onChange = currentDb.onChange;
}
Optional<StringRef> timeout;
for (auto option : persistentOptions) {
if (option.first == FDBTransactionOptions::TIMEOUT) {
timeout = option.second.castTo<StringRef>();
} else if (currentDb.value) {
} else if (newTr.transaction) {
newTr.transaction->setOption(option.first, option.second.castTo<StringRef>());
}
}
@ -784,13 +822,11 @@ void MultiVersionTransaction::updateTransaction() {
// that might inadvertently fail the transaction.
if (timeout.present()) {
setTimeout(timeout);
if (currentDb.value) {
if (newTr.transaction) {
newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout);
}
}
newTr.onChange = currentDb.onChange;
lock.enter();
transaction = newTr;
lock.leave();
@ -1055,6 +1091,14 @@ ThreadFuture<Void> MultiVersionTransaction::onError(Error const& e) {
}
}
Optional<TenantName> MultiVersionTransaction::getTenant() {
if (tenant.present()) {
return tenant.get()->tenantName;
} else {
return Optional<TenantName>();
}
}
// Waits for the specified duration and signals the assignment variable with a timed out error
// This will be canceled if a new timeout is set, in which case the tsav will not be signaled.
ACTOR Future<Void> timeoutImpl(Reference<ThreadSingleAssignmentVar<Void>> tsav, double duration) {
@ -1181,6 +1225,39 @@ bool MultiVersionTransaction::isValid() {
return tr.transaction.isValid();
}
// MultiVersionTenant
MultiVersionTenant::MultiVersionTenant(Reference<MultiVersionDatabase> db, StringRef tenantName)
: tenantVar(new ThreadSafeAsyncVar<Reference<ITenant>>(Reference<ITenant>(nullptr))), tenantName(tenantName), db(db) {
updateTenant();
}
MultiVersionTenant::~MultiVersionTenant() {}
Reference<ITransaction> MultiVersionTenant::createTransaction() {
return Reference<ITransaction>(new MultiVersionTransaction(
db, Reference<MultiVersionTenant>::addRef(this), db->dbState->transactionDefaultOptions));
}
// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
// to open transactions via an AsyncVar.
void MultiVersionTenant::updateTenant() {
Reference<ITenant> tenant;
auto currentDb = db->dbState->dbVar->get();
if (currentDb.value) {
tenant = currentDb.value->openTenant(tenantName);
} else {
tenant = Reference<ITenant>(nullptr);
}
tenantVar->set(tenant);
MutexHolder holder(tenantLock);
tenantUpdater = mapThreadFuture<Void, Void>(currentDb.onChange, [this](ErrorOr<Void> result) {
updateTenant();
return Void();
});
}
// MultiVersionDatabase
MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
int threadIdx,
@ -1261,9 +1338,14 @@ Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Refer
return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false));
}
Reference<ITenant> MultiVersionDatabase::openTenant(TenantNameRef tenantName) {
return makeReference<MultiVersionTenant>(Reference<MultiVersionDatabase>::addRef(this), tenantName);
}
Reference<ITransaction> MultiVersionDatabase::createTransaction() {
return Reference<ITransaction>(
new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this), dbState->transactionDefaultOptions));
return Reference<ITransaction>(new MultiVersionTransaction(Reference<MultiVersionDatabase>::addRef(this),
Optional<Reference<MultiVersionTenant>>(),
dbState->transactionDefaultOptions));
}
void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value) {

View File

@ -36,6 +36,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
typedef struct FDB_result FDBResult;
typedef struct FDB_cluster FDBCluster;
typedef struct FDB_database FDBDatabase;
typedef struct FDB_tenant FDBTenant;
typedef struct FDB_transaction FDBTransaction;
typedef int fdb_error_t;
@ -120,6 +121,10 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
fdb_error_t (*createDatabase)(const char* clusterFilePath, FDBDatabase** db);
// Database
fdb_error_t (*databaseOpenTenant)(FDBDatabase* database,
uint8_t const* tenantName,
int tenantNameLength,
FDBTenant** outTenant);
fdb_error_t (*databaseCreateTransaction)(FDBDatabase* database, FDBTransaction** tr);
fdb_error_t (*databaseSetOption)(FDBDatabase* database,
FDBDatabaseOption option,
@ -143,6 +148,10 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);
// Tenant
fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
void (*tenantDestroy)(FDBTenant* tenant);
// Transaction
fdb_error_t (*transactionSetOption)(FDBTransaction* tr,
FDBTransactionOption option,
@ -356,6 +365,11 @@ public:
ThreadFuture<Void> onError(Error const& e) override;
void reset() override;
Optional<TenantName> getTenant() override {
ASSERT(false);
throw internal_error();
}
void addref() override { ThreadSafeReferenceCounted<DLTransaction>::addref(); }
void delref() override { ThreadSafeReferenceCounted<DLTransaction>::delref(); }
@ -364,6 +378,25 @@ private:
FdbCApi::FDBTransaction* const tr;
};
class DLTenant : public ITenant, ThreadSafeReferenceCounted<DLTenant> {
public:
DLTenant(Reference<FdbCApi> api, FdbCApi::FDBTenant* tenant) : api(api), tenant(tenant) {}
~DLTenant() override {
if (tenant) {
api->tenantDestroy(tenant);
}
}
Reference<ITransaction> createTransaction() override;
void addref() override { ThreadSafeReferenceCounted<DLTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<DLTenant>::delref(); }
private:
const Reference<FdbCApi> api;
FdbCApi::FDBTenant* tenant;
};
// An implementation of IDatabase that wraps a database object created on an externally loaded client library.
// All API calls to that database are routed through the external library.
class DLDatabase : public IDatabase, ThreadSafeReferenceCounted<DLDatabase> {
@ -378,6 +411,7 @@ public:
ThreadFuture<Void> onReady();
Reference<ITenant> openTenant(TenantNameRef tenantName) override;
Reference<ITransaction> createTransaction() override;
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
double getMainThreadBusyness() override;
@ -438,6 +472,7 @@ private:
};
class MultiVersionDatabase;
class MultiVersionTenant;
// An implementation of ITransaction that wraps a transaction created either locally or through a dynamically loaded
// external client. When needed (e.g on cluster version change), the MultiVersionTransaction can automatically replace
@ -445,6 +480,7 @@ class MultiVersionDatabase;
class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
public:
MultiVersionTransaction(Reference<MultiVersionDatabase> db,
Optional<Reference<MultiVersionTenant>> tenant,
UniqueOrderedOptionList<FDBTransactionOptions> defaultOptions);
~MultiVersionTransaction() override;
@ -513,6 +549,8 @@ public:
ThreadFuture<Void> onError(Error const& e) override;
void reset() override;
Optional<TenantName> getTenant() override;
void addref() override { ThreadSafeReferenceCounted<MultiVersionTransaction>::addref(); }
void delref() override { ThreadSafeReferenceCounted<MultiVersionTransaction>::delref(); }
@ -521,6 +559,7 @@ public:
private:
const Reference<MultiVersionDatabase> db;
const Optional<Reference<MultiVersionTenant>> tenant;
ThreadSpinLock lock;
struct TransactionInfo {
@ -561,6 +600,8 @@ private:
void setDefaultOptions(UniqueOrderedOptionList<FDBTransactionOptions> options);
std::vector<std::pair<FDBTransactionOptions::Option, Optional<Standalone<StringRef>>>> persistentOptions;
const Optional<TenantName> tenantName;
};
struct ClientDesc {
@ -591,6 +632,33 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {
class MultiVersionApi;
// An implementation of ITenant that wraps a tenant created either locally or through a dynamically loaded
// external client. The wrapped ITenant is automatically changed when the MultiVersionDatabase used to create
// it connects with a different version.
class MultiVersionTenant final : public ITenant, ThreadSafeReferenceCounted<MultiVersionTenant> {
public:
MultiVersionTenant(Reference<MultiVersionDatabase> db, StringRef tenantName);
~MultiVersionTenant() override;
Reference<ITransaction> createTransaction() override;
void addref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::delref(); }
Reference<ThreadSafeAsyncVar<Reference<ITenant>>> tenantVar;
const Standalone<StringRef> tenantName;
private:
Reference<MultiVersionDatabase> db;
Mutex tenantLock;
ThreadFuture<Void> tenantUpdater;
// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
// to open transactions via an AsyncVar.
void updateTenant();
};
// An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded
// external client. The MultiVersionDatabase monitors the protocol version of the cluster and automatically
// replaces the wrapped database when the protocol version changes.
@ -605,6 +673,7 @@ public:
~MultiVersionDatabase() override;
Reference<ITenant> openTenant(TenantNameRef tenantName) override;
Reference<ITransaction> createTransaction() override;
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
double getMainThreadBusyness() override;

File diff suppressed because it is too large Load Diff

View File

@ -159,6 +159,7 @@ struct TransactionOptions {
bool expensiveClearCostEstimation : 1;
bool useGrvCache : 1;
bool skipGrvCache : 1;
bool rawAccess : 1;
TransactionPriority priority;
@ -236,6 +237,8 @@ struct Watch : public ReferenceCounted<Watch>, NonCopyable {
struct TransactionState : ReferenceCounted<TransactionState> {
Database cx;
Optional<TenantName> tenant;
int64_t tenantId = TenantInfo::INVALID_TENANT;
Reference<TransactionLogInfo> trLogInfo;
TransactionOptions options;
@ -258,15 +261,19 @@ struct TransactionState : ReferenceCounted<TransactionState> {
// Only available so that Transaction can have a default constructor, for use in state variables
TransactionState(TaskPriority taskID, SpanID spanID) : taskID(taskID), spanID(spanID) {}
TransactionState(Database cx, TaskPriority taskID, SpanID spanID, Reference<TransactionLogInfo> trLogInfo)
: cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanID(spanID) {}
TransactionState(Database cx,
Optional<TenantName> tenant,
TaskPriority taskID,
SpanID spanID,
Reference<TransactionLogInfo> trLogInfo);
Reference<TransactionState> cloneAndReset(Reference<TransactionLogInfo> newTrLogInfo, bool generateNewSpan) const;
TenantInfo getTenantInfo() const;
};
class Transaction : NonCopyable {
public:
explicit Transaction(Database const& cx);
explicit Transaction(Database const& cx, Optional<TenantName> const& tenant = Optional<TenantName>());
~Transaction();
void setVersion(Version v);
@ -440,6 +447,8 @@ public:
return Standalone<VectorRef<KeyRangeRef>>(tr.transaction.write_conflict_ranges, tr.arena);
}
Optional<TenantName> getTenant() { return trState->tenant; }
Reference<TransactionState> trState;
std::vector<Reference<Watch>> watches;
Span span;
@ -481,6 +490,25 @@ int64_t extractIntOption(Optional<StringRef> value,
// states: coordinator, TLog and storage state
ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Adds necessary mutation(s) to the transaction, so that *one* checkpoint will be created for
// each and every shards overlapping with `range`. Each checkpoint will be created at a random
// storage server for each shard.
// All checkpoint(s) will be created at the transaction's commit version.
Future<Void> createCheckpoint(Transaction* tr, KeyRangeRef range, CheckpointFormat format);
// Same as above.
Future<Void> createCheckpoint(Reference<ReadYourWritesTransaction> tr, KeyRangeRef range, CheckpointFormat format);
// Gets checkpoint metadata for `keys` at the specific version, with the particular format.
// One CheckpointMetaData will be returned for each distinctive shard.
// The collective keyrange of the returned checkpoint(s) is a super-set of `keys`.
// checkpoint_not_found() error will be returned if the specific checkpoint(s) cannot be found.
ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
KeyRange keys,
Version version,
CheckpointFormat format,
double timeout = 5.0);
// Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion> exclusions);

View File

@ -22,6 +22,8 @@
#include "fdbclient/PaxosConfigTransaction.h"
#include "flow/actorcompiler.h" // must be last include
using ConfigTransactionInfo = ModelInterface<ConfigTransactionInterface>;
class CommitQuorum {
ActorCollection actors{ false };
std::vector<ConfigTransactionInterface> ctis;
@ -224,10 +226,12 @@ class PaxosConfigTransactionImpl {
loop {
try {
ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
// TODO: Load balance
state Reference<ConfigTransactionInfo> configNodes(
new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false));
ConfigTransactionGetReply reply =
wait(timeoutError(self->getGenerationQuorum.getReadReplicas()[0].get.getReply(
ConfigTransactionGetRequest{ generation, configKey }),
wait(timeoutError(basicLoadBalance(configNodes,
&ConfigTransactionInterface::get,
ConfigTransactionGetRequest{ generation, configKey }),
CLIENT_KNOBS->GET_KNOB_TIMEOUT));
if (reply.value.present()) {
return reply.value.get().toValue();
@ -245,10 +249,12 @@ class PaxosConfigTransactionImpl {
ACTOR static Future<RangeResult> getConfigClasses(PaxosConfigTransactionImpl* self) {
ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
// TODO: Load balance
state Reference<ConfigTransactionInfo> configNodes(
new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false));
ConfigTransactionGetConfigClassesReply reply =
wait(retryBrokenPromise(self->getGenerationQuorum.getReadReplicas()[0].getClasses,
ConfigTransactionGetConfigClassesRequest{ generation }));
wait(basicLoadBalance(configNodes,
&ConfigTransactionInterface::getClasses,
ConfigTransactionGetConfigClassesRequest{ generation }));
RangeResult result;
result.reserve(result.arena(), reply.configClasses.size());
for (const auto& configClass : reply.configClasses) {
@ -259,10 +265,12 @@ class PaxosConfigTransactionImpl {
ACTOR static Future<RangeResult> getKnobs(PaxosConfigTransactionImpl* self, Optional<Key> configClass) {
ConfigGeneration generation = wait(self->getGenerationQuorum.getGeneration());
// TODO: Load balance
state Reference<ConfigTransactionInfo> configNodes(
new ConfigTransactionInfo(self->getGenerationQuorum.getReadReplicas(), false));
ConfigTransactionGetKnobsReply reply =
wait(retryBrokenPromise(self->getGenerationQuorum.getReadReplicas()[0].getKnobs,
ConfigTransactionGetKnobsRequest{ generation, configClass }));
wait(basicLoadBalance(configNodes,
&ConfigTransactionInterface::getKnobs,
ConfigTransactionGetKnobsRequest{ generation, configClass }));
RangeResult result;
result.reserve(result.arena(), reply.knobNames.size());
for (const auto& knobName : reply.knobNames) {
@ -461,6 +469,6 @@ PaxosConfigTransaction::PaxosConfigTransaction() = default;
PaxosConfigTransaction::~PaxosConfigTransaction() = default;
void PaxosConfigTransaction::setDatabase(Database const& cx) {
void PaxosConfigTransaction::construct(Database const& cx) {
impl = PImpl<PaxosConfigTransactionImpl>::create(cx);
}

View File

@ -35,7 +35,7 @@ public:
PaxosConfigTransaction(std::vector<ConfigTransactionInterface> const&);
PaxosConfigTransaction();
~PaxosConfigTransaction();
void setDatabase(Database const&) override;
void construct(Database const&) override;
Future<Version> getReadVersion() override;
Optional<Version> getCachedReadVersion() const override;

View File

@ -1443,17 +1443,21 @@ public:
}
};
ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx)
: ISingleThreadTransaction(cx->deferredError), tr(cx), cache(&arena), writes(&arena), retries(0), approximateSize(0),
creationTime(now()), commitStarted(false), versionStampFuture(tr.getVersionstamp()),
ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx, Optional<TenantName> tenantName)
: ISingleThreadTransaction(cx->deferredError), tr(cx, tenantName), cache(&arena), writes(&arena), retries(0),
approximateSize(0), creationTime(now()), commitStarted(false), versionStampFuture(tr.getVersionstamp()),
specialKeySpaceWriteMap(std::make_pair(false, Optional<Value>()), specialKeys.end), options(tr) {
std::copy(
cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions));
applyPersistentOptions();
}
void ReadYourWritesTransaction::setDatabase(Database const& cx) {
*this = ReadYourWritesTransaction(cx);
void ReadYourWritesTransaction::construct(Database const& cx) {
*this = ReadYourWritesTransaction(cx, Optional<TenantName>());
}
void ReadYourWritesTransaction::construct(Database const& cx, TenantName const& tenantName) {
*this = ReadYourWritesTransaction(cx, tenantName);
}
ACTOR Future<Void> timebomb(double endTime, Promise<Void> resetPromise) {

View File

@ -68,10 +68,11 @@ class ReadYourWritesTransaction final : NonCopyable,
public ISingleThreadTransaction,
public FastAllocated<ReadYourWritesTransaction> {
public:
explicit ReadYourWritesTransaction(Database const& cx);
explicit ReadYourWritesTransaction(Database const& cx, Optional<TenantName> tenant = Optional<TenantName>());
~ReadYourWritesTransaction();
void setDatabase(Database const&) override;
void construct(Database const&) override;
void construct(Database const&, TenantName const& tenant) override;
void setVersion(Version v) override { tr.setVersion(v); }
Future<Version> getReadVersion() override;
Optional<Version> getCachedReadVersion() const override { return tr.getCachedReadVersion(); }
@ -190,6 +191,8 @@ public:
void setSpecialKeySpaceErrorMsg(const std::string& msg) { specialKeySpaceErrorMsg = msg; }
Transaction& getTransaction() { return tr; }
Optional<TenantName> getTenant() { return tr.getTenant(); }
// used in template functions as returned Future type
template <typename Type>
using FutureT = Future<Type>;

View File

@ -113,10 +113,12 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Data distribution queue
init( HEALTH_POLL_TIME, 1.0 );
init( BEST_TEAM_STUCK_DELAY, 1.0 );
init( DEST_OVERLOADED_DELAY, 0.2 );
init( BG_REBALANCE_POLLING_INTERVAL, 10.0 );
init( BG_REBALANCE_SWITCH_CHECK_INTERVAL, 5.0 ); if (randomize && BUGGIFY) BG_REBALANCE_SWITCH_CHECK_INTERVAL = 1.0;
init( DD_QUEUE_LOGGING_INTERVAL, 5.0 );
init( RELOCATION_PARALLELISM_PER_SOURCE_SERVER, 2 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_SOURCE_SERVER = 1;
init( RELOCATION_PARALLELISM_PER_DEST_SERVER, 10 ); if( randomize && BUGGIFY ) RELOCATION_PARALLELISM_PER_DEST_SERVER = 1; // Note: if this is smaller than FETCH_KEYS_PARALLELISM, this will artificially reduce performance. The current default of 10 is probably too high but is set conservatively for now.
init( DD_QUEUE_MAX_KEY_SERVERS, 100 ); if( randomize && BUGGIFY ) DD_QUEUE_MAX_KEY_SERVERS = 1;
init( DD_REBALANCE_PARALLELISM, 50 );
init( DD_REBALANCE_RESET_AMOUNT, 30 );
@ -365,6 +367,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 );
// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
init( ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE, true );
init( DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, "fdb");
init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 );
init( ROCKSDB_MAX_SUBCOMPACTIONS, 2 );
@ -676,6 +680,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( MAX_STORAGE_COMMIT_TIME, 120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true );
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
@ -714,6 +719,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( COORDINATOR_LEADER_CONNECTION_TIMEOUT, 20.0 );
// Dynamic Knobs (implementation)
init( COMPACTION_INTERVAL, isSimulated ? 5.0 : 300.0 );
init( UPDATE_NODE_TIMEOUT, 3.0 );
init( GET_COMMITTED_VERSION_TIMEOUT, 3.0 );
init( GET_SNAPSHOT_AND_CHANGES_TIMEOUT, 3.0 );

View File

@ -112,10 +112,12 @@ public:
// Data distribution queue
double HEALTH_POLL_TIME;
double BEST_TEAM_STUCK_DELAY;
double DEST_OVERLOADED_DELAY;
double BG_REBALANCE_POLLING_INTERVAL;
double BG_REBALANCE_SWITCH_CHECK_INTERVAL;
double DD_QUEUE_LOGGING_INTERVAL;
double RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
double RELOCATION_PARALLELISM_PER_DEST_SERVER;
int DD_QUEUE_MAX_KEY_SERVERS;
int DD_REBALANCE_PARALLELISM;
int DD_REBALANCE_RESET_AMOUNT;
@ -296,6 +298,7 @@ public:
bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
bool ROCKSDB_PERFCONTEXT_ENABLE; // Enable rocks perf context metrics. May cause performance overhead
double ROCKSDB_PERFCONTEXT_SAMPLE_RATE;
int ROCKSDB_MAX_SUBCOMPACTIONS;
@ -615,6 +618,7 @@ public:
bool ENABLE_CLEAR_RANGE_EAGER_READS;
bool QUICK_GET_VALUE_FALLBACK;
bool QUICK_GET_KEY_VALUES_FALLBACK;
int CHECKPOINT_TRANSFER_BLOCK_BYTES;
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
@ -653,6 +657,7 @@ public:
double COORDINATOR_LEADER_CONNECTION_TIMEOUT;
// Dynamic Knobs (implementation)
double COMPACTION_INTERVAL;
double UPDATE_NODE_TIMEOUT;
double GET_COMMITTED_VERSION_TIMEOUT;
double GET_SNAPSHOT_AND_CHANGES_TIMEOUT;

View File

@ -286,7 +286,7 @@ void SimpleConfigTransaction::checkDeferredError() const {
impl->checkDeferredError(deferredError);
}
void SimpleConfigTransaction::setDatabase(Database const& cx) {
void SimpleConfigTransaction::construct(Database const& cx) {
impl = PImpl<SimpleConfigTransactionImpl>::create(cx);
}

View File

@ -43,7 +43,7 @@ public:
SimpleConfigTransaction(ConfigTransactionInterface const&);
SimpleConfigTransaction(Database const&);
SimpleConfigTransaction();
void setDatabase(Database const&) override;
void construct(Database const&) override;
~SimpleConfigTransaction();
Future<Version> getReadVersion() override;
Optional<Version> getCachedReadVersion() const override;

View File

@ -28,6 +28,7 @@
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ProcessInterface.h"
#include "fdbclient/GlobalConfig.actor.h"
@ -54,6 +55,8 @@ static bool isAlphaNumeric(const std::string& key) {
}
} // namespace
const KeyRangeRef TenantMapRangeImpl::submoduleRange = KeyRangeRef("tenant_map/"_sr, "tenant_map0"_sr);
std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToBoundary = {
{ SpecialKeySpace::MODULE::TRANSACTION,
KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")) },
@ -111,7 +114,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "datadistribution",
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "tenantmap", TenantMapRangeImpl::submoduleRange.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
@ -1291,6 +1295,7 @@ void ProcessClassRangeImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef&
}
ACTOR Future<RangeResult> getProcessClassSourceActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
std::vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
auto workers = _workers; // strip const
// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
@ -2697,3 +2702,95 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
// exclude locality with failed option as true.
return excludeLocalityCommitActor(ryw, true);
}
ACTOR Future<RangeResult> getTenantList(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) {
KeyRangeRef tenantRange =
kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.removePrefix(TenantMapRangeImpl::submoduleRange.begin);
state KeyRef managementPrefix =
kr.begin.substr(0,
SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() +
TenantMapRangeImpl::submoduleRange.begin.size());
std::map<TenantName, TenantMapEntry> tenants = wait(ManagementAPI::listTenantsTransaction(
Reference<ReadYourWritesTransaction>::addRef(ryw), tenantRange.begin, tenantRange.end, limitsHint.rows));
RangeResult results;
for (auto tenant : tenants) {
json_spirit::mObject tenantEntry;
tenantEntry["id"] = tenant.second.id;
tenantEntry["prefix"] = tenant.second.prefix.toString();
std::string tenantEntryString = json_spirit::write_string(json_spirit::mValue(tenantEntry));
ValueRef tenantEntryBytes(results.arena(), tenantEntryString);
results.push_back(results.arena(),
KeyValueRef(tenant.first.withPrefix(managementPrefix, results.arena()), tenantEntryBytes));
}
return results;
}
TenantMapRangeImpl::TenantMapRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> TenantMapRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return getTenantList(ryw, kr, limitsHint);
}
ACTOR Future<Void> deleteTenantRange(ReadYourWritesTransaction* ryw, TenantName beginTenant, TenantName endTenant) {
std::map<TenantName, TenantMapEntry> tenants = wait(
ManagementAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY));
if (tenants.size() == CLIENT_KNOBS->TOO_MANY) {
TraceEvent(SevWarn, "DeleteTenantRangeTooLange")
.detail("BeginTenant", beginTenant)
.detail("EndTenant", endTenant);
ryw->setSpecialKeySpaceErrorMsg("too many tenants to range delete");
throw special_keys_api_failure();
}
std::vector<Future<Void>> deleteFutures;
for (auto tenant : tenants) {
deleteFutures.push_back(ManagementAPI::deleteTenantTransaction(&ryw->getTransaction(), tenant.first));
}
wait(waitForAll(deleteFutures));
return Void();
}
Future<Optional<std::string>> TenantMapRangeImpl::commit(ReadYourWritesTransaction* ryw) {
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range);
std::vector<Future<Void>> tenantManagementFutures;
for (auto range : ranges) {
if (!range.value().first) {
continue;
}
TenantNameRef tenantName =
range.begin()
.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.removePrefix(TenantMapRangeImpl::submoduleRange.begin);
if (range.value().second.present()) {
tenantManagementFutures.push_back(
success(ManagementAPI::createTenantTransaction(&ryw->getTransaction(), tenantName)));
} else {
// For a single key clear, just issue the delete
if (KeyRangeRef(range.begin(), range.end()).singleKeyRange()) {
tenantManagementFutures.push_back(
ManagementAPI::deleteTenantTransaction(&ryw->getTransaction(), tenantName));
} else {
TenantNameRef endTenant = range.end().removePrefix(
SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
if (endTenant.startsWith(submoduleRange.begin)) {
endTenant = endTenant.removePrefix(submoduleRange.end);
} else {
endTenant = "\xff"_sr;
}
tenantManagementFutures.push_back(deleteTenantRange(ryw, tenantName, endTenant));
}
}
}
return tag(waitForAll(tenantManagementFutures), Optional<std::string>());
}

View File

@ -528,5 +528,16 @@ public:
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class TenantMapRangeImpl : public SpecialKeyRangeRWImpl {
public:
const static KeyRangeRef submoduleRange;
explicit TenantMapRangeImpl(KeyRangeRef kr);
Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
#include "flow/unactorcompiler.h"
#endif

View File

@ -0,0 +1,88 @@
/*
* StorageCheckpoint.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBCLIENT_STORAGCHECKPOINT_H
#define FDBCLIENT_STORAGCHECKPOINT_H
#pragma once
#include "fdbclient/FDBTypes.h"
// FDB storage checkpoint format.
enum CheckpointFormat {
InvalidFormat = 0,
// For RocksDB, checkpoint generated via rocksdb::Checkpoint::ExportColumnFamily().
RocksDBColumnFamily = 1,
// For RocksDB, checkpoint generated via rocksdb::Checkpoint::CreateCheckpoint().
RocksDB = 2,
};
// Metadata of a FDB checkpoint.
struct CheckpointMetaData {
enum CheckpointState {
InvalidState = 0,
Pending = 1, // Checkpoint creation pending.
Complete = 2, // Checkpoint is created and ready to be read.
Deleting = 3, // Checkpoint deletion requested.
Fail = 4,
};
constexpr static FileIdentifier file_identifier = 13804342;
Version version;
KeyRange range;
int16_t format; // CheckpointFormat.
UID ssID; // Storage server ID on which this checkpoint is created.
UID checkpointID; // A unique id for this checkpoint.
int16_t state; // CheckpointState.
int referenceCount; // A reference count on the checkpoint, it can only be deleted when this is 0.
int64_t gcTime; // Time to delete this checkpoint, a Unix timestamp in seconds.
// A serialized metadata associated with format, this data can be understood by the corresponding KVS.
Standalone<StringRef> serializedCheckpoint;
CheckpointMetaData() : format(InvalidFormat), state(InvalidState), referenceCount(0) {}
CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID)
: version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0) {}
CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID)
: version(version), range(range), format(format), checkpointID(checkpointID), referenceCount(0) {}
CheckpointState getState() const { return static_cast<CheckpointState>(state); }
void setState(CheckpointState state) { this->state = static_cast<int16_t>(state); }
CheckpointFormat getFormat() const { return static_cast<CheckpointFormat>(format); }
void setFormat(CheckpointFormat format) { this->format = static_cast<int16_t>(format); }
std::string toString() const {
std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) +
"\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() +
"\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast<int>(state)) +
"\n";
return res;
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
}
};
#endif

View File

@ -24,6 +24,7 @@
#include <ostream>
#include "fdbclient/FDBTypes.h"
#include "fdbclient/StorageCheckpoint.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/QueueModel.h"
#include "fdbrpc/fdbrpc.h"
@ -85,6 +86,8 @@ struct StorageServerInterface {
RequestStream<struct OverlappingChangeFeedsRequest> overlappingChangeFeeds;
RequestStream<struct ChangeFeedPopRequest> changeFeedPop;
RequestStream<struct ChangeFeedVersionUpdateRequest> changeFeedVersionUpdate;
RequestStream<struct GetCheckpointRequest> checkpoint;
RequestStream<struct FetchCheckpointRequest> fetchCheckpoint;
explicit StorageServerInterface(UID uid) : uniqueID(uid) {}
StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
@ -137,6 +140,9 @@ struct StorageServerInterface {
RequestStream<struct ChangeFeedPopRequest>(getValue.getEndpoint().getAdjustedEndpoint(17));
changeFeedVersionUpdate = RequestStream<struct ChangeFeedVersionUpdateRequest>(
getValue.getEndpoint().getAdjustedEndpoint(18));
checkpoint = RequestStream<struct GetCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(19));
fetchCheckpoint =
RequestStream<struct FetchCheckpointRequest>(getValue.getEndpoint().getAdjustedEndpoint(20));
}
} else {
ASSERT(Ar::isDeserializing);
@ -184,6 +190,8 @@ struct StorageServerInterface {
streams.push_back(overlappingChangeFeeds.getReceiver());
streams.push_back(changeFeedPop.getReceiver());
streams.push_back(changeFeedVersionUpdate.getReceiver());
streams.push_back(checkpoint.getReceiver());
streams.push_back(fetchCheckpoint.getReceiver());
FlowTransport::transport().addEndpoints(streams);
}
};
@ -816,6 +824,60 @@ struct ChangeFeedPopRequest {
}
};
// Request to search for a checkpoint for a minimum keyrange: `range`, at the specific version,
// in the specific format.
// A CheckpointMetaData will be returned if the specific checkpoint is found.
struct GetCheckpointRequest {
constexpr static FileIdentifier file_identifier = 13804343;
Version version; // The FDB version at which the checkpoint is created.
KeyRange range;
int16_t format; // CheckpointFormat.
Optional<UID> checkpointID; // When present, look for the checkpoint with the exact UID.
ReplyPromise<CheckpointMetaData> reply;
GetCheckpointRequest() {}
GetCheckpointRequest(Version version, KeyRange const& range, CheckpointFormat format)
: version(version), range(range), format(format) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, range, format, checkpointID, reply);
}
};
// Reply to FetchCheckpointRequest, transfers checkpoint back to client.
struct FetchCheckpointReply : public ReplyPromiseStreamReply {
constexpr static FileIdentifier file_identifier = 13804345;
Standalone<StringRef> token; // Serialized data specific to a particular checkpoint format.
Standalone<StringRef> data;
FetchCheckpointReply() {}
FetchCheckpointReply(StringRef token) : token(token) {}
int expectedSize() const { return data.expectedSize(); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, ReplyPromiseStreamReply::acknowledgeToken, ReplyPromiseStreamReply::sequence, token, data);
}
};
// Request to fetch checkpoint from a storage server.
struct FetchCheckpointRequest {
constexpr static FileIdentifier file_identifier = 13804344;
UID checkpointID;
Standalone<StringRef> token; // Serialized data specific to a particular checkpoint format.
ReplyPromiseStream<FetchCheckpointReply> reply;
FetchCheckpointRequest() = default;
FetchCheckpointRequest(UID checkpointID, StringRef token) : checkpointID(checkpointID), token(token) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, checkpointID, token, reply);
}
};
struct OverlappingChangeFeedEntry {
Key rangeId;
KeyRange range;

View File

@ -215,6 +215,33 @@ const KeyRangeRef writeConflictRangeKeysRange =
const KeyRef clusterIdKey = LiteralStringRef("\xff/clusterId");
const KeyRef checkpointPrefix = "\xff/checkpoint/"_sr;
const Key checkpointKeyFor(UID checkpointID) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(checkpointPrefix);
wr << checkpointID;
return wr.toValue();
}
const Value checkpointValue(const CheckpointMetaData& checkpoint) {
return ObjectWriter::toValue(checkpoint, IncludeVersion());
}
UID decodeCheckpointKey(const KeyRef& key) {
UID checkpointID;
BinaryReader rd(key.removePrefix(checkpointPrefix), Unversioned());
rd >> checkpointID;
return checkpointID;
}
CheckpointMetaData decodeCheckpointValue(const ValueRef& value) {
CheckpointMetaData checkpoint;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(checkpoint);
return checkpoint;
}
// "\xff/cacheServer/[[UID]] := StorageServerInterface"
const KeyRangeRef storageCacheServerKeys(LiteralStringRef("\xff/cacheServer/"), LiteralStringRef("\xff/cacheServer0"));
const KeyRef storageCacheServersPrefix = storageCacheServerKeys.begin;
@ -1336,6 +1363,8 @@ TenantMapEntry decodeTenantEntry(ValueRef const& value) {
const KeyRangeRef tenantMapKeys("\xff/tenantMap/"_sr, "\xff/tenantMap0"_sr);
const KeyRef tenantMapPrefix = tenantMapKeys.begin;
const KeyRef tenantMapPrivatePrefix = "\xff\xff/tenantMap/"_sr;
const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr;
const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr;
// for tests
void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {

View File

@ -70,6 +70,13 @@ void decodeKeyServersValue(std::map<Tag, UID> const& tag_uid,
extern const KeyRef clusterIdKey;
// "\xff/checkpoint/[[UID]] := [[CheckpointMetaData]]"
extern const KeyRef checkpointPrefix;
const Key checkpointKeyFor(UID checkpointID);
const Value checkpointValue(const CheckpointMetaData& checkpoint);
UID decodeCheckpointKey(const KeyRef& key);
CheckpointMetaData decodeCheckpointValue(const ValueRef& value);
// "\xff/storageCacheServer/[[UID]] := StorageServerInterface"
// This will be added by the cache server on initialization and removed by DD
// TODO[mpilman]: We will need a way to map uint16_t ids to UIDs in a future
@ -598,6 +605,8 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value);
extern const KeyRangeRef tenantMapKeys;
extern const KeyRef tenantMapPrefix;
extern const KeyRef tenantMapPrivatePrefix;
extern const KeyRef tenantLastIdKey;
extern const KeyRef tenantDataPrefixKey;
Value encodeTenantEntry(TenantMapEntry const& tenantEntry);
TenantMapEntry decodeTenantEntry(ValueRef const& value);

View File

@ -23,6 +23,7 @@
#include "fdbclient/ThreadSafeTransaction.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/versions.h"
#include "fdbclient/GenericManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
// Users of ThreadSafeTransaction might share Reference<ThreadSafe...> between different threads as long as they don't
@ -46,9 +47,13 @@ ThreadFuture<Reference<IDatabase>> ThreadSafeDatabase::createFromExistingDatabas
});
}
Reference<ITenant> ThreadSafeDatabase::openTenant(TenantNameRef tenantName) {
return makeReference<ThreadSafeTenant>(Reference<ThreadSafeDatabase>::addRef(this), tenantName);
}
Reference<ITransaction> ThreadSafeDatabase::createTransaction() {
auto type = isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW;
return Reference<ITransaction>(new ThreadSafeTransaction(db, type));
return Reference<ITransaction>(new ThreadSafeTransaction(db, type, Optional<TenantName>()));
}
void ThreadSafeDatabase::setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value) {
@ -147,7 +152,17 @@ ThreadSafeDatabase::~ThreadSafeDatabase() {
onMainThreadVoid([db]() { db->delref(); }, nullptr);
}
ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type) {
Reference<ITransaction> ThreadSafeTenant::createTransaction() {
auto type = db->isConfigDB ? ISingleThreadTransaction::Type::SIMPLE_CONFIG : ISingleThreadTransaction::Type::RYW;
return Reference<ITransaction>(new ThreadSafeTransaction(db->db, type, name));
}
ThreadSafeTenant::~ThreadSafeTenant() {}
ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx,
ISingleThreadTransaction::Type type,
Optional<TenantName> tenant)
: tenantName(tenant) {
// Allocate memory for the transaction from this thread (so the pointer is known for subsequent method calls)
// but run its constructor on the main thread
@ -158,9 +173,13 @@ ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadT
auto tr = this->tr = ISingleThreadTransaction::allocateOnForeignThread(type);
// No deferred error -- if the construction of the RYW transaction fails, we have no where to put it
onMainThreadVoid(
[tr, cx]() {
[tr, cx, tenant]() {
cx->addref();
tr->setDatabase(Database(cx));
if (tenant.present()) {
tr->construct(Database(cx), tenant.get());
} else {
tr->construct(Database(cx));
}
},
nullptr);
}
@ -469,6 +488,10 @@ ThreadFuture<Void> ThreadSafeTransaction::onError(Error const& e) {
return onMainThread([tr, e]() { return tr->onError(e); });
}
Optional<TenantName> ThreadSafeTransaction::getTenant() {
return tenantName;
}
void ThreadSafeTransaction::operator=(ThreadSafeTransaction&& r) noexcept {
tr = r.tr;
r.tr = nullptr;

View File

@ -35,6 +35,7 @@ public:
~ThreadSafeDatabase() override;
static ThreadFuture<Reference<IDatabase>> createFromExistingDatabase(Database cx);
Reference<ITenant> openTenant(TenantNameRef tenantName) override;
Reference<ITransaction> createTransaction() override;
void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -61,6 +62,7 @@ public:
void setSharedState(DatabaseSharedState* p) override;
private:
friend class ThreadSafeTenant;
friend class ThreadSafeTransaction;
bool isConfigDB{ false };
DatabaseContext* db;
@ -71,11 +73,28 @@ public: // Internal use only
DatabaseContext* unsafeGetPtr() const { return db; }
};
class ThreadSafeTenant : public ITenant, ThreadSafeReferenceCounted<ThreadSafeTenant>, NonCopyable {
public:
ThreadSafeTenant(Reference<ThreadSafeDatabase> db, StringRef name) : db(db), name(name) {}
~ThreadSafeTenant() override;
Reference<ITransaction> createTransaction() override;
void addref() override { ThreadSafeReferenceCounted<ThreadSafeTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<ThreadSafeTenant>::delref(); }
private:
Reference<ThreadSafeDatabase> db;
Standalone<StringRef> name;
};
// An implementation of ITransaction that serializes operations onto the network thread and interacts with the
// lower-level client APIs exposed by ISingleThreadTransaction
class ThreadSafeTransaction : public ITransaction, ThreadSafeReferenceCounted<ThreadSafeTransaction>, NonCopyable {
public:
explicit ThreadSafeTransaction(DatabaseContext* cx, ISingleThreadTransaction::Type type);
explicit ThreadSafeTransaction(DatabaseContext* cx,
ISingleThreadTransaction::Type type,
Optional<TenantName> tenant);
~ThreadSafeTransaction() override;
// Note: used while refactoring fdbcli, need to be removed later
@ -152,6 +171,8 @@ public:
ThreadFuture<Void> checkDeferredError();
ThreadFuture<Void> onError(Error const& e) override;
Optional<TenantName> getTenant() override;
// These are to permit use as state variables in actors:
ThreadSafeTransaction() : tr(nullptr) {}
void operator=(ThreadSafeTransaction&& r) noexcept;
@ -164,6 +185,7 @@ public:
private:
ISingleThreadTransaction* tr;
const Optional<TenantName> tenantName;
};
// An implementation of IClientApi that serializes operations onto the network thread and interacts with the lower-level

View File

@ -230,9 +230,11 @@ description is not currently required but encouraged.
<Option name="initialize_new_database" code="300"
description="This is a write-only transaction which sets the initial configuration. This option is designed for use by database system tools only." />
<Option name="access_system_keys" code="301"
description="Allows this transaction to read and modify system keys (those that start with the byte 0xFF)"/>
description="Allows this transaction to read and modify system keys (those that start with the byte 0xFF). Implies raw_access."/>
<Option name="read_system_keys" code="302"
description="Allows this transaction to read system keys (those that start with the byte 0xFF)"/>
description="Allows this transaction to read system keys (those that start with the byte 0xFF). Implies raw_access."/>
<Option name="raw_access" code="303"
description="Allows this transaction to access the raw key-space when tenant mode is on."/>
<Option name="debug_dump" code="400"
hidden="true" />
<Option name="debug_retry_logging" code="401" paramType="String" paramDescription="Optional transaction name" />

View File

@ -427,6 +427,7 @@ public:
bool speedUpSimulation;
BackupAgentType backupAgents;
BackupAgentType drAgents;
bool restarted = false;
bool hasDiffProtocolProcess; // true if simulator is testing a process with a different version
bool setDiffProtocol; // true if a process with a different protocol version has been started

View File

@ -541,6 +541,29 @@ private:
toCommit->writeTypedMessage(privatized);
}
// Generates private mutations for the target storage server, instructing it to create a checkpoint.
void checkSetCheckpointKeys(MutationRef m) {
if (!m.param1.startsWith(checkpointPrefix)) {
return;
}
if (toCommit) {
CheckpointMetaData checkpoint = decodeCheckpointValue(m.param2);
Tag tag = decodeServerTagValue(txnStateStore->readValue(serverTagKeyFor(checkpoint.ssID)).get().get());
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent("SendingPrivateMutationCheckpoint", dbgid)
.detail("Original", m)
.detail("Privatized", privatized)
.detail("Server", checkpoint.ssID)
.detail("TagKey", serverTagKeyFor(checkpoint.ssID))
.detail("Tag", tag.toString())
.detail("Checkpoint", checkpoint.toString());
toCommit->addTag(tag);
toCommit->writeTypedMessage(privatized);
}
}
void checkSetOtherKeys(MutationRef m) {
if (initialCommit)
return;
@ -1081,6 +1104,7 @@ public:
if (m.type == MutationRef::SetValue && isSystemKey(m.param1)) {
checkSetKeyServersPrefix(m);
checkSetServerKeysPrefix(m);
checkSetCheckpointKeys(m);
checkSetServerTagsPrefix(m);
checkSetStorageCachePrefix(m);
checkSetCacheKeysPrefix(m);

View File

@ -50,6 +50,10 @@ set(FDBSERVER_SRCS
KeyValueStoreMemory.actor.cpp
KeyValueStoreRocksDB.actor.cpp
KeyValueStoreSQLite.actor.cpp
ServerCheckpoint.actor.cpp
ServerCheckpoint.actor.h
RocksDBCheckpointUtils.actor.cpp
RocksDBCheckpointUtils.actor.h
Knobs.h
LatencyBandConfig.cpp
LatencyBandConfig.h
@ -191,6 +195,7 @@ set(FDBSERVER_SRCS
workloads/ChangeFeeds.actor.cpp
workloads/DataDistributionMetrics.actor.cpp
workloads/DataLossRecovery.actor.cpp
workloads/PhysicalShardMove.actor.cpp
workloads/DDBalance.actor.cpp
workloads/DDMetrics.actor.cpp
workloads/DDMetricsExclude.actor.cpp

View File

@ -94,6 +94,7 @@ class ConfigBroadcasterImpl {
int coordinators = 0;
std::unordered_set<NetworkAddress> activeConfigNodes;
std::unordered_set<NetworkAddress> registrationResponses;
bool disallowUnregistered = false;
Promise<Void> newConfigNodesAllowed;
@ -217,6 +218,7 @@ class ConfigBroadcasterImpl {
self->clients.erase(clientUID);
self->clientFailures.erase(clientUID);
self->activeConfigNodes.erase(clientAddress);
self->registrationResponses.erase(clientAddress);
// See comment where this promise is reset below.
if (self->newConfigNodesAllowed.isSet()) {
self->newConfigNodesAllowed.reset();
@ -258,6 +260,7 @@ class ConfigBroadcasterImpl {
self->newConfigNodesAllowed.reset();
}
}
self->registrationResponses.insert(address);
if (registered) {
if (!self->disallowUnregistered) {
@ -265,9 +268,18 @@ class ConfigBroadcasterImpl {
}
self->activeConfigNodes.insert(address);
self->disallowUnregistered = true;
} else if (self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) {
// Need to allow registration of previously unregistered nodes when
// the cluster first starts up.
} else if ((self->activeConfigNodes.size() < self->coordinators / 2 + 1 && !self->disallowUnregistered) ||
self->coordinators - self->registrationResponses.size() <=
self->coordinators / 2 + 1 - self->activeConfigNodes.size()) {
// Received a registration request from an unregistered node. There
// are two cases where we want to allow unregistered nodes to
// register:
// * the cluster is just starting and no nodes are registered
// * a minority of nodes are registered and a majority are
// unregistered. This situation should only occur in rare
// circumstances where the cluster controller dies with only a
// minority of config nodes having received a
// ConfigBroadcastReadyRequest
self->activeConfigNodes.insert(address);
if (self->activeConfigNodes.size() >= self->coordinators / 2 + 1 &&
self->newConfigNodesAllowed.canBeSet()) {
@ -390,9 +402,9 @@ public:
this->coordinators = coordinators.configServers.size();
if (configDBType != ConfigDBType::DISABLED) {
if (configDBType == ConfigDBType::SIMPLE) {
consumer = IConfigConsumer::createSimple(coordinators, 0.5, Optional<double>{});
consumer = IConfigConsumer::createSimple(coordinators, 0.5, SERVER_KNOBS->COMPACTION_INTERVAL);
} else {
consumer = IConfigConsumer::createPaxos(coordinators, 0.5, Optional<double>{});
consumer = IConfigConsumer::createPaxos(coordinators, 0.5, SERVER_KNOBS->COMPACTION_INTERVAL);
}
TraceEvent(SevDebug, "ConfigBroadcasterStartingConsumer", id)
.detail("Consumer", consumer->getID())

View File

@ -176,14 +176,16 @@ struct ConfigFollowerRollforwardRequest {
struct ConfigFollowerGetCommittedVersionReply {
static constexpr FileIdentifier file_identifier = 9214735;
Version lastCompacted;
Version lastCommitted;
ConfigFollowerGetCommittedVersionReply() = default;
explicit ConfigFollowerGetCommittedVersionReply(Version lastCommitted) : lastCommitted(lastCommitted) {}
explicit ConfigFollowerGetCommittedVersionReply(Version lastCompacted, Version lastCommitted)
: lastCompacted(lastCompacted), lastCommitted(lastCommitted) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, lastCommitted);
serializer(ar, lastCompacted, lastCommitted);
}
};

View File

@ -495,7 +495,7 @@ class ConfigNodeImpl {
}
ACTOR static Future<Void> rollforward(ConfigNodeImpl* self, ConfigFollowerRollforwardRequest req) {
Version lastCompactedVersion = wait(getLastCompactedVersion(self));
state Version lastCompactedVersion = wait(getLastCompactedVersion(self));
if (req.lastKnownCommitted < lastCompactedVersion) {
req.reply.sendError(version_already_compacted());
return Void();
@ -529,6 +529,10 @@ class ConfigNodeImpl {
versionedAnnotationKey(currentGeneration.committedVersion + 1)));
currentGeneration.committedVersion = req.rollback.get();
if (req.rollback.get() < lastCompactedVersion) {
self->kvStore->set(
KeyValueRef(lastCompactedVersionKey, BinaryWriter::toValue(req.rollback.get(), IncludeVersion())));
}
// The mutation commit loop below should persist the new generation
// to disk, so we don't need to do it here.
}
@ -536,13 +540,15 @@ class ConfigNodeImpl {
// committed version and rollforward version.
ASSERT_GT(req.mutations[0].version, currentGeneration.committedVersion);
wait(commitMutations(self, req.mutations, req.annotations, req.target));
req.reply.send(Void());
return Void();
}
ACTOR static Future<Void> getCommittedVersion(ConfigNodeImpl* self, ConfigFollowerGetCommittedVersionRequest req) {
state Version lastCompacted = wait(getLastCompactedVersion(self));
ConfigGeneration generation = wait(getGeneration(self));
req.reply.send(ConfigFollowerGetCommittedVersionReply{ generation.committedVersion });
req.reply.send(ConfigFollowerGetCommittedVersionReply{ lastCompacted, generation.committedVersion });
return Void();
}

View File

@ -200,8 +200,9 @@ public:
}
int64_t bestLoadBytes = 0;
bool wigglingBestOption = false; // best option contains server in paused wiggle state
Optional<Reference<IDataDistributionTeam>> bestOption;
std::vector<Reference<IDataDistributionTeam>> randomTeams;
std::vector<Reference<TCTeamInfo>> randomTeams;
const std::set<UID> completeSources(req.completeSources.begin(), req.completeSources.end());
// Note: this block does not apply any filters from the request
@ -249,9 +250,18 @@ public:
(!req.teamMustHaveShards ||
self->shardsAffectedByTeamFailure->hasShards(ShardsAffectedByTeamFailure::Team(
self->teams[currentIndex]->getServerIDs(), self->primary)))) {
// bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption
// in this case
if (bestOption.present() && !wigglingBestOption &&
self->teams[currentIndex]->hasWigglePausedServer()) {
continue;
}
bestLoadBytes = loadBytes;
bestOption = self->teams[currentIndex];
bestIndex = currentIndex;
wigglingBestOption = self->teams[bestIndex]->hasWigglePausedServer();
}
}
}
@ -262,7 +272,7 @@ public:
while (randomTeams.size() < SERVER_KNOBS->BEST_TEAM_OPTION_COUNT &&
nTries < SERVER_KNOBS->BEST_TEAM_MAX_TEAM_TRIES) {
// If unhealthy team is majority, we may not find an ok dest in this while loop
Reference<IDataDistributionTeam> dest = deterministicRandom()->randomChoice(self->teams);
Reference<TCTeamInfo> dest = deterministicRandom()->randomChoice(self->teams);
bool ok = dest->isHealthy() && (!req.preferLowerUtilization ||
dest->hasHealthyAvailableSpace(self->medianAvailableSpace));
@ -298,8 +308,16 @@ public:
int64_t loadBytes = randomTeams[i]->getLoadBytes(true, req.inflightPenalty);
if (!bestOption.present() || (req.preferLowerUtilization && loadBytes < bestLoadBytes) ||
(!req.preferLowerUtilization && loadBytes > bestLoadBytes)) {
// bestOption doesn't contain wiggling SS while current team does. Don't replace bestOption
// in this case
if (bestOption.present() && !wigglingBestOption && randomTeams[i]->hasWigglePausedServer()) {
continue;
}
bestLoadBytes = loadBytes;
bestOption = randomTeams[i];
wigglingBestOption = randomTeams[i]->hasWigglePausedServer();
}
}
}
@ -3611,6 +3629,10 @@ void DDTeamCollection::removeLaggingStorageServer(Key zoneId) {
disableFailingLaggingServers.set(false);
}
bool DDTeamCollection::isWigglePausedServer(const UID& server) const {
return pauseWiggle && pauseWiggle->get() && wigglingId == server;
}
std::vector<UID> DDTeamCollection::getRandomHealthyTeam(const UID& excludeServer) {
std::vector<int> candidates, backup;
for (int i = 0; i < teams.size(); ++i) {
@ -5629,6 +5651,62 @@ public:
return Void();
}
ACTOR static Future<Void> GetTeam_DeprioritizeWigglePausedTeam() {
Reference<IReplicationPolicy> policy = Reference<IReplicationPolicy>(
new PolicyAcross(3, "zoneid", Reference<IReplicationPolicy>(new PolicyOne())));
state int processSize = 5;
state int teamSize = 3;
state std::unique_ptr<DDTeamCollection> collection = testTeamCollection(teamSize, policy, processSize);
GetStorageMetricsReply mid_avail;
mid_avail.capacity.bytes = 1000 * 1024 * 1024;
mid_avail.available.bytes = 400 * 1024 * 1024;
mid_avail.load.bytes = 100 * 1024 * 1024;
GetStorageMetricsReply high_avail;
high_avail.capacity.bytes = 1000 * 1024 * 1024;
high_avail.available.bytes = 800 * 1024 * 1024;
high_avail.load.bytes = 90 * 1024 * 1024;
collection->addTeam(std::set<UID>({ UID(1, 0), UID(2, 0), UID(3, 0) }), true);
collection->addTeam(std::set<UID>({ UID(2, 0), UID(3, 0), UID(4, 0) }), true);
collection->disableBuildingTeams();
collection->setCheckTeamDelay();
/*
* Among server teams that have healthy space available, pick the team that is
* least utilized, if the caller says they preferLowerUtilization.
*/
collection->server_info[UID(1, 0)]->setMetrics(mid_avail);
collection->server_info[UID(2, 0)]->setMetrics(high_avail);
collection->server_info[UID(3, 0)]->setMetrics(high_avail);
collection->server_info[UID(4, 0)]->setMetrics(high_avail);
collection->wigglingId = UID(4, 0);
collection->pauseWiggle = makeReference<AsyncVar<bool>>(true);
bool wantsNewServers = true;
bool wantsTrueBest = true;
bool preferLowerUtilization = true;
bool teamMustHaveShards = false;
std::vector<UID> completeSources{ UID(1, 0), UID(2, 0), UID(3, 0) };
state GetTeamRequest req(wantsNewServers, wantsTrueBest, preferLowerUtilization, teamMustHaveShards);
req.completeSources = completeSources;
wait(collection->getTeam(req));
std::pair<Optional<Reference<IDataDistributionTeam>>, bool> resTeam = req.reply.getFuture().get();
std::set<UID> expectedServers{ UID(1, 0), UID(2, 0), UID(3, 0) };
ASSERT(resTeam.first.present());
auto servers = resTeam.first.get()->getServerIDs();
const std::set<UID> selectedServers(servers.begin(), servers.end());
ASSERT(expectedServers == selectedServers);
return Void();
}
};
TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") {
@ -5690,3 +5768,8 @@ TEST_CASE("/DataDistribution/GetTeam/ServerUtilizationNearCutoff") {
wait(DDTeamCollectionUnitTest::GetTeam_ServerUtilizationNearCutoff());
return Void();
}
TEST_CASE("/DataDistribution/GetTeam/DeprioritizeWigglePausedTeam") {
wait(DDTeamCollectionUnitTest::GetTeam_DeprioritizeWigglePausedTeam());
return Void();
}

View File

@ -594,6 +594,9 @@ public:
void removeLaggingStorageServer(Key zoneId);
// whether server is under wiggling proces, but wiggle is paused for some healthy compliance.
bool isWigglePausedServer(const UID& server) const;
// Returns a random healthy team, which does not contain excludeServer.
std::vector<UID> getRandomHealthyTeam(const UID& excludeServer);

View File

@ -19,17 +19,18 @@
*/
#include <set>
#include <sstream>
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbrpc/Replication.h"
#include "fdbserver/DataDistribution.actor.h"
#include "fdbserver/DDTeamCollection.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/Knobs.h"
@ -38,14 +39,14 @@
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/DDTeamCollection.h"
#include "flow/ActorCollection.h"
#include "flow/Arena.h"
#include "flow/BooleanParam.h"
#include "flow/serialize.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#include "flow/serialize.h"
// Read keyservers, return unique set of teams
ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Database cx,

View File

@ -48,6 +48,7 @@ struct RelocateData {
int workFactor;
std::vector<UID> src;
std::vector<UID> completeSources;
std::vector<UID> completeDests;
bool wantsNewServers;
TraceInterval interval;
@ -87,7 +88,7 @@ struct RelocateData {
return priority == rhs.priority && boundaryPriority == rhs.boundaryPriority &&
healthPriority == rhs.healthPriority && keys == rhs.keys && startTime == rhs.startTime &&
workFactor == rhs.workFactor && src == rhs.src && completeSources == rhs.completeSources &&
wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId;
completeDests == rhs.completeDests && wantsNewServers == rhs.wantsNewServers && randomId == rhs.randomId;
}
bool operator!=(const RelocateData& rhs) const { return !(*this == rhs); }
};
@ -262,7 +263,7 @@ struct Busyness {
Busyness() : ledger(10, 0) {}
bool canLaunch(int prio, int work) {
bool canLaunch(int prio, int work) const {
ASSERT(prio > 0 && prio < 1000);
return ledger[prio / 100] <= WORK_FULL_UTILIZATION - work; // allow for rounding errors in double division
}
@ -281,7 +282,8 @@ struct Busyness {
if (i != 1)
result += ", ";
result += i + 1 == j ? format("%03d", i * 100) : format("%03d/%03d", i * 100, (j - 1) * 100);
result += format("=%1.02f", (float)ledger[i] / WORK_FULL_UTILIZATION);
result +=
format("=%1.02f (%d/%d)", (float)ledger[i] / WORK_FULL_UTILIZATION, ledger[i], WORK_FULL_UTILIZATION);
i = j;
}
return result;
@ -289,7 +291,7 @@ struct Busyness {
};
// find the "workFactor" for this, were it launched now
int getWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) {
int getSrcWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) {
if (relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT ||
relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT)
return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
@ -299,21 +301,26 @@ int getWorkFactor(RelocateData const& relocation, int singleRegionTeamSize) {
return WORK_FULL_UTILIZATION / singleRegionTeamSize / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER;
}
// Data movement's resource control: Do not overload source servers used for the RelocateData
int getDestWorkFactor() {
// Work of moving a shard is even across destination servers
return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER;
}
// Data movement's resource control: Do not overload servers used for the RelocateData
// return true if servers are not too busy to launch the relocation
// This ensure source servers will not be overloaded.
bool canLaunch(RelocateData& relocation,
int teamSize,
int singleRegionTeamSize,
std::map<UID, Busyness>& busymap,
std::vector<RelocateData> cancellableRelocations) {
bool canLaunchSrc(RelocateData& relocation,
int teamSize,
int singleRegionTeamSize,
std::map<UID, Busyness>& busymap,
std::vector<RelocateData> cancellableRelocations) {
// assert this has not already been launched
ASSERT(relocation.workFactor == 0);
ASSERT(relocation.src.size() != 0);
ASSERT(teamSize >= singleRegionTeamSize);
// find the "workFactor" for this, were it launched now
int workFactor = getWorkFactor(relocation, singleRegionTeamSize);
int workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize);
int neededServers = std::min<int>(relocation.src.size(), teamSize - singleRegionTeamSize + 1);
if (SERVER_KNOBS->USE_OLD_NEEDED_SERVERS) {
neededServers = std::max(1, (int)relocation.src.size() - teamSize + 1);
@ -338,18 +345,55 @@ bool canLaunch(RelocateData& relocation,
return false;
}
// candidateTeams is a vector containing one team per datacenter, the team(s) DD is planning on moving the shard to.
bool canLaunchDest(const std::vector<std::pair<Reference<IDataDistributionTeam>, bool>>& candidateTeams,
int priority,
std::map<UID, Busyness>& busymapDest) {
// fail switch if this is causing issues
if (SERVER_KNOBS->RELOCATION_PARALLELISM_PER_DEST_SERVER <= 0) {
return true;
}
int workFactor = getDestWorkFactor();
for (auto& team : candidateTeams) {
for (UID id : team.first->getServerIDs()) {
if (!busymapDest[id].canLaunch(priority, workFactor)) {
return false;
}
}
}
return true;
}
// update busyness for each server
void launch(RelocateData& relocation, std::map<UID, Busyness>& busymap, int singleRegionTeamSize) {
// if we are here this means that we can launch and should adjust all the work the servers can do
relocation.workFactor = getWorkFactor(relocation, singleRegionTeamSize);
relocation.workFactor = getSrcWorkFactor(relocation, singleRegionTeamSize);
for (int i = 0; i < relocation.src.size(); i++)
busymap[relocation.src[i]].addWork(relocation.priority, relocation.workFactor);
}
void complete(RelocateData const& relocation, std::map<UID, Busyness>& busymap) {
void launchDest(RelocateData& relocation,
const std::vector<std::pair<Reference<IDataDistributionTeam>, bool>>& candidateTeams,
std::map<UID, Busyness>& destBusymap) {
ASSERT(relocation.completeDests.empty());
int destWorkFactor = getDestWorkFactor();
for (auto& team : candidateTeams) {
for (UID id : team.first->getServerIDs()) {
relocation.completeDests.push_back(id);
destBusymap[id].addWork(relocation.priority, destWorkFactor);
}
}
}
void complete(RelocateData const& relocation, std::map<UID, Busyness>& busymap, std::map<UID, Busyness>& destBusymap) {
ASSERT(relocation.workFactor > 0);
for (int i = 0; i < relocation.src.size(); i++)
busymap[relocation.src[i]].removeWork(relocation.priority, relocation.workFactor);
int destWorkFactor = getDestWorkFactor();
for (UID id : relocation.completeDests) {
destBusymap[id].removeWork(relocation.priority, destWorkFactor);
}
}
ACTOR Future<Void> dataDistributionRelocator(struct DDQueueData* self,
@ -376,6 +420,7 @@ struct DDQueueData {
int singleRegionTeamSize;
std::map<UID, Busyness> busymap; // UID is serverID
std::map<UID, Busyness> destBusymap; // UID is serverID
KeyRangeMap<RelocateData> queueMap;
std::set<RelocateData, std::greater<RelocateData>> fetchingSourcesQueue;
@ -546,15 +591,22 @@ struct DDQueueData {
.detail("Problem", "relocate data that is inFlight is not also in the queue");
}
for (int i = 0; i < it->value().completeDests.size(); i++) {
// each server in the inFlight map is in the dest busymap
if (!destBusymap.count(it->value().completeDests[i]))
TraceEvent(SevError, "DDQueueValidateError10")
.detail("Problem", "each server in the inFlight map is in the destBusymap");
}
// in flight relocates have source servers
if (it->value().startTime != -1 && !it->value().src.size())
TraceEvent(SevError, "DDQueueValidateError10")
TraceEvent(SevError, "DDQueueValidateError11")
.detail("Problem", "in flight relocates have source servers");
if (inFlightActors.liveActorAt(it->range().begin)) {
// the key range in the inFlight map matches the key range in the RelocateData message
if (it->value().keys != it->range())
TraceEvent(SevError, "DDQueueValidateError11")
TraceEvent(SevError, "DDQueueValidateError12")
.detail(
"Problem",
"the key range in the inFlight map matches the key range in the RelocateData message");
@ -564,13 +616,29 @@ struct DDQueueData {
for (auto it = busymap.begin(); it != busymap.end(); ++it) {
for (int i = 0; i < it->second.ledger.size() - 1; i++) {
if (it->second.ledger[i] < it->second.ledger[i + 1])
TraceEvent(SevError, "DDQueueValidateError12")
TraceEvent(SevError, "DDQueueValidateError13")
.detail("Problem", "ascending ledger problem")
.detail("LedgerLevel", i)
.detail("LedgerValueA", it->second.ledger[i])
.detail("LedgerValueB", it->second.ledger[i + 1]);
if (it->second.ledger[i] < 0.0)
TraceEvent(SevError, "DDQueueValidateError13")
TraceEvent(SevError, "DDQueueValidateError14")
.detail("Problem", "negative ascending problem")
.detail("LedgerLevel", i)
.detail("LedgerValue", it->second.ledger[i]);
}
}
for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) {
for (int i = 0; i < it->second.ledger.size() - 1; i++) {
if (it->second.ledger[i] < it->second.ledger[i + 1])
TraceEvent(SevError, "DDQueueValidateError15")
.detail("Problem", "ascending ledger problem")
.detail("LedgerLevel", i)
.detail("LedgerValueA", it->second.ledger[i])
.detail("LedgerValueB", it->second.ledger[i + 1]);
if (it->second.ledger[i] < 0.0)
TraceEvent(SevError, "DDQueueValidateError16")
.detail("Problem", "negative ascending problem")
.detail("LedgerLevel", i)
.detail("LedgerValue", it->second.ledger[i]);
@ -895,7 +963,7 @@ struct DDQueueData {
// SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the
// queue
// FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY
if (!canLaunch(rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations)) {
if (!canLaunchSrc(rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations)) {
// logRelocation( rd, "SkippingQueuedRelocation" );
continue;
}
@ -956,6 +1024,18 @@ struct DDQueueData {
}
};
static std::string destServersString(std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> const& bestTeams) {
std::stringstream ss;
for (auto& tc : bestTeams) {
for (const auto& id : tc.first->getServerIDs()) {
ss << id.toString() << " ";
}
}
return std::move(ss).str();
}
// This actor relocates the specified keys to a good place.
// The inFlightActor key range map stores the actor for each RelocateData
ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd, const DDEnabledState* ddEnabledState) {
@ -970,6 +1050,9 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
state bool anyHealthy = false;
state bool allHealthy = true;
state bool anyWithSource = false;
state bool anyDestOverloaded = false;
state int destOverloadedCount = 0;
state int stuckCount = 0;
state std::vector<std::pair<Reference<IDataDistributionTeam>, bool>> bestTeams;
state double startTime = now();
state std::vector<UID> destIds;
@ -997,7 +1080,8 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
ASSERT(rd.src.size());
loop {
state int stuckCount = 0;
destOverloadedCount = 0;
stuckCount = 0;
// state int bestTeamStuckThreshold = 50;
loop {
state int tciIndex = 0;
@ -1005,6 +1089,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
anyHealthy = false;
allHealthy = true;
anyWithSource = false;
anyDestOverloaded = false;
bestTeams.clear();
// Get team from teamCollections in different DCs and find the best one
while (tciIndex < self->teamCollections.size()) {
@ -1058,18 +1143,41 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
tciIndex++;
}
if (foundTeams && anyHealthy) {
// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
// already
anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
if (foundTeams && anyHealthy && !anyDestOverloaded) {
ASSERT(rd.completeDests.empty());
break;
}
TEST(true); // did not find a healthy destination team on the first attempt
stuckCount++;
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId)
.suppressFor(1.0)
.detail("Count", stuckCount)
.detail("TeamCollectionId", tciIndex)
.detail("NumOfTeamCollections", self->teamCollections.size());
wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch));
if (anyDestOverloaded) {
TEST(true); // Destination overloaded throttled move
destOverloadedCount++;
TraceEvent(destOverloadedCount > 50 ? SevInfo : SevDebug, "DestSSBusy", distributorId)
.suppressFor(1.0)
.detail("StuckCount", stuckCount)
.detail("DestOverloadedCount", destOverloadedCount)
.detail("TeamCollectionId", tciIndex)
.detail("AnyDestOverloaded", anyDestOverloaded)
.detail("NumOfTeamCollections", self->teamCollections.size())
.detail("Servers", destServersString(bestTeams));
wait(delay(SERVER_KNOBS->DEST_OVERLOADED_DELAY, TaskPriority::DataDistributionLaunch));
} else {
TEST(true); // did not find a healthy destination team on the first attempt
stuckCount++;
TraceEvent(stuckCount > 50 ? SevWarnAlways : SevWarn, "BestTeamStuck", distributorId)
.suppressFor(1.0)
.detail("StuckCount", stuckCount)
.detail("DestOverloadedCount", destOverloadedCount)
.detail("TeamCollectionId", tciIndex)
.detail("AnyDestOverloaded", anyDestOverloaded)
.detail("NumOfTeamCollections", self->teamCollections.size());
wait(delay(SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch));
}
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
}
destIds.clear();
@ -1123,6 +1231,8 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
// FIXME: do not add data in flight to servers that were already in the src.
healthyDestinations.addDataInFlightToTeam(+metrics.bytes);
launchDest(rd, bestTeams, self->destBusymap);
if (SERVER_KNOBS->DD_ENABLE_VERBOSE_TRACING) {
// StorageMetrics is the rd shard's metrics, e.g., bytes and write bandwidth
TraceEvent(SevInfo, "RelocateShardDecision", distributorId)
@ -1646,7 +1756,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
launchData = results;
}
when(RelocateData done = waitNext(self.dataTransferComplete.getFuture())) {
complete(done, self.busymap);
complete(done, self.busymap, self.destBusymap);
if (serversToLaunchFrom.empty() && !done.src.empty())
launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch);
serversToLaunchFrom.insert(done.src.begin(), done.src.end());

View File

@ -1095,21 +1095,48 @@ public:
// DecodedNodes are stored in a contiguous vector, which sometimes must be expanded, so care
// must be taken to resolve DecodedNode pointers again after the DecodeCache has new entries added.
struct DecodeCache : FastAllocated<DecodeCache>, ReferenceCounted<DecodeCache> {
DecodeCache(const T& lowerBound = T(), const T& upperBound = T())
: lowerBound(arena, lowerBound), upperBound(arena, upperBound) {
DecodeCache(const T& lowerBound = T(), const T& upperBound = T(), int64_t* pMemoryTracker = nullptr)
: lowerBound(arena, lowerBound), upperBound(arena, upperBound), lastKnownUsedMemory(0),
pMemoryTracker(pMemoryTracker) {
decodedNodes.reserve(10);
deltatree_printf("DecodedNode size: %d\n", sizeof(DecodedNode));
}
~DecodeCache() {
if (pMemoryTracker != nullptr) {
// Do not update, only subtract the last known amount which would have been
// published to the counter
*pMemoryTracker -= lastKnownUsedMemory;
}
}
Arena arena;
T lowerBound;
T upperBound;
// Track the amount of memory used by the vector and arena and publish updates to some counter.
// Note that no update is pushed on construction because a Cursor will surely soon follow.
// Updates are pushed to the counter on
// DecodeCache clear
// DecodeCache destruction
// Cursor destruction
// as those are the most efficient times to publish an update.
int lastKnownUsedMemory;
int64_t* pMemoryTracker;
// Index 0 is always the root
std::vector<DecodedNode> decodedNodes;
DecodedNode& get(int index) { return decodedNodes[index]; }
void updateUsedMemory() {
int usedNow = sizeof(DeltaTree2) + arena.getSize(true) + (decodedNodes.capacity() * sizeof(DecodedNode));
if (pMemoryTracker != nullptr) {
*pMemoryTracker += (usedNow - lastKnownUsedMemory);
}
lastKnownUsedMemory = usedNow;
}
template <class... Args>
int emplace_new(Args&&... args) {
int index = decodedNodes.size();
@ -1125,6 +1152,7 @@ public:
lowerBound = T(a, lowerBound);
upperBound = T(a, upperBound);
arena = a;
updateUsedMemory();
}
};
@ -1142,6 +1170,12 @@ public:
// Copy constructor does not copy item because normally a copied cursor will be immediately moved.
Cursor(const Cursor& c) : tree(c.tree), cache(c.cache), nodeIndex(c.nodeIndex) {}
~Cursor() {
if (cache != nullptr) {
cache->updateUsedMemory();
}
}
Cursor next() const {
Cursor c = *this;
c.moveNext();
@ -1545,7 +1579,17 @@ public:
T leftBase = leftBaseIndex == -1 ? cache->lowerBound : get(cache->get(leftBaseIndex));
T rightBase = rightBaseIndex == -1 ? cache->upperBound : get(cache->get(rightBaseIndex));
int common = leftBase.getCommonPrefixLen(rightBase, skipLen);
// If seek has reached a non-edge node then whatever bytes the left and right bases
// have in common are definitely in common with k. However, for an edge node there
// is no guarantee, as one of the bases will be the lower or upper decode boundary
// and it is possible to add elements to the DeltaTree beyond those boundaries.
int common;
if (leftBaseIndex == -1 || rightBaseIndex == -1) {
common = 0;
} else {
common = leftBase.getCommonPrefixLen(rightBase, skipLen);
}
int commonWithLeftParent = k.getCommonPrefixLen(leftBase, common);
int commonWithRightParent = k.getCommonPrefixLen(rightBase, common);
bool borrowFromLeft = commonWithLeftParent >= commonWithRightParent;

View File

@ -24,6 +24,22 @@
#include "fdbclient/FDBTypes.h"
#include "fdbserver/Knobs.h"
#include "fdbclient/StorageCheckpoint.h"
struct CheckpointRequest {
const Version version; // The FDB version at which the checkpoint is created.
const KeyRange range; // Keyrange this checkpoint must contain.
const CheckpointFormat format;
const UID checkpointID;
const std::string checkpointDir; // The local directory where the checkpoint file will be created.
CheckpointRequest(const Version version,
const KeyRange& range,
const CheckpointFormat format,
const UID& id,
const std::string& checkpointDir)
: version(version), range(range), format(format), checkpointID(id), checkpointDir(checkpointDir) {}
};
class IClosable {
public:
@ -87,6 +103,15 @@ public:
virtual void enableSnapshot() {}
// Create a checkpoint.
virtual Future<CheckpointMetaData> checkpoint(const CheckpointRequest& request) { throw not_implemented(); }
// Restore from a checkpoint.
virtual Future<Void> restore(const std::vector<CheckpointMetaData>& checkpoints) { throw not_implemented(); }
// Delete a checkpoint.
virtual Future<Void> deleteCheckpoint(const CheckpointMetaData& checkpoint) { throw not_implemented(); }
/*
Concurrency contract
Causal consistency:

View File

@ -309,6 +309,11 @@ public:
// Advance the commit version and the oldest readble version and commit until the remap queue is empty.
virtual Future<Void> clearRemapQueue() = 0;
// Get a pointer to an integer representing a byte count penalty the pager should apply against usable page cache
// memory. This is used to track significant memory usage external to the pager. Such usages should
// increment/decrement the value at this pointer based on their memory footprint.
virtual int64_t* getPageCachePenaltySource() = 0;
protected:
~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface
};

View File

@ -5,11 +5,21 @@
#include <rocksdb/filter_policy.h>
#include <rocksdb/listener.h>
#include <rocksdb/options.h>
#include <rocksdb/metadata.h>
#include <rocksdb/slice_transform.h>
#include <rocksdb/sst_file_reader.h>
#include <rocksdb/sst_file_writer.h>
#include <rocksdb/slice.h>
#include <rocksdb/env.h>
#include <rocksdb/options.h>
#include <rocksdb/statistics.h>
#include <rocksdb/table.h>
#include <rocksdb/version.h>
#include <rocksdb/types.h>
#include <rocksdb/utilities/checkpoint.h>
#include <rocksdb/utilities/table_properties_collectors.h>
#include <rocksdb/version.h>
#include <rocksdb/rate_limiter.h>
#include <rocksdb/perf_context.h>
#include <rocksdb/c.h>
@ -32,6 +42,8 @@
#endif // SSD_ROCKSDB_EXPERIMENTAL
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/RocksDBCheckpointUtils.actor.h"
#include "flow/actorcompiler.h" // has to be last include
#ifdef SSD_ROCKSDB_EXPERIMENTAL
@ -114,7 +126,10 @@ private:
std::mutex mutex;
};
using DB = rocksdb::DB*;
using CF = rocksdb::ColumnFamilyHandle*;
#define PERSIST_PREFIX "\xff\xff"
const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version");
const StringRef ROCKSDBSTORAGE_HISTOGRAM_GROUP = LiteralStringRef("RocksDBStorage");
const StringRef ROCKSDB_COMMIT_LATENCY_HISTOGRAM = LiteralStringRef("RocksDBCommitLatency");
const StringRef ROCKSDB_COMMIT_ACTION_HISTOGRAM = LiteralStringRef("RocksDBCommitAction");
@ -134,6 +149,74 @@ const StringRef ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM = LiteralStringRef("Rock
const StringRef ROCKSDB_READVALUE_GET_HISTOGRAM = LiteralStringRef("RocksDBReadValueGet");
const StringRef ROCKSDB_READPREFIX_GET_HISTOGRAM = LiteralStringRef("RocksDBReadPrefixGet");
rocksdb::ExportImportFilesMetaData getMetaData(const CheckpointMetaData& checkpoint) {
rocksdb::ExportImportFilesMetaData metaData;
if (checkpoint.getFormat() != RocksDBColumnFamily) {
return metaData;
}
RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(checkpoint);
metaData.db_comparator_name = rocksCF.dbComparatorName;
for (const LiveFileMetaData& fileMetaData : rocksCF.sstFiles) {
rocksdb::LiveFileMetaData liveFileMetaData;
liveFileMetaData.size = fileMetaData.size;
liveFileMetaData.name = fileMetaData.name;
liveFileMetaData.file_number = fileMetaData.file_number;
liveFileMetaData.db_path = fileMetaData.db_path;
liveFileMetaData.smallest_seqno = fileMetaData.smallest_seqno;
liveFileMetaData.largest_seqno = fileMetaData.largest_seqno;
liveFileMetaData.smallestkey = fileMetaData.smallestkey;
liveFileMetaData.largestkey = fileMetaData.largestkey;
liveFileMetaData.num_reads_sampled = fileMetaData.num_reads_sampled;
liveFileMetaData.being_compacted = fileMetaData.being_compacted;
liveFileMetaData.num_entries = fileMetaData.num_entries;
liveFileMetaData.num_deletions = fileMetaData.num_deletions;
liveFileMetaData.temperature = static_cast<rocksdb::Temperature>(fileMetaData.temperature);
liveFileMetaData.oldest_blob_file_number = fileMetaData.oldest_blob_file_number;
liveFileMetaData.oldest_ancester_time = fileMetaData.oldest_ancester_time;
liveFileMetaData.file_creation_time = fileMetaData.file_creation_time;
liveFileMetaData.file_checksum = fileMetaData.file_checksum;
liveFileMetaData.file_checksum_func_name = fileMetaData.file_checksum_func_name;
liveFileMetaData.column_family_name = fileMetaData.column_family_name;
liveFileMetaData.level = fileMetaData.level;
metaData.files.push_back(liveFileMetaData);
}
return metaData;
}
void populateMetaData(CheckpointMetaData* checkpoint, const rocksdb::ExportImportFilesMetaData& metaData) {
RocksDBColumnFamilyCheckpoint rocksCF;
rocksCF.dbComparatorName = metaData.db_comparator_name;
for (const rocksdb::LiveFileMetaData& fileMetaData : metaData.files) {
LiveFileMetaData liveFileMetaData;
liveFileMetaData.size = fileMetaData.size;
liveFileMetaData.name = fileMetaData.name;
liveFileMetaData.file_number = fileMetaData.file_number;
liveFileMetaData.db_path = fileMetaData.db_path;
liveFileMetaData.smallest_seqno = fileMetaData.smallest_seqno;
liveFileMetaData.largest_seqno = fileMetaData.largest_seqno;
liveFileMetaData.smallestkey = fileMetaData.smallestkey;
liveFileMetaData.largestkey = fileMetaData.largestkey;
liveFileMetaData.num_reads_sampled = fileMetaData.num_reads_sampled;
liveFileMetaData.being_compacted = fileMetaData.being_compacted;
liveFileMetaData.num_entries = fileMetaData.num_entries;
liveFileMetaData.num_deletions = fileMetaData.num_deletions;
liveFileMetaData.temperature = static_cast<uint8_t>(fileMetaData.temperature);
liveFileMetaData.oldest_blob_file_number = fileMetaData.oldest_blob_file_number;
liveFileMetaData.oldest_ancester_time = fileMetaData.oldest_ancester_time;
liveFileMetaData.file_creation_time = fileMetaData.file_creation_time;
liveFileMetaData.file_checksum = fileMetaData.file_checksum;
liveFileMetaData.file_checksum_func_name = fileMetaData.file_checksum_func_name;
liveFileMetaData.column_family_name = fileMetaData.column_family_name;
liveFileMetaData.level = fileMetaData.level;
rocksCF.sstFiles.push_back(liveFileMetaData);
}
checkpoint->setFormat(RocksDBColumnFamily);
checkpoint->serializedCheckpoint = ObjectWriter::toValue(rocksCF, IncludeVersion());
}
rocksdb::Slice toSlice(StringRef s) {
return rocksdb::Slice(reinterpret_cast<const char*>(s.begin()), s.size());
}
@ -219,12 +302,13 @@ rocksdb::ReadOptions getReadOptions() {
}
struct ReadIterator {
CF& cf;
uint64_t index; // incrementing counter to uniquely identify read iterator.
bool inUse;
std::shared_ptr<rocksdb::Iterator> iter;
double creationTime;
ReadIterator(uint64_t index, DB& db, rocksdb::ReadOptions& options)
: index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options)) {}
ReadIterator(CF& cf, uint64_t index, DB& db, rocksdb::ReadOptions& options)
: cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
};
/*
@ -241,8 +325,8 @@ gets deleted as the ref count becomes 0.
*/
class ReadIteratorPool {
public:
ReadIteratorPool(DB& db, const std::string& path)
: db(db), index(0), iteratorsReuseCount(0), readRangeOptions(getReadOptions()) {
ReadIteratorPool(DB& db, CF& cf, const std::string& path)
: db(db), cf(cf), index(0), iteratorsReuseCount(0), readRangeOptions(getReadOptions()) {
readRangeOptions.background_purge_on_iterator_cleanup = true;
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
TraceEvent("ReadIteratorPool")
@ -271,12 +355,12 @@ public:
}
}
index++;
ReadIterator iter(index, db, readRangeOptions);
ReadIterator iter(cf, index, db, readRangeOptions);
iteratorsMap.insert({ index, iter });
return iter;
} else {
index++;
ReadIterator iter(index, db, readRangeOptions);
ReadIterator iter(cf, index, db, readRangeOptions);
return iter;
}
}
@ -316,6 +400,7 @@ private:
std::unordered_map<int, ReadIterator> iteratorsMap;
std::unordered_map<int, ReadIterator>::iterator it;
DB& db;
CF& cf;
rocksdb::ReadOptions readRangeOptions;
std::mutex mutex;
// incrementing counter for every new iterator creation, to uniquely identify the iterator in returnIterator().
@ -735,10 +820,9 @@ Error statusToError(const rocksdb::Status& s) {
}
struct RocksDBKeyValueStore : IKeyValueStore {
using CF = rocksdb::ColumnFamilyHandle*;
struct Writer : IThreadPoolReceiver {
DB& db;
CF& cf;
UID id;
std::shared_ptr<rocksdb::RateLimiter> rateLimiter;
@ -752,11 +836,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
int threadIndex;
explicit Writer(DB& db,
CF& cf,
UID id,
std::shared_ptr<ReadIteratorPool> readIterPool,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
int threadIndex)
: db(db), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
: db(db), cf(cf), id(id), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
threadIndex(threadIndex),
rateLimiter(SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0
? rocksdb::NewGenericRateLimiter(
@ -814,40 +899,71 @@ struct RocksDBKeyValueStore : IKeyValueStore {
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
};
void action(OpenAction& a) {
std::vector<rocksdb::ColumnFamilyDescriptor> defaultCF = { rocksdb::ColumnFamilyDescriptor{
"default", getCFOptions() } };
std::vector<rocksdb::ColumnFamilyHandle*> handle;
auto options = getOptions();
ASSERT(cf == nullptr);
std::vector<std::string> columnFamilies;
rocksdb::Options options = getOptions();
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, a.path, &columnFamilies);
if (std::find(columnFamilies.begin(), columnFamilies.end(), "default") == columnFamilies.end()) {
columnFamilies.push_back("default");
}
rocksdb::ColumnFamilyOptions cfOptions = getCFOptions();
std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
for (const std::string& name : columnFamilies) {
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, cfOptions });
}
options.listeners.push_back(a.errorListener);
if (SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC > 0) {
options.rate_limiter = rateLimiter;
}
auto status = rocksdb::DB::Open(options, a.path, defaultCF, &handle, &db);
std::vector<rocksdb::ColumnFamilyHandle*> handles;
status = rocksdb::DB::Open(options, a.path, descriptors, &handles, &db);
if (!status.ok()) {
logRocksDBError(status, "Open");
a.done.sendError(statusToError(status));
return;
}
for (rocksdb::ColumnFamilyHandle* handle : handles) {
if (handle->GetName() == SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY) {
cf = handle;
break;
}
}
if (cf == nullptr) {
status = db->CreateColumnFamily(cfOptions, SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, &cf);
if (!status.ok()) {
logRocksDBError(status, "Open");
a.done.sendError(statusToError(status));
}
}
TraceEvent(SevInfo, "RocksDB")
.detail("Path", a.path)
.detail("Method", "Open")
.detail("KnobRocksDBWriteRateLimiterBytesPerSec",
SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC)
.detail("KnobRocksDBWriteRateLimiterAutoTune", SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE)
.detail("ColumnFamily", cf->GetName());
if (g_network->isSimulated()) {
// The current thread and main thread are same when the code runs in simulation.
// blockUntilReady() is getting the thread into deadlock state, so directly calling
// the metricsLogger.
a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
} else {
TraceEvent(SevInfo, "RocksDB")
.detail("Path", a.path)
.detail("Method", "Open")
.detail("KnobRocksDBWriteRateLimiterBytesPerSec",
SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC)
.detail("KnobRocksDBWriteRateLimiterAutoTune", SERVER_KNOBS->ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE);
if (g_network->isSimulated()) {
// The current thread and main thread are same when the code runs in simulation.
// blockUntilReady() is getting the thread into deadlock state, so directly calling
// the metricsLogger.
onMainThread([&] {
a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
} else {
onMainThread([&] {
a.metrics = rocksDBMetricLogger(options.statistics, perfContextMetrics, db, readIterPool) &&
flowLockLogger(a.readLock, a.fetchLock) && refreshReadIteratorPool(readIterPool);
return Future<bool>(true);
}).blockUntilReady();
}
a.done.send(Void());
return Future<bool>(true);
}).blockUntilReady();
}
a.done.send(Void());
}
struct DeleteVisitor : public rocksdb::WriteBatch::Handler {
@ -863,6 +979,26 @@ struct RocksDBKeyValueStore : IKeyValueStore {
deletes.push_back_deep(arena, kr);
return rocksdb::Status::OK();
}
rocksdb::Status PutCF(uint32_t column_family_id,
const rocksdb::Slice& key,
const rocksdb::Slice& value) override {
return rocksdb::Status::OK();
}
rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
return rocksdb::Status::OK();
}
rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
return rocksdb::Status::OK();
}
rocksdb::Status MergeCF(uint32_t column_family_id,
const rocksdb::Slice& key,
const rocksdb::Slice& value) override {
return rocksdb::Status::OK();
}
};
struct CommitAction : TypedAction<Writer, CommitAction> {
@ -894,7 +1030,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
Standalone<VectorRef<KeyRangeRef>> deletes;
DeleteVisitor dv(deletes, deletes.arena());
ASSERT(a.batchToCommit->Iterate(&dv).ok());
rocksdb::Status s = a.batchToCommit->Iterate(&dv);
if (!s.ok()) {
logRocksDBError(s, "CommitDeleteVisitor");
a.done.sendError(statusToError(s));
return;
}
// If there are any range deletes, we should have added them to be deleted.
ASSERT(!deletes.empty() || !a.batchToCommit->HasDeleteRange());
rocksdb::WriteOptions options;
@ -906,7 +1047,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
// Request for batchToCommit bytes. If this request cannot be satisfied, the call is blocked.
rateLimiter->Request(a.batchToCommit->GetDataSize() /* bytes */, rocksdb::Env::IO_HIGH);
}
auto s = db->Write(options, a.batchToCommit.get());
s = db->Write(options, a.batchToCommit.get());
readIterPool->update();
if (a.getHistograms) {
writeHistogram->sampleSeconds(timer_monotonic() - writeBeginTime);
@ -922,7 +1063,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
for (const auto& keyRange : deletes) {
auto begin = toSlice(keyRange.begin);
auto end = toSlice(keyRange.end);
ASSERT(db->SuggestCompactRange(db->DefaultColumnFamily(), &begin, &end).ok());
ASSERT(db->SuggestCompactRange(cf, &begin, &end).ok());
}
if (a.getHistograms) {
deleteCompactRangeHistogram->sampleSeconds(timer_monotonic() - compactRangeBeginTime);
@ -956,9 +1097,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
logRocksDBError(s, "Close");
}
if (a.deleteOnClose) {
std::vector<rocksdb::ColumnFamilyDescriptor> defaultCF = { rocksdb::ColumnFamilyDescriptor{
"default", getCFOptions() } };
s = rocksdb::DestroyDB(a.path, getOptions(), defaultCF);
std::set<std::string> columnFamilies{ "default" };
columnFamilies.insert(SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY);
std::vector<rocksdb::ColumnFamilyDescriptor> descriptors;
for (const std::string name : columnFamilies) {
descriptors.push_back(rocksdb::ColumnFamilyDescriptor{ name, getCFOptions() });
}
s = rocksdb::DestroyDB(a.path, getOptions(), descriptors);
if (!s.ok()) {
logRocksDBError(s, "Destroy");
} else {
@ -968,10 +1113,133 @@ struct RocksDBKeyValueStore : IKeyValueStore {
TraceEvent("RocksDB").detail("Path", a.path).detail("Method", "Close");
a.done.send(Void());
}
struct CheckpointAction : TypedAction<Writer, CheckpointAction> {
CheckpointAction(const CheckpointRequest& request) : request(request) {}
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
const CheckpointRequest request;
ThreadReturnPromise<CheckpointMetaData> reply;
};
void action(CheckpointAction& a) {
TraceEvent("RocksDBServeCheckpointBegin", id)
.detail("MinVersion", a.request.version)
.detail("Range", a.request.range.toString())
.detail("Format", static_cast<int>(a.request.format))
.detail("CheckpointDir", a.request.checkpointDir);
rocksdb::Checkpoint* checkpoint;
rocksdb::Status s = rocksdb::Checkpoint::Create(db, &checkpoint);
if (!s.ok()) {
logRocksDBError(s, "Checkpoint");
a.reply.sendError(statusToError(s));
return;
}
rocksdb::PinnableSlice value;
rocksdb::ReadOptions readOptions = getReadOptions();
s = db->Get(readOptions, cf, toSlice(persistVersion), &value);
if (!s.ok() && !s.IsNotFound()) {
logRocksDBError(s, "Checkpoint");
a.reply.sendError(statusToError(s));
return;
}
const Version version = s.IsNotFound()
? latestVersion
: BinaryReader::fromStringRef<Version>(toStringRef(value), Unversioned());
TraceEvent("RocksDBServeCheckpointVersion", id)
.detail("CheckpointVersion", a.request.version)
.detail("PersistVersion", version);
// TODO: set the range as the actual shard range.
CheckpointMetaData res(version, a.request.range, a.request.format, a.request.checkpointID);
const std::string& checkpointDir = a.request.checkpointDir;
if (a.request.format == RocksDBColumnFamily) {
rocksdb::ExportImportFilesMetaData* pMetadata;
platform::eraseDirectoryRecursive(checkpointDir);
const std::string cwd = platform::getWorkingDirectory() + "/";
s = checkpoint->ExportColumnFamily(cf, checkpointDir, &pMetadata);
if (!s.ok()) {
logRocksDBError(s, "Checkpoint");
a.reply.sendError(statusToError(s));
return;
}
populateMetaData(&res, *pMetadata);
delete pMetadata;
TraceEvent("RocksDBServeCheckpointSuccess", id)
.detail("CheckpointMetaData", res.toString())
.detail("RocksDBCF", getRocksCF(res).toString());
} else {
throw not_implemented();
}
res.setState(CheckpointMetaData::Complete);
a.reply.send(res);
}
struct RestoreAction : TypedAction<Writer, RestoreAction> {
RestoreAction(const std::string& path, const std::vector<CheckpointMetaData>& checkpoints)
: path(path), checkpoints(checkpoints) {}
double getTimeEstimate() const override { return SERVER_KNOBS->COMMIT_TIME_ESTIMATE; }
const std::string path;
const std::vector<CheckpointMetaData> checkpoints;
ThreadReturnPromise<Void> done;
};
void action(RestoreAction& a) {
TraceEvent("RocksDBServeRestoreBegin", id).detail("Path", a.path);
// TODO: Fail gracefully.
ASSERT(!a.checkpoints.empty());
if (a.checkpoints[0].format == RocksDBColumnFamily) {
ASSERT_EQ(a.checkpoints.size(), 1);
TraceEvent("RocksDBServeRestoreCF", id)
.detail("Path", a.path)
.detail("Checkpoint", a.checkpoints[0].toString())
.detail("RocksDBCF", getRocksCF(a.checkpoints[0]).toString());
auto options = getOptions();
rocksdb::Status status = rocksdb::DB::Open(options, a.path, &db);
if (!status.ok()) {
logRocksDBError(status, "Restore");
a.done.sendError(statusToError(status));
return;
}
rocksdb::ExportImportFilesMetaData metaData = getMetaData(a.checkpoints[0]);
rocksdb::ImportColumnFamilyOptions importOptions;
importOptions.move_files = true;
status = db->CreateColumnFamilyWithImport(
getCFOptions(), SERVER_KNOBS->DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, importOptions, metaData, &cf);
if (!status.ok()) {
logRocksDBError(status, "Restore");
a.done.sendError(statusToError(status));
} else {
TraceEvent(SevInfo, "RocksDB").detail("Path", a.path).detail("Method", "Restore");
a.done.send(Void());
}
} else {
throw not_implemented();
}
}
};
struct Reader : IThreadPoolReceiver {
DB& db;
CF& cf;
double readValueTimeout;
double readValuePrefixTimeout;
double readRangeTimeout;
@ -992,10 +1260,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
int threadIndex;
explicit Reader(DB& db,
CF& cf,
std::shared_ptr<ReadIteratorPool> readIterPool,
std::shared_ptr<PerfContextMetrics> perfContextMetrics,
int threadIndex)
: db(db), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics), threadIndex(threadIndex),
: db(db), cf(cf), readIterPool(readIterPool), perfContextMetrics(perfContextMetrics),
threadIndex(threadIndex),
readRangeLatencyHistogram(Histogram::getHistogram(ROCKSDBSTORAGE_HISTOGRAM_GROUP,
ROCKSDB_READRANGE_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)),
@ -1066,6 +1336,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
double getTimeEstimate() const override { return SERVER_KNOBS->READ_VALUE_TIME_ESTIMATE; }
};
void action(ReadValueAction& a) {
ASSERT(cf != nullptr);
bool doPerfContextMetrics =
SERVER_KNOBS->ROCKSDB_PERFCONTEXT_ENABLE &&
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_PERFCONTEXT_SAMPLE_RATE);
@ -1098,7 +1369,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
options.deadline = std::chrono::duration_cast<std::chrono::microseconds>(deadlineSeconds);
double dbGetBeginTime = a.getHistograms ? timer_monotonic() : 0;
auto s = db->Get(options, db->DefaultColumnFamily(), toSlice(a.key), &value);
auto s = db->Get(options, cf, toSlice(a.key), &value);
if (!s.ok() && !s.IsNotFound()) {
logRocksDBError(s, "ReadValue");
a.result.sendError(statusToError(s));
return;
}
if (a.getHistograms) {
readValueGetHistogram->sampleSeconds(timer_monotonic() - dbGetBeginTime);
}
@ -1175,7 +1452,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
options.deadline = std::chrono::duration_cast<std::chrono::microseconds>(deadlineSeconds);
double dbGetBeginTime = a.getHistograms ? timer_monotonic() : 0;
auto s = db->Get(options, db->DefaultColumnFamily(), toSlice(a.key), &value);
auto s = db->Get(options, cf, toSlice(a.key), &value);
if (a.getHistograms) {
readPrefixGetHistogram->sampleSeconds(timer_monotonic() - dbGetBeginTime);
}
@ -1330,6 +1607,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
DB db = nullptr;
std::shared_ptr<PerfContextMetrics> perfContextMetrics;
std::string path;
rocksdb::ColumnFamilyHandle* defaultFdbCF = nullptr;
UID id;
Reference<IThreadPool> writeThread;
Reference<IThreadPool> readThreads;
@ -1357,7 +1635,8 @@ struct RocksDBKeyValueStore : IKeyValueStore {
Counters counters;
explicit RocksDBKeyValueStore(const std::string& path, UID id)
: path(path), id(id), perfContextMetrics(new PerfContextMetrics()), readIterPool(new ReadIteratorPool(db, path)),
: path(path), id(id), perfContextMetrics(new PerfContextMetrics()),
readIterPool(new ReadIteratorPool(db, defaultFdbCF, path)),
readSemaphore(SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
fetchSemaphore(SERVER_KNOBS->ROCKSDB_FETCH_QUEUE_SOFT_MAX),
numReadWaiters(SERVER_KNOBS->ROCKSDB_READ_QUEUE_HARD_MAX - SERVER_KNOBS->ROCKSDB_READ_QUEUE_SOFT_MAX),
@ -1381,11 +1660,11 @@ struct RocksDBKeyValueStore : IKeyValueStore {
readThreads = createGenericThreadPool();
}
writeThread->addThread(
new Writer(db, id, readIterPool, perfContextMetrics, SERVER_KNOBS->ROCKSDB_READ_PARALLELISM),
new Writer(db, defaultFdbCF, id, readIterPool, perfContextMetrics, SERVER_KNOBS->ROCKSDB_READ_PARALLELISM),
"fdb-rocksdb-wr");
TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) {
readThreads->addThread(new Reader(db, readIterPool, perfContextMetrics, i), "fdb-rocksdb-re");
readThreads->addThread(new Reader(db, defaultFdbCF, readIterPool, perfContextMetrics, i), "fdb-rocksdb-re");
}
}
@ -1429,7 +1708,8 @@ struct RocksDBKeyValueStore : IKeyValueStore {
if (writeBatch == nullptr) {
writeBatch.reset(new rocksdb::WriteBatch());
}
writeBatch->Put(toSlice(kv.key), toSlice(kv.value));
ASSERT(defaultFdbCF != nullptr);
writeBatch->Put(defaultFdbCF, toSlice(kv.key), toSlice(kv.value));
}
void clear(KeyRangeRef keyRange, const Arena*) override {
@ -1437,10 +1717,12 @@ struct RocksDBKeyValueStore : IKeyValueStore {
writeBatch.reset(new rocksdb::WriteBatch());
}
ASSERT(defaultFdbCF != nullptr);
if (keyRange.singleKeyRange()) {
writeBatch->Delete(toSlice(keyRange.begin));
writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
} else {
writeBatch->DeleteRange(toSlice(keyRange.begin), toSlice(keyRange.end));
writeBatch->DeleteRange(defaultFdbCF, toSlice(keyRange.begin), toSlice(keyRange.end));
}
}
@ -1587,6 +1869,46 @@ struct RocksDBKeyValueStore : IKeyValueStore {
return StorageBytes(free, total, live, free);
}
Future<CheckpointMetaData> checkpoint(const CheckpointRequest& request) override {
auto a = new Writer::CheckpointAction(request);
auto res = a->reply.getFuture();
writeThread->post(a);
return res;
}
Future<Void> restore(const std::vector<CheckpointMetaData>& checkpoints) override {
auto a = new Writer::RestoreAction(path, checkpoints);
auto res = a->done.getFuture();
writeThread->post(a);
return res;
}
// Delete a checkpoint.
Future<Void> deleteCheckpoint(const CheckpointMetaData& checkpoint) override {
if (checkpoint.format == RocksDBColumnFamily) {
RocksDBColumnFamilyCheckpoint rocksCF;
ObjectReader reader(checkpoint.serializedCheckpoint.begin(), IncludeVersion());
reader.deserialize(rocksCF);
std::unordered_set<std::string> dirs;
for (const LiveFileMetaData& file : rocksCF.sstFiles) {
dirs.insert(file.db_path);
}
for (const std::string dir : dirs) {
platform::eraseDirectoryRecursive(dir);
TraceEvent("DeleteCheckpointRemovedDir", id)
.detail("CheckpointID", checkpoint.checkpointID)
.detail("Dir", dir);
}
} else if (checkpoint.format == RocksDB) {
throw not_implemented();
} else {
throw internal_error();
}
return Void();
}
};
} // namespace
@ -1701,6 +2023,61 @@ TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/RocksDBReopen") {
return Void();
}
TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/CheckpointRestore") {
state std::string cwd = platform::getWorkingDirectory() + "/";
state std::string rocksDBTestDir = "rocksdb-kvstore-br-test-db";
platform::eraseDirectoryRecursive(rocksDBTestDir);
state IKeyValueStore* kvStore = new RocksDBKeyValueStore(rocksDBTestDir, deterministicRandom()->randomUniqueID());
wait(kvStore->init());
kvStore->set({ LiteralStringRef("foo"), LiteralStringRef("bar") });
wait(kvStore->commit(false));
Optional<Value> val = wait(kvStore->readValue(LiteralStringRef("foo")));
ASSERT(Optional<Value>(LiteralStringRef("bar")) == val);
platform::eraseDirectoryRecursive("checkpoint");
state std::string checkpointDir = cwd + "checkpoint";
CheckpointRequest request(
latestVersion, allKeys, RocksDBColumnFamily, deterministicRandom()->randomUniqueID(), checkpointDir);
CheckpointMetaData metaData = wait(kvStore->checkpoint(request));
state std::string rocksDBRestoreDir = "rocksdb-kvstore-br-restore-db";
platform::eraseDirectoryRecursive(rocksDBRestoreDir);
state IKeyValueStore* kvStoreCopy =
new RocksDBKeyValueStore(rocksDBRestoreDir, deterministicRandom()->randomUniqueID());
std::vector<CheckpointMetaData> checkpoints;
checkpoints.push_back(metaData);
wait(kvStoreCopy->restore(checkpoints));
Optional<Value> val = wait(kvStoreCopy->readValue(LiteralStringRef("foo")));
ASSERT(Optional<Value>(LiteralStringRef("bar")) == val);
std::vector<Future<Void>> closes;
closes.push_back(kvStore->onClosed());
closes.push_back(kvStoreCopy->onClosed());
kvStore->close();
kvStoreCopy->close();
wait(waitForAll(closes));
platform::eraseDirectoryRecursive(rocksDBTestDir);
platform::eraseDirectoryRecursive(rocksDBRestoreDir);
return Void();
}
TEST_CASE("noSim/fdbserver/KeyValueStoreRocksDB/RocksDBTypes") {
// If the following assertion fails, update SstFileMetaData and LiveFileMetaData in RocksDBCheckpointUtils.actor.h
// to be the same as rocksdb::SstFileMetaData and rocksdb::LiveFileMetaData.
ASSERT_EQ(sizeof(rocksdb::LiveFileMetaData), 184);
ASSERT_EQ(sizeof(rocksdb::ExportImportFilesMetaData), 32);
return Void();
}
} // namespace
#endif // SSD_ROCKSDB_EXPERIMENTAL

View File

@ -20,6 +20,7 @@
#include <vector>
#include "fdbclient/FDBOptions.g.h"
#include "flow/Util.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbclient/KeyBackedTypes.h"
@ -65,6 +66,7 @@ ACTOR Future<MoveKeysLock> takeMoveKeysLock(Database cx, UID ddId) {
state MoveKeysLock lock;
state UID txnId;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
if (!g_network->isSimulated()) {
txnId = deterministicRandom()->randomUniqueID();
tr.debugTransaction(txnId);
@ -99,6 +101,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
MoveKeysLock lock,
const DDEnabledState* ddEnabledState,
bool isWrite = true) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
if (!ddEnabledState->isDDEnabled()) {
TraceEvent(SevDebug, "DDDisabledByInMemoryCheck").log();
throw movekeys_conflict();
@ -605,6 +608,7 @@ ACTOR Future<Void> checkFetchingState(Database cx,
tr.trState->taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
std::vector<Future<Optional<Value>>> serverListEntries;
serverListEntries.reserve(dest.size());
@ -698,6 +702,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
tr.trState->taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
releaser.release();
wait(finishMoveKeysParallelismLock->take(TaskPriority::DataDistributionLaunch));
@ -1332,6 +1337,7 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
try {
tr.trState->taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
TraceEvent("RemoveKeysFromFailedServerLocked")
.detail("ServerID", serverID)

View File

@ -27,6 +27,8 @@
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using ConfigFollowerInfo = ModelInterface<ConfigFollowerInterface>;
struct CommittedVersions {
Version secondToLastCommitted;
Version lastCommitted;
@ -42,6 +44,10 @@ class GetCommittedVersionQuorum {
std::vector<ConfigFollowerInterface> cfis;
std::map<Version, std::vector<ConfigFollowerInterface>> replies;
std::map<Version, Version> priorVersions;
std::map<NetworkAddress, Version> committed;
// Need to know the largest compacted version on any node to avoid asking
// for changes that have already been compacted.
Version largestCompactedResponse{ 0 };
// Last durably committed version.
Version lastSeenVersion;
size_t totalRepliesReceived{ 0 };
@ -58,6 +64,7 @@ class GetCommittedVersionQuorum {
ACTOR static Future<Void> updateNode(GetCommittedVersionQuorum* self,
CommittedVersions nodeVersion,
CommittedVersions quorumVersion,
Version lastCompacted,
ConfigFollowerInterface cfi) {
state Version target = quorumVersion.lastCommitted;
if (nodeVersion.lastCommitted == target) {
@ -79,37 +86,41 @@ class GetCommittedVersionQuorum {
rollback = std::max(nodeVersion.lastCommitted - 1, Version{ 0 });
}
if (rollback.present()) {
// When a new ConfigBroadcaster is created, it may not know
// about the last committed version on the ConfigNodes. If
// compaction has occurred, this can cause change requests to
// be sent to nodes asking for version 0 when the node has
// already compacted that version, causing an error. Make sure
// the rollback version is at least set to the last compacted
// version to prevent this issue.
rollback = std::max(rollback.get(), lastCompacted);
}
// Now roll node forward to match the largest committed version of
// the replies.
// TODO: Load balance over quorum. Also need to catch
// error_code_process_behind and retry with the next ConfigNode in
// the quorum.
state ConfigFollowerInterface quorumCfi = self->replies[target][0];
state Reference<ConfigFollowerInfo> quorumCfi(new ConfigFollowerInfo(self->replies[target], false));
try {
state Version lastSeenVersion = rollback.present() ? rollback.get() : nodeVersion.lastCommitted;
ConfigFollowerGetChangesReply reply = wait(timeoutError(
quorumCfi.getChanges.getReply(ConfigFollowerGetChangesRequest{ lastSeenVersion, target }),
SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
state Version lastSeenVersion = std::max(
rollback.present() ? rollback.get() : nodeVersion.lastCommitted, self->largestCompactedResponse);
ConfigFollowerGetChangesReply reply =
wait(timeoutError(basicLoadBalance(quorumCfi,
&ConfigFollowerInterface::getChanges,
ConfigFollowerGetChangesRequest{ lastSeenVersion, target }),
SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
wait(timeoutError(cfi.rollforward.getReply(ConfigFollowerRollforwardRequest{
rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }),
SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
} catch (Error& e) {
if (e.code() == error_code_version_already_compacted) {
TEST(true); // PaxosConfigConsumer rollforward compacted ConfigNode
ConfigFollowerGetSnapshotAndChangesReply reply = wait(retryBrokenPromise(
quorumCfi.getSnapshotAndChanges, ConfigFollowerGetSnapshotAndChangesRequest{ target }));
wait(retryBrokenPromise(
cfi.rollforward,
ConfigFollowerRollforwardRequest{
rollback, nodeVersion.lastCommitted, target, reply.changes, reply.annotations }));
} else if (e.code() == error_code_transaction_too_old) {
if (e.code() == error_code_transaction_too_old) {
// Seeing this trace is not necessarily a problem. There
// are legitimate scenarios where a ConfigNode could return
// transaction_too_old in response to a rollforward
// request.
// one of these errors in response to a get changes or
// rollforward request. The retry loop should handle this
// case.
TraceEvent(SevInfo, "ConfigNodeRollforwardError").error(e);
} else {
throw e;
throw;
}
}
}
@ -123,6 +134,8 @@ class GetCommittedVersionQuorum {
SERVER_KNOBS->GET_COMMITTED_VERSION_TIMEOUT));
++self->totalRepliesReceived;
self->largestCompactedResponse = std::max(self->largestCompactedResponse, reply.lastCompacted);
state Version lastCompacted = reply.lastCompacted;
self->largestCommitted = std::max(self->largestCommitted, reply.lastCommitted);
state CommittedVersions committedVersions = CommittedVersions{ self->lastSeenVersion, reply.lastCommitted };
if (self->priorVersions.find(committedVersions.lastCommitted) == self->priorVersions.end()) {
@ -136,14 +149,15 @@ class GetCommittedVersionQuorum {
if (self->quorumVersion.canBeSet()) {
self->quorumVersion.send(QuorumVersion{ committedVersions, true });
}
wait(self->updateNode(self, committedVersions, self->quorumVersion.getFuture().get().versions, cfi));
wait(self->updateNode(
self, committedVersions, self->quorumVersion.getFuture().get().versions, lastCompacted, cfi));
} else if (self->maxAgreement >= self->cfis.size() / 2 + 1) {
// A quorum of ConfigNodes agree on the latest committed version,
// but the node we just got a reply from is not one of them. We may
// need to roll it forward or back.
QuorumVersion quorumVersion = wait(self->quorumVersion.getFuture());
ASSERT(committedVersions.lastCommitted != quorumVersion.versions.lastCommitted);
wait(self->updateNode(self, committedVersions, quorumVersion.versions, cfi));
wait(self->updateNode(self, committedVersions, quorumVersion.versions, lastCompacted, cfi));
} else if (self->maxAgreement + (self->cfis.size() - self->totalRepliesReceived) <
(self->cfis.size() / 2 + 1)) {
// It is impossible to reach a quorum of ConfigNodes that agree
@ -158,18 +172,25 @@ class GetCommittedVersionQuorum {
self->quorumVersion.send(
QuorumVersion{ CommittedVersions{ largestCommittedPrior, largestCommitted }, false });
}
wait(self->updateNode(self, committedVersions, self->quorumVersion.getFuture().get().versions, cfi));
wait(self->updateNode(
self, committedVersions, self->quorumVersion.getFuture().get().versions, lastCompacted, cfi));
} else {
// Still building up responses; don't have enough data to act on
// yet, so wait until we do.
QuorumVersion quorumVersion = wait(self->quorumVersion.getFuture());
wait(self->updateNode(self, committedVersions, quorumVersion.versions, cfi));
wait(self->updateNode(self, committedVersions, quorumVersion.versions, lastCompacted, cfi));
}
} catch (Error& e) {
// Count a timeout as a reply.
++self->totalRepliesReceived;
if (e.code() != error_code_timed_out) {
throw;
if (e.code() == error_code_version_already_compacted) {
if (self->quorumVersion.canBeSet()) {
self->quorumVersion.sendError(e);
}
} else if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) {
if (self->quorumVersion.canBeSet()) {
self->quorumVersion.sendError(e);
}
} else if (self->totalRepliesReceived == self->cfis.size() && self->quorumVersion.canBeSet() &&
!self->quorumVersion.isError()) {
size_t nonTimeoutReplies =
@ -178,14 +199,10 @@ class GetCommittedVersionQuorum {
});
if (nonTimeoutReplies >= self->cfis.size() / 2 + 1) {
// Make sure to trigger the quorumVersion if a timeout
// occurred, a quorum disagree on the committed version, and
// there are no more incoming responses. Note that this means
// that it is impossible to reach a quorum, so send back the
// largest committed version seen. We also need to store the
// interface for the timed out server for future communication
// attempts.
auto& nodes = self->replies[self->largestCommitted];
nodes.push_back(cfi);
// occurred, a quorum disagree on the committed version,
// and there are no more incoming responses. Note that this
// means that it is impossible to reach a quorum, so send
// back the largest committed version seen.
self->quorumVersion.send(
QuorumVersion{ CommittedVersions{ self->lastSeenVersion, self->largestCommitted }, false });
} else if (!self->quorumVersion.isSet()) {
@ -219,6 +236,16 @@ public:
ASSERT(isReady());
return replies.at(quorumVersion.getFuture().get().versions.lastCommitted);
}
Version getSmallestCommitted() const {
if (committed.size() == cfis.size()) {
Version smallest = MAX_VERSION;
for (const auto& [key, value] : committed) {
smallest = std::min(smallest, value);
}
return smallest;
}
return ::invalidVersion;
}
Future<Void> complete() const { return waitForAll(actors); }
};
@ -226,6 +253,7 @@ class PaxosConfigConsumerImpl {
std::vector<ConfigFollowerInterface> cfis;
GetCommittedVersionQuorum getCommittedVersionQuorum;
Version lastSeenVersion{ 0 };
Version compactionVersion{ 0 };
double pollingInterval;
Optional<double> compactionInterval;
UID id;
@ -238,13 +266,15 @@ class PaxosConfigConsumerImpl {
return quorumVersion.versions.lastCommitted;
}
// Periodically compact knob changes on the configuration nodes. All nodes
// must have received a version before it can be compacted.
ACTOR static Future<Void> compactor(PaxosConfigConsumerImpl* self, ConfigBroadcaster* broadcaster) {
if (!self->compactionInterval.present()) {
wait(Never());
return Void();
}
loop {
state Version compactionVersion = self->lastSeenVersion;
state Version compactionVersion = self->compactionVersion;
wait(delayJittered(self->compactionInterval.get()));
std::vector<Future<Void>> compactionRequests;
compactionRequests.reserve(compactionRequests.size());
@ -263,12 +293,14 @@ class PaxosConfigConsumerImpl {
loop {
self->resetCommittedVersionQuorum(); // TODO: This seems to fix a segfault, investigate more
try {
// TODO: Load balance
state Version committedVersion = wait(getCommittedVersion(self));
ConfigFollowerGetSnapshotAndChangesReply reply = wait(
timeoutError(self->getCommittedVersionQuorum.getReadReplicas()[0].getSnapshotAndChanges.getReply(
ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion }),
SERVER_KNOBS->GET_SNAPSHOT_AND_CHANGES_TIMEOUT));
state Reference<ConfigFollowerInfo> configNodes(
new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas(), false));
ConfigFollowerGetSnapshotAndChangesReply reply =
wait(timeoutError(basicLoadBalance(configNodes,
&ConfigFollowerInterface::getSnapshotAndChanges,
ConfigFollowerGetSnapshotAndChangesRequest{ committedVersion }),
SERVER_KNOBS->GET_SNAPSHOT_AND_CHANGES_TIMEOUT));
TraceEvent(SevDebug, "ConfigConsumerGotSnapshotAndChanges", self->id)
.detail("SnapshotVersion", reply.snapshotVersion)
.detail("SnapshotSize", reply.snapshot.size())
@ -277,6 +309,8 @@ class PaxosConfigConsumerImpl {
.detail("AnnotationsSize", reply.annotations.size());
ASSERT_GE(committedVersion, self->lastSeenVersion);
self->lastSeenVersion = committedVersion;
Version smallestCommitted = self->getCommittedVersionQuorum.getSmallestCommitted();
self->compactionVersion = std::max(self->compactionVersion, smallestCommitted);
broadcaster->applySnapshotAndChanges(std::move(reply.snapshot),
reply.snapshotVersion,
reply.changes,
@ -288,7 +322,8 @@ class PaxosConfigConsumerImpl {
} catch (Error& e) {
if (e.code() == error_code_failed_to_reach_quorum) {
wait(self->getCommittedVersionQuorum.complete());
} else if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise) {
} else if (e.code() != error_code_timed_out && e.code() != error_code_broken_promise &&
e.code() != error_code_version_already_compacted && e.code() != error_code_process_behind) {
throw;
}
wait(delayJittered(0.1));
@ -313,13 +348,14 @@ class PaxosConfigConsumerImpl {
// ConfigNodes changes to 1, 1, 2, the committed version
// returned would be 1.
if (committedVersion > self->lastSeenVersion) {
// TODO: Load balance to avoid always hitting the
// node at index 0 first
ASSERT(self->getCommittedVersionQuorum.getReadReplicas().size() >= self->cfis.size() / 2 + 1);
ConfigFollowerGetChangesReply reply = wait(
timeoutError(self->getCommittedVersionQuorum.getReadReplicas()[0].getChanges.getReply(
state Reference<ConfigFollowerInfo> configNodes(
new ConfigFollowerInfo(self->getCommittedVersionQuorum.getReadReplicas(), false));
ConfigFollowerGetChangesReply reply = wait(timeoutError(
basicLoadBalance(configNodes,
&ConfigFollowerInterface::getChanges,
ConfigFollowerGetChangesRequest{ self->lastSeenVersion, committedVersion }),
SERVER_KNOBS->FETCH_CHANGES_TIMEOUT));
SERVER_KNOBS->FETCH_CHANGES_TIMEOUT));
for (const auto& versionedMutation : reply.changes) {
TraceEvent te(SevDebug, "ConsumerFetchedMutation", self->id);
te.detail("Version", versionedMutation.version)
@ -333,19 +369,20 @@ class PaxosConfigConsumerImpl {
}
}
self->lastSeenVersion = committedVersion;
Version smallestCommitted = self->getCommittedVersionQuorum.getSmallestCommitted();
self->compactionVersion = std::max(self->compactionVersion, smallestCommitted);
broadcaster->applyChanges(reply.changes,
committedVersion,
reply.annotations,
self->getCommittedVersionQuorum.getReadReplicas());
// TODO: Catch error_code_process_behind and retry with
// the next ConfigNode in the quorum.
} else if (committedVersion == self->lastSeenVersion) {
broadcaster->applyChanges({}, -1, {}, self->getCommittedVersionQuorum.getReadReplicas());
}
wait(delayJittered(self->pollingInterval));
} catch (Error& e) {
if (e.code() == error_code_version_already_compacted || e.code() == error_code_timed_out ||
e.code() == error_code_failed_to_reach_quorum) {
e.code() == error_code_failed_to_reach_quorum || e.code() == error_code_version_already_compacted ||
e.code() == error_code_process_behind) {
TEST(true); // PaxosConfigConsumer get version_already_compacted error
if (e.code() == error_code_failed_to_reach_quorum) {
try {
@ -365,7 +402,7 @@ class PaxosConfigConsumerImpl {
self->resetCommittedVersionQuorum();
continue;
} else {
throw e;
throw;
}
}
try {

View File

@ -19,6 +19,7 @@
*/
#include <cinttypes>
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/SystemData.h"
#include "flow/ActorCollection.h"
#include "fdbrpc/simulator.h"
@ -233,6 +234,7 @@ ACTOR Future<std::vector<BlobWorkerInterface>> getBlobWorkers(Database cx, bool
if (use_system_priority) {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
}
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
RangeResult blobWorkersList = wait(tr.getRange(blobWorkerListKeys, CLIENT_KNOBS->TOO_MANY));
@ -256,6 +258,7 @@ ACTOR Future<std::vector<StorageServerInterface>> getStorageServers(Database cx,
if (use_system_priority) {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
}
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));

View File

@ -0,0 +1,283 @@
/*
*RocksDBCheckpointUtils.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/RocksDBCheckpointUtils.actor.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/StorageCheckpoint.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/actorcompiler.h" // has to be last include
namespace {
class RocksDBCheckpointReader : public ICheckpointReader {
public:
RocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID)
: checkpoint_(checkpoint), id_(logID), file_(Reference<IAsyncFile>()), offset_(0) {}
Future<Void> init(StringRef token) override;
Future<RangeResult> nextKeyValues(const int rowLimit, const int byteLimit) override { throw not_implemented(); }
// Returns the next chunk of serialized checkpoint.
Future<Standalone<StringRef>> nextChunk(const int byteLimit) override;
Future<Void> close() override;
private:
ACTOR static Future<Void> doInit(RocksDBCheckpointReader* self) {
ASSERT(self != nullptr);
try {
state Reference<IAsyncFile> _file = wait(IAsyncFileSystem::filesystem()->open(
self->path_, IAsyncFile::OPEN_READONLY | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO, 0));
self->file_ = _file;
TraceEvent("RocksDBCheckpointReaderOpenFile").detail("File", self->path_);
} catch (Error& e) {
TraceEvent(SevWarnAlways, "ServerGetCheckpointFileFailure")
.errorUnsuppressed(e)
.detail("File", self->path_);
throw e;
}
return Void();
}
ACTOR static Future<Standalone<StringRef>> getNextChunk(RocksDBCheckpointReader* self, int byteLimit) {
int blockSize = std::min(64 * 1024, byteLimit); // Block size read from disk.
state Standalone<StringRef> buf = makeAlignedString(_PAGE_SIZE, blockSize);
int bytesRead = wait(self->file_->read(mutateString(buf), blockSize, self->offset_));
if (bytesRead == 0) {
throw end_of_stream();
}
self->offset_ += bytesRead;
return buf.substr(0, bytesRead);
}
ACTOR static Future<Void> doClose(RocksDBCheckpointReader* self) {
wait(delay(0, TaskPriority::FetchKeys));
delete self;
return Void();
}
CheckpointMetaData checkpoint_;
UID id_;
Reference<IAsyncFile> file_;
int offset_;
std::string path_;
};
Future<Void> RocksDBCheckpointReader::init(StringRef token) {
ASSERT_EQ(this->checkpoint_.getFormat(), RocksDBColumnFamily);
const std::string name = token.toString();
this->offset_ = 0;
this->path_.clear();
const RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(this->checkpoint_);
for (const auto& sstFile : rocksCF.sstFiles) {
if (sstFile.name == name) {
this->path_ = sstFile.db_path + sstFile.name;
break;
}
}
if (this->path_.empty()) {
TraceEvent("RocksDBCheckpointReaderInitFileNotFound").detail("File", this->path_);
return checkpoint_not_found();
}
return doInit(this);
}
Future<Standalone<StringRef>> RocksDBCheckpointReader::nextChunk(const int byteLimit) {
return getNextChunk(this, byteLimit);
}
Future<Void> RocksDBCheckpointReader::close() {
return doClose(this);
}
// Fetch a single sst file from storage server. If the file is fetch successfully, it will be recorded via cFun.
ACTOR Future<Void> fetchCheckpointFile(Database cx,
std::shared_ptr<CheckpointMetaData> metaData,
int idx,
std::string dir,
std::function<Future<Void>(const CheckpointMetaData&)> cFun,
int maxRetries = 3) {
state RocksDBColumnFamilyCheckpoint rocksCF;
ObjectReader reader(metaData->serializedCheckpoint.begin(), IncludeVersion());
reader.deserialize(rocksCF);
// Skip fetched file.
if (rocksCF.sstFiles[idx].fetched && rocksCF.sstFiles[idx].db_path == dir) {
return Void();
}
state std::string remoteFile = rocksCF.sstFiles[idx].name;
state std::string localFile = dir + rocksCF.sstFiles[idx].name;
state UID ssID = metaData->ssID;
state Transaction tr(cx);
state StorageServerInterface ssi;
loop {
try {
Optional<Value> ss = wait(tr.get(serverListKeyFor(ssID)));
if (!ss.present()) {
throw checkpoint_not_found();
}
ssi = decodeServerListValue(ss.get());
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
state int attempt = 0;
loop {
try {
++attempt;
TraceEvent("FetchCheckpointFileBegin")
.detail("RemoteFile", remoteFile)
.detail("TargetUID", ssID.toString())
.detail("StorageServer", ssi.id().toString())
.detail("LocalFile", localFile)
.detail("Attempt", attempt);
wait(IAsyncFileSystem::filesystem()->deleteFile(localFile, true));
const int64_t flags = IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_READWRITE |
IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_NO_AIO;
state int64_t offset = 0;
state Reference<IAsyncFile> asyncFile = wait(IAsyncFileSystem::filesystem()->open(localFile, flags, 0666));
state ReplyPromiseStream<FetchCheckpointReply> stream =
ssi.fetchCheckpoint.getReplyStream(FetchCheckpointRequest(metaData->checkpointID, remoteFile));
TraceEvent("FetchCheckpointFileReceivingData")
.detail("RemoteFile", remoteFile)
.detail("TargetUID", ssID.toString())
.detail("StorageServer", ssi.id().toString())
.detail("LocalFile", localFile)
.detail("Attempt", attempt);
loop {
state FetchCheckpointReply rep = waitNext(stream.getFuture());
wait(asyncFile->write(rep.data.begin(), rep.data.size(), offset));
wait(asyncFile->flush());
offset += rep.data.size();
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
TraceEvent("FetchCheckpointFileError")
.errorUnsuppressed(e)
.detail("RemoteFile", remoteFile)
.detail("StorageServer", ssi.toString())
.detail("LocalFile", localFile)
.detail("Attempt", attempt);
if (attempt >= maxRetries) {
throw e;
}
} else {
wait(asyncFile->sync());
int64_t fileSize = wait(asyncFile->size());
TraceEvent("FetchCheckpointFileEnd")
.detail("RemoteFile", remoteFile)
.detail("StorageServer", ssi.toString())
.detail("LocalFile", localFile)
.detail("Attempt", attempt)
.detail("DataSize", offset)
.detail("FileSize", fileSize);
rocksCF.sstFiles[idx].db_path = dir;
rocksCF.sstFiles[idx].fetched = true;
metaData->serializedCheckpoint = ObjectWriter::toValue(rocksCF, IncludeVersion());
if (cFun) {
wait(cFun(*metaData));
}
return Void();
}
}
}
}
} // namespace
ACTOR Future<CheckpointMetaData> fetchRocksDBCheckpoint(Database cx,
CheckpointMetaData initialState,
std::string dir,
std::function<Future<Void>(const CheckpointMetaData&)> cFun) {
TraceEvent("FetchRocksCheckpointBegin")
.detail("InitialState", initialState.toString())
.detail("CheckpointDir", dir);
state std::shared_ptr<CheckpointMetaData> metaData = std::make_shared<CheckpointMetaData>(initialState);
if (metaData->format == RocksDBColumnFamily) {
state RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(initialState);
TraceEvent("RocksDBCheckpointMetaData").detail("RocksCF", rocksCF.toString());
state int i = 0;
state std::vector<Future<Void>> fs;
for (; i < rocksCF.sstFiles.size(); ++i) {
fs.push_back(fetchCheckpointFile(cx, metaData, i, dir, cFun));
TraceEvent("GetCheckpointFetchingFile")
.detail("FileName", rocksCF.sstFiles[i].name)
.detail("Server", metaData->ssID.toString());
}
wait(waitForAll(fs));
} else {
throw not_implemented();
}
return *metaData;
}
ACTOR Future<Void> deleteRocksCFCheckpoint(CheckpointMetaData checkpoint) {
ASSERT_EQ(checkpoint.getFormat(), RocksDBColumnFamily);
RocksDBColumnFamilyCheckpoint rocksCF = getRocksCF(checkpoint);
TraceEvent("DeleteRocksColumnFamilyCheckpoint", checkpoint.checkpointID)
.detail("CheckpointID", checkpoint.checkpointID)
.detail("RocksCF", rocksCF.toString());
state std::unordered_set<std::string> dirs;
for (const LiveFileMetaData& file : rocksCF.sstFiles) {
dirs.insert(file.db_path);
}
state std::unordered_set<std::string>::iterator it = dirs.begin();
for (; it != dirs.end(); ++it) {
const std::string dir = *it;
platform::eraseDirectoryRecursive(dir);
TraceEvent("DeleteCheckpointRemovedDir", checkpoint.checkpointID)
.detail("CheckpointID", checkpoint.checkpointID)
.detail("Dir", dir);
wait(delay(0, TaskPriority::FetchKeys));
}
return Void();
}
ICheckpointReader* newRocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) {
return new RocksDBCheckpointReader(checkpoint, logID);
}
RocksDBColumnFamilyCheckpoint getRocksCF(const CheckpointMetaData& checkpoint) {
RocksDBColumnFamilyCheckpoint rocksCF;
ObjectReader reader(checkpoint.serializedCheckpoint.begin(), IncludeVersion());
reader.deserialize(rocksCF);
return rocksCF;
}

View File

@ -0,0 +1,209 @@
/*
*RocksDBCheckpointUtils.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_G_H)
#define FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_G_H
#include "fdbserver/RocksDBCheckpointUtils.actor.g.h"
#elif !defined(FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_H)
#define FDBSERVER_ROCKSDB_CHECKPOINT_UTILS_ACTOR_H
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/ServerCheckpoint.actor.h"
#include "flow/flow.h"
#include "flow/actorcompiler.h" // has to be last include
// Copied from rocksdb/metadata.h, so that we can add serializer.
struct SstFileMetaData {
constexpr static FileIdentifier file_identifier = 3804347;
SstFileMetaData()
: size(0), file_number(0), smallest_seqno(0), largest_seqno(0), num_reads_sampled(0), being_compacted(false),
num_entries(0), num_deletions(0), temperature(0), oldest_blob_file_number(0), oldest_ancester_time(0),
file_creation_time(0) {}
SstFileMetaData(const std::string& _file_name,
uint64_t _file_number,
const std::string& _path,
size_t _size,
uint64_t _smallest_seqno,
uint64_t _largest_seqno,
const std::string& _smallestkey,
const std::string& _largestkey,
uint64_t _num_reads_sampled,
bool _being_compacted,
int _temperature,
uint64_t _oldest_blob_file_number,
uint64_t _oldest_ancester_time,
uint64_t _file_creation_time,
std::string& _file_checksum,
std::string& _file_checksum_func_name)
: size(_size), name(_file_name), file_number(_file_number), db_path(_path), smallest_seqno(_smallest_seqno),
largest_seqno(_largest_seqno), smallestkey(_smallestkey), largestkey(_largestkey),
num_reads_sampled(_num_reads_sampled), being_compacted(_being_compacted), num_entries(0), num_deletions(0),
temperature(_temperature), oldest_blob_file_number(_oldest_blob_file_number),
oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time),
file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name) {}
// File size in bytes.
size_t size;
// The name of the file.
std::string name;
// The id of the file.
uint64_t file_number;
// The full path where the file locates.
std::string db_path;
uint64_t smallest_seqno; // Smallest sequence number in file.
uint64_t largest_seqno; // Largest sequence number in file.
std::string smallestkey; // Smallest user defined key in the file.
std::string largestkey; // Largest user defined key in the file.
uint64_t num_reads_sampled; // How many times the file is read.
bool being_compacted; // true if the file is currently being compacted.
uint64_t num_entries;
uint64_t num_deletions;
// This feature is experimental and subject to change.
int temperature;
uint64_t oldest_blob_file_number; // The id of the oldest blob file
// referenced by the file.
// An SST file may be generated by compactions whose input files may
// in turn be generated by earlier compactions. The creation time of the
// oldest SST file that is the compaction ancestor of this file.
// The timestamp is provided SystemClock::GetCurrentTime().
// 0 if the information is not available.
//
// Note: for TTL blob files, it contains the start of the expiration range.
uint64_t oldest_ancester_time;
// Timestamp when the SST file is created, provided by
// SystemClock::GetCurrentTime(). 0 if the information is not available.
uint64_t file_creation_time;
// The checksum of a SST file, the value is decided by the file content and
// the checksum algorithm used for this SST file. The checksum function is
// identified by the file_checksum_func_name. If the checksum function is
// not specified, file_checksum is "0" by default.
std::string file_checksum;
// The name of the checksum function used to generate the file checksum
// value. If file checksum is not enabled (e.g., sst_file_checksum_func is
// null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
// "Unknown".
std::string file_checksum_func_name;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar,
size,
name,
file_number,
db_path,
smallest_seqno,
largest_seqno,
smallestkey,
largestkey,
num_reads_sampled,
being_compacted,
num_entries,
num_deletions,
temperature,
oldest_blob_file_number,
oldest_ancester_time,
file_creation_time,
file_checksum,
file_checksum_func_name);
}
};
// Copied from rocksdb::LiveFileMetaData.
struct LiveFileMetaData : public SstFileMetaData {
constexpr static FileIdentifier file_identifier = 3804346;
std::string column_family_name; // Name of the column family
int level; // Level at which this file resides.
bool fetched;
LiveFileMetaData() : column_family_name(), level(0), fetched(false) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar,
SstFileMetaData::size,
SstFileMetaData::name,
SstFileMetaData::file_number,
SstFileMetaData::db_path,
SstFileMetaData::smallest_seqno,
SstFileMetaData::largest_seqno,
SstFileMetaData::smallestkey,
SstFileMetaData::largestkey,
SstFileMetaData::num_reads_sampled,
SstFileMetaData::being_compacted,
SstFileMetaData::num_entries,
SstFileMetaData::num_deletions,
SstFileMetaData::temperature,
SstFileMetaData::oldest_blob_file_number,
SstFileMetaData::oldest_ancester_time,
SstFileMetaData::file_creation_time,
SstFileMetaData::file_checksum,
SstFileMetaData::file_checksum_func_name,
column_family_name,
level,
fetched);
}
};
// Checkpoint metadata associated with RockDBColumnFamily format.
// Based on rocksdb::ExportImportFilesMetaData.
struct RocksDBColumnFamilyCheckpoint {
constexpr static FileIdentifier file_identifier = 13804346;
std::string dbComparatorName;
std::vector<LiveFileMetaData> sstFiles;
CheckpointFormat format() const { return RocksDBColumnFamily; }
std::string toString() const {
std::string res = "RocksDBColumnFamilyCheckpoint:\nSST Files:\n";
for (const auto& file : sstFiles) {
res += file.db_path + file.name + "\n";
}
return res;
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, dbComparatorName, sstFiles);
}
};
// Fetch the checkpoint file(s) to local dir, the checkpoint is specified by initialState.
// If cFun is provided, the fetch progress can be checkpointed, so that next time, the fetch process
// can be continued, in case of crash.
ACTOR Future<CheckpointMetaData> fetchRocksDBCheckpoint(Database cx,
CheckpointMetaData initialState,
std::string dir,
std::function<Future<Void>(const CheckpointMetaData&)> cFun);
ACTOR Future<Void> deleteRocksCFCheckpoint(CheckpointMetaData checkpoint);
ICheckpointReader* newRocksDBCheckpointReader(const CheckpointMetaData& checkpoint, UID logID);
RocksDBColumnFamilyCheckpoint getRocksCF(const CheckpointMetaData& checkpoint);
#endif

View File

@ -0,0 +1,67 @@
/*
*ServerCheckpoint.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/ServerCheckpoint.actor.h"
#include "fdbserver/RocksDBCheckpointUtils.actor.h"
#include "flow/actorcompiler.h" // has to be last include
ICheckpointReader* newCheckpointReader(const CheckpointMetaData& checkpoint, UID logID) {
if (checkpoint.getFormat() == RocksDBColumnFamily) {
return newRocksDBCheckpointReader(checkpoint, logID);
} else if (checkpoint.getFormat() == RocksDB) {
throw not_implemented();
} else {
ASSERT(false);
}
return nullptr;
}
ACTOR Future<Void> deleteCheckpoint(CheckpointMetaData checkpoint) {
wait(delay(0, TaskPriority::FetchKeys));
if (checkpoint.getFormat() == RocksDBColumnFamily) {
wait(deleteRocksCFCheckpoint(checkpoint));
} else if (checkpoint.getFormat() == RocksDB) {
throw not_implemented();
} else {
ASSERT(false);
}
return Void();
}
ACTOR Future<CheckpointMetaData> fetchCheckpoint(Database cx,
CheckpointMetaData initialState,
std::string dir,
std::function<Future<Void>(const CheckpointMetaData&)> cFun) {
state CheckpointMetaData result;
if (initialState.getFormat() == RocksDBColumnFamily) {
CheckpointMetaData _result = wait(fetchRocksDBCheckpoint(cx, initialState, dir, cFun));
result = _result;
} else if (initialState.getFormat() == RocksDB) {
throw not_implemented();
} else {
ASSERT(false);
}
return result;
}

View File

@ -0,0 +1,66 @@
/*
*ServerCheckpoint.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_SERVER_CHECKPOINT_ACTOR_G_H)
#define FDBSERVER_SERVER_CHECKPOINT_ACTOR_G_H
#include "fdbserver/ServerCheckpoint.actor.g.h"
#elif !defined(FDBSERVER_SERVER_CHECKPOINT_ACTOR_H)
#define FDBSERVER_SERVER_CHECKPOINT_ACTOR_H
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/StorageCheckpoint.h"
#include "flow/flow.h"
#include "flow/actorcompiler.h" // has to be last include
// An ICheckpointReader can read the contents of a checkpoint created from a KV store,
// i.e., by IKeyValueStore::checkpoint().
class ICheckpointReader {
public:
// `token` is a serialized object defined by each derived ICheckpointReader class, to specify the
// starting point for the underlying checkpoint.
virtual Future<Void> init(StringRef token) = 0;
// Scans the checkpoint, and returns the key-value pairs.
virtual Future<RangeResult> nextKeyValues(const int rowLimit, const int ByteLimit) = 0;
// Returns the next chunk of the serialized checkpoint.
virtual Future<Standalone<StringRef>> nextChunk(const int ByteLimit) = 0;
virtual Future<Void> close() = 0;
protected:
virtual ~ICheckpointReader() {}
};
ICheckpointReader* newCheckpointReader(const CheckpointMetaData& checkpoint, UID logID);
// Delete a checkpoint.
ACTOR Future<Void> deleteCheckpoint(CheckpointMetaData checkpoint);
// Fetchs checkpoint to a local `dir`, `initialState` provides the checkpoint formats, location, restart point, etc.
// If cFun is provided, the progress can be checkpointed.
// Returns a CheckpointMetaData, which could contain KVS-specific results, e.g., the list of fetched checkpoint files.
ACTOR Future<CheckpointMetaData> fetchCheckpoint(Database cx,
CheckpointMetaData initialState,
std::string dir,
std::function<Future<Void>(const CheckpointMetaData&)> cFun = nullptr);
#endif

View File

@ -1179,6 +1179,8 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
json_spirit::write_string(json_spirit::mValue(regionArr), json_spirit::Output_options::none);
}
g_simulator.restarted = true;
TraceEvent("RestartSimulatorSettings")
.detail("DesiredCoordinators", g_simulator.desiredCoordinators)
.detail("ProcessesPerMachine", g_simulator.processesPerMachine)

View File

@ -154,6 +154,10 @@ bool TCServerInfo::hasHealthyAvailableSpace(double minAvailableSpaceRatio) const
return availableSpaceRatio >= minAvailableSpaceRatio;
}
bool TCServerInfo::isWigglePausedServer() const {
return collection && collection->isWigglePausedServer(id);
}
Future<Void> TCServerInfo::updateServerMetrics() {
return TCServerInfoImpl::updateServerMetrics(this);
}
@ -431,6 +435,14 @@ bool TCTeamInfo::hasServer(const UID& server) const {
return std::find(serverIDs.begin(), serverIDs.end(), server) != serverIDs.end();
}
bool TCTeamInfo::hasWigglePausedServer() const {
for (const auto& server : servers) {
if (server->isWigglePausedServer())
return true;
}
return false;
}
void TCTeamInfo::addServers(const std::vector<UID>& servers) {
serverIDs.reserve(servers.size());
for (int i = 0; i < servers.size(); i++) {

View File

@ -97,6 +97,7 @@ public:
// If a storage server does not reply its storeType, it will be tracked by failure monitor and removed.
return (storeType == configStoreType || storeType == KeyValueStoreType::END);
}
bool isWigglePausedServer() const;
std::pair<int64_t, int64_t> spaceBytes(bool includeInFlight = true) const;
int64_t loadBytes() const;
@ -214,6 +215,7 @@ public:
void delref() override { ReferenceCounted<TCTeamInfo>::delref(); }
bool hasServer(const UID& server) const;
bool hasWigglePausedServer() const;
void addServers(const std::vector<UID>& servers) override;

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More