Merge remote-tracking branch 'apple/main' into vgasiunas-upgrade-test

This commit is contained in:
Vaidas Gasiunas 2022-04-06 12:01:52 +02:00
commit ef0ec3d893
230 changed files with 18186 additions and 4994 deletions
README.md
bindings
cmake
contrib
design
documentation
fdbbackup
fdbcli
fdbclient

View File

@ -35,7 +35,7 @@ The official docker image for building is [`foundationdb/build`](https://hub.doc
To build outside the official docker image you'll need at least these dependencies:
1. Install cmake Version 3.13 or higher [CMake](https://cmake.org/)
1. Install [Mono](http://www.mono-project.com/download/stable/)
1. Install [Mono](https://www.mono-project.com/download/stable/)
1. Install [Ninja](https://ninja-build.org/) (optional, but recommended)
If compiling for local development, please set `-DUSE_WERROR=ON` in
@ -177,7 +177,7 @@ Under Windows, only Visual Studio with ClangCl is supported
1. Install [Python](https://www.python.org/downloads/) if is not already installed by Visual Studio
1. (Optional) Install [OpenJDK 11](https://developers.redhat.com/products/openjdk/download) to build Java bindings
1. (Optional) Install [OpenSSL 3.x](https://slproweb.com/products/Win32OpenSSL.html) to build with TLS support
1. (Optional) Install [WIX Toolset](http://wixtoolset.org/) to build Windows installer
1. (Optional) Install [WIX Toolset](https://wixtoolset.org/) to build Windows installer
1. `mkdir build && cd build`
1. `cmake -G "Visual Studio 16 2019" -A x64 -T ClangCl <PATH_TO_FOUNDATIONDB_SOURCE>`
1. `msbuild /p:Configuration=Release foundationdb.sln`

View File

@ -202,6 +202,7 @@ class TestRunner(object):
self.args.types = list(reduce(lambda t1, t2: filter(t1.__contains__, t2), map(lambda tester: tester.types, self.testers)))
self.args.no_directory_snapshot_ops = self.args.no_directory_snapshot_ops or any([not tester.directory_snapshot_ops_enabled for tester in self.testers])
self.args.no_tenants = self.args.no_tenants or any([not tester.tenants_enabled for tester in self.testers]) or self.args.api_version < 710
def print_test(self):
test_instructions = self._generate_test()
@ -282,6 +283,17 @@ class TestRunner(object):
def _insert_instructions(self, test_instructions):
util.get_logger().info('\nInserting test into database...')
del self.db[:]
while True:
tr = self.db.create_transaction()
try:
tr.options.set_special_key_space_enable_writes()
del tr[b'\xff\xff/management/tenant_map/' : b'\xff\xff/management/tenant_map0']
tr.commit().wait()
break
except fdb.FDBError as e:
tr.on_error(e).wait()
for subspace, thread in test_instructions.items():
thread.insert_operations(self.db, subspace)
@ -445,6 +457,8 @@ def parse_args(argv):
parser.add_argument('--no-directory-snapshot-ops', action='store_true', help='Disables snapshot operations for directory instructions.')
parser.add_argument('--no-tenants', action='store_true', help='Disables tenant operations.')
return parser.parse_args(argv)

View File

@ -26,7 +26,7 @@ ALL_TYPES = COMMON_TYPES + ['versionstamp']
class Tester:
def __init__(self, name, cmd, max_int_bits=64, min_api_version=0, max_api_version=MAX_API_VERSION, threads_enabled=True, types=COMMON_TYPES, directory_snapshot_ops_enabled=True):
def __init__(self, name, cmd, max_int_bits=64, min_api_version=0, max_api_version=MAX_API_VERSION, threads_enabled=True, types=COMMON_TYPES, directory_snapshot_ops_enabled=True, tenants_enabled=False):
self.name = name
self.cmd = cmd
self.max_int_bits = max_int_bits
@ -35,6 +35,7 @@ class Tester:
self.threads_enabled = threads_enabled
self.types = types
self.directory_snapshot_ops_enabled = directory_snapshot_ops_enabled
self.tenants_enabled = tenants_enabled
def supports_api_version(self, api_version):
return api_version >= self.min_api_version and api_version <= self.max_api_version
@ -57,11 +58,11 @@ _java_cmd = 'java -ea -cp %s:%s com.apple.foundationdb.test.' % (
# We could set min_api_version lower on some of these if the testers were updated to support them
testers = {
'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES),
'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES),
'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
'ruby': Tester('ruby', _absolute_path('ruby/tests/tester.rb'), 2040, 23, MAX_API_VERSION),
'java': Tester('java', _java_cmd + 'StackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES),
'java_async': Tester('java', _java_cmd + 'AsyncStackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES),
'java': Tester('java', _java_cmd + 'StackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
'java_async': Tester('java', _java_cmd + 'AsyncStackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
'go': Tester('go', _absolute_path('go/build/bin/_stacktester'), 2040, 200, MAX_API_VERSION, types=ALL_TYPES),
'flow': Tester('flow', _absolute_path('flow/bin/fdb_flow_tester'), 63, 500, MAX_API_VERSION, directory_snapshot_ops_enabled=False),
}

View File

@ -0,0 +1,77 @@
Overview
--------
Tenant testing is an optional extension to the core binding tester that enables
testing of the tenant API. This testing is enabled by adding some additional
instructions and modifying the behavior of some existing instructions.
Additional State and Initialization
-----------------------------------
Your tester should store an additional piece of state tracking the active tenant
that is to be used to create transactions. This tenant must support an unset
state, in which case transactions will be created directly on the database.
New Instructions
----------------
The tenant API introduces some new operations:
#### TENANT_CREATE
Pops the top item off of the stack as TENANT_NAME. Creates a new tenant
in the database with the name TENANT_NAME. May optionally push a future
onto the stack.
#### TENANT_DELETE
Pops the top item off of the stack as TENANT_NAME. Deletes the tenant with
the name TENANT_NAME from the database. May optionally push a future onto
the stack.
#### TENANT_SET_ACTIVE
Pops the top item off of the stack as TENANT_NAME. Opens the tenant with
name TENANT_NAME and stores it as the active tenant.
#### TENANT_CLEAR_ACTIVE
Unsets the active tenant.
Updates to Existing Instructions
--------------------------------
Some existing operations in the binding tester will have slightly modified
behavior when tenants are enabled.
#### NEW_TRANSACTION
When creating a new transaction, the active tenant should be used. If no active
tenant is set, then the transaction should be created as normal using the
database.
#### _TENANT suffix
Similar to the _DATABASE suffix, an operation with the _TENANT suffix indicates
that the operation should be performed on the current active tenant object. If
there is no active tenant, then the operation should be performed on the database
as if _DATABASE was specified. In any case where the operation suffixed with
_DATABASE is allowed to push a future onto the stack, the same operation suffixed
with _TENANT is also allowed to push a future onto the stack.
If your binding does not support operations directly on a tenant object, you should
simulate it using an anonymous transaction. Remember that set and clear operations
must immediately commit (with appropriate retry behavior!).
Operations that can include the _TENANT prefix are:
GET_TENANT
GET_KEY_TENANT
GET_RANGE_TENANT
GET_RANGE_STARTS_WITH_TENANT
GET_RANGE_SELECTOR_TENANT
SET_TENANT
CLEAR_TENANT
CLEAR_RANGE_TENANT
CLEAR_RANGE_STARTS_WITH_TENANT
ATOMIC_OP_TENANT

View File

@ -58,6 +58,7 @@ class ApiTest(Test):
self.outstanding_ops = []
self.random = test_util.RandomGenerator(args.max_int_bits, args.api_version, args.types)
self.api_version = args.api_version
self.allocated_tenants = set()
def add_stack_items(self, num):
self.stack_size += num
@ -137,6 +138,12 @@ class ApiTest(Test):
test_util.to_front(instructions, self.stack_size - read[0])
instructions.append('WAIT_FUTURE')
def choose_tenant(self, new_tenant_probability):
if len(self.allocated_tenants) == 0 or random.random() < new_tenant_probability:
return self.random.random_string(random.randint(0, 30))
else:
return random.choice(list(self.allocated_tenants))
def generate(self, args, thread_number):
instructions = InstructionSet()
@ -158,6 +165,7 @@ class ApiTest(Test):
write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT']
txn_sizes = ['GET_APPROXIMATE_SIZE']
storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS']
tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE']
op_choices += reads
op_choices += mutations
@ -173,6 +181,9 @@ class ApiTest(Test):
op_choices += txn_sizes
op_choices += storage_metrics
if not args.no_tenants:
op_choices += tenants
idempotent_atomic_ops = ['BIT_AND', 'BIT_OR', 'MAX', 'MIN', 'BYTE_MIN', 'BYTE_MAX']
atomic_ops = idempotent_atomic_ops + ['ADD', 'BIT_XOR', 'APPEND_IF_FITS']
@ -195,7 +206,7 @@ class ApiTest(Test):
# print 'Adding instruction %s at %d' % (op, index)
if args.concurrency == 1 and (op in database_mutations):
if args.concurrency == 1 and (op in database_mutations or op in ['TENANT_CREATE', 'TENANT_DELETE']):
self.wait_for_reads(instructions)
test_util.blocking_commit(instructions)
self.can_get_commit_version = False
@ -570,18 +581,39 @@ class ApiTest(Test):
instructions.push_args(key1, key2, chunkSize)
instructions.append(op)
self.add_strings(1)
elif op == 'TENANT_CREATE':
tenant_name = self.choose_tenant(0.8)
self.allocated_tenants.add(tenant_name)
instructions.push_args(tenant_name)
instructions.append(op)
self.add_strings(1)
elif op == 'TENANT_DELETE':
tenant_name = self.choose_tenant(0.2)
if tenant_name in self.allocated_tenants:
self.allocated_tenants.remove(tenant_name)
instructions.push_args(tenant_name)
instructions.append(op)
self.add_strings(1)
elif op == 'TENANT_SET_ACTIVE':
tenant_name = self.choose_tenant(0.8)
instructions.push_args(tenant_name)
instructions.append(op)
elif op == 'TENANT_CLEAR_ACTIVE':
instructions.append(op)
else:
assert False, 'Unknown operation: ' + op
if read_performed and op not in database_reads:
self.outstanding_ops.append((self.stack_size, len(instructions) - 1))
if args.concurrency == 1 and (op in database_reads or op in database_mutations):
if args.concurrency == 1 and (op in database_reads or op in database_mutations or op in ['TENANT_CREATE', 'TENANT_DELETE']):
instructions.append('WAIT_FUTURE')
instructions.begin_finalization()
if not args.no_tenants:
instructions.append('TENANT_CLEAR_ACTIVE')
if args.concurrency == 1:
self.wait_for_reads(instructions)
test_util.blocking_commit(instructions)

View File

@ -124,6 +124,7 @@ if(NOT WIN32)
add_library(fdb_c_performance_test OBJECT test/performance_test.c test/test.h)
add_library(fdb_c_ryw_benchmark OBJECT test/ryw_benchmark.c test/test.h)
add_library(fdb_c_txn_size_test OBJECT test/txn_size_test.c test/test.h)
add_library(fdb_c_client_memory_test OBJECT test/client_memory_test.cpp test/unit/fdb_api.cpp test/unit/fdb_api.hpp)
add_library(mako OBJECT ${MAKO_SRCS})
add_library(fdb_c_setup_tests OBJECT test/unit/setup_tests.cpp)
add_library(fdb_c_unit_tests OBJECT ${UNIT_TEST_SRCS})
@ -135,6 +136,7 @@ if(NOT WIN32)
add_executable(fdb_c_performance_test test/performance_test.c test/test.h)
add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h)
add_executable(fdb_c_txn_size_test test/txn_size_test.c test/test.h)
add_executable(fdb_c_client_memory_test test/client_memory_test.cpp test/unit/fdb_api.cpp test/unit/fdb_api.hpp)
add_executable(mako ${MAKO_SRCS})
add_executable(fdb_c_setup_tests test/unit/setup_tests.cpp)
add_executable(fdb_c_unit_tests ${UNIT_TEST_SRCS})
@ -145,10 +147,12 @@ if(NOT WIN32)
strip_debug_symbols(fdb_c_performance_test)
strip_debug_symbols(fdb_c_ryw_benchmark)
strip_debug_symbols(fdb_c_txn_size_test)
strip_debug_symbols(fdb_c_client_memory_test)
endif()
target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads)
add_dependencies(fdb_c_setup_tests doctest)
add_dependencies(fdb_c_unit_tests doctest)

View File

@ -835,9 +835,10 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio
context.get_load_f = granule_context.get_load_f;
context.free_load_f = granule_context.free_load_f;
context.debugNoMaterialize = granule_context.debugNoMaterialize;
context.granuleParallelism = granule_context.granuleParallelism;
Optional<Version> rv;
if (readVersion != invalidVersion) { rv = readVersion; }
if (readVersion != latestVersion) { rv = readVersion; }
return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr()););
}

View File

@ -185,7 +185,12 @@ typedef struct readgranulecontext {
void* userContext;
/* Returns a unique id for the load. Asynchronous to support queueing multiple in parallel. */
int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
int64_t (*start_load_f)(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* context);
/* Returns data for the load. Pass the loadId returned by start_load_f */
uint8_t* (*get_load_f)(int64_t loadId, void* context);
@ -196,6 +201,9 @@ typedef struct readgranulecontext {
/* Set this to true for testing if you don't want to read the granule files,
just do the request to the blob workers */
fdb_bool_t debugNoMaterialize;
/* Number of granules to load in parallel */
int granuleParallelism;
} FDBReadBlobGranuleContext;
DLLEXPORT void fdb_future_cancel(FDBFuture* f);
@ -441,15 +449,15 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_range_split_points(F
int end_key_name_length,
int64_t chunk_size);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTransaction* db,
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length);
/* InvalidVersion (-1) for readVersion means get read version from transaction
/* LatestVersion (-2) for readVersion means get read version from transaction
Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */
DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* db,
DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,

View File

@ -0,0 +1,83 @@
/*
* client_memory_test.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define FDB_API_VERSION 710
#include <foundationdb/fdb_c.h>
#include "unit/fdb_api.hpp"
#include <thread>
#include <iostream>
#include <vector>
void fdb_check(fdb_error_t e) {
if (e) {
std::cerr << fdb_get_error(e) << std::endl;
std::abort();
}
}
FDBDatabase* fdb_open_database(const char* clusterFile) {
FDBDatabase* db;
fdb_check(fdb_create_database(clusterFile, &db));
return db;
}
int main(int argc, char** argv) {
if (argc != 2) {
printf("Usage: %s <cluster_file>", argv[0]);
}
fdb_check(fdb_select_api_version(710));
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
fdb_check(
fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));
fdb_check(fdb_network_set_option(
FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, reinterpret_cast<const uint8_t*>("json"), 4));
// Use a bunch of memory from different client threads
FDBDatabase* db = fdb_open_database(argv[1]);
auto thread_func = [&]() {
fdb::Transaction tr(db);
for (int i = 0; i < 10000; ++i) {
tr.set(std::to_string(i), std::string(i, '\x00'));
}
tr.cancel();
};
std::vector<std::thread> threads;
constexpr auto kThreadCount = 64;
for (int i = 0; i < kThreadCount; ++i) {
threads.emplace_back(thread_func);
}
for (auto& thread : threads) {
thread.join();
}
fdb_database_destroy(db);
db = nullptr;
// Memory usage should go down now if the allocator is returning memory to the OS. It's expected that something is
// externally monitoring the memory usage of this process during this sleep.
using namespace std::chrono_literals;
std::this_thread::sleep_for(10s);
fdb_check(fdb_stop_network());
network_thread.join();
}

View File

@ -585,6 +585,7 @@ int64_t granule_start_load(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* userContext) {
FILE* fp;
char full_fname[PATH_MAX];
@ -682,6 +683,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction,
granuleContext.get_load_f = &granule_get_load;
granuleContext.free_load_f = &granule_free_load;
granuleContext.debugNoMaterialize = !doMaterialize;
granuleContext.granuleParallelism = 2; // TODO make knob or setting for changing this?
r = fdb_transaction_read_blob_granules(transaction,
(uint8_t*)keystr,
@ -689,7 +691,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction,
(uint8_t*)keystr2,
strlen(keystr2),
0 /* beginVersion*/,
-1, /* endVersion. -1 is use txn read version */
-2, /* endVersion. -2 (latestVersion) is use txn read version */
granuleContext);
free(fileContext.data_by_id);

View File

@ -138,6 +138,12 @@ Tenant::Tenant(FDBDatabase* db, const uint8_t* name, int name_length) {
}
}
Tenant::~Tenant() {
if (tenant != nullptr) {
fdb_tenant_destroy(tenant);
}
}
// Transaction
Transaction::Transaction(FDBDatabase* db) {
if (fdb_error_t err = fdb_database_create_transaction(db, &tr_)) {
@ -146,7 +152,7 @@ Transaction::Transaction(FDBDatabase* db) {
}
}
Transaction::Transaction(Tenant tenant) {
Transaction::Transaction(Tenant& tenant) {
if (fdb_error_t err = fdb_tenant_create_transaction(tenant.tenant, &tr_)) {
std::cerr << fdb_get_error(err) << std::endl;
std::abort();

View File

@ -206,6 +206,11 @@ public:
class Tenant final {
public:
Tenant(FDBDatabase* db, const uint8_t* name, int name_length);
~Tenant();
Tenant(const Tenant&) = delete;
Tenant& operator=(const Tenant&) = delete;
Tenant(Tenant&&) = delete;
Tenant& operator=(Tenant&&) = delete;
private:
friend class Transaction;
@ -219,7 +224,7 @@ class Transaction final {
public:
// Given an FDBDatabase, initializes a new transaction.
Transaction(FDBDatabase* db);
Transaction(Tenant tenant);
Transaction(Tenant& tenant);
~Transaction();
// Wrapper around fdb_transaction_reset.

View File

@ -20,6 +20,7 @@
// Unit tests for the FoundationDB C API.
#include "fdb_c_options.g.h"
#define FDB_API_VERSION 710
#include <foundationdb/fdb_c.h>
#include <assert.h>
@ -948,12 +949,10 @@ std::map<std::string, std::string> fillInRecords(int n) {
return data;
}
GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transaction& tr) {
GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transaction& tr, std::string mapper) {
std::string indexEntryKeyBegin = indexEntryKey(beginId);
std::string indexEntryKeyEnd = indexEntryKey(endId);
std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).append("{...}"_sr).pack().toString();
return get_mapped_range(
tr,
FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((const uint8_t*)indexEntryKeyBegin.c_str(), indexEntryKeyBegin.size()),
@ -968,6 +967,11 @@ GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transact
/* reverse */ 0);
}
GetMappedRangeResult getMappedIndexEntries(int beginId, int endId, fdb::Transaction& tr) {
std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).append("{...}"_sr).pack().toString();
return getMappedIndexEntries(beginId, endId, tr, mapper);
}
TEST_CASE("fdb_transaction_get_mapped_range") {
const int TOTAL_RECORDS = 20;
fillInRecords(TOTAL_RECORDS);
@ -1008,7 +1012,6 @@ TEST_CASE("fdb_transaction_get_mapped_range") {
TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_serializable") {
std::string mapper = Tuple().append(prefix).append(RECORD).append("{K[3]}"_sr).pack().toString();
fdb::Transaction tr(db);
fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
auto result = get_mapped_range(
tr,
FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((const uint8_t*)indexEntryKey(0).c_str(), indexEntryKey(0).size()),
@ -1038,11 +1041,36 @@ TEST_CASE("fdb_transaction_get_mapped_range_restricted_to_ryw_enable") {
/* target_bytes */ 0,
/* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL,
/* iteration */ 0,
/* snapshot */ true,
/* snapshot */ false,
/* reverse */ 0);
ASSERT(result.err == error_code_unsupported_operation);
}
void assertNotTuple(std::string str) {
try {
Tuple::unpack(str);
} catch (Error& e) {
return;
}
UNREACHABLE();
}
TEST_CASE("fdb_transaction_get_mapped_range_fail_on_mapper_not_tuple") {
// A string that cannot be parsed as tuple.
// "\x15:\x152\x15E\x15\x09\x15\x02\x02MySimpleRecord$repeater-version\x00\x15\x013\x00\x00\x00\x00\x1aU\x90\xba\x00\x00\x00\x02\x15\x04"
std::string mapper = {
'\x15', ':', '\x15', '2', '\x15', 'E', '\x15', '\t', '\x15', '\x02', '\x02', 'M',
'y', 'S', 'i', 'm', 'p', 'l', 'e', 'R', 'e', 'c', 'o', 'r',
'd', '$', 'r', 'e', 'p', 'e', 'a', 't', 'e', 'r', '-', 'v',
'e', 'r', 's', 'i', 'o', 'n', '\x00', '\x15', '\x01', '3', '\x00', '\x00',
'\x00', '\x00', '\x1a', 'U', '\x90', '\xba', '\x00', '\x00', '\x00', '\x02', '\x15', '\x04'
};
assertNotTuple(mapper);
fdb::Transaction tr(db);
auto result = getMappedIndexEntries(1, 3, tr, mapper);
ASSERT(result.err == error_code_mapper_not_tuple);
}
TEST_CASE("fdb_transaction_get_range reverse") {
std::map<std::string, std::string> data = create_data({ { "a", "1" }, { "b", "2" }, { "c", "3" }, { "d", "4" } });
insert_data(db, data);
@ -2430,6 +2458,38 @@ TEST_CASE("Tenant create, access, and delete") {
break;
}
while (1) {
StringRef begin = "\xff\xff/management/tenant_map/"_sr;
StringRef end = "\xff\xff/management/tenant_map0"_sr;
fdb_check(tr.set_option(FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, nullptr, 0));
fdb::KeyValueArrayFuture f = tr.get_range(FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(begin.begin(), begin.size()),
FDB_KEYSEL_FIRST_GREATER_OR_EQUAL(end.begin(), end.size()),
/* limit */ 0,
/* target_bytes */ 0,
/* FDBStreamingMode */ FDB_STREAMING_MODE_WANT_ALL,
/* iteration */ 0,
/* snapshot */ false,
/* reverse */ 0);
fdb_error_t err = wait_future(f);
if (err) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
FDBKeyValue const* outKv;
int outCount;
int outMore;
fdb_check(f.get(&outKv, &outCount, &outMore));
CHECK(outCount == 1);
CHECK(StringRef(outKv->key, outKv->key_length) == StringRef(tenantName).withPrefix(begin));
tr.reset();
break;
}
fdb::Tenant tenant(db, reinterpret_cast<const uint8_t*>(tenantName.c_str()), tenantName.size());
fdb::Transaction tr2(tenant);
@ -2505,6 +2565,152 @@ TEST_CASE("Tenant create, access, and delete") {
}
}
int64_t granule_start_load_fail(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* userContext) {
CHECK(false);
return -1;
}
uint8_t* granule_get_load_fail(int64_t loadId, void* userContext) {
CHECK(false);
return nullptr;
}
void granule_free_load_fail(int64_t loadId, void* userContext) {
CHECK(false);
}
TEST_CASE("Blob Granule Functions") {
auto confValue =
get_value("\xff/conf/blob_granules_enabled", /* snapshot */ false, { FDB_TR_OPTION_READ_SYSTEM_KEYS });
if (!confValue.has_value() || confValue.value() != "1") {
return;
}
// write some data
insert_data(db, create_data({ { "bg1", "a" }, { "bg2", "b" }, { "bg3", "c" } }));
// because wiring up files is non-trivial, just test the calls complete with the expected no_materialize error
FDBReadBlobGranuleContext granuleContext;
granuleContext.userContext = nullptr;
granuleContext.start_load_f = &granule_start_load_fail;
granuleContext.get_load_f = &granule_get_load_fail;
granuleContext.free_load_f = &granule_free_load_fail;
granuleContext.debugNoMaterialize = true;
granuleContext.granuleParallelism = 1;
// dummy values
FDBKeyValue const* out_kv;
int out_count;
int out_more;
fdb::Transaction tr(db);
int64_t originalReadVersion = -1;
// test no materialize gets error but completes, save read version
while (1) {
fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
// -2 is latest version
fdb::KeyValueArrayResult r = tr.read_blob_granules(key("bg"), key("bh"), 0, -2, granuleContext);
fdb_error_t err = r.get(&out_kv, &out_count, &out_more);
if (err && err != 2037 /* blob_granule_not_materialized */) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
CHECK(err == 2037 /* blob_granule_not_materialized */);
// If read done, save read version. Should have already used read version so this shouldn't error
fdb::Int64Future grvFuture = tr.get_read_version();
fdb_error_t grvErr = wait_future(grvFuture);
CHECK(!grvErr);
CHECK(!grvFuture.get(&originalReadVersion));
CHECK(originalReadVersion > 0);
tr.reset();
break;
}
// test with begin version > 0
while (1) {
fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
// -2 is latest version, read version should be >= originalReadVersion
fdb::KeyValueArrayResult r =
tr.read_blob_granules(key("bg"), key("bh"), originalReadVersion, -2, granuleContext);
fdb_error_t err = r.get(&out_kv, &out_count, &out_more);
;
if (err && err != 2037 /* blob_granule_not_materialized */) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
CHECK(err == 2037 /* blob_granule_not_materialized */);
tr.reset();
break;
}
// test with prior read version completes after delay larger than normal MVC window
// TODO: should we not do this?
std::this_thread::sleep_for(std::chrono::milliseconds(6000));
while (1) {
fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
fdb::KeyValueArrayResult r =
tr.read_blob_granules(key("bg"), key("bh"), 0, originalReadVersion, granuleContext);
fdb_error_t err = r.get(&out_kv, &out_count, &out_more);
if (err && err != 2037 /* blob_granule_not_materialized */) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
CHECK(err == 2037 /* blob_granule_not_materialized */);
tr.reset();
break;
}
// test ranges
while (1) {
fdb::KeyRangeArrayFuture f = tr.get_blob_granule_ranges(key("bg"), key("bh"));
fdb_error_t err = wait_future(f);
if (err) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
const FDBKeyRange* out_kr;
int out_count;
fdb_check(f.get(&out_kr, &out_count));
CHECK(out_count >= 1);
// check key ranges are in order
for (int i = 0; i < out_count; i++) {
// key range start < end
CHECK(std::string((const char*)out_kr[i].begin_key, out_kr[i].begin_key_length) <
std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length));
}
// Ranges themselves are sorted
for (int i = 0; i < out_count - 1; i++) {
CHECK(std::string((const char*)out_kr[i].end_key, out_kr[i].end_key_length) <=
std::string((const char*)out_kr[i + 1].begin_key, out_kr[i + 1].begin_key_length));
}
tr.reset();
break;
}
}
int main(int argc, char** argv) {
if (argc < 3) {
std::cout << "Unit tests for the FoundationDB C API.\n"

View File

@ -32,6 +32,7 @@ set(JAVA_BINDING_SRCS
src/main/com/apple/foundationdb/DirectBufferPool.java
src/main/com/apple/foundationdb/FDB.java
src/main/com/apple/foundationdb/FDBDatabase.java
src/main/com/apple/foundationdb/FDBTenant.java
src/main/com/apple/foundationdb/FDBTransaction.java
src/main/com/apple/foundationdb/FutureInt64.java
src/main/com/apple/foundationdb/FutureKey.java
@ -64,6 +65,8 @@ set(JAVA_BINDING_SRCS
src/main/com/apple/foundationdb/ReadTransactionContext.java
src/main/com/apple/foundationdb/subspace/package-info.java
src/main/com/apple/foundationdb/subspace/Subspace.java
src/main/com/apple/foundationdb/Tenant.java
src/main/com/apple/foundationdb/TenantManagement.java
src/main/com/apple/foundationdb/Transaction.java
src/main/com/apple/foundationdb/TransactionContext.java
src/main/com/apple/foundationdb/EventKeeper.java

View File

@ -663,6 +663,34 @@ JNIEXPORT jbyteArray JNICALL Java_com_apple_foundationdb_FutureKey_FutureKey_1ge
return result;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1openTenant(JNIEnv* jenv,
jobject,
jlong dbPtr,
jbyteArray tenantNameBytes) {
if (!dbPtr || !tenantNameBytes) {
throwParamNotNull(jenv);
return 0;
}
FDBDatabase* database = (FDBDatabase*)dbPtr;
FDBTenant* tenant;
uint8_t* barr = (uint8_t*)jenv->GetByteArrayElements(tenantNameBytes, JNI_NULL);
if (!barr) {
if (!jenv->ExceptionOccurred())
throwRuntimeEx(jenv, "Error getting handle to native resources");
return 0;
}
fdb_error_t err = fdb_database_open_tenant(database, barr, jenv->GetArrayLength(tenantNameBytes), &tenant);
if (err) {
safeThrow(jenv, getThrowable(jenv, err));
return 0;
}
jenv->ReleaseByteArrayElements(tenantNameBytes, (jbyte*)barr, JNI_ABORT);
return (jlong)tenant;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1createTransaction(JNIEnv* jenv,
jobject,
jlong dbPtr) {
@ -764,6 +792,31 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDB_Database_1create(JNIEnv*
return (jlong)db;
}
JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1createTransaction(JNIEnv* jenv,
jobject,
jlong tPtr) {
if (!tPtr) {
throwParamNotNull(jenv);
return 0;
}
FDBTenant* tenant = (FDBTenant*)tPtr;
FDBTransaction* tr;
fdb_error_t err = fdb_tenant_create_transaction(tenant, &tr);
if (err) {
safeThrow(jenv, getThrowable(jenv, err));
return 0;
}
return (jlong)tr;
}
JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTenant_Tenant_1dispose(JNIEnv* jenv, jobject, jlong tPtr) {
if (!tPtr) {
throwParamNotNull(jenv);
return;
}
fdb_tenant_destroy((FDBTenant*)tPtr);
}
JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBTransaction_Transaction_1setVersion(JNIEnv* jenv,
jobject,
jlong tPtr,

View File

@ -23,6 +23,7 @@ package com.apple.foundationdb;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.function.Function;
import com.apple.foundationdb.tuple.Tuple;
/**
* A mutable, lexicographically ordered mapping from binary keys to binary values.
@ -41,11 +42,82 @@ import java.util.function.Function;
*/
public interface Database extends AutoCloseable, TransactionContext {
/**
* Creates a {@link Transaction} that operates on this {@code Database}.<br>
* Opens an existing tenant to be used for running transactions.<br>
* <br>
* <b>Note:</b> opening a tenant does not check its existence in the cluster. If the tenant does not exist,
* attempts to read or write data with it will fail.
*
* @param tenantName The name of the tenant to open.
* @return a {@link Tenant} that can be used to create transactions that will operate in the tenant's key-space.
*/
default Tenant openTenant(byte[] tenantName) {
return openTenant(tenantName, getExecutor());
}
/**
* Opens an existing tenant to be used for running transactions. This is a convenience method that generates the
* tenant name by packing a {@code Tuple}.<br>
* <br>
* <b>Note:</b> opening a tenant does not check its existence in the cluster. If the tenant does not exist,
* attempts to read or write data with it will fail.
*
* @param tenantName The name of the tenant to open, as a Tuple.
* @return a {@link Tenant} that can be used to create transactions that will operate in the tenant's key-space.
*/
Tenant openTenant(Tuple tenantName);
/**
* Opens an existing tenant to be used for running transactions.
*
* @param tenantName The name of the tenant to open.
* @param e the {@link Executor} to use when executing asynchronous callbacks.
* @return a {@link Tenant} that can be used to create transactions that will operate in the tenant's key-space.
*/
Tenant openTenant(byte[] tenantName, Executor e);
/**
* Opens an existing tenant to be used for running transactions. This is a convenience method that generates the
* tenant name by packing a {@code Tuple}.
*
* @param tenantName The name of the tenant to open, as a Tuple.
* @param e the {@link Executor} to use when executing asynchronous callbacks.
* @return a {@link Tenant} that can be used to create transactions that will operate in the tenant's key-space.
*/
Tenant openTenant(Tuple tenantName, Executor e);
/**
* Opens an existing tenant to be used for running transactions.
*
* @param tenantName The name of the tenant to open.
* @param e the {@link Executor} to use when executing asynchronous callbacks.
* @param eventKeeper the {@link EventKeeper} to use when tracking instrumented calls for the tenant's transactions.
* @return a {@link Tenant} that can be used to create transactions that will operate in the tenant's key-space.
*/
Tenant openTenant(byte[] tenantName, Executor e, EventKeeper eventKeeper);
/**
* Opens an existing tenant to be used for running transactions. This is a convenience method that generates the
* tenant name by packing a {@code Tuple}.
*
* @param tenantName The name of the tenant to open, as a Tuple.
* @param e the {@link Executor} to use when executing asynchronous callbacks.
* @param eventKeeper the {@link EventKeeper} to use when tracking instrumented calls for the tenant's transactions.
* @return a {@link Tenant} that can be used to create transactions that will operate in the tenant's key-space.
*/
Tenant openTenant(Tuple tenantName, Executor e, EventKeeper eventKeeper);
/**
* Creates a {@link Transaction} that operates on this {@code Database}. Creating a transaction
* in this way does not associate it with a {@code Tenant}, and as a result the transaction will
* operate on the entire key-space for the database.<br>
* <br>
* <b>Note:</b> Java transactions automatically set the {@link TransactionOptions#setUsedDuringCommitProtectionDisable}
* option. This is because the Java bindings disallow use of {@code Transaction} objects after
* {@link Transaction#onError} is called.
* {@link Transaction#onError} is called.<br>
* <br>
* <b>Note:</b> Transactions created directly on a {@code Database} object cannot be used in a cluster
* that requires tenant-based access. To run transactions in those clusters, you must first open a tenant
* with {@link #openTenant(byte[])}.
*
* @return a newly created {@code Transaction} that reads from and writes to this {@code Database}.
*/

View File

@ -27,6 +27,8 @@ import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.tuple.ByteArrayUtil;
import com.apple.foundationdb.tuple.Tuple;
class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsumer {
private DatabaseOptions options;
@ -116,6 +118,44 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
}
}
@Override
public Tenant openTenant(byte[] tenantName, Executor e) {
return openTenant(tenantName, e, eventKeeper);
}
@Override
public Tenant openTenant(Tuple tenantName) {
return openTenant(tenantName.pack());
}
@Override
public Tenant openTenant(Tuple tenantName, Executor e) {
return openTenant(tenantName.pack(), e);
}
@Override
public Tenant openTenant(byte[] tenantName, Executor e, EventKeeper eventKeeper) {
pointerReadLock.lock();
Tenant tenant = null;
try {
tenant = new FDBTenant(Database_openTenant(getPtr(), tenantName), this, tenantName, e, eventKeeper);
return tenant;
} catch (RuntimeException err) {
if (tenant != null) {
tenant.close();
}
throw err;
} finally {
pointerReadLock.unlock();
}
}
@Override
public Tenant openTenant(Tuple tenantName, Executor e, EventKeeper eventKeeper) {
return openTenant(tenantName.pack(), e, eventKeeper);
}
@Override
public Transaction createTransaction(Executor e) {
return createTransaction(e, eventKeeper);
@ -170,6 +210,7 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
Database_dispose(cPtr);
}
private native long Database_openTenant(long cPtr, byte[] tenantName);
private native long Database_createTransaction(long cPtr);
private native void Database_dispose(long cPtr);
private native void Database_setOption(long cPtr, int code, byte[] value) throws FDBException;

View File

@ -0,0 +1,158 @@
/*
* FDBTenant.java
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.apple.foundationdb;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.tuple.ByteArrayUtil;
class FDBTenant extends NativeObjectWrapper implements Tenant {
private final Database database;
private final byte[] name;
private final Executor executor;
private final EventKeeper eventKeeper;
protected FDBTenant(long cPtr, Database database, byte[] name, Executor executor) {
this(cPtr, database, name, executor, null);
}
protected FDBTenant(long cPtr, Database database, byte[] name, Executor executor, EventKeeper eventKeeper) {
super(cPtr);
this.database = database;
this.name = name;
this.executor = executor;
this.eventKeeper = eventKeeper;
}
@Override
public <T> T run(Function<? super Transaction, T> retryable, Executor e) {
Transaction t = this.createTransaction(e);
try {
while (true) {
try {
T returnVal = retryable.apply(t);
t.commit().join();
return returnVal;
} catch (RuntimeException err) {
t = t.onError(err).join();
}
}
} finally {
t.close();
}
}
@Override
public <T> T read(Function<? super ReadTransaction, T> retryable, Executor e) {
return this.run(retryable, e);
}
@Override
public <T> CompletableFuture<T> runAsync(final Function<? super Transaction, ? extends CompletableFuture<T>> retryable, Executor e) {
final AtomicReference<Transaction> trRef = new AtomicReference<>(createTransaction(e));
final AtomicReference<T> returnValue = new AtomicReference<>();
return AsyncUtil.whileTrue(() -> {
CompletableFuture<T> process = AsyncUtil.applySafely(retryable, trRef.get());
return AsyncUtil.composeHandleAsync(process.thenComposeAsync(returnVal ->
trRef.get().commit().thenApply(o -> {
returnValue.set(returnVal);
return false;
}), e),
(value, t) -> {
if(t == null)
return CompletableFuture.completedFuture(value);
if(!(t instanceof RuntimeException))
throw new CompletionException(t);
return trRef.get().onError(t).thenApply(newTr -> {
trRef.set(newTr);
return true;
});
}, e);
}, e)
.thenApply(o -> returnValue.get())
.whenComplete((v, t) -> trRef.get().close());
}
@Override
public <T> CompletableFuture<T> readAsync(
Function<? super ReadTransaction, ? extends CompletableFuture<T>> retryable, Executor e) {
return this.runAsync(retryable, e);
}
@Override
protected void finalize() throws Throwable {
try {
checkUnclosed("Tenant");
close();
}
finally {
super.finalize();
}
}
@Override
public Transaction createTransaction(Executor e) {
return createTransaction(e, eventKeeper);
}
@Override
public Transaction createTransaction(Executor e, EventKeeper eventKeeper) {
pointerReadLock.lock();
Transaction tr = null;
try {
tr = new FDBTransaction(Tenant_createTransaction(getPtr()), database, e, eventKeeper);
tr.options().setUsedDuringCommitProtectionDisable();
return tr;
} catch (RuntimeException err) {
if (tr != null) {
tr.close();
}
throw err;
} finally {
pointerReadLock.unlock();
}
}
@Override
public byte[] getName() {
return name;
}
@Override
public Executor getExecutor() {
return executor;
}
@Override
protected void closeInternal(long cPtr) {
Tenant_dispose(cPtr);
}
private native long Tenant_createTransaction(long cPtr);
private native void Tenant_dispose(long cPtr);
}

View File

@ -0,0 +1,257 @@
/*
* Tenant.java
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.apple.foundationdb;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.function.Function;
/**
* A tenant represents a named key-space within a database that can be interacted with
* transactionally.<br>
* <br>
* The simplest correct programs using tenants will make use of the methods defined
* in the {@link TransactionContext} interface. When used on a {@code Tenant} these
* methods will call {@code Transaction#commit()} after user code has been
* executed. These methods will not return successfully until {@code commit()} has
* returned successfully.<br>
* <br>
* <b>Note:</b> {@code Tenant} objects must be {@link #close closed} when no longer
* in use in order to free any associated resources.
*/
public interface Tenant extends AutoCloseable, TransactionContext {
/**
* Creates a {@link Transaction} that operates on this {@code Tenant}.<br>
* <br>
* <b>Note:</b> Java transactions automatically set the {@link TransactionOptions#setUsedDuringCommitProtectionDisable}
* option. This is because the Java bindings disallow use of {@code Transaction} objects after
* {@link Transaction#onError} is called.
*
* @return a newly created {@code Transaction} that reads from and writes to this {@code Tenant}.
*/
default Transaction createTransaction() {
return createTransaction(getExecutor());
}
/**
* Creates a {@link Transaction} that operates on this {@code Tenant} with the given {@link Executor}
* for asynchronous callbacks.
*
* @param e the {@link Executor} to use when executing asynchronous callbacks.
* @return a newly created {@code Transaction} that reads from and writes to this {@code Tenant}.
*/
Transaction createTransaction(Executor e);
/**
* Creates a {@link Transaction} that operates on this {@code Tenant} with the given {@link Executor}
* for asynchronous callbacks.
*
* @param e the {@link Executor} to use when executing asynchronous callbacks.
* @param eventKeeper the {@link EventKeeper} to use when tracking instrumented calls for the transaction.
*
* @return a newly created {@code Transaction} that reads from and writes to this {@code Tenant}.
*/
Transaction createTransaction(Executor e, EventKeeper eventKeeper);
/**
* Returns the name of this {@code Tenant}.
*
* @return the name of this {@code Tenant} as a byte string.
*/
byte[] getName();
/**
* Runs a read-only transactional function against this {@code Tenant} with retry logic.
* {@link Function#apply(Object) apply(ReadTransaction)} will be called on the
* supplied {@link Function} until a non-retryable
* FDBException (or any {@code Throwable} other than an {@code FDBException})
* is thrown. This call is blocking -- this
* method will not return until the {@code Function} has been called and completed without error.<br>
*
* @param retryable the block of logic to execute in a {@link Transaction} against
* this tenant
* @param <T> the return type of {@code retryable}
*
* @return the result of the last run of {@code retryable}
*/
@Override
default <T> T read(Function<? super ReadTransaction, T> retryable) {
return read(retryable, getExecutor());
}
/**
* Runs a read-only transactional function against this {@code Tenant} with retry logic. Use
* this formulation of {@link #read(Function)} if one wants to set a custom {@link Executor}
* for the transaction when run.
*
* @param retryable the block of logic to execute in a {@link Transaction} against
* this tenant
* @param e the {@link Executor} to use for asynchronous callbacks
* @param <T> the return type of {@code retryable}
* @return the result of the last run of {@code retryable}
*
* @see #read(Function)
*/
<T> T read(Function<? super ReadTransaction, T> retryable, Executor e);
/**
* Runs a read-only transactional function against this {@code Tenant} with retry logic.
* {@link Function#apply(Object) apply(ReadTransaction)} will be called on the
* supplied {@link Function} until a non-retryable
* FDBException (or any {@code Throwable} other than an {@code FDBException})
* is thrown. This call is non-blocking -- this
* method will return immediately and with a {@link CompletableFuture} that will be
* set when the {@code Function} has been called and completed without error.<br>
* <br>
* Any errors encountered executing {@code retryable}, or received from the
* database, will be set on the returned {@code CompletableFuture}.
*
* @param retryable the block of logic to execute in a {@link ReadTransaction} against
* this tenant
* @param <T> the return type of {@code retryable}
*
* @return a {@code CompletableFuture} that will be set to the value returned by the last call
* to {@code retryable}
*/
@Override
default <T> CompletableFuture<T> readAsync(
Function<? super ReadTransaction, ? extends CompletableFuture<T>> retryable) {
return readAsync(retryable, getExecutor());
}
/**
* Runs a read-only transactional function against this {@code Tenant} with retry logic.
* Use this version of {@link #readAsync(Function)} if one wants to set a custom
* {@link Executor} for the transaction when run.
*
* @param retryable the block of logic to execute in a {@link ReadTransaction} against
* this tenant
* @param e the {@link Executor} to use for asynchronous callbacks
* @param <T> the return type of {@code retryable}
*
* @return a {@code CompletableFuture} that will be set to the value returned by the last call
* to {@code retryable}
*
* @see #readAsync(Function)
*/
<T> CompletableFuture<T> readAsync(
Function<? super ReadTransaction, ? extends CompletableFuture<T>> retryable, Executor e);
/**
* Runs a transactional function against this {@code Tenant} with retry logic.
* {@link Function#apply(Object) apply(Transaction)} will be called on the
* supplied {@link Function} until a non-retryable
* FDBException (or any {@code Throwable} other than an {@code FDBException})
* is thrown or {@link Transaction#commit() commit()},
* when called after {@code apply()}, returns success. This call is blocking -- this
* method will not return until {@code commit()} has been called and returned success.<br>
* <br>
* As with other client/server databases, in some failure scenarios a client may
* be unable to determine whether a transaction succeeded. In these cases, your
* transaction may be executed twice. For more information about how to reason
* about these situations see
* <a href="/foundationdb/developer-guide.html#transactions-with-unknown-results"
* target="_blank">the FounationDB Developer Guide</a>
*
* @param retryable the block of logic to execute in a {@link Transaction} against
* this tenant
* @param <T> the return type of {@code retryable}
*
* @return the result of the last run of {@code retryable}
*/
@Override
default <T> T run(Function<? super Transaction, T> retryable) {
return run(retryable, getExecutor());
}
/**
* Runs a transactional function against this {@code Tenant} with retry logic.
* Use this formulation of {@link #run(Function)} if one would like to set a
* custom {@link Executor} for the transaction when run.
*
* @param retryable the block of logic to execute in a {@link Transaction} against
* this tenant
* @param e the {@link Executor} to use for asynchronous callbacks
* @param <T> the return type of {@code retryable}
*
* @return the result of the last run of {@code retryable}
*/
<T> T run(Function<? super Transaction, T> retryable, Executor e);
/**
* Runs a transactional function against this {@code Tenant} with retry logic.
* {@link Function#apply(Object) apply(Transaction)} will be called on the
* supplied {@link Function} until a non-retryable
* FDBException (or any {@code Throwable} other than an {@code FDBException})
* is thrown or {@link Transaction#commit() commit()},
* when called after {@code apply()}, returns success. This call is non-blocking -- this
* method will return immediately and with a {@link CompletableFuture} that will be
* set when {@code commit()} has been called and returned success.<br>
* <br>
* As with other client/server databases, in some failure scenarios a client may
* be unable to determine whether a transaction succeeded. In these cases, your
* transaction may be executed twice. For more information about how to reason
* about these situations see
* <a href="/foundationdb/developer-guide.html#transactions-with-unknown-results"
* target="_blank">the FounationDB Developer Guide</a><br>
* <br>
* Any errors encountered executing {@code retryable}, or received from the
* database, will be set on the returned {@code CompletableFuture}.
*
* @param retryable the block of logic to execute in a {@link Transaction} against
* this tenant
* @param <T> the return type of {@code retryable}
*
* @return a {@code CompletableFuture} that will be set to the value returned by the last call
* to {@code retryable}
*/
@Override
default <T> CompletableFuture<T> runAsync(
Function<? super Transaction, ? extends CompletableFuture<T>> retryable) {
return runAsync(retryable, getExecutor());
}
/**
* Runs a transactional function against this {@code Tenant} with retry logic. Use
* this formulation of the non-blocking {@link #runAsync(Function)} if one wants
* to set a custom {@link Executor} for the transaction when run.
*
* @param retryable the block of logic to execute in a {@link Transaction} against
* this tenant
* @param e the {@link Executor} to use for asynchronous callbacks
* @param <T> the return type of {@code retryable}
*
* @return a {@code CompletableFuture} that will be set to the value returned by the last call
* to {@code retryable}
*
* @see #run(Function)
*/
<T> CompletableFuture<T> runAsync(
Function<? super Transaction, ? extends CompletableFuture<T>> retryable, Executor e);
/**
* Close the {@code Tenant} object and release any associated resources. This must be called at
* least once after the {@code Tenant} object is no longer in use. This can be called multiple
* times, but care should be taken that it is not in use in another thread at the time of the call.
*/
@Override
void close();
}

View File

@ -0,0 +1,214 @@
/*
* TenantManagement.java
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.apple.foundationdb;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BiFunction;
import com.apple.foundationdb.async.AsyncIterable;
import com.apple.foundationdb.async.AsyncIterator;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.async.CloseableAsyncIterator;
import com.apple.foundationdb.tuple.ByteArrayUtil;
import com.apple.foundationdb.tuple.Tuple;
/**
* The FoundationDB API includes function to manage the set of tenants in a cluster.
*/
public class TenantManagement {
static final byte[] TENANT_MAP_PREFIX = ByteArrayUtil.join(new byte[] { (byte)255, (byte)255 },
"/management/tenant_map/".getBytes());
/**
* Creates a new tenant in the cluster. If the tenant already exists, this operation will complete
* successfully without changing anything. The transaction must be committed for the creation to take
* effect or to observe any errors.
*
* @param tr The transaction used to create the tenant.
* @param tenantName The name of the tenant. Can be any byte string that does not begin a 0xFF byte.
*/
public static void createTenant(Transaction tr, byte[] tenantName) {
tr.options().setSpecialKeySpaceEnableWrites();
tr.set(ByteArrayUtil.join(TENANT_MAP_PREFIX, tenantName), new byte[0]);
}
/**
* Creates a new tenant in the cluster. If the tenant already exists, this operation will complete
* successfully without changing anything. The transaction must be committed for the creation to take
* effect or to observe any errors.<br>
* <br>
* This is a convenience method that generates the tenant name by packing a {@code Tuple}.
*
* @param tr The transaction used to create the tenant.
* @param tenantName The name of the tenant, as a Tuple.
*/
public static void createTenant(Transaction tr, Tuple tenantName) {
createTenant(tr, tenantName.pack());
}
/**
* Creates a new tenant in the cluster using a transaction created on the specified {@code Database}.
* This operation will first check whether the tenant exists, and if it does it will set the
* {@code CompletableFuture} to a tenant_already_exists error. Otherwise, it will attempt to create
* the tenant in a retry loop. If the tenant is created concurrently by another transaction, this
* function may still return successfully.
*
* @param db The database used to create a transaction for creating the tenant.
* @param tenantName The name of the tenant. Can be any byte string that does not begin a 0xFF byte.
* @return a {@code CompletableFuture} that when set without error will indicate that the tenant has
* been created.
*/
public static CompletableFuture<Void> createTenant(Database db, byte[] tenantName) {
final AtomicBoolean checkedExistence = new AtomicBoolean(false);
final byte[] key = ByteArrayUtil.join(TENANT_MAP_PREFIX, tenantName);
return db.runAsync(tr -> {
tr.options().setSpecialKeySpaceEnableWrites();
if(checkedExistence.get()) {
tr.set(key, new byte[0]);
return CompletableFuture.completedFuture(null);
}
else {
return tr.get(key).thenAcceptAsync(result -> {
checkedExistence.set(true);
if(result != null) {
throw new FDBException("A tenant with the given name already exists", 2132);
}
tr.set(key, new byte[0]);
});
}
});
}
/**
* Creates a new tenant in the cluster using a transaction created on the specified {@code Database}.
* This operation will first check whether the tenant exists, and if it does it will set the
* {@code CompletableFuture} to a tenant_already_exists error. Otherwise, it will attempt to create
* the tenant in a retry loop. If the tenant is created concurrently by another transaction, this
* function may still return successfully.<br>
* <br>
* This is a convenience method that generates the tenant name by packing a {@code Tuple}.
*
* @param db The database used to create a transaction for creating the tenant.
* @param tenantName The name of the tenant, as a Tuple.
* @return a {@code CompletableFuture} that when set without error will indicate that the tenant has
* been created.
*/
public static CompletableFuture<Void> createTenant(Database db, Tuple tenantName) {
return createTenant(db, tenantName.pack());
}
/**
* Deletes a tenant from the cluster. If the tenant does not exists, this operation will complete
* successfully without changing anything. The transaction must be committed for the deletion to take
* effect or to observe any errors.<br>
* <br>
* <b>Note:</b> A tenant cannot be deleted if it has any data in it. To delete a non-empty tenant, you must
* first use a clear operation to delete all of its keys.
*
* @param tr The transaction used to delete the tenant.
* @param tenantName The name of the tenant being deleted.
*/
public static void deleteTenant(Transaction tr, byte[] tenantName) {
tr.options().setSpecialKeySpaceEnableWrites();
tr.clear(ByteArrayUtil.join(TENANT_MAP_PREFIX, tenantName));
}
/**
* Deletes a tenant from the cluster. If the tenant does not exists, this operation will complete
* successfully without changing anything. The transaction must be committed for the deletion to take
* effect or to observe any errors.<br>
* <br>
* <b>Note:</b> A tenant cannot be deleted if it has any data in it. To delete a non-empty tenant, you must
* first use a clear operation to delete all of its keys.<br>
* <br>
* This is a convenience method that generates the tenant name by packing a {@code Tuple}.
*
* @param tr The transaction used to delete the tenant.
* @param tenantName The name of the tenant being deleted, as a Tuple.
*/
public static void deleteTenant(Transaction tr, Tuple tenantName) {
deleteTenant(tr, tenantName.pack());
}
/**
* Deletes a tenant from the cluster using a transaction created on the specified {@code Database}. This
* operation will first check whether the tenant exists, and if it does not it will set the
* {@code CompletableFuture} to a tenant_not_found error. Otherwise, it will attempt to delete the
* tenant in a retry loop. If the tenant is deleted concurrently by another transaction, this function may
* still return successfully.<br>
* <br>
* <b>Note:</b> A tenant cannot be deleted if it has any data in it. To delete a non-empty tenant, you must
* first use a clear operation to delete all of its keys.
*
* @param db The database used to create a transaction for deleting the tenant.
* @param tenantName The name of the tenant being deleted.
* @return a {@code CompletableFuture} that when set without error will indicate that the tenant has
* been deleted.
*/
public static CompletableFuture<Void> deleteTenant(Database db, byte[] tenantName) {
final AtomicBoolean checkedExistence = new AtomicBoolean(false);
final byte[] key = ByteArrayUtil.join(TENANT_MAP_PREFIX, tenantName);
return db.runAsync(tr -> {
tr.options().setSpecialKeySpaceEnableWrites();
if(checkedExistence.get()) {
tr.clear(key);
return CompletableFuture.completedFuture(null);
}
else {
return tr.get(key).thenAcceptAsync(result -> {
checkedExistence.set(true);
if(result == null) {
throw new FDBException("Tenant does not exist", 2131);
}
tr.clear(key);
});
}
});
}
/**
* Deletes a tenant from the cluster using a transaction created on the specified {@code Database}. This
* operation will first check whether the tenant exists, and if it does not it will set the
* {@code CompletableFuture} to a tenant_not_found error. Otherwise, it will attempt to delete the
* tenant in a retry loop. If the tenant is deleted concurrently by another transaction, this function may
* still return successfully.<br>
* <br>
* <b>Note:</b> A tenant cannot be deleted if it has any data in it. To delete a non-empty tenant, you must
* first use a clear operation to delete all of its keys.<br>
* <br>
* This is a convenience method that generates the tenant name by packing a {@code Tuple}.
*
* @param db The database used to create a transaction for deleting the tenant.
* @param tenantName The name of the tenant being deleted.
* @return a {@code CompletableFuture} that when set without error will indicate that the tenant has
* been deleted.
*/
public static CompletableFuture<Void> deleteTenant(Database db, Tuple tenantName) {
return deleteTenant(db, tenantName.pack());
}
private TenantManagement() {}
}

View File

@ -30,6 +30,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.function.Function;
@ -42,6 +43,7 @@ import com.apple.foundationdb.KeyArrayResult;
import com.apple.foundationdb.MutationType;
import com.apple.foundationdb.Range;
import com.apple.foundationdb.StreamingMode;
import com.apple.foundationdb.TenantManagement;
import com.apple.foundationdb.Transaction;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.tuple.ByteArrayUtil;
@ -184,7 +186,7 @@ public class AsyncStackTester {
return AsyncUtil.DONE;
}
else if(op == StackOperation.RESET) {
inst.context.newTransaction();
inst.context.resetTransaction();
return AsyncUtil.DONE;
}
else if(op == StackOperation.CANCEL) {
@ -332,9 +334,9 @@ public class AsyncStackTester {
final Transaction oldTr = inst.tr;
CompletableFuture<Void> f = oldTr.onError(err).whenComplete((tr, t) -> {
if(t != null) {
inst.context.newTransaction(oldTr); // Other bindings allow reuse of non-retryable transactions, so we need to emulate that behavior.
inst.context.resetTransaction(oldTr); // Other bindings allow reuse of non-retryable transactions, so we need to emulate that behavior.
}
else if(!inst.setTransaction(oldTr, tr)) {
else if(!inst.replaceTransaction(oldTr, tr)) {
tr.close();
}
}).thenApply(v -> null);
@ -469,6 +471,28 @@ public class AsyncStackTester {
inst.push(ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putDouble(value).array());
}, FDB.DEFAULT_EXECUTOR);
}
else if (op == StackOperation.TENANT_CREATE) {
return inst.popParam().thenAcceptAsync(param -> {
byte[] tenantName = (byte[])param;
inst.push(TenantManagement.createTenant(inst.context.db, tenantName));
}, FDB.DEFAULT_EXECUTOR);
}
else if (op == StackOperation.TENANT_DELETE) {
return inst.popParam().thenAcceptAsync(param -> {
byte[] tenantName = (byte[])param;
inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName));
}, FDB.DEFAULT_EXECUTOR);
}
else if (op == StackOperation.TENANT_SET_ACTIVE) {
return inst.popParam().thenAcceptAsync(param -> {
byte[] tenantName = (byte[])param;
inst.context.setTenant(Optional.of(tenantName));
}, FDB.DEFAULT_EXECUTOR);
}
else if (op == StackOperation.TENANT_CLEAR_ACTIVE) {
inst.context.setTenant(Optional.empty());
return AsyncUtil.DONE;
}
else if(op == StackOperation.UNIT_TESTS) {
inst.context.db.options().setLocationCacheSize(100001);
return inst.context.db.runAsync(tr -> {
@ -554,7 +578,7 @@ public class AsyncStackTester {
private static CompletableFuture<Void> executeMutation(final Instruction inst, Function<Transaction, CompletableFuture<Void>> r) {
// run this with a retry loop
return inst.tcx.runAsync(r).thenRunAsync(() -> {
if(inst.isDatabase)
if(inst.isDatabase || inst.isTenant)
inst.push("RESULT_NOT_PRESENT".getBytes());
}, FDB.DEFAULT_EXECUTOR);
}

View File

@ -25,6 +25,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.CompletableFuture;
@ -35,6 +36,7 @@ import com.apple.foundationdb.FDBException;
import com.apple.foundationdb.KeySelector;
import com.apple.foundationdb.Range;
import com.apple.foundationdb.StreamingMode;
import com.apple.foundationdb.Tenant;
import com.apple.foundationdb.Transaction;
import com.apple.foundationdb.tuple.ByteArrayUtil;
import com.apple.foundationdb.tuple.Tuple;
@ -42,15 +44,27 @@ import com.apple.foundationdb.tuple.Tuple;
abstract class Context implements Runnable, AutoCloseable {
final Stack stack = new Stack();
final Database db;
Optional<Tenant> tenant = Optional.empty();
final String preStr;
int instructionIndex = 0;
KeySelector nextKey, endKey;
Long lastVersion = null;
private static class TransactionState {
public Transaction transaction;
public Optional<Tenant> tenant;
public TransactionState(Transaction transaction, Optional<Tenant> tenant) {
this.transaction = transaction;
this.tenant = tenant;
}
}
private String trName;
private List<Thread> children = new LinkedList<>();
private static Map<String, Transaction> transactionMap = new HashMap<>();
private static Map<String, TransactionState> transactionMap = new HashMap<>();
private static Map<Transaction, AtomicInteger> transactionRefCounts = new HashMap<>();
private static Map<byte[], Tenant> tenantMap = new HashMap<>();
Context(Database db, byte[] prefix) {
this.db = db;
@ -86,15 +100,24 @@ abstract class Context implements Runnable, AutoCloseable {
}
}
public synchronized void setTenant(Optional<byte[]> tenantName) {
if (tenantName.isPresent()) {
tenant = Optional.of(tenantMap.computeIfAbsent(tenantName.get(), tn -> db.openTenant(tenantName.get())));
}
else {
tenant = Optional.empty();
}
}
public static synchronized void addTransactionReference(Transaction tr) {
transactionRefCounts.computeIfAbsent(tr, x -> new AtomicInteger(0)).incrementAndGet();
}
private static synchronized Transaction getTransaction(String trName) {
Transaction tr = transactionMap.get(trName);
assert tr != null : "Null transaction";
addTransactionReference(tr);
return tr;
TransactionState state = transactionMap.get(trName);
assert state != null : "Null transaction";
addTransactionReference(state.transaction);
return state.transaction;
}
public Transaction getCurrentTransaction() {
@ -105,59 +128,78 @@ abstract class Context implements Runnable, AutoCloseable {
if(tr != null) {
AtomicInteger count = transactionRefCounts.get(tr);
if(count.decrementAndGet() == 0) {
assert !transactionMap.containsValue(tr);
transactionRefCounts.remove(tr);
tr.close();
}
}
}
private static synchronized void updateTransaction(String trName, Transaction tr) {
releaseTransaction(transactionMap.put(trName, tr));
addTransactionReference(tr);
}
private static synchronized boolean updateTransaction(String trName, Transaction oldTr, Transaction newTr) {
boolean added;
if(oldTr == null) {
added = (transactionMap.putIfAbsent(trName, newTr) == null);
private static Transaction createTransaction(Database db, Optional<Tenant> creatingTenant) {
if (creatingTenant.isPresent()) {
return creatingTenant.get().createTransaction();
}
else {
added = transactionMap.replace(trName, oldTr, newTr);
return db.createTransaction();
}
}
private static synchronized boolean newTransaction(Database db, Optional<Tenant> tenant, String trName, boolean allowReplace) {
TransactionState oldState = transactionMap.get(trName);
if (oldState != null) {
releaseTransaction(oldState.transaction);
}
else if (!allowReplace) {
return false;
}
if(added) {
TransactionState newState = new TransactionState(createTransaction(db, tenant), tenant);
transactionMap.put(trName, newState);
addTransactionReference(newState.transaction);
return true;
}
private static synchronized boolean replaceTransaction(Database db, String trName, Transaction oldTr, Transaction newTr) {
TransactionState trState = transactionMap.get(trName);
assert trState != null : "Null transaction";
if(oldTr == null || trState.transaction == oldTr) {
if(newTr == null) {
newTr = createTransaction(db, trState.tenant);
}
releaseTransaction(trState.transaction);
addTransactionReference(newTr);
releaseTransaction(oldTr);
trState.transaction = newTr;
return true;
}
return false;
}
public void updateCurrentTransaction(Transaction tr) {
updateTransaction(trName, tr);
}
public boolean updateCurrentTransaction(Transaction oldTr, Transaction newTr) {
return updateTransaction(trName, oldTr, newTr);
}
public void newTransaction() {
Transaction tr = db.createTransaction();
updateCurrentTransaction(tr);
newTransaction(db, tenant, trName, true);
}
public void newTransaction(Transaction oldTr) {
Transaction newTr = db.createTransaction();
if(!updateCurrentTransaction(oldTr, newTr)) {
newTr.close();
}
public void replaceTransaction(Transaction tr) {
replaceTransaction(db, trName, null, tr);
}
public boolean replaceTransaction(Transaction oldTr, Transaction newTr) {
return replaceTransaction(db, trName, oldTr, newTr);
}
public void resetTransaction() {
replaceTransaction(db, trName, null, null);
}
public boolean resetTransaction(Transaction oldTr) {
return replaceTransaction(db, trName, oldTr, null);
}
public void switchTransaction(byte[] rawTrName) {
trName = ByteArrayUtil.printable(rawTrName);
newTransaction(null);
newTransaction(db, tenant, trName, false);
}
abstract void executeOperations() throws Throwable;
@ -224,8 +266,12 @@ abstract class Context implements Runnable, AutoCloseable {
@Override
public void close() {
for(Transaction tr : transactionMap.values()) {
tr.close();
for(TransactionState tr : transactionMap.values()) {
tr.transaction.close();
}
for(Tenant tenant : tenantMap.values()) {
tenant.close();
}
}
}

View File

@ -33,11 +33,13 @@ import com.apple.foundationdb.tuple.Tuple;
class Instruction extends Stack {
private static final String SUFFIX_SNAPSHOT = "_SNAPSHOT";
private static final String SUFFIX_DATABASE = "_DATABASE";
private static final String SUFFIX_TENANT = "_TENANT";
final String op;
final Tuple tokens;
final Context context;
final boolean isDatabase;
final boolean isTenant;
final boolean isSnapshot;
final Transaction tr;
final ReadTransaction readTr;
@ -49,14 +51,23 @@ class Instruction extends Stack {
this.tokens = tokens;
String fullOp = tokens.getString(0);
isDatabase = fullOp.endsWith(SUFFIX_DATABASE);
boolean isDatabaseLocal = fullOp.endsWith(SUFFIX_DATABASE);
isTenant = fullOp.endsWith(SUFFIX_TENANT);
isSnapshot = fullOp.endsWith(SUFFIX_SNAPSHOT);
if(isDatabase) {
if(isDatabaseLocal) {
tr = null;
readTr = null;
op = fullOp.substring(0, fullOp.length() - SUFFIX_DATABASE.length());
}
else if(isTenant) {
tr = null;
readTr = null;
op = fullOp.substring(0, fullOp.length() - SUFFIX_TENANT.length());
if (!context.tenant.isPresent()) {
isDatabaseLocal = true;
}
}
else if(isSnapshot) {
tr = context.getCurrentTransaction();
readTr = tr.snapshot();
@ -68,22 +79,24 @@ class Instruction extends Stack {
op = fullOp;
}
tcx = isDatabase ? context.db : tr;
readTcx = isDatabase ? context.db : readTr;
isDatabase = isDatabaseLocal;
tcx = isDatabase ? context.db : isTenant ? context.tenant.get() : tr;
readTcx = isDatabase ? context.db : isTenant ? context.tenant.get() : readTr;
}
boolean setTransaction(Transaction newTr) {
if(!isDatabase) {
context.updateCurrentTransaction(newTr);
boolean replaceTransaction(Transaction newTr) {
if(!isDatabase && !isTenant) {
context.replaceTransaction(newTr);
return true;
}
return false;
}
boolean setTransaction(Transaction oldTr, Transaction newTr) {
if(!isDatabase) {
return context.updateCurrentTransaction(oldTr, newTr);
boolean replaceTransaction(Transaction oldTr, Transaction newTr) {
if(!isDatabase && !isTenant) {
return context.replaceTransaction(oldTr, newTr);
}
return false;

View File

@ -73,5 +73,11 @@ enum StackOperation {
DECODE_DOUBLE,
UNIT_TESTS, /* Possibly unimplemented */
// Tenants
TENANT_CREATE,
TENANT_DELETE,
TENANT_SET_ACTIVE,
TENANT_CLEAR_ACTIVE,
LOG_STACK
}

View File

@ -30,6 +30,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.function.Function;
@ -44,11 +45,13 @@ import com.apple.foundationdb.LocalityUtil;
import com.apple.foundationdb.MutationType;
import com.apple.foundationdb.Range;
import com.apple.foundationdb.StreamingMode;
import com.apple.foundationdb.TenantManagement;
import com.apple.foundationdb.Transaction;
import com.apple.foundationdb.async.AsyncIterable;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.async.CloseableAsyncIterator;
import com.apple.foundationdb.tuple.ByteArrayUtil;
import com.apple.foundationdb.Tenant;
import com.apple.foundationdb.tuple.Tuple;
/**
@ -197,7 +200,7 @@ public class StackTester {
inst.tr.options().setNextWriteNoWriteConflictRange();
}
else if(op == StackOperation.RESET) {
inst.context.newTransaction();
inst.context.resetTransaction();
}
else if(op == StackOperation.CANCEL) {
inst.tr.cancel();
@ -300,12 +303,12 @@ public class StackTester {
try {
Transaction tr = inst.tr.onError(err).join();
if(!inst.setTransaction(tr)) {
if(!inst.replaceTransaction(tr)) {
tr.close();
}
}
catch(Throwable t) {
inst.context.newTransaction(); // Other bindings allow reuse of non-retryable transactions, so we need to emulate that behavior.
inst.context.resetTransaction(); // Other bindings allow reuse of non-retryable transactions, so we need to emulate that behavior.
throw t;
}
@ -418,6 +421,21 @@ public class StackTester {
double value = ((Number)param).doubleValue();
inst.push(ByteBuffer.allocate(8).order(ByteOrder.BIG_ENDIAN).putDouble(value).array());
}
else if (op == StackOperation.TENANT_CREATE) {
byte[] tenantName = (byte[])inst.popParam().join();
inst.push(TenantManagement.createTenant(inst.context.db, tenantName));
}
else if (op == StackOperation.TENANT_DELETE) {
byte[] tenantName = (byte[])inst.popParam().join();
inst.push(TenantManagement.deleteTenant(inst.context.db, tenantName));
}
else if (op == StackOperation.TENANT_SET_ACTIVE) {
byte[] tenantName = (byte[])inst.popParam().join();
inst.context.setTenant(Optional.of(tenantName));
}
else if (op == StackOperation.TENANT_CLEAR_ACTIVE) {
inst.context.setTenant(Optional.empty());
}
else if(op == StackOperation.UNIT_TESTS) {
try {
inst.context.db.options().setLocationCacheSize(100001);
@ -490,6 +508,7 @@ public class StackTester {
testWatches(inst.context.db);
testLocality(inst.context.db);
testTenantTupleNames(inst.context.db);
}
catch(Exception e) {
throw new RuntimeException("Unit tests failed: " + e.getMessage());
@ -579,7 +598,7 @@ public class StackTester {
private static void executeMutation(Instruction inst, Function<Transaction, Void> r) {
// run this with a retry loop (and commit)
inst.tcx.run(r);
if(inst.isDatabase)
if(inst.isDatabase || inst.isTenant)
inst.push("RESULT_NOT_PRESENT".getBytes());
}
@ -741,6 +760,35 @@ public class StackTester {
});
}
private static void testTenantTupleNames(Database db) {
try {
TenantManagement.createTenant(db, Tuple.from("tenant")).join();
Tenant tenant = db.openTenant(Tuple.from("tenant"));
tenant.run(tr -> {
tr.set(Tuple.from("hello").pack(), Tuple.from("world").pack());
return null;
});
String output = tenant.read(tr -> {
byte[] result = tr.get(Tuple.from("hello").pack()).join();
return Tuple.fromBytes(result).getString(0);
});
assert output.equals("world");
tenant.run(tr -> {
tr.clear(Tuple.from("hello").pack());
return null;
});
TenantManagement.deleteTenant(db, Tuple.from("tenant")).join();
}
catch(Exception e) {
e.printStackTrace();
}
}
/**
* Run a stack-machine based test.
*

View File

@ -5,6 +5,7 @@ set(SRCS
fdb/locality.py
fdb/six.py
fdb/subspace_impl.py
fdb/tenant_management.py
fdb/tuple.py
README.rst
MANIFEST.in)

View File

@ -88,6 +88,7 @@ def api_version(ver):
'predicates',
'Future',
'Database',
'Tenant',
'Transaction',
'KeyValue',
'KeySelector',
@ -99,6 +100,9 @@ def api_version(ver):
_add_symbols(fdb.impl, list)
if ver >= 710:
import fdb.tenant_management
if ver < 610:
globals()["init"] = getattr(fdb.impl, "init")
globals()["open"] = getattr(fdb.impl, "open_v609")

View File

@ -34,6 +34,7 @@ import traceback
import fdb
from fdb import six
from fdb.tuple import pack, unpack
_network_thread = None
_network_thread_reentrant_lock = threading.RLock()
@ -198,9 +199,10 @@ def transactional(*tr_args, **tr_kwargs):
one of two actions, depending on the type of the parameter passed
to the function at call time.
If given a Database, a Transaction will be created and passed into
the wrapped code in place of the Database. After the function is
complete, the newly created transaction will be committed.
If given a Database or Tenant, a Transaction will be created and
passed into the wrapped code in place of the Database or Tenant.
After the function is complete, the newly created transaction
will be committed.
It is important to note that the wrapped method may be called
multiple times in the event of a commit failure, until the commit
@ -943,128 +945,114 @@ class FormerFuture(_FDBBase):
except:
pass
class Database(_FDBBase):
def __init__(self, dpointer):
self.dpointer = dpointer
self.options = _DatabaseOptions(self)
def __del__(self):
# print('Destroying database 0x%x' % self.dpointer)
self.capi.fdb_database_destroy(self.dpointer)
class _TransactionCreator(_FDBBase):
def get(self, key):
return Database.__database_getitem(self, key)
return _TransactionCreator.__creator_getitem(self, key)
def __getitem__(self, key):
if isinstance(key, slice):
return self.get_range(key.start, key.stop, reverse=(key.step == -1))
return Database.__database_getitem(self, key)
return _TransactionCreator.__creator_getitem(self, key)
def get_key(self, key_selector):
return Database.__database_get_key(self, key_selector)
return _TransactionCreator.__creator_get_key(self, key_selector)
def get_range(self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.want_all):
return Database.__database_get_range(self, begin, end, limit, reverse, streaming_mode)
return _TransactionCreator.__creator_get_range(self, begin, end, limit, reverse, streaming_mode)
def get_range_startswith(self, prefix, *args, **kwargs):
return Database.__database_get_range_startswith(self, prefix, *args, **kwargs)
return _TransactionCreator.__creator_get_range_startswith(self, prefix, *args, **kwargs)
def set(self, key, value):
Database.__database_setitem(self, key, value)
_TransactionCreator.__creator_setitem(self, key, value)
def __setitem__(self, key, value):
Database.__database_setitem(self, key, value)
_TransactionCreator.__creator_setitem(self, key, value)
def clear(self, key):
Database.__database_delitem(self, key)
_TransactionCreator.__creator_delitem(self, key)
def clear_range(self, begin, end):
Database.__database_delitem(self, slice(begin, end))
_TransactionCreator.__creator_delitem(self, slice(begin, end))
def __delitem__(self, key_or_slice):
Database.__database_delitem(self, key_or_slice)
_TransactionCreator.__creator_delitem(self, key_or_slice)
def clear_range_startswith(self, prefix):
Database.__database_clear_range_startswith(self, prefix)
_TransactionCreator.__creator_clear_range_startswith(self, prefix)
def get_and_watch(self, key):
return Database.__database_get_and_watch(self, key)
return _TransactionCreator.__creator_get_and_watch(self, key)
def set_and_watch(self, key, value):
return Database.__database_set_and_watch(self, key, value)
return _TransactionCreator.__creator_set_and_watch(self, key, value)
def clear_and_watch(self, key):
return Database.__database_clear_and_watch(self, key)
return _TransactionCreator.__creator_clear_and_watch(self, key)
def create_transaction(self):
pointer = ctypes.c_void_p()
self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer))
return Transaction(pointer.value, self)
def _set_option(self, option, param, length):
self.capi.fdb_database_set_option(self.dpointer, option, param, length)
pass
def _atomic_operation(self, opcode, key, param):
Database.__database_atomic_operation(self, opcode, key, param)
_TransactionCreator.__creator_atomic_operation(self, opcode, key, param)
#### Transaction implementations ####
@staticmethod
@transactional
def __database_getitem(tr, key):
def __creator_getitem(tr, key):
return tr[key].value
@staticmethod
@transactional
def __database_get_key(tr, key_selector):
def __creator_get_key(tr, key_selector):
return tr.get_key(key_selector).value
@staticmethod
@transactional
def __database_get_range(tr, begin, end, limit, reverse, streaming_mode):
def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode):
return tr.get_range(begin, end, limit, reverse, streaming_mode).to_list()
@staticmethod
@transactional
def __database_get_range_startswith(tr, prefix, *args, **kwargs):
def __creator_get_range_startswith(tr, prefix, *args, **kwargs):
return tr.get_range_startswith(prefix, *args, **kwargs).to_list()
@staticmethod
@transactional
def __database_setitem(tr, key, value):
def __creator_setitem(tr, key, value):
tr[key] = value
@staticmethod
@transactional
def __database_clear_range_startswith(tr, prefix):
def __creator_clear_range_startswith(tr, prefix):
tr.clear_range_startswith(prefix)
@staticmethod
@transactional
def __database_get_and_watch(tr, key):
def __creator_get_and_watch(tr, key):
v = tr.get(key)
return v, tr.watch(key)
@staticmethod
@transactional
def __database_set_and_watch(tr, key, value):
def __creator_set_and_watch(tr, key, value):
tr.set(key, value)
return tr.watch(key)
@staticmethod
@transactional
def __database_clear_and_watch(tr, key):
def __creator_clear_and_watch(tr, key):
del tr[key]
return tr.watch(key)
@staticmethod
@transactional
def __database_delitem(tr, key_or_slice):
def __creator_delitem(tr, key_or_slice):
del tr[key_or_slice]
@staticmethod
@transactional
def __database_atomic_operation(tr, opcode, key, param):
def __creator_atomic_operation(tr, opcode, key, param):
tr._atomic_operation(opcode, key, param)
# Asynchronous transactions
@ -1074,11 +1062,11 @@ class Database(_FDBBase):
From = asyncio.From
coroutine = asyncio.coroutine
class Database:
class TransactionCreator:
@staticmethod
@transactional
@coroutine
def __database_getitem(tr, key):
def __creator_getitem(tr, key):
# raise Return(( yield From( tr[key] ) ))
raise Return(tr[key])
yield None
@ -1086,26 +1074,26 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_get_key(tr, key_selector):
def __creator_get_key(tr, key_selector):
raise Return(tr.get_key(key_selector))
yield None
@staticmethod
@transactional
@coroutine
def __database_get_range(tr, begin, end, limit, reverse, streaming_mode):
def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode):
raise Return((yield From(tr.get_range(begin, end, limit, reverse, streaming_mode).to_list())))
@staticmethod
@transactional
@coroutine
def __database_get_range_startswith(tr, prefix, *args, **kwargs):
def __creator_get_range_startswith(tr, prefix, *args, **kwargs):
raise Return((yield From(tr.get_range_startswith(prefix, *args, **kwargs).to_list())))
@staticmethod
@transactional
@coroutine
def __database_setitem(tr, key, value):
def __creator_setitem(tr, key, value):
tr[key] = value
raise Return()
yield None
@ -1113,7 +1101,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_clear_range_startswith(tr, prefix):
def __creator_clear_range_startswith(tr, prefix):
tr.clear_range_startswith(prefix)
raise Return()
yield None
@ -1121,7 +1109,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_get_and_watch(tr, key):
def __creator_get_and_watch(tr, key):
v = tr.get(key)
raise Return(v, tr.watch(key))
yield None
@ -1129,7 +1117,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_set_and_watch(tr, key, value):
def __creator_set_and_watch(tr, key, value):
tr.set(key, value)
raise Return(tr.watch(key))
yield None
@ -1137,7 +1125,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_clear_and_watch(tr, key):
def __creator_clear_and_watch(tr, key):
del tr[key]
raise Return(tr.watch(key))
yield None
@ -1145,7 +1133,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_delitem(tr, key_or_slice):
def __creator_delitem(tr, key_or_slice):
del tr[key_or_slice]
raise Return()
yield None
@ -1153,11 +1141,55 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_atomic_operation(tr, opcode, key, param):
def __creator_atomic_operation(tr, opcode, key, param):
tr._atomic_operation(opcode, key, param)
raise Return()
yield None
return Database
return TransactionCreator
def process_tenant_name(name):
if isinstance(name, tuple):
return pack(name)
elif isinstance(name, bytes):
return name
else:
raise TypeError('Tenant name must be of type ' + bytes.__name__ + ' or of type ' + tuple.__name__)
class Database(_TransactionCreator):
def __init__(self, dpointer):
self.dpointer = dpointer
self.options = _DatabaseOptions(self)
def __del__(self):
# print('Destroying database 0x%x' % self.dpointer)
self.capi.fdb_database_destroy(self.dpointer)
def _set_option(self, option, param, length):
self.capi.fdb_database_set_option(self.dpointer, option, param, length)
def open_tenant(self, name):
tname = process_tenant_name(name)
pointer = ctypes.c_void_p()
self.capi.fdb_database_open_tenant(self.dpointer, tname, len(tname), ctypes.byref(pointer))
return Tenant(pointer.value)
def create_transaction(self):
pointer = ctypes.c_void_p()
self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer))
return Transaction(pointer.value, self)
class Tenant(_TransactionCreator):
def __init__(self, tpointer):
self.tpointer = tpointer
def __del__(self):
self.capi.fdb_tenant_destroy(self.tpointer)
def create_transaction(self):
pointer = ctypes.c_void_p()
self.capi.fdb_tenant_create_transaction(self.tpointer, ctypes.byref(pointer))
return Transaction(pointer.value, self)
fill_operations()
@ -1458,6 +1490,10 @@ def init_c_api():
_capi.fdb_database_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_database_destroy.restype = None
_capi.fdb_database_open_tenant.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p)]
_capi.fdb_database_open_tenant.restype = ctypes.c_int
_capi.fdb_database_open_tenant.errcheck = check_error_code
_capi.fdb_database_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)]
_capi.fdb_database_create_transaction.restype = ctypes.c_int
_capi.fdb_database_create_transaction.errcheck = check_error_code
@ -1466,6 +1502,13 @@ def init_c_api():
_capi.fdb_database_set_option.restype = ctypes.c_int
_capi.fdb_database_set_option.errcheck = check_error_code
_capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_tenant_destroy.restype = None
_capi.fdb_tenant_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)]
_capi.fdb_tenant_create_transaction.restype = ctypes.c_int
_capi.fdb_tenant_create_transaction.errcheck = check_error_code
_capi.fdb_transaction_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_transaction_destroy.restype = None
@ -1686,10 +1729,10 @@ def init(event_model=None):
raise asyncio.Return(self)
return it()
FDBRange.iterate = iterate
AT = Database.declare_asynchronous_transactions()
AT = _TransactionCreator.declare_asynchronous_transactions()
for name in dir(AT):
if name.startswith("_Database__database_"):
setattr(Database, name, getattr(AT, name))
if name.startswith("__TransactionCreator__creator_"):
setattr(_TransactionCreator, name, getattr(AT, name))
def to_list(self):
if self._mode == StreamingMode.iterator:

View File

@ -0,0 +1,95 @@
#
# tenant_management.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# FoundationDB Python API
"""Documentation for this API can be found at
https://apple.github.io/foundationdb/api-python.html"""
from fdb import impl as _impl
_tenant_map_prefix = b'\xff\xff/management/tenant_map/'
# If the existence_check_marker is an empty list, then check whether the tenant exists.
# After the check, append an item to the existence_check_marker list so that subsequent
# calls to this function will not perform the existence check.
#
# If the existence_check_marker is a non-empty list, return None.
def _check_tenant_existence(tr, key, existence_check_marker, force_maybe_commited):
if not existence_check_marker:
existing_tenant = tr[key].wait()
existence_check_marker.append(None)
if force_maybe_commited:
raise _impl.FDBError(1021) # maybe_committed
return existing_tenant != None
return None
# Attempt to create a tenant in the cluster. If existence_check_marker is an empty
# list, then this function will check if the tenant already exists and fail if it does.
# Once the existence check is completed, it will not be done again if this function
# retries. As a result, this function may return successfully if the tenant is created
# by someone else concurrently. This behavior allows the operation to be idempotent with
# respect to retries.
#
# If the existence_check_marker is a non-empty list, then the existence check is skipped.
@_impl.transactional
def _create_tenant_impl(tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False):
tr.options.set_special_key_space_enable_writes()
key = b'%s%s' % (_tenant_map_prefix, tenant_name)
if _check_tenant_existence(tr, key, existence_check_marker, force_existence_check_maybe_committed) is True:
raise _impl.FDBError(2132) # tenant_already_exists
tr[key] = b''
# Attempt to delete a tenant from the cluster. If existence_check_marker is an empty
# list, then this function will check if the tenant already exists and fail if it does
# not. Once the existence check is completed, it will not be done again if this function
# retries. As a result, this function may return successfully if the tenant is deleted
# by someone else concurrently. This behavior allows the operation to be idempotent with
# respect to retries.
#
# If the existence_check_marker is a non-empty list, then the existence check is skipped.
@_impl.transactional
def _delete_tenant_impl(tr, tenant_name, existence_check_marker, force_existence_check_maybe_committed=False):
tr.options.set_special_key_space_enable_writes()
key = b'%s%s' % (_tenant_map_prefix, tenant_name)
if _check_tenant_existence(tr, key, existence_check_marker, force_existence_check_maybe_committed) is False:
raise _impl.FDBError(2131) # tenant_not_found
del tr[key]
def create_tenant(db_or_tr, tenant_name):
tenant_name = _impl.process_tenant_name(tenant_name)
# Only perform the existence check when run using a database
# Callers using a transaction are expected to check existence themselves if required
existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None]
_create_tenant_impl(db_or_tr, tenant_name, existence_check_marker)
def delete_tenant(db_or_tr, tenant_name):
tenant_name = _impl.process_tenant_name(tenant_name)
# Only perform the existence check when run using a database
# Callers using a transaction are expected to check existence themselves if required
existence_check_marker = [] if not isinstance(db_or_tr, _impl.TransactionRead) else [None]
_delete_tenant_impl(db_or_tr, tenant_name, existence_check_marker)

View File

@ -233,7 +233,8 @@ def suspend(logger):
port = address.split(':')[1]
logger.debug("Port: {}".format(port))
# use the port number to find the exact fdb process we are connecting to
pinfo = list(filter(lambda x: port in x, pinfos))
# child process like fdbserver -r flowprocess does not provide `datadir` in the command line
pinfo = list(filter(lambda x: port in x and 'datadir' in x, pinfos))
assert len(pinfo) == 1
pid = pinfo[0].split(' ')[0]
logger.debug("Pid: {}".format(pid))

View File

@ -0,0 +1,187 @@
#!/usr/bin/python
#
# tenant_tests.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import fdb
import sys
import json
from fdb.tuple import pack
if __name__ == '__main__':
fdb.api_version(710)
def cleanup_tenant(db, tenant_name):
try:
tenant = db.open_tenant(tenant_name)
del tenant[:]
fdb.tenant_management.delete_tenant(db, tenant_name)
except fdb.FDBError as e:
if e.code == 2131: # tenant not found
pass
else:
raise
def test_tenant_tuple_name(db):
tuplename=(b'test', b'level', b'hierarchy', 3, 1.24, 'str')
cleanup_tenant(db, tuplename)
fdb.tenant_management.create_tenant(db, tuplename)
tenant=db.open_tenant(tuplename)
tenant[b'foo'] = b'bar'
assert tenant[b'foo'] == b'bar'
del tenant[b'foo']
fdb.tenant_management.delete_tenant(db, tuplename)
def test_tenant_operations(db):
cleanup_tenant(db, b'tenant1')
cleanup_tenant(db, b'tenant2')
fdb.tenant_management.create_tenant(db, b'tenant1')
fdb.tenant_management.create_tenant(db, b'tenant2')
tenant1 = db.open_tenant(b'tenant1')
tenant2 = db.open_tenant(b'tenant2')
db[b'tenant_test_key'] = b'no_tenant'
tenant1[b'tenant_test_key'] = b'tenant1'
tenant2[b'tenant_test_key'] = b'tenant2'
tenant1_entry = db[b'\xff\xff/management/tenant_map/tenant1']
tenant1_json = json.loads(tenant1_entry)
prefix1 = tenant1_json['prefix'].encode('utf8')
tenant2_entry = db[b'\xff\xff/management/tenant_map/tenant2']
tenant2_json = json.loads(tenant2_entry)
prefix2 = tenant2_json['prefix'].encode('utf8')
assert tenant1[b'tenant_test_key'] == b'tenant1'
assert db[prefix1 + b'tenant_test_key'] == b'tenant1'
assert tenant2[b'tenant_test_key'] == b'tenant2'
assert db[prefix2 + b'tenant_test_key'] == b'tenant2'
assert db[b'tenant_test_key'] == b'no_tenant'
tr1 = tenant1.create_transaction()
try:
del tr1[:]
tr1.commit().wait()
except fdb.FDBError as e:
tr.on_error(e).wait()
assert tenant1[b'tenant_test_key'] == None
assert db[prefix1 + b'tenant_test_key'] == None
assert tenant2[b'tenant_test_key'] == b'tenant2'
assert db[prefix2 + b'tenant_test_key'] == b'tenant2'
assert db[b'tenant_test_key'] == b'no_tenant'
fdb.tenant_management.delete_tenant(db, b'tenant1')
try:
tenant1[b'tenant_test_key']
assert False
except fdb.FDBError as e:
assert e.code == 2131 # tenant not found
del tenant2[:]
fdb.tenant_management.delete_tenant(db, b'tenant2')
assert db[prefix1 + b'tenant_test_key'] == None
assert db[prefix2 + b'tenant_test_key'] == None
assert db[b'tenant_test_key'] == b'no_tenant'
del db[b'tenant_test_key']
assert db[b'tenant_test_key'] == None
def test_tenant_operation_retries(db):
cleanup_tenant(db, b'tenant1')
cleanup_tenant(db, b'tenant2')
# Test that the tenant creation only performs the existence check once
fdb.tenant_management._create_tenant_impl(db, b'tenant1', [], force_existence_check_maybe_committed=True)
# An attempt to create the tenant again should fail
try:
fdb.tenant_management.create_tenant(db, b'tenant1')
assert False
except fdb.FDBError as e:
assert e.code == 2132 # tenant already exists
# Using a transaction skips the existence check
tr = db.create_transaction()
fdb.tenant_management.create_tenant(tr, b'tenant1')
# Test that a concurrent tenant creation doesn't interfere with the existence check logic
tr = db.create_transaction()
existence_check_marker = []
fdb.tenant_management._create_tenant_impl(tr, b'tenant2', existence_check_marker)
fdb.tenant_management.create_tenant(db, b'tenant2')
tr = db.create_transaction()
try:
fdb.tenant_management._create_tenant_impl(tr, b'tenant2', existence_check_marker)
tr.commit().wait()
except fdb.FDBError as e:
tr.on_error(e).wait()
# Test that tenant deletion only performs the existence check once
fdb.tenant_management._delete_tenant_impl(db, b'tenant1', [], force_existence_check_maybe_committed=True)
# An attempt to delete the tenant again should fail
try:
fdb.tenant_management.delete_tenant(db, b'tenant1')
assert False
except fdb.FDBError as e:
assert e.code == 2131 # tenant not found
# Using a transaction skips the existence check
tr = db.create_transaction()
fdb.tenant_management.delete_tenant(tr, b'tenant1')
# Test that a concurrent tenant deletion doesn't interfere with the existence check logic
tr = db.create_transaction()
existence_check_marker = []
fdb.tenant_management._delete_tenant_impl(tr, b'tenant2', existence_check_marker)
fdb.tenant_management.delete_tenant(db, b'tenant2')
tr = db.create_transaction()
try:
fdb.tenant_management._delete_tenant_impl(tr, b'tenant2', existence_check_marker)
tr.commit().wait()
except fdb.FDBError as e:
tr.on_error(e).wait()
def test_tenants(db):
test_tenant_tuple_name(db)
test_tenant_operations(db)
test_tenant_operation_retries(db)
# Expect a cluster file as input. This test will write to the FDB cluster, so
# be aware of potential side effects.
if __name__ == '__main__':
clusterFile = sys.argv[1]
db = fdb.open(clusterFile)
db.options.set_transaction_timeout(2000) # 2 seconds
db.options.set_transaction_retry_limit(3)
test_tenants(db)

View File

@ -49,6 +49,7 @@ from cancellation_timeout_tests import test_db_retry_limits
from cancellation_timeout_tests import test_combinations
from size_limit_tests import test_size_limit_option, test_get_approximate_size
from tenant_tests import test_tenants
random.seed(0)
@ -112,12 +113,13 @@ class Stack:
class Instruction:
def __init__(self, tr, stack, op, index, isDatabase=False, isSnapshot=False):
def __init__(self, tr, stack, op, index, isDatabase=False, isTenant=False, isSnapshot=False):
self.tr = tr
self.stack = stack
self.op = op
self.index = index
self.isDatabase = isDatabase
self.isTenant = isTenant
self.isSnapshot = isSnapshot
def pop(self, count=None, with_idx=False):
@ -277,6 +279,7 @@ class Tester:
def __init__(self, db, prefix):
self.db = db
self.tenant = None
self.instructions = self.db[fdb.tuple.range((prefix,))]
@ -317,7 +320,8 @@ class Tester:
def new_transaction(self):
with Tester.tr_map_lock:
Tester.tr_map[self.tr_name] = self.db.create_transaction()
tr_source = self.tenant if self.tenant is not None else self.db
Tester.tr_map[self.tr_name] = tr_source.create_transaction()
def switch_transaction(self, name):
self.tr_name = name
@ -335,18 +339,22 @@ class Tester:
# print("%d. Instruction is %s" % (idx, op))
isDatabase = op.endswith(six.u('_DATABASE'))
isTenant = op.endswith(six.u('_TENANT'))
isSnapshot = op.endswith(six.u('_SNAPSHOT'))
if isDatabase:
op = op[:-9]
obj = self.db
elif isTenant:
op = op[:-7]
obj = self.tenant if self.tenant else self.db
elif isSnapshot:
op = op[:-9]
obj = self.current_transaction().snapshot
else:
obj = self.current_transaction()
inst = Instruction(obj, self.stack, op, idx, isDatabase, isSnapshot)
inst = Instruction(obj, self.stack, op, idx, isDatabase, isTenant, isSnapshot)
try:
if inst.op == six.u("PUSH"):
@ -583,6 +591,19 @@ class Tester:
prefix = inst.pop()
Tester.wait_empty(self.db, prefix)
inst.push(b"WAITED_FOR_EMPTY")
elif inst.op == six.u("TENANT_CREATE"):
name = inst.pop()
fdb.tenant_management.create_tenant(self.db, name)
inst.push(b"RESULT_NOT_PRESENT")
elif inst.op == six.u("TENANT_DELETE"):
name = inst.pop()
fdb.tenant_management.delete_tenant(self.db, name)
inst.push(b"RESULT_NOT_PRESENT")
elif inst.op == six.u("TENANT_SET_ACTIVE"):
name = inst.pop()
self.tenant = self.db.open_tenant(name)
elif inst.op == six.u("TENANT_CLEAR_ACTIVE"):
self.tenant = None
elif inst.op == six.u("UNIT_TESTS"):
try:
test_db_options(db)
@ -600,6 +621,9 @@ class Tester:
test_size_limit_option(db)
test_get_approximate_size(db)
if fdb.get_api_version() >= 710:
test_tenants(db)
except fdb.FDBError as e:
print("Unit tests failed: %s" % e.description)
traceback.print_exc()

View File

@ -212,6 +212,17 @@ endif()
set(COROUTINE_IMPL ${DEFAULT_COROUTINE_IMPL} CACHE STRING "Which coroutine implementation to use. Options are boost and libcoro")
################################################################################
# AWS SDK
################################################################################
set(BUILD_AWS_BACKUP OFF CACHE BOOL "Build AWS S3 SDK backup client")
if (BUILD_AWS_BACKUP)
set(WITH_AWS_BACKUP ON)
else()
set(WITH_AWS_BACKUP OFF)
endif()
################################################################################
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/packages)
@ -232,6 +243,7 @@ function(print_components)
message(STATUS "Build Python sdist (make package): ${WITH_PYTHON_BINDING}")
message(STATUS "Configure CTest (depends on Python): ${WITH_PYTHON}")
message(STATUS "Build with RocksDB: ${WITH_ROCKSDB_EXPERIMENTAL}")
message(STATUS "Build with AWS SDK: ${WITH_AWS_BACKUP}")
message(STATUS "=========================================")
endfunction()

98
cmake/awssdk.cmake Normal file
View File

@ -0,0 +1,98 @@
project(awssdk-download NONE)
# Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang
set(AWSSDK_COMPILER_FLAGS "")
set(AWSSDK_LINK_FLAGS "")
if(APPLE OR CLANG OR USE_LIBCXX)
set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++)
set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi)
endif()
include(ExternalProject)
ExternalProject_Add(awssdk_project
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS}
-DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS}
TEST_COMMAND ""
BUILD_ALWAYS TRUE
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
)
add_library(awssdk_core STATIC IMPORTED)
add_dependencies(awssdk_core awssdk_project)
set_target_properties(awssdk_core PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a")
add_library(awssdk_crt STATIC IMPORTED)
add_dependencies(awssdk_crt awssdk_project)
set_target_properties(awssdk_crt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a")
# TODO: can we remove c_s3? It seems to be a dependency of libaws-crt
add_library(awssdk_c_s3 STATIC IMPORTED)
add_dependencies(awssdk_c_s3 awssdk_project)
set_target_properties(awssdk_c_s3 PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a")
add_library(awssdk_c_auth STATIC IMPORTED)
add_dependencies(awssdk_c_auth awssdk_project)
set_target_properties(awssdk_c_auth PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a")
add_library(awssdk_c_eventstream STATIC IMPORTED)
add_dependencies(awssdk_c_eventstream awssdk_project)
set_target_properties(awssdk_c_eventstream PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a")
add_library(awssdk_c_http STATIC IMPORTED)
add_dependencies(awssdk_c_http awssdk_project)
set_target_properties(awssdk_c_http PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a")
add_library(awssdk_c_mqtt STATIC IMPORTED)
add_dependencies(awssdk_c_mqtt awssdk_project)
set_target_properties(awssdk_c_mqtt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a")
add_library(awssdk_c_io STATIC IMPORTED)
add_dependencies(awssdk_c_io awssdk_project)
set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a")
add_library(awssdk_checksums STATIC IMPORTED)
add_dependencies(awssdk_checksums awssdk_project)
set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a")
add_library(awssdk_c_compression STATIC IMPORTED)
add_dependencies(awssdk_c_compression awssdk_project)
set_target_properties(awssdk_c_compression PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a")
add_library(awssdk_c_cal STATIC IMPORTED)
add_dependencies(awssdk_c_cal awssdk_project)
set_target_properties(awssdk_c_cal PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a")
add_library(awssdk_c_common STATIC IMPORTED)
add_dependencies(awssdk_c_common awssdk_project)
set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a")
# link them all together in one interface target
add_library(awssdk_target INTERFACE)
target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include)
target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl)

View File

@ -346,7 +346,7 @@ function createDatabase
# Configure the database.
else
"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory; status' --timeout "${CONFIGUREWAIT}" --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log"
"${BINDIR}/fdbcli" -C "${FDBCONF}" --exec 'configure new single memory tenant_mode=optional_experimental; status' --timeout "${CONFIGUREWAIT}" --log --log-dir "${LOGDIR}" &>> "${LOGDIR}/fdbclient.log"
if ! displayMessage "Checking if config succeeded"
then

View File

@ -0,0 +1,100 @@
#!/usr/bin/env python3
#
# alloc_instrumentation_traces.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Example trace:
{ "Severity": "10", "Time": "194.878474", "DateTime": "2022-02-01T16:28:27Z", "Type": "MemSample", "Machine": "2.1.1.0:2", "ID": "0000000000000000", "Count": "943", "TotalSize": "540000000", "SampleCount": "54", "Hash": "980074757", "Bt": "addr2line -e fdbserver.debug -p -C -f -i 0x1919b72 0x3751d43 0x37518cc 0x19930f8 0x199dac3 0x1999e7c 0x21a1061 0x31e8fc5 0x31e784a 0x10ab3a8 0x36bf4c6 0x36bf304 0x36beea4 0x36bf352 0x36bfa1c 0x10ab3a8 0x37b22fe 0x37a16ee 0x368c754 0x19202d5 0x7fb3fe2d6555 0x1077029", "ThreadID": "10074331651862410074", "LogGroup": "default" }
"""
# This program analyzes MemSample trace events produced by setting ALLOC_INSTRUMENTATION in FastAlloc.h
# It outputs the top memory users by total size as well as number of allocations.
# Example usage: cat trace.* | ./alloc_instrumentation_traces.py
import sys
import json
byCnt = []
bySize = []
totalSize = 0
lastTimestamp = ""
for line in sys.stdin:
ev = json.loads(line.rstrip())
type = ev["Type"]
if (type != 'MemSample'):
continue
bt = ev["Bt"]
if (bt == "na"):
continue
timestamp = ev["Time"]
cnt = int(ev["Count"])
scnt = int(ev["SampleCount"])
size = int(ev["TotalSize"])
h = ev["Hash"]
if (timestamp != lastTimestamp):
byCnt = []
bySize = []
totalSize = 0
lastTimestamp = timestamp
# print(str(cnt) + " " + str(scnt) + " " + str(size) + " " + h)
byCnt.append( (cnt, scnt, size, h, bt) )
bySize.append( (size, cnt, size, h, bt) )
totalSize += size
byCnt.sort(reverse=True)
bySize.sort(reverse=True)
btByHash = {}
byte_suffix = ["Bytes", "KB", "MB", "GB", "TB"]
def byte_str(bytes):
suffix_idx = 0
while (bytes >= 1024 * 10):
suffix_idx += 1
bytes //= 1024
return str(bytes) + ' ' + byte_suffix[suffix_idx]
print("By Size")
print("-------\r\n")
for x in bySize[:10]:
# print(str(x[0]) + ": " + x[3])
print(str(x[1]) + " / " + byte_str(x[0]) + " (" + byte_str(x[0] // x[1]) + " per alloc):\r\n" + x[4] + "\r\n")
btByHash[x[3]] = x[4]
print()
print("By Count")
print("--------\r\n")
for x in byCnt[:5]:
# print(str(x[0]) + ": " + x[3])
print(str(x[0]) + " / " + byte_str(x[2]) + " (" + byte_str(x[2] // x[0]) + " per alloc):\r\n" + x[4] + "\r\n")
btByHash[x[3]] = x[4]

View File

@ -0,0 +1,237 @@
# FDB Encryption **data at-rest**
## Threat Model
The proposed solution is `able to handle` the following attacks:
* An attacker, if able to get access to any FDB cluster host or attached disk, would not be able to read the persisted data. Further, for cloud deployments, returning a cloud instance back to the cloud provider will prevent the cloud provider from reading the contents of data stored on the disk.
* Data stored on a lost or stolen FDB host persistent disk storage device cant be recovered.
The proposed solution `will not be able` to handle the following attacks:
* Encryption is enabled for data at-rest only, generating a memory dump of FDB processes could enable an attacker to read in-memory data contents.
* An FDB cluster host access, if compromised, would allow an attacker to read/write data managed by the FDB cluster.
## Goals
FoundationDB being a multi-model, easily scalable and fault-tolerant, with an ability to provide great performance even with commodity hardware, plays a critical role enabling enterprises to deploy, manage and run mission critical applications.
Data encryption support is a table-stake feature for modern day enterprise service offerings in the cloud. Customers expect, and at times warrant, that their data and metadata be fully encrypted using the latest security standards. The goal of this document includes:
* Discuss detailed design to support data at-rest encryption support for data stored in FDB clusters. Encrypting data in-transit and/or in-memory caches at various layers in the query execution pipeline (inside and external to FDB) is out of the scope of this feature.
* Isolation guarantees: the encryption domain matches with `tenant` partition semantics supported by FDB clusters. Tenants are discrete namespaces in FDB that serve as transaction domains. A tenant is a `identifier` that maps to a `prefix` within the data-FDB cluster, and all operations within a tenant are implicitly bound within a `tenant-prefix`. Refer to `Multi-Tenant FoundationDB API` documentation more details. However, it is possible to use a single encryption key for the whole cluster, in case `tenant partitioning` isnt available.
* Ease of integration with external Key Management Services enabling persisting, caching, and lookup of encryption keys.
## Config Knobs
* `ServerKnob::ENABLE_ENCRYPION` allows enable/disable encryption feature.
* `ServerKnob::ENCRYPTION_MODE` controls the encryption mode supported. The current scheme supports `AES-256-CTR` encryption mode.
## Encryption Mode
The proposal is to use strong AES-256 CTR encryption mode. Salient properties are:
* HMAC_SHA256 key hashing technique is used to derive encryption keys using a base encryption key and locally generated random number. The formula used is as follows:
```
DEK = HMAC SHA256(BEK || UID)
Where
DEK = Derived Encryption Key
BEK = Base Encryption key
UID = Host local random generated number
```
UID is an 8 byte host-local random number. Another option would have been a simple host-local incrementing counter, however, the scheme runs the risk of repeated encryption-key generation on cluster/process restarts.
* An encryption key derived using the above formula will be cached (in-memory) for a short time interval (10 mins, for instance). The encryption-key is immutable, but, the TTL approach allows refreshing encryption key by reaching out to External Encryption KeyManagement solutions, hence, supporting “restricting lifetime of an encryption” feature if implemented by Encryption Key Management solution.
* Initialization Vector (IV) selection would be random.
## Architecture
The encryption responsibilities are split across multiple modules to ensure data and metadata stored in the cluster is never persisted in plain text on any durable storages (temporary and/or long-term durable storage).
## Encryption Request Workflow
### **Write Request**
* An FDB client initiates a write transaction providing {key, value} in plaintext format.
* An FDB cluster host as part of processing a write transaction would do the following:
1. Obtain required encryption key based on the transaction request tenant information.
2. Encrypt mutations before persisting them on Transaction Logs (TLogs). As a background process, the mutations are moved to a long-term durable storage by the Storage Server processes.
Refer to the sections below for more details.
### **Read Request**
* An FDB client initiates a read transaction request.
* An FDB cluster host as part of processing request would do the following:
1. StorageServer would read desired data blocks from the persistent storage.
2. Regenerate the encryption key required to decrypt the data.
3. Decrypt data and pass results as plaintext to the caller.
Below diagram depicts the end-to-end encryption workflow detailing various modules involved and their interactions. The following section discusses detailed design for involved components.
```
_______________________________________________________
| FDB CLUSER HOST |
| |
_____________________ | ________________________ _________________ |
| | (proprietary) | | | | |
| |<---------- |--| KMS CONNECTOR | | COMMIT PROXIES | |
| ENCRYPTION KEY | | | | | | |
| MANAGEMENT SOLUTION | | |(non FDB - proprietary) | | | |
| | | |________________________| |_________________| |
| | | ^ | |
|_____________________| | | (REST API) | (Encrypt |
| | V Mutation) |
| _________________________________________ | __________________
| | | | | |
| | ENCRYPT KEYPROXY SERVER |<------|-----------| |
| |_________________________________________| | | |
| | | | BACKUP FILES |
| | (Encrypt Node) | | |
| V | | |
| _________________________________________ | | (Encrypt file) |
| | |<------|-----------| |
| | REDWOOD STORAGE SERVER | | |__________________|
| |_________________________________________| |
|_______________________________________________________|
```
## FDB Encryption
An FDB client would insert data i.e. plaintext {key, value} in a FDB cluster for persistence.
### KMS-Connector
A non-FDB process running on FDB cluster hosts enables an FDB cluster to interact with external Encryption Key Managements services. Salient features includes:
* An external (non-FDB) standalone process implementing a REST server.
* Abstracts organization specific KeyManagementService integration details. The proposed design ensures ease of integration given limited infrastructure needed to implement a local/remote REST server.
* Ensure organization specific code is implemented outside the FDB codebase.
* The KMS-Connector process is launched and maintained by the FDBMonitor. The process needs to handle the following REST endpoint:
1. GET - http://localhost/getEncryptionKey
Define a single interface returning “encryption key string in plaintext” and accepting an
JSON input which can be customized as needed:
```json
json_input_payload
{
“Version” : int // version
“KeyId” : keyId // string
}
```
Few benefits of the above proposed schemes are:
* JSON input format is extensible (adding new fields is backward compatible).
* Popular Cloud KMS “getPublicKey” API accepts “keyId” as a string, hence, API should be easy to integrate.
1. AWS: https://docs.aws.amazon.com/cli/latest/reference/kms/get-public-key.html
2. GCP: https://cloud.google.com/kms/docs/retrieve-public-key
`Future improvements`: FDBMonitor at present will launch one KMS-Connector process per FDB cluster host. Though multiple KMS-Connector processes are launched, only one process (collocated with EncryptKeyServer) would consume cluster resources. In future, possible enhancements could be:
* Enable FDBMonitor to launch “N” (configurable) processes per cluster.
* Enable the FDB cluster to manage external processes as well.
### Encrypt KeyServer
Salient features include:
* New FDB role/process to allow fetching of encryption keys from external KeyManagementService interfaces. The process connects to the KMS-Connector REST interface to fetch desired encryption keys.
* On an encryption-key fetch from KMS-Connector, it applies HMAC derivative function to generate a new encryption key and cache it in-memory. The in-memory cache is used to serve encryption key fetch requests from other FDB processes.
Given encryption keys will be needed as part of cluster-recovery, this process/role needs to be recruited at the start of the cluster-recovery process (just after the “master/sequencer” process/role recruitment). All other FDB processes will interact with this process to obtain encryption keys needed to encrypt and/or decrypt the data payload.
`Note`: An alternative would be to incorporate the functionality into the ClusterController process itself, however, having clear responsibility separation would make design more flexible and extensible in future if needed.
### Commit Proxies (CPs)
When a FDB client initiates a write transaction to insert/update data stored in a FDB cluster, the transaction is received by a CP, which then resolves the transaction by checking if the transaction is allowed. If allowed, it commits the transaction to TLogs. The proposal is to extend CP responsibilities by encrypting mutations using the desired encryption key before mutations get persisted into TLogs (durable storage). The encryption key derivation is achieved using the following formula:
```
DEK = HMAC SHA256(BEK || UID)
Where:
DEK = Derived Encryption Key
BEK = Base Encryption Key
UID = Host local random generated number
```
The Transaction State Store (commonly referred as TxnStateStore) is a Key-Value datastore used by FDB to store metadata about the database itself for bootstrap purposes. The data stored in this store plays a critical role in: guiding the transaction system to persist writes (storage tags to mutations at CPs), and managing FDB internal data movement. The TxnStateStore data gets encrypted with the desired encryption key before getting persisted on the disk queues.
As part of encryption, every Mutation would be appended by a plaintext `BlobCipherEncryptHeader` to assist decrypting the information for reads.
CPs would cache (in-memory) recently used encryption-keys to optimize network traffic due to encryption related operations. Further, the caching would improve overall performance, avoiding frequent RPC calls to EncryptKeyServer which may eventually become a scalability bottleneck. Each encryption-key in the cache has a short Time-To-Live (10 mins) and on expiry the process will interact with the EncryptKeyServer to fetch the required encryption-keys. The same caching policy is followed by the Redwood Storage Server and the Backup File processes too.
### **Caveats**
The encryption is done inline in the transaction path, which will increase the total commit latencies. Few possible ways to minimize this impact are:
* Overlap encryption operations with the CP::resolution phase, which would minimize the latency penalty per transaction at the cost of spending more CPU cycles. If needed, for production deployments, we may need to increase the number of CPs per FDB cluster.
* Implement an external process to offload encryption. If done, encryption would appear no different than the CP::resolution phase, where the process would invoke RPC calls to encrypt the buffer and wait for operation completion.
### Storage Servers
The encryption design only supports Redwood Storage Server integration, support for other storage engines is yet to be planned.
### Redwood Storage Nodes
Redwood at heart is a B+ tree and stores data in two types of nodes:
* `Non-leaf` nodes: Nodes will only store keys and not values(prefix compression is applied).
* `Leaf` Nodes: Will store `{key, value}` tuples for a given key-range.
Both above-mentioned nodes will be converted into one or more fixed size pages (likely 4K or 8K) before being persisted on a durable storage. The encryption will be performed at the node level instead of “page level”, i.e. all pages constituting a given Redwood node will be encrypted using the same encryption key generated using the following formula:
```
DEK = HMAC SHA256(BEK || UID)
Where:
DEK = Derived Encryption Key
BEK = Base Encryption Key
UID = Host local random generated number
```
### Backup Files
Backup Files are designed to pull committed mutations from StorageServers and persist them as “files” stored on cloud backed BlobStorage such as Amazon S3. Each persisted file stores mutations for a given key-range and will be encrypted by generating an encryption key using below formula:
```
DEK = HMAC SHA256(BEK || FID)
Where:
DEK = Derived Encryption Key
BEK = Base Encryption Key
FID = File Identifier (unique)
```
## Decryption on Reads
To assist reads, FDB processes (StorageServers, Backup Files workers) will be modified to read/parse the encryption header. The data decryption will be done as follows:
* The FDB process will interact with Encrypt KeyServer to fetch the desired base encryption key corresponding to the key-id persisted in the encryption header.
* Reconstruct the encryption key and decrypt the data block.
## Future Work
* Extend the TLog API to allow clients to read “plaintext mutations” directly from a TLogServer. In current implementations there are two consumers of TLogs:
1. Storage Server: At present the plan is for StorageServer to decrypt the mutations.
2. BackupWorker (Apple implementation) which is currently not used in the code.

View File

@ -3,3 +3,4 @@ setuptools>=20.10.0,<=57.4.0
sphinx==1.5.6
sphinx-bootstrap-theme==0.4.8
docutils==0.16
Jinja2==3.0.3

View File

@ -7,7 +7,7 @@
.. |database-type| replace:: ``Database``
.. |database-class| replace:: :class:`Database`
.. |database-auto| replace:: the :func:`@fdb.transactional <transactional>` decorator
.. |tenant-type| replace:: FIXME
.. |tenant-type| replace:: :class:`Tenant`
.. |transaction-class| replace:: :class:`Transaction`
.. |get-key-func| replace:: :func:`Transaction.get_key`
.. |get-range-func| replace:: :func:`Transaction.get_range`
@ -316,6 +316,14 @@ A |database-blurb1| |database-blurb2|
Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional <transactional>` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior.
.. method:: Database.open_tenant(tenant_name)
Opens an existing tenant to be used for running transactions and returns it as a :class`Tenant` object.
The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
.. note :: Opening a tenant does not check its existence in the cluster. If the tenant does not exist, attempts to read or write data with it will fail.
.. |sync-read| replace:: This read is fully synchronous.
.. |sync-write| replace:: This change will be committed immediately, and is fully synchronous.
@ -460,6 +468,17 @@ Database options
.. method:: Database.options.set_snapshot_ryw_disable()
|option-db-snapshot-ryw-disable-blurb|
Tenant objects
==============
.. class:: Tenant
|tenant-blurb1|
.. method:: Tenant.create_transaction()
Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional <transactional>` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior.
.. _api-python-transactional-decorator:
@ -479,9 +498,9 @@ Transactional decoration
The ``@fdb.transactional`` decorator makes ``simple_function`` a transactional function. All functions using this decorator must have an argument **named** ``tr``. This specially named argument is passed a transaction that the function can use to do reads and writes.
A caller of a transactionally decorated function can pass a :class:`Database` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits.
A caller of a transactionally decorated function can pass a :class:`Database` or :class:`Tenant` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits.
If ``db`` is a :class:`Database`, a call like ::
If ``db`` is a :class:`Database` or :class:`Tenant`, a call like ::
simple_function(db, 'a', 'b')
@ -744,7 +763,7 @@ Committing
.. decorator:: transactional()
The ``transactional`` decorator makes it easy to write transactional functions which accept either a :class:`Database` or a :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
.. method :: Transaction.commit()
@ -754,7 +773,7 @@ Committing
|commit-outstanding-reads-blurb|
.. note :: Consider using the :func:`@fdb.transactional <transactional>` decorator, which not only calls :meth:`Database.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions.
.. note :: Consider using the :func:`@fdb.transactional <transactional>` decorator, which not only calls :meth:`Database.create_transaction` or :meth`Tenant.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions.
.. warning :: |used-during-commit-blurb|
@ -1559,3 +1578,32 @@ Locality information
.. method:: fdb.locality.get_addresses_for_key(tr, key)
Returns a :class:`fdb.FutureStringArray`. You must call the :meth:`fdb.Future.wait()` method on this object to retrieve a list of public network addresses as strings, one for each of the storage servers responsible for storing ``key`` and its associated value.
Tenant management
=================
.. module:: fdb.tenant_management
The FoundationDB API includes functions to manage the set of tenants in a cluster.
.. method:: fdb.tenant_management.create_tenant(db_or_tr, tenant_name)
Creates a new tenant in the cluster.
The tenant name can be either a byte string or a tuple and cannot start with the ``\xff`` byte. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
If a database is provided to this function for the ``db_or_tr`` parameter, then this function will first check if the tenant already exists. If it does, it will fail with a ``tenant_already_exists`` error. Otherwise, it will create a transaction and attempt to create the tenant in a retry loop. If the tenant is created concurrently by another transaction, this function may still return successfully.
If a transaction is provided to this function for the ``db_or_tr`` parameter, then this function will not check if the tenant already exists. It is up to the user to perform that check if required. The user must also successfully commit the transaction in order for the creation to take effect.
.. method:: fdb.tenant_management.delete_tenant(db_or_tr, tenant_name)
Delete a tenant from the cluster.
The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
It is an error to delete a tenant that still has data. To delete a non-empty tenant, first clear all of the keys in the tenant.
If a database is provided to this function for the ``db_or_tr`` parameter, then this function will first check if the tenant already exists. If it does not, it will fail with a ``tenant_not_found`` error. Otherwise, it will create a transaction and attempt to delete the tenant in a retry loop. If the tenant is deleted concurrently by another transaction, this function may still return successfully.
If a transaction is provided to this function for the ``db_or_tr`` parameter, then this function will not check if the tenant already exists. It is up to the user to perform that check if required. The user must also successfully commit the transaction in order for the deletion to take effect.

View File

@ -155,6 +155,12 @@ Here is a complete list of valid parameters:
**Example**: The URL parameter *header=x-amz-storage-class:REDUCED_REDUNDANCY* would send the HTTP header required to use the reduced redundancy storage option in the S3 API.
Signing Protocol
=================
AWS signature version 4 is the default signing protocol choice. This boolean knob ``--knob_http_request_aws_v4_header`` can be used to select between v4 style and v2 style signatures.
If the knob is set to ``true`` then v4 signature will be used and if set to ``false`` then v2 signature will be used.
.. _blob-credential-files:
Blob Credential Files

View File

@ -26,6 +26,8 @@ FoundationDB supports language bindings for application development using the or
* :doc:`known-limitations` describes both long-term design limitations of FoundationDB and short-term limitations applicable to the current version.
* :doc:`tenants` describes the use of the tenants feature to define named transaction domains.
.. toctree::
:maxdepth: 1
:titlesonly:
@ -42,3 +44,4 @@ FoundationDB supports language bindings for application development using the or
known-limitations
transaction-profiler-analyzer
api-version-upgrade-guide
tenants

View File

@ -64,7 +64,7 @@ The ``commit`` command commits the current transaction. Any sets or clears execu
configure
---------
The ``configure`` command changes the database configuration. Its syntax is ``configure [new|tss] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [grv_proxies=<N>] [commit_proxies=<N>] [resolvers=<N>] [logs=<N>] [count=<TSS_COUNT>] [perpetual_storage_wiggle=<WIGGLE_SPEED>] [perpetual_storage_wiggle_locality=<<LOCALITY_KEY>:<LOCALITY_VALUE>|0>] [storage_migration_type={disabled|aggressive|gradual}]``.
The ``configure`` command changes the database configuration. Its syntax is ``configure [new|tss] [single|double|triple|three_data_hall|three_datacenter] [ssd|memory] [grv_proxies=<N>] [commit_proxies=<N>] [resolvers=<N>] [logs=<N>] [count=<TSS_COUNT>] [perpetual_storage_wiggle=<WIGGLE_SPEED>] [perpetual_storage_wiggle_locality=<<LOCALITY_KEY>:<LOCALITY_VALUE>|0>] [storage_migration_type={disabled|aggressive|gradual}] [tenant_mode={disabled|optional_experimental|required_experimental}]``.
The ``new`` option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When ``new`` is used, both a redundancy mode and a storage engine must be specified.

View File

@ -273,6 +273,16 @@ Directory partitions have the following drawbacks, and in general they should no
* Directories in a partition have longer prefixes than their counterparts outside of partitions, which reduces performance. Nesting partitions inside of other partitions results in even longer prefixes.
* The root directory of a partition cannot be used to pack/unpack keys and therefore cannot be used to create subspaces. You must create at least one subdirectory of a partition in order to store content in it.
Tenants
-------
:doc:`tenants` in FoundationDB provide a way to divide the cluster key-space into named transaction domains. Each tenant has a byte-string name that can be used to open transactions on the tenant's data, and tenant transactions are not permitted to access data outside of the tenant. Tenants can be useful for enforcing separation between unrelated use-cases.
Tenants and directories
~~~~~~~~~~~~~~~~~~~~~~~
Because tenants enforce that transactions operate within the tenant boundaries, it is not recommended to use a global directory layer shared between tenants. It is possible, however, to use the directory layer within each tenant. To do so, simply use the directory layer as normal with tenant transactions.
Working with the APIs
=====================

View File

@ -701,7 +701,7 @@
"ssd-1",
"ssd-2",
"ssd-redwood-1-experimental",
"ssd-rocksdb-experimental",
"ssd-rocksdb-v1",
"memory",
"memory-1",
"memory-2",
@ -714,7 +714,7 @@
"ssd-1",
"ssd-2",
"ssd-redwood-1-experimental",
"ssd-rocksdb-experimental",
"ssd-rocksdb-v1",
"memory",
"memory-1",
"memory-2",

View File

@ -0,0 +1,60 @@
#######
Tenants
#######
.. warning :: Tenants are currently experimental and are not recommended for use in production.
FoundationDB provides a feature called tenants that allow you to configure one or more named transaction domains in your cluster. A transaction domain is a key-space in which a transaction is allowed to operate, and no tenant operations are allowed to use keys outside the tenant key-space. Tenants can be useful for managing separate, unrelated use-cases and preventing them from interfering with each other. They can also be helpful for defining safe boundaries when moving a subset of data between clusters.
By default, FoundationDB has a single transaction domain that contains both the normal key-space (``['', '\xff')``) as well as the system keys (``['\xff', '\xff\xff')``) and the :doc:`special-keys` (``['\xff\xff', '\xff\xff\xff')``).
Overview
========
A tenant in a FoundationDB cluster maps a byte-string name to a key-space that can be used to store data associated with that tenant. This key-space is stored in the clusters global key-space under a prefix assigned to that tenant, with each tenant being assigned a separate non-intersecting prefix.
In addition to being each assigned a separate tenant prefix, tenants can be configured to have a common shared prefix. By default, the shared prefix is empty and tenants are allocated prefixes throughout the normal key-space. To configure an alternate shared prefix, set the ``\xff/tenantDataPrefix`` key to have the desired prefix as the value.
Tenant operations are implicitly confined to the key-space associated with the tenant. It is not necessary for client applications to use or be aware of the prefix assigned to the tenant.
Enabling tenants
================
In order to use tenants, the cluster must be configured with an appropriate tenant mode using ``fdbcli``::
fdb> configure tenant_mode=<MODE>
FoundationDB clusters support the following tenant modes:
* ``disabled`` - Tenants cannot be created or used. Disabled is the default tenant mode.
* ``optional_experimental`` - Tenants can be created. Each transaction can choose whether or not to use a tenant. This mode is primarily intended for migration and testing purposes, and care should be taken to avoid conflicts between tenant and non-tenant data.
* ``required_experimental`` - Tenants can be created. Each normal transaction must use a tenant. To support special access needs, transactions will be permitted to access the raw key-space using the ``RAW_ACCESS`` transaction option.
Creating and deleting tenants
=============================
Tenants can be created and deleted using the ``\xff\xff/management/tenant_map/<tenant_name>`` :doc:`special key <special-keys>` range as well as by using APIs provided in some language bindings.
Tenants can be created with any byte-string name that does not begin with the ``\xff`` character. Once created, a tenant will be assigned an ID and a prefix where its data will reside.
In order to delete a tenant, it must first be empty. If a tenant contains any keys, they must be cleared prior to deleting the tenant.
Using tenants
=============
In order to use the key-space associated with an existing tenant, you must open the tenant using the ``Database`` object provided by your language binding. The resulting ``Tenant`` object can be used to create transactions much like with a ``Database``, and the resulting transactions will be restricted to the tenant's key-space.
All operations performed within a tenant transaction will occur within the tenant key-space. It is not necessary to use or even be aware of the prefix assigned to a tenant in the global key-space. Operations that could resolve outside of the tenant key-space (e.g. resolving key selectors) will be clamped to the tenant.
.. note :: Tenant transactions are not permitted to access system keys.
Raw access
----------
When operating in the tenant mode ``required_experimental``, transactions are not ordinarily permitted to run without using a tenant. In order to access the system keys or perform maintenance operations that span multiple tenants, it is required to use the ``RAW_ACCESS`` transaction option to access the global key-space. It is an error to specify ``RAW_ACCESS`` on a transaction that is configured to use a tenant.
.. note :: Setting the ``READ_SYSTEM_KEYS`` or ``ACCESS_SYSTEM_KEYS`` options implies ``RAW_ACCESS`` for your transaction.
.. note :: Many :doc:`special keys <special-keys>` operations access parts of the system keys and will implictly enable raw access on the transactions in which they are used.
.. warning :: Care should be taken when using raw access to run transactions spanning multiple tenants if the tenant feature is being utilized to aid in moving data between clusters. In such scenarios, it may not be guaranteed that all of the data you intend to access is on a single cluster.

View File

@ -238,7 +238,7 @@ ACTOR Future<Void> echoClient() {
return Void();
}
struct SimpleKeyValueStoreInteface {
struct SimpleKeyValueStoreInterface {
constexpr static FileIdentifier file_identifier = 8226647;
RequestStream<struct GetKVInterface> connect;
RequestStream<struct GetRequest> get;
@ -253,7 +253,7 @@ struct SimpleKeyValueStoreInteface {
struct GetKVInterface {
constexpr static FileIdentifier file_identifier = 8062308;
ReplyPromise<SimpleKeyValueStoreInteface> reply;
ReplyPromise<SimpleKeyValueStoreInterface> reply;
template <class Ar>
void serialize(Ar& ar) {
@ -297,7 +297,7 @@ struct ClearRequest {
};
ACTOR Future<Void> kvStoreServer() {
state SimpleKeyValueStoreInteface inf;
state SimpleKeyValueStoreInterface inf;
state std::map<std::string, std::string> store;
inf.connect.makeWellKnownEndpoint(WLTOKEN_SIMPLE_KV_SERVER, TaskPriority::DefaultEndpoint);
loop {
@ -333,17 +333,17 @@ ACTOR Future<Void> kvStoreServer() {
}
}
ACTOR Future<SimpleKeyValueStoreInteface> connect() {
ACTOR Future<SimpleKeyValueStoreInterface> connect() {
std::cout << format("%llu: Connect...\n", uint64_t(g_network->now()));
SimpleKeyValueStoreInteface c;
SimpleKeyValueStoreInterface c;
c.connect = RequestStream<GetKVInterface>(Endpoint::wellKnown({ serverAddress }, WLTOKEN_SIMPLE_KV_SERVER));
SimpleKeyValueStoreInteface result = wait(c.connect.getReply(GetKVInterface()));
SimpleKeyValueStoreInterface result = wait(c.connect.getReply(GetKVInterface()));
std::cout << format("%llu: done..\n", uint64_t(g_network->now()));
return result;
}
ACTOR Future<Void> kvSimpleClient() {
state SimpleKeyValueStoreInteface server = wait(connect());
state SimpleKeyValueStoreInterface server = wait(connect());
std::cout << format("Set %s -> %s\n", "foo", "bar");
SetRequest setRequest;
setRequest.key = "foo";
@ -356,7 +356,7 @@ ACTOR Future<Void> kvSimpleClient() {
return Void();
}
ACTOR Future<Void> kvClient(SimpleKeyValueStoreInteface server, std::shared_ptr<uint64_t> ops) {
ACTOR Future<Void> kvClient(SimpleKeyValueStoreInterface server, std::shared_ptr<uint64_t> ops) {
state Future<Void> timeout = delay(20);
state int rangeSize = 2 << 12;
loop {
@ -397,7 +397,7 @@ ACTOR Future<Void> throughputMeasurement(std::shared_ptr<uint64_t> operations) {
}
ACTOR Future<Void> multipleClients() {
SimpleKeyValueStoreInteface server = wait(connect());
SimpleKeyValueStoreInterface server = wait(connect());
auto ops = std::make_shared<uint64_t>(0);
std::vector<Future<Void>> clients(100);
for (auto& f : clients) {

View File

@ -101,6 +101,7 @@ std::vector<LogFile> getRelevantLogFiles(const std::vector<LogFile>& files, Vers
struct ConvertParams {
std::string container_url;
Optional<std::string> proxy;
Version begin = invalidVersion;
Version end = invalidVersion;
bool log_enabled = false;
@ -112,6 +113,10 @@ struct ConvertParams {
std::string s;
s.append("ContainerURL:");
s.append(container_url);
if (proxy.present()) {
s.append(" Proxy:");
s.append(proxy.get());
}
s.append(" Begin:");
s.append(format("%" PRId64, begin));
s.append(" End:");
@ -448,7 +453,8 @@ private:
};
ACTOR Future<Void> convert(ConvertParams params) {
state Reference<IBackupContainer> container = IBackupContainer::openContainer(params.container_url);
state Reference<IBackupContainer> container =
IBackupContainer::openContainer(params.container_url, params.proxy, {});
state BackupFileList listing = wait(container->dumpFileList());
std::sort(listing.logs.begin(), listing.logs.end());
TraceEvent("Container").detail("URL", params.container_url).detail("Logs", listing.logs.size());

View File

@ -46,6 +46,8 @@ enum {
OPT_HEX_KEY_PREFIX,
OPT_BEGIN_VERSION_FILTER,
OPT_END_VERSION_FILTER,
OPT_KNOB,
OPT_SAVE_FILE,
OPT_HELP
};
@ -72,6 +74,9 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
{ OPT_HEX_KEY_PREFIX, "--hex-prefix", SO_REQ_SEP },
{ OPT_BEGIN_VERSION_FILTER, "--begin-version-filter", SO_REQ_SEP },
{ OPT_END_VERSION_FILTER, "--end-version-filter", SO_REQ_SEP },
{ OPT_KNOB, "--knob-", SO_REQ_SEP },
{ OPT_SAVE_FILE, "-s", SO_NONE },
{ OPT_SAVE_FILE, "--save", SO_NONE },
{ OPT_HELP, "-?", SO_NONE },
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },

View File

@ -22,21 +22,27 @@
#include <cstdlib>
#include <iostream>
#include <limits>
#include <memory>
#include <string>
#include <vector>
#include "fdbbackup/BackupTLSConfig.h"
#include "fdbclient/BuildFlags.h"
#include "fdbbackup/FileConverter.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbbackup/FileConverter.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/IKnobCollection.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/MutationList.h"
#include "flow/ArgParseUtil.h"
#include "flow/IRandom.h"
#include "flow/Platform.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/serialize.h"
#include "fdbclient/BuildFlags.h"
#include "flow/actorcompiler.h" // has to be last include
#define SevDecodeInfo SevVerbose
@ -73,11 +79,14 @@ void printDecodeUsage() {
" --list-only Print file list and exit.\n"
" -k KEY_PREFIX Use the prefix for filtering mutations\n"
" --hex-prefix HEX_PREFIX\n"
" The prefix specified in HEX format, e.g., \\x05\\x01.\n"
" The prefix specified in HEX format, e.g., \"\\\\x05\\\\x01\".\n"
" --begin-version-filter BEGIN_VERSION\n"
" The version range's begin version (inclusive) for filtering.\n"
" --end-version-filter END_VERSION\n"
" The version range's end version (exclusive) for filtering.\n"
" --knob-KNOBNAME KNOBVALUE\n"
" Changes a knob value. KNOBNAME should be lowercase.\n"
" -s, --save Save a copy of downloaded files (default: not saving).\n"
"\n";
return;
}
@ -88,15 +97,19 @@ void printBuildInformation() {
struct DecodeParams {
std::string container_url;
Optional<std::string> proxy;
std::string fileFilter; // only files match the filter will be decoded
bool log_enabled = true;
std::string log_dir, trace_format, trace_log_group;
BackupTLSConfig tlsConfig;
bool list_only = false;
bool save_file_locally = false;
std::string prefix; // Key prefix for filtering
Version beginVersionFilter = 0;
Version endVersionFilter = std::numeric_limits<Version>::max();
std::vector<std::pair<std::string, std::string>> knobs;
// Returns if [begin, end) overlap with the filter range
bool overlap(Version begin, Version end) const {
// Filter [100, 200), [50,75) [200, 300)
@ -107,6 +120,10 @@ struct DecodeParams {
std::string s;
s.append("ContainerURL: ");
s.append(container_url);
if (proxy.present()) {
s.append(", Proxy: ");
s.append(proxy.get());
}
s.append(", FileFilter: ");
s.append(fileFilter);
if (log_enabled) {
@ -130,8 +147,19 @@ struct DecodeParams {
if (!prefix.empty()) {
s.append(", KeyPrefix: ").append(printable(KeyRef(prefix)));
}
for (const auto& [knob, value] : knobs) {
s.append(", KNOB-").append(knob).append(" = ").append(value);
}
s.append(", SaveFile: ").append(save_file_locally ? "true" : "false");
return s;
}
void updateKnobs() {
IKnobCollection::setupKnobs(knobs);
// Reinitialize knobs in order to update knobs that are dependent on explicitly set knobs
IKnobCollection::getMutableGlobalKnobCollection().initialize(Randomize::False, IsSimulated::False);
}
};
// Decode an ASCII string, e.g., "\x15\x1b\x19\x04\xaf\x0c\x28\x0a",
@ -256,6 +284,20 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
param->tlsConfig.blobCredentials.push_back(args->OptionArg());
break;
case OPT_KNOB: {
Optional<std::string> knobName = extractPrefixedArgument("--knob", args->OptionSyntax());
if (!knobName.present()) {
std::cerr << "ERROR: unable to parse knob option '" << args->OptionSyntax() << "'\n";
return FDB_EXIT_ERROR;
}
param->knobs.emplace_back(knobName.get(), args->OptionArg());
break;
}
case OPT_SAVE_FILE:
param->save_file_locally = true;
break;
#ifndef TLS_DISABLED
case TLSConfig::OPT_TLS_PLUGIN:
args->OptionArg();
@ -321,16 +363,17 @@ struct VersionedMutations {
*
* DecodeProgress progress(logfile);
* wait(progress->openFile(container));
* while (!progress->finished()) {
* VersionedMutations m = wait(progress->getNextBatch());
* ...
* while (1) {
* Optional<VersionedMutations> batch = wait(progress->getNextBatch());
* if (!batch.present()) break;
* ... // process the batch mutations
* }
*
* Internally, the decoding process is done block by block -- each block is
* decoded into a list of key/value pairs, which are then decoded into batches
* of mutations. Because a version's mutations can be split into many key/value
* pairs, the decoding of mutation batch needs to look ahead one more pair. So
* at any time this object might have two blocks of data in memory.
* pairs, the decoding of mutation needs to look ahead to find all batches that
* belong to the same version.
*/
class DecodeProgress {
std::vector<Standalone<VectorRef<KeyValueRef>>> blocks;
@ -338,31 +381,30 @@ class DecodeProgress {
public:
DecodeProgress() = default;
DecodeProgress(const LogFile& file) : file(file) {}
DecodeProgress(const LogFile& file, bool save) : file(file), save(save) {}
// If there are no more mutations to pull from the file.
bool finished() const { return done; }
~DecodeProgress() {
if (lfd != -1) {
close(lfd);
}
}
// Open and loads file into memory
Future<Void> openFile(Reference<IBackupContainer> container) { return openFileImpl(this, container); }
// The following are private APIs:
// PRECONDITION: finished() must return false before calling this function.
// Returns the next batch of mutations along with the arena backing it.
// Note the returned batch can be empty when the file has unfinished
// version batch data that are in the next file.
VersionedMutations getNextBatch() {
ASSERT(!finished());
VersionedMutations vms;
Optional<VersionedMutations> getNextBatch() {
for (auto& [version, m] : mutationBlocksByVersion) {
if (m.isComplete()) {
VersionedMutations vms;
vms.version = version;
std::vector<MutationRef> mutations = fileBackup::decodeMutationLogValue(m.serializedMutations);
TraceEvent("Decode").detail("Version", vms.version).detail("N", mutations.size());
vms.mutations.insert(vms.mutations.end(), mutations.begin(), mutations.end());
vms.serializedMutations = m.serializedMutations;
vms.mutations = fileBackup::decodeMutationLogValue(vms.serializedMutations);
TraceEvent("Decode").detail("Version", vms.version).detail("N", vms.mutations.size());
mutationBlocksByVersion.erase(version);
return vms;
}
@ -372,13 +414,27 @@ public:
if (!mutationBlocksByVersion.empty()) {
TraceEvent(SevWarn, "UnfishedBlocks").detail("NumberOfVersions", mutationBlocksByVersion.size());
}
done = true;
return vms;
return Optional<VersionedMutations>();
}
ACTOR static Future<Void> openFileImpl(DecodeProgress* self, Reference<IBackupContainer> container) {
Reference<IAsyncFile> fd = wait(container->readFile(self->file.fileName));
self->fd = fd;
if (self->save) {
std::string dir = self->file.fileName;
std::size_t found = self->file.fileName.find_last_of('/');
if (found != std::string::npos) {
std::string path = self->file.fileName.substr(0, found);
if (!directoryExists(path)) {
platform::createDirectory(path);
}
}
self->lfd = open(self->file.fileName.c_str(), O_WRONLY | O_CREAT | O_TRUNC);
if (self->lfd == -1) {
TraceEvent(SevError, "OpenLocalFileFailed").detail("File", self->file.fileName);
throw platform_error();
}
}
while (!self->eof) {
wait(readAndDecodeFile(self));
}
@ -403,10 +459,34 @@ public:
}
// Decode a file block into log_key and log_value chunks
Standalone<VectorRef<KeyValueRef>> chunks =
state Standalone<VectorRef<KeyValueRef>> chunks =
wait(fileBackup::decodeMutationLogFileBlock(self->fd, self->offset, len));
self->blocks.push_back(chunks);
if (self->save) {
ASSERT(self->lfd != -1);
// Read the chunck one more time
state Standalone<StringRef> buf = makeString(len);
int rLen = wait(self->fd->read(mutateString(buf), len, self->offset));
if (rLen != len)
throw restore_bad_read();
int wlen = write(self->lfd, buf.begin(), len);
if (wlen != len) {
TraceEvent(SevError, "WriteLocalFileFailed")
.detail("File", self->file.fileName)
.detail("Offset", self->offset)
.detail("Len", len)
.detail("Wrote", wlen);
throw platform_error();
}
TraceEvent("WriteLocalFile")
.detail("Name", self->file.fileName)
.detail("Len", len)
.detail("Offset", self->offset);
}
TraceEvent("ReadFile")
.detail("Name", self->file.fileName)
.detail("Len", len)
@ -429,7 +509,8 @@ public:
Reference<IAsyncFile> fd;
int64_t offset = 0;
bool eof = false;
bool done = false;
bool save = false;
int lfd = -1; // local file descriptor
};
ACTOR Future<Void> process_file(Reference<IBackupContainer> container, LogFile file, UID uid, DecodeParams params) {
@ -438,10 +519,14 @@ ACTOR Future<Void> process_file(Reference<IBackupContainer> container, LogFile f
return Void();
}
state DecodeProgress progress(file);
state DecodeProgress progress(file, params.save_file_locally);
wait(progress.openFile(container));
while (!progress.finished()) {
VersionedMutations vms = progress.getNextBatch();
while (true) {
auto batch = progress.getNextBatch();
if (!batch.present())
break;
const VersionedMutations& vms = batch.get();
if (vms.version < params.beginVersionFilter || vms.version >= params.endVersionFilter) {
TraceEvent("SkipVersion").detail("Version", vms.version);
continue;
@ -457,7 +542,8 @@ ACTOR Future<Void> process_file(Reference<IBackupContainer> container, LogFile f
print = m.param1.startsWith(StringRef(params.prefix));
} else if (m.type == MutationRef::ClearRange) {
KeyRange range(KeyRangeRef(m.param1, m.param2));
print = range.contains(StringRef(params.prefix));
KeyRange range2 = prefixRange(StringRef(params.prefix));
print = range.intersects(range2);
} else {
ASSERT(false);
}
@ -476,7 +562,8 @@ ACTOR Future<Void> process_file(Reference<IBackupContainer> container, LogFile f
}
ACTOR Future<Void> decode_logs(DecodeParams params) {
state Reference<IBackupContainer> container = IBackupContainer::openContainer(params.container_url);
state Reference<IBackupContainer> container =
IBackupContainer::openContainer(params.container_url, params.proxy, {});
state UID uid = deterministicRandom()->randomUniqueID();
state BackupFileList listing = wait(container->dumpFileList());
// remove partitioned logs
@ -514,10 +601,10 @@ ACTOR Future<Void> decode_logs(DecodeParams params) {
int main(int argc, char** argv) {
try {
CSimpleOpt* args =
new CSimpleOpt(argc, argv, file_converter::gConverterOptions, SO_O_EXACT | SO_O_HYPHEN_TO_UNDERSCORE);
std::unique_ptr<CSimpleOpt> args(
new CSimpleOpt(argc, argv, file_converter::gConverterOptions, SO_O_EXACT | SO_O_HYPHEN_TO_UNDERSCORE));
file_converter::DecodeParams param;
int status = file_converter::parseDecodeCommandLine(&param, args);
int status = file_converter::parseDecodeCommandLine(&param, args.get());
std::cout << "Params: " << param.toString() << "\n";
if (status != FDB_EXIT_SUCCESS) {
file_converter::printDecodeUsage();
@ -551,6 +638,9 @@ int main(int argc, char** argv) {
StringRef url(param.container_url);
setupNetwork(0, UseMetrics::True);
// Must be called after setupNetwork() to be effective
param.updateKnobs();
TraceEvent::setNetworkThread();
openTraceFile(NetworkAddress(), 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
param.tlsConfig.setupBlobCredentials();

View File

@ -130,6 +130,7 @@ enum {
OPT_USE_PARTITIONED_LOG,
// Backup and Restore constants
OPT_PROXY,
OPT_TAGNAME,
OPT_BACKUPKEYS,
OPT_WAITFORDONE,
@ -234,6 +235,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
{ OPT_NOSTOPWHENDONE, "--no-stop-when-done", SO_NONE },
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
// Enable "-p" option after GA
// { OPT_USE_PARTITIONED_LOG, "-p", SO_NONE },
{ OPT_USE_PARTITIONED_LOG, "--partitioned-log-experimental", SO_NONE },
@ -294,6 +296,7 @@ CSimpleOpt::SOption g_rgBackupModifyOptions[] = {
{ OPT_MOD_VERIFY_UID, "--verify-uid", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP },
{ OPT_SNAPSHOTINTERVAL, "--snapshot-interval", SO_REQ_SEP },
{ OPT_MOD_ACTIVE_INTERVAL, "--active-snapshot-interval", SO_REQ_SEP },
@ -482,6 +485,7 @@ CSimpleOpt::SOption g_rgBackupExpireOptions[] = {
{ OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--logdir", SO_REQ_SEP },
{ OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP },
@ -517,6 +521,7 @@ CSimpleOpt::SOption g_rgBackupDeleteOptions[] = {
#endif
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--logdir", SO_REQ_SEP },
{ OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP },
@ -546,6 +551,7 @@ CSimpleOpt::SOption g_rgBackupDescribeOptions[] = {
{ OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--logdir", SO_REQ_SEP },
{ OPT_TRACE_FORMAT, "--trace-format", SO_REQ_SEP },
@ -578,6 +584,7 @@ CSimpleOpt::SOption g_rgBackupDumpOptions[] = {
{ OPT_CLUSTERFILE, "--cluster-file", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--logdir", SO_REQ_SEP },
{ OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
@ -652,6 +659,7 @@ CSimpleOpt::SOption g_rgBackupQueryOptions[] = {
{ OPT_RESTORE_TIMESTAMP, "--query-restore-timestamp", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "-d", SO_REQ_SEP },
{ OPT_DESTCONTAINER, "--destcontainer", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_RESTORE_VERSION, "-qrv", SO_REQ_SEP },
{ OPT_RESTORE_VERSION, "--query-restore-version", SO_REQ_SEP },
{ OPT_BACKUPKEYS_FILTER, "-k", SO_REQ_SEP },
@ -689,6 +697,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
{ OPT_RESTORE_TIMESTAMP, "--timestamp", SO_REQ_SEP },
{ OPT_KNOB, "--knob-", SO_REQ_SEP },
{ OPT_RESTORECONTAINER, "-r", SO_REQ_SEP },
{ OPT_PROXY, "--proxy", SO_REQ_SEP },
{ OPT_PREFIX_ADD, "--add-prefix", SO_REQ_SEP },
{ OPT_PREFIX_REMOVE, "--remove-prefix", SO_REQ_SEP },
{ OPT_TAGNAME, "-t", SO_REQ_SEP },
@ -1920,6 +1929,7 @@ ACTOR Future<Void> submitDBBackup(Database src,
ACTOR Future<Void> submitBackup(Database db,
std::string url,
Optional<std::string> proxy,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
@ -1977,6 +1987,7 @@ ACTOR Future<Void> submitBackup(Database db,
else {
wait(backupAgent.submitBackup(db,
KeyRef(url),
proxy,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
tagName,
@ -2260,8 +2271,9 @@ ACTOR Future<Void> changeDBBackupResumed(Database src, Database dest, bool pause
}
Reference<IBackupContainer> openBackupContainer(const char* name,
std::string destinationContainer,
Optional<std::string> const& encryptionKeyFile = {}) {
const std::string& destinationContainer,
const Optional<std::string>& proxy,
const Optional<std::string>& encryptionKeyFile) {
// Error, if no dest container was specified
if (destinationContainer.empty()) {
fprintf(stderr, "ERROR: No backup destination was specified.\n");
@ -2271,7 +2283,7 @@ Reference<IBackupContainer> openBackupContainer(const char* name,
Reference<IBackupContainer> c;
try {
c = IBackupContainer::openContainer(destinationContainer, encryptionKeyFile);
c = IBackupContainer::openContainer(destinationContainer, proxy, encryptionKeyFile);
} catch (Error& e) {
std::string msg = format("ERROR: '%s' on URL '%s'", e.what(), destinationContainer.c_str());
if (e.code() == error_code_backup_invalid_url && !IBackupContainer::lastOpenError.empty()) {
@ -2291,6 +2303,7 @@ ACTOR Future<Void> runRestore(Database db,
std::string originalClusterFile,
std::string tagName,
std::string container,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> ranges,
Version beginVersion,
Version targetVersion,
@ -2339,7 +2352,7 @@ ACTOR Future<Void> runRestore(Database db,
state FileBackupAgent backupAgent;
state Reference<IBackupContainer> bc =
openBackupContainer(exeRestore.toString().c_str(), container, encryptionKeyFile);
openBackupContainer(exeRestore.toString().c_str(), container, proxy, encryptionKeyFile);
// If targetVersion is unset then use the maximum restorable version from the backup description
if (targetVersion == invalidVersion) {
@ -2368,6 +2381,7 @@ ACTOR Future<Void> runRestore(Database db,
origDb,
KeyRef(tagName),
KeyRef(container),
proxy,
ranges,
waitForDone,
targetVersion,
@ -2411,6 +2425,7 @@ ACTOR Future<Void> runRestore(Database db,
ACTOR Future<Void> runFastRestoreTool(Database db,
std::string tagName,
std::string container,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> ranges,
Version dbVersion,
bool performRestore,
@ -2440,7 +2455,7 @@ ACTOR Future<Void> runFastRestoreTool(Database db,
if (performRestore) {
if (dbVersion == invalidVersion) {
TraceEvent("FastRestoreTool").detail("TargetRestoreVersion", "Largest restorable version");
BackupDescription desc = wait(IBackupContainer::openContainer(container)->describeBackup());
BackupDescription desc = wait(IBackupContainer::openContainer(container, proxy, {})->describeBackup());
if (!desc.maxRestorableVersion.present()) {
fprintf(stderr, "The specified backup is not restorable to any version.\n");
throw restore_error();
@ -2457,6 +2472,7 @@ ACTOR Future<Void> runFastRestoreTool(Database db,
KeyRef(tagName),
ranges,
KeyRef(container),
proxy,
dbVersion,
LockDB::True,
randomUID,
@ -2478,7 +2494,7 @@ ACTOR Future<Void> runFastRestoreTool(Database db,
restoreVersion = dbVersion;
} else {
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(container);
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(container, proxy, {});
state BackupDescription description = wait(bc->describeBackup());
if (dbVersion <= 0) {
@ -2522,9 +2538,10 @@ ACTOR Future<Void> runFastRestoreTool(Database db,
ACTOR Future<Void> dumpBackupData(const char* name,
std::string destinationContainer,
Optional<std::string> proxy,
Version beginVersion,
Version endVersion) {
state Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer);
state Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer, proxy, {});
if (beginVersion < 0 || endVersion < 0) {
BackupDescription desc = wait(c->describeBackup());
@ -2552,6 +2569,7 @@ ACTOR Future<Void> dumpBackupData(const char* name,
ACTOR Future<Void> expireBackupData(const char* name,
std::string destinationContainer,
Optional<std::string> proxy,
Version endVersion,
std::string endDatetime,
Database db,
@ -2577,7 +2595,7 @@ ACTOR Future<Void> expireBackupData(const char* name,
}
try {
Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer, encryptionKeyFile);
Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer, proxy, encryptionKeyFile);
state IBackupContainer::ExpireProgress progress;
state std::string lastProgress;
@ -2623,9 +2641,11 @@ ACTOR Future<Void> expireBackupData(const char* name,
return Void();
}
ACTOR Future<Void> deleteBackupContainer(const char* name, std::string destinationContainer) {
ACTOR Future<Void> deleteBackupContainer(const char* name,
std::string destinationContainer,
Optional<std::string> proxy) {
try {
state Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer);
state Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer, proxy, {});
state int numDeleted = 0;
state Future<Void> done = c->deleteContainer(&numDeleted);
@ -2657,12 +2677,13 @@ ACTOR Future<Void> deleteBackupContainer(const char* name, std::string destinati
ACTOR Future<Void> describeBackup(const char* name,
std::string destinationContainer,
Optional<std::string> proxy,
bool deep,
Optional<Database> cx,
bool json,
Optional<std::string> encryptionKeyFile) {
try {
Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer, encryptionKeyFile);
Reference<IBackupContainer> c = openBackupContainer(name, destinationContainer, proxy, encryptionKeyFile);
state BackupDescription desc = wait(c->describeBackup(deep));
if (cx.present())
wait(desc.resolveVersionTimes(cx.get()));
@ -2688,6 +2709,7 @@ static void reportBackupQueryError(UID operationId, JsonBuilderObject& result, s
// resolved to that timestamp.
ACTOR Future<Void> queryBackup(const char* name,
std::string destinationContainer,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> keyRangesFilter,
Version restoreVersion,
std::string originalClusterFile,
@ -2734,7 +2756,7 @@ ACTOR Future<Void> queryBackup(const char* name,
}
try {
state Reference<IBackupContainer> bc = openBackupContainer(name, destinationContainer);
state Reference<IBackupContainer> bc = openBackupContainer(name, destinationContainer, proxy, {});
if (restoreVersion == invalidVersion) {
BackupDescription desc = wait(bc->describeBackup());
if (desc.maxRestorableVersion.present()) {
@ -2814,9 +2836,9 @@ ACTOR Future<Void> queryBackup(const char* name,
return Void();
}
ACTOR Future<Void> listBackup(std::string baseUrl) {
ACTOR Future<Void> listBackup(std::string baseUrl, Optional<std::string> proxy) {
try {
std::vector<std::string> containers = wait(IBackupContainer::listContainers(baseUrl));
std::vector<std::string> containers = wait(IBackupContainer::listContainers(baseUrl, proxy));
for (std::string container : containers) {
printf("%s\n", container.c_str());
}
@ -2852,6 +2874,7 @@ ACTOR Future<Void> listBackupTags(Database cx) {
struct BackupModifyOptions {
Optional<std::string> verifyUID;
Optional<std::string> destURL;
Optional<std::string> proxy;
Optional<int> snapshotIntervalSeconds;
Optional<int> activeSnapshotIntervalSeconds;
bool hasChanges() const {
@ -2869,7 +2892,7 @@ ACTOR Future<Void> modifyBackup(Database db, std::string tagName, BackupModifyOp
state Reference<IBackupContainer> bc;
if (options.destURL.present()) {
bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get());
bc = openBackupContainer(exeBackup.toString().c_str(), options.destURL.get(), options.proxy, {});
try {
wait(timeoutError(bc->create(), 30));
} catch (Error& e) {
@ -3342,6 +3365,7 @@ int main(int argc, char* argv[]) {
break;
}
Optional<std::string> proxy;
std::string destinationContainer;
bool describeDeep = false;
bool describeTimestamps = false;
@ -3595,6 +3619,14 @@ int main(int argc, char* argv[]) {
return FDB_EXIT_ERROR;
}
break;
case OPT_PROXY:
proxy = args->OptionArg();
if (!Hostname::isHostname(proxy.get()) && !NetworkAddress::parseOptional(proxy.get()).present()) {
fprintf(stderr, "ERROR: Proxy format should be either IP:port or host:port\n");
return FDB_EXIT_ERROR;
}
modifyOptions.proxy = proxy;
break;
case OPT_DESTCONTAINER:
destinationContainer = args->OptionArg();
// If the url starts with '/' then prepend "file://" for backwards compatibility
@ -3870,33 +3902,9 @@ int main(int argc, char* argv[]) {
return FDB_EXIT_ERROR;
}
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
for (const auto& [knobName, knobValueString] : knobs) {
try {
auto knobValue = g_knobs.parseKnobValue(knobName, knobValueString);
g_knobs.setKnob(knobName, knobValue);
} catch (Error& e) {
if (e.code() == error_code_invalid_option_value) {
fprintf(stderr,
"WARNING: Invalid value '%s' for knob option '%s'\n",
knobValueString.c_str(),
knobName.c_str());
TraceEvent(SevWarnAlways, "InvalidKnobValue")
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
} else {
fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", knobName.c_str(), e.what());
TraceEvent(SevError, "FailedToSetKnob")
.error(e)
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
throw;
}
}
}
IKnobCollection::setupKnobs(knobs);
// Reinitialize knobs in order to update knobs that are dependent on explicitly set knobs
g_knobs.initialize(Randomize::False, IsSimulated::False);
IKnobCollection::getMutableGlobalKnobCollection().initialize(Randomize::False, IsSimulated::False);
TraceEvent("ProgramStart")
.setMaxEventLength(12000)
@ -3962,9 +3970,10 @@ int main(int argc, char* argv[]) {
if (!initCluster())
return FDB_EXIT_ERROR;
// Test out the backup url to make sure it parses. Doesn't test to make sure it's actually writeable.
openBackupContainer(argv[0], destinationContainer, encryptionKeyFile);
openBackupContainer(argv[0], destinationContainer, proxy, encryptionKeyFile);
f = stopAfter(submitBackup(db,
destinationContainer,
proxy,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
backupKeys,
@ -4036,6 +4045,7 @@ int main(int argc, char* argv[]) {
}
f = stopAfter(expireBackupData(argv[0],
destinationContainer,
proxy,
expireVersion,
expireDatetime,
db,
@ -4047,7 +4057,7 @@ int main(int argc, char* argv[]) {
case BackupType::DELETE_BACKUP:
initTraceFile();
f = stopAfter(deleteBackupContainer(argv[0], destinationContainer));
f = stopAfter(deleteBackupContainer(argv[0], destinationContainer, proxy));
break;
case BackupType::DESCRIBE:
@ -4060,6 +4070,7 @@ int main(int argc, char* argv[]) {
// given, but quietly skip them if not.
f = stopAfter(describeBackup(argv[0],
destinationContainer,
proxy,
describeDeep,
describeTimestamps ? Optional<Database>(db) : Optional<Database>(),
jsonOutput,
@ -4068,7 +4079,7 @@ int main(int argc, char* argv[]) {
case BackupType::LIST:
initTraceFile();
f = stopAfter(listBackup(baseUrl));
f = stopAfter(listBackup(baseUrl, proxy));
break;
case BackupType::TAGS:
@ -4081,6 +4092,7 @@ int main(int argc, char* argv[]) {
initTraceFile();
f = stopAfter(queryBackup(argv[0],
destinationContainer,
proxy,
backupKeysFilter,
restoreVersion,
restoreClusterFileOrig,
@ -4090,7 +4102,7 @@ int main(int argc, char* argv[]) {
case BackupType::DUMP:
initTraceFile();
f = stopAfter(dumpBackupData(argv[0], destinationContainer, dumpBegin, dumpEnd));
f = stopAfter(dumpBackupData(argv[0], destinationContainer, proxy, dumpBegin, dumpEnd));
break;
case BackupType::UNDEFINED:
@ -4141,6 +4153,7 @@ int main(int argc, char* argv[]) {
restoreClusterFileOrig,
tagName,
restoreContainer,
proxy,
backupKeys,
beginVersion,
restoreVersion,
@ -4218,6 +4231,7 @@ int main(int argc, char* argv[]) {
f = stopAfter(runFastRestoreTool(db,
tagName,
restoreContainer,
proxy,
backupKeys,
restoreVersion,
!dryRun,

View File

@ -265,7 +265,7 @@ CommandFactory configureFactory(
"commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*|"
"count=<TSS_COUNT>|perpetual_storage_wiggle=<WIGGLE_SPEED>|perpetual_storage_wiggle_locality="
"<<LOCALITY_KEY>:<LOCALITY_VALUE>|0>|storage_migration_type={disabled|gradual|aggressive}"
"|tenant_mode={disabled|optional_experimental|required_experimental}",
"|tenant_mode={disabled|optional_experimental|required_experimental}|blob_granules_enabled={0|1}",
"change the database configuration",
"The `new' option, if present, initializes a new database with the given configuration rather than changing "
"the configuration of an existing one. When used, both a redundancy mode and a storage engine must be "

View File

@ -51,7 +51,9 @@ ACTOR Future<bool> createTenantCommandActor(Reference<IDatabase> db, std::vector
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
try {
if (!doneExistenceCheck) {
Optional<Value> existingTenant = wait(safeThreadFutureToFuture(tr->get(tenantNameKey)));
// Hold the reference to the standalone's memory
state ThreadFuture<Optional<Value>> existingTenantFuture = tr->get(tenantNameKey);
Optional<Value> existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture));
if (existingTenant.present()) {
throw tenant_already_exists();
}
@ -96,7 +98,9 @@ ACTOR Future<bool> deleteTenantCommandActor(Reference<IDatabase> db, std::vector
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
try {
if (!doneExistenceCheck) {
Optional<Value> existingTenant = wait(safeThreadFutureToFuture(tr->get(tenantNameKey)));
// Hold the reference to the standalone's memory
state ThreadFuture<Optional<Value>> existingTenantFuture = tr->get(tenantNameKey);
Optional<Value> existingTenant = wait(safeThreadFutureToFuture(existingTenantFuture));
if (!existingTenant.present()) {
throw tenant_not_found();
}
@ -163,8 +167,10 @@ ACTOR Future<bool> listTenantsCommandActor(Reference<IDatabase> db, std::vector<
loop {
try {
RangeResult tenants = wait(safeThreadFutureToFuture(
tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit)));
// Hold the reference to the standalone's memory
state ThreadFuture<RangeResult> kvsFuture =
tr->getRange(firstGreaterOrEqual(beginTenantKey), firstGreaterOrEqual(endTenantKey), limit);
RangeResult tenants = wait(safeThreadFutureToFuture(kvsFuture));
if (tenants.empty()) {
if (tokens.size() == 1) {
@ -213,7 +219,9 @@ ACTOR Future<bool> getTenantCommandActor(Reference<IDatabase> db, std::vector<St
loop {
try {
Optional<Value> tenant = wait(safeThreadFutureToFuture(tr->get(tenantNameKey)));
// Hold the reference to the standalone's memory
state ThreadFuture<Optional<Value>> tenantFuture = tr->get(tenantNameKey);
Optional<Value> tenant = wait(safeThreadFutureToFuture(tenantFuture));
if (!tenant.present()) {
throw tenant_not_found();
}

View File

@ -354,10 +354,13 @@ static std::vector<std::vector<StringRef>> parseLine(std::string& line, bool& er
forcetoken = true;
break;
case ' ':
case '\n':
case '\t':
case '\r':
if (!quoted) {
if (i > offset || (forcetoken && i == offset))
buf.push_back(StringRef((uint8_t*)(line.data() + offset), i - offset));
offset = i = line.find_first_not_of(' ', i);
offset = i = line.find_first_not_of(" \n\t\r", i);
forcetoken = false;
} else
i++;
@ -788,8 +791,9 @@ void configureGenerator(const char* text, const char* line, std::vector<std::str
"resolvers=",
"perpetual_storage_wiggle=",
"perpetual_storage_wiggle_locality=",
"storage_migration_type="
"storage_migration_type=",
"tenant_mode=",
"blob_granules_enabled=",
nullptr };
arrayGenerator(text, line, opts, lc);
}
@ -1017,33 +1021,10 @@ struct CLIOptions {
}
void setupKnobs() {
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
for (const auto& [knobName, knobValueString] : knobs) {
try {
auto knobValue = g_knobs.parseKnobValue(knobName, knobValueString);
g_knobs.setKnob(knobName, knobValue);
} catch (Error& e) {
if (e.code() == error_code_invalid_option_value) {
fprintf(stderr,
"WARNING: Invalid value '%s' for knob option '%s'\n",
knobValueString.c_str(),
knobName.c_str());
TraceEvent(SevWarnAlways, "InvalidKnobValue")
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
} else {
fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", knobName.c_str(), e.what());
TraceEvent(SevError, "FailedToSetKnob")
.error(e)
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
exit_code = FDB_EXIT_ERROR;
}
}
}
IKnobCollection::setupKnobs(knobs);
// Reinitialize knobs in order to update knobs that are dependent on explicitly set knobs
g_knobs.initialize(Randomize::False, IsSimulated::False);
IKnobCollection::getMutableGlobalKnobCollection().initialize(Randomize::False, IsSimulated::False);
}
int processArg(CSimpleOpt& args) {

View File

@ -165,6 +165,7 @@ public:
Key backupTag,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
Key bcUrl,
Optional<std::string> proxy,
Version targetVersion,
LockDB lockDB,
UID randomUID,
@ -187,6 +188,7 @@ public:
Optional<Database> cxOrig,
Key tagName,
Key url,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> ranges,
WaitForComplete = WaitForComplete::True,
Version targetVersion = ::invalidVersion,
@ -202,6 +204,7 @@ public:
Optional<Database> cxOrig,
Key tagName,
Key url,
Optional<std::string> proxy,
WaitForComplete waitForComplete = WaitForComplete::True,
Version targetVersion = ::invalidVersion,
Verbose verbose = Verbose::True,
@ -219,6 +222,7 @@ public:
cxOrig,
tagName,
url,
proxy,
rangeRef,
waitForComplete,
targetVersion,
@ -263,6 +267,7 @@ public:
Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
Key outContainer,
Optional<std::string> proxy,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string const& tagName,
@ -273,6 +278,7 @@ public:
Optional<std::string> const& encryptionKeyFileName = {});
Future<Void> submitBackup(Database cx,
Key outContainer,
Optional<std::string> proxy,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string const& tagName,
@ -284,6 +290,7 @@ public:
return runRYWTransactionFailIfLocked(cx, [=](Reference<ReadYourWritesTransaction> tr) {
return submitBackup(tr,
outContainer,
proxy,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
tagName,
@ -720,20 +727,37 @@ template <>
inline Tuple Codec<Reference<IBackupContainer>>::pack(Reference<IBackupContainer> const& bc) {
Tuple tuple;
tuple.append(StringRef(bc->getURL()));
if (bc->getEncryptionKeyFileName().present()) {
tuple.append(bc->getEncryptionKeyFileName().get());
} else {
tuple.append(StringRef());
}
if (bc->getProxy().present()) {
tuple.append(StringRef(bc->getProxy().get()));
} else {
tuple.append(StringRef());
}
return tuple;
}
template <>
inline Reference<IBackupContainer> Codec<Reference<IBackupContainer>>::unpack(Tuple const& val) {
ASSERT(val.size() == 1 || val.size() == 2);
ASSERT(val.size() >= 1);
auto url = val.getString(0).toString();
Optional<std::string> encryptionKeyFileName;
if (val.size() == 2) {
if (val.size() > 1 && !val.getString(1).empty()) {
encryptionKeyFileName = val.getString(1).toString();
}
return IBackupContainer::openContainer(url, encryptionKeyFileName);
Optional<std::string> proxy;
if (val.size() > 2 && !val.getString(2).empty()) {
proxy = val.getString(2).toString();
}
return IBackupContainer::openContainer(url, proxy, encryptionKeyFileName);
}
class BackupConfig : public KeyBackedConfig {

View File

@ -256,7 +256,8 @@ std::vector<std::string> IBackupContainer::getURLFormats() {
// Get an IBackupContainer based on a container URL string
Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& url,
Optional<std::string> const& encryptionKeyFileName) {
const Optional<std::string>& proxy,
const Optional<std::string>& encryptionKeyFileName) {
static std::map<std::string, Reference<IBackupContainer>> m_cache;
Reference<IBackupContainer>& r = m_cache[url];
@ -273,7 +274,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
// The URL parameters contain blobstore endpoint tunables as well as possible backup-specific options.
S3BlobStoreEndpoint::ParametersT backupParams;
Reference<S3BlobStoreEndpoint> bstore =
S3BlobStoreEndpoint::fromString(url, &resource, &lastOpenError, &backupParams);
S3BlobStoreEndpoint::fromString(url, proxy, &resource, &lastOpenError, &backupParams);
if (resource.empty())
throw backup_invalid_url();
@ -317,7 +318,7 @@ Reference<IBackupContainer> IBackupContainer::openContainer(const std::string& u
// Get a list of URLS to backup containers based on some a shorter URL. This function knows about some set of supported
// URL types which support this sort of backup discovery.
ACTOR Future<std::vector<std::string>> listContainers_impl(std::string baseURL) {
ACTOR Future<std::vector<std::string>> listContainers_impl(std::string baseURL, Optional<std::string> proxy) {
try {
StringRef u(baseURL);
if (u.startsWith("file://"_sr)) {
@ -327,8 +328,8 @@ ACTOR Future<std::vector<std::string>> listContainers_impl(std::string baseURL)
std::string resource;
S3BlobStoreEndpoint::ParametersT backupParams;
Reference<S3BlobStoreEndpoint> bstore =
S3BlobStoreEndpoint::fromString(baseURL, &resource, &IBackupContainer::lastOpenError, &backupParams);
Reference<S3BlobStoreEndpoint> bstore = S3BlobStoreEndpoint::fromString(
baseURL, proxy, &resource, &IBackupContainer::lastOpenError, &backupParams);
if (!resource.empty()) {
TraceEvent(SevWarn, "BackupContainer")
@ -370,8 +371,9 @@ ACTOR Future<std::vector<std::string>> listContainers_impl(std::string baseURL)
}
}
Future<std::vector<std::string>> IBackupContainer::listContainers(const std::string& baseURL) {
return listContainers_impl(baseURL);
Future<std::vector<std::string>> IBackupContainer::listContainers(const std::string& baseURL,
const Optional<std::string>& proxy) {
return listContainers_impl(baseURL, proxy);
}
ACTOR Future<Version> timeKeeperVersionFromDatetime(std::string datetime, Database db) {

View File

@ -156,6 +156,7 @@ struct BackupFileList {
struct BackupDescription {
BackupDescription() : snapshotBytes(0) {}
std::string url;
Optional<std::string> proxy;
std::vector<KeyspaceSnapshotFile> snapshots;
int64_t snapshotBytes;
// The version before which everything has been deleted by an expire
@ -294,11 +295,14 @@ public:
// Get an IBackupContainer based on a container spec string
static Reference<IBackupContainer> openContainer(const std::string& url,
const Optional<std::string>& encryptionKeyFileName = {});
const Optional<std::string>& proxy,
const Optional<std::string>& encryptionKeyFileName);
static std::vector<std::string> getURLFormats();
static Future<std::vector<std::string>> listContainers(const std::string& baseURL);
static Future<std::vector<std::string>> listContainers(const std::string& baseURL,
const Optional<std::string>& proxy);
std::string const& getURL() const { return URL; }
Optional<std::string> const& getProxy() const { return proxy; }
Optional<std::string> const& getEncryptionKeyFileName() const { return encryptionKeyFileName; }
static std::string lastOpenError;
@ -306,6 +310,7 @@ public:
// TODO: change the following back to `private` once blob obj access is refactored
protected:
std::string URL;
Optional<std::string> proxy;
Optional<std::string> encryptionKeyFileName;
};

View File

@ -409,6 +409,7 @@ public:
Version logStartVersionOverride) {
state BackupDescription desc;
desc.url = bc->getURL();
desc.proxy = bc->getProxy();
TraceEvent("BackupContainerDescribe1")
.detail("URL", bc->getURL())
@ -1500,7 +1501,8 @@ Future<Void> BackupContainerFileSystem::createTestEncryptionKeyFile(std::string
// code but returning a different template type because you can't cast between them
Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
const std::string& url,
Optional<std::string> const& encryptionKeyFileName) {
const Optional<std::string>& proxy,
const Optional<std::string>& encryptionKeyFileName) {
static std::map<std::string, Reference<BackupContainerFileSystem>> m_cache;
Reference<BackupContainerFileSystem>& r = m_cache[url];
@ -1517,7 +1519,7 @@ Reference<BackupContainerFileSystem> BackupContainerFileSystem::openContainerFS(
// The URL parameters contain blobstore endpoint tunables as well as possible backup-specific options.
S3BlobStoreEndpoint::ParametersT backupParams;
Reference<S3BlobStoreEndpoint> bstore =
S3BlobStoreEndpoint::fromString(url, &resource, &lastOpenError, &backupParams);
S3BlobStoreEndpoint::fromString(url, proxy, &resource, &lastOpenError, &backupParams);
if (resource.empty())
throw backup_invalid_url();
@ -1635,7 +1637,9 @@ ACTOR static Future<Void> testWriteSnapshotFile(Reference<IBackupFile> file, Key
return Void();
}
ACTOR Future<Void> testBackupContainer(std::string url, Optional<std::string> encryptionKeyFileName) {
ACTOR Future<Void> testBackupContainer(std::string url,
Optional<std::string> proxy,
Optional<std::string> encryptionKeyFileName) {
state FlowLock lock(100e6);
if (encryptionKeyFileName.present()) {
@ -1644,7 +1648,7 @@ ACTOR Future<Void> testBackupContainer(std::string url, Optional<std::string> en
printf("BackupContainerTest URL %s\n", url.c_str());
state Reference<IBackupContainer> c = IBackupContainer::openContainer(url, encryptionKeyFileName);
state Reference<IBackupContainer> c = IBackupContainer::openContainer(url, proxy, encryptionKeyFileName);
// Make sure container doesn't exist, then create it.
try {
@ -1789,12 +1793,13 @@ ACTOR Future<Void> testBackupContainer(std::string url, Optional<std::string> en
}
TEST_CASE("/backup/containers/localdir/unencrypted") {
wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}));
wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}));
return Void();
}
TEST_CASE("/backup/containers/localdir/encrypted") {
wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()),
{},
format("%s/test_encryption_key", params.getDataDir().c_str())));
return Void();
}
@ -1803,7 +1808,7 @@ TEST_CASE("/backup/containers/url") {
if (!g_network->isSimulated()) {
const char* url = getenv("FDB_TEST_BACKUP_URL");
ASSERT(url != nullptr);
wait(testBackupContainer(url, {}));
wait(testBackupContainer(url, {}, {}));
}
return Void();
}
@ -1813,7 +1818,7 @@ TEST_CASE("/backup/containers_list") {
state const char* url = getenv("FDB_TEST_BACKUP_URL");
ASSERT(url != nullptr);
printf("Listing %s\n", url);
std::vector<std::string> urls = wait(IBackupContainer::listContainers(url));
std::vector<std::string> urls = wait(IBackupContainer::listContainers(url, {}));
for (auto& u : urls) {
printf("%s\n", u.c_str());
}

View File

@ -81,9 +81,9 @@ public:
Future<bool> exists() override = 0;
// TODO: refactor this to separate out the "deal with blob store" stuff from the backup business logic
static Reference<BackupContainerFileSystem> openContainerFS(
const std::string& url,
const Optional<std::string>& encryptionKeyFileName = {});
static Reference<BackupContainerFileSystem> openContainerFS(const std::string& url,
const Optional<std::string>& proxy,
const Optional<std::string>& encryptionKeyFileName);
// Get a list of fileNames and their sizes in the container under the given path
// Although not required, an implementation can avoid traversing unwanted subfolders

View File

@ -52,19 +52,20 @@ struct BlobFilePointerRef {
StringRef filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
BlobFilePointerRef() {}
BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length)
: filename(to, filename), offset(offset), length(length) {}
BlobFilePointerRef(Arena& to, const std::string& filename, int64_t offset, int64_t length, int64_t fullFileLength)
: filename(to, filename), offset(offset), length(length), fullFileLength(fullFileLength) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, filename, offset, length);
serializer(ar, filename, offset, length, fullFileLength);
}
std::string toString() const {
std::stringstream ss;
ss << filename.toString() << ":" << offset << ":" << length;
ss << filename.toString() << ":" << offset << ":" << length << ":" << fullFileLength;
return std::move(ss).str();
}
};
@ -77,17 +78,18 @@ struct BlobGranuleChunkRef {
constexpr static FileIdentifier file_identifier = 865198;
KeyRangeRef keyRange;
Version includedVersion;
Version snapshotVersion;
Optional<BlobFilePointerRef> snapshotFile; // not set if it's an incremental read
VectorRef<BlobFilePointerRef> deltaFiles;
GranuleDeltas newDeltas;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keyRange, includedVersion, snapshotFile, deltaFiles, newDeltas);
serializer(ar, keyRange, includedVersion, snapshotVersion, snapshotFile, deltaFiles, newDeltas);
}
};
enum BlobGranuleSplitState { Unknown = 0, Started = 1, Assigned = 2, Done = 3 };
enum BlobGranuleSplitState { Unknown = 0, Initialized = 1, Assigned = 2, Done = 3 };
struct BlobGranuleHistoryValue {
constexpr static FileIdentifier file_identifier = 991434;

View File

@ -18,9 +18,12 @@
* limitations under the License.
*/
#include <vector>
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "flow/serialize.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/SystemData.h" // for allKeys unit test - could remove
#include "flow/UnitTest.h"
@ -119,29 +122,43 @@ static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map<KeyRef, Val
static void applyDeltas(const GranuleDeltas& deltas,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Version& lastFileEndVersion,
std::map<KeyRef, ValueRef>& dataMap) {
if (!deltas.empty()) {
// check that consecutive delta file versions are disjoint
ASSERT(lastFileEndVersion < deltas.front().version);
if (deltas.empty()) {
return;
}
for (const MutationsAndVersionRef& delta : deltas) {
if (delta.version > readVersion) {
// check that consecutive delta file versions are disjoint
ASSERT(lastFileEndVersion < deltas.front().version);
const MutationsAndVersionRef* mutationIt = deltas.begin();
// prune beginVersion if necessary
if (beginVersion > deltas.front().version) {
ASSERT(beginVersion <= deltas.back().version);
// binary search for beginVersion
mutationIt = std::lower_bound(deltas.begin(),
deltas.end(),
MutationsAndVersionRef(beginVersion, 0),
MutationsAndVersionRef::OrderByVersion());
}
while (mutationIt != deltas.end()) {
if (mutationIt->version > readVersion) {
lastFileEndVersion = readVersion;
return;
}
for (auto& m : delta.mutations) {
for (auto& m : mutationIt->mutations) {
applyDelta(keyRange, m, dataMap);
}
mutationIt++;
}
if (!deltas.empty()) {
lastFileEndVersion = deltas.back().version;
}
lastFileEndVersion = deltas.back().version;
}
static Arena loadDeltaFile(StringRef deltaData,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Version& lastFileEndVersion,
std::map<KeyRef, ValueRef>& dataMap) {
@ -151,7 +168,7 @@ static Arena loadDeltaFile(StringRef deltaData,
reader.deserialize(FileIdentifierFor<GranuleDeltas>::value, deltas, parseArena);
if (BG_READ_DEBUG) {
fmt::print("Parsed {}} deltas from file\n", deltas.size());
fmt::print("Parsed {} deltas from file\n", deltas.size());
}
// TODO REMOVE sanity check
@ -163,19 +180,18 @@ static Arena loadDeltaFile(StringRef deltaData,
ASSERT(deltas[i].version <= deltas[i + 1].version);
}
applyDeltas(deltas, keyRange, readVersion, lastFileEndVersion, dataMap);
applyDeltas(deltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
return parseArena;
}
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]) {
// TODO REMOVE with V2 of protocol
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
ASSERT(chunk.snapshotFile.present());
ASSERT(snapshotData.present());
// Arena to hold all allocations for applying deltas. Most of it, and the arenas produced by reading the files,
// will likely be tossed if there are a significant number of mutations, so we copy at the end instead of doing a
@ -195,13 +211,14 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
}
for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
Arena deltaArena = loadDeltaFile(deltaFileData[deltaIdx], keyRange, readVersion, lastFileEndVersion, dataMap);
Arena deltaArena =
loadDeltaFile(deltaFileData[deltaIdx], keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
arena.dependsOn(deltaArena);
}
if (BG_READ_DEBUG) {
fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size());
}
applyDeltas(chunk.newDeltas, keyRange, readVersion, lastFileEndVersion, dataMap);
applyDeltas(chunk.newDeltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
RangeResult ret;
for (auto& it : dataMap) {
@ -211,50 +228,90 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
return ret;
}
struct GranuleLoadIds {
Optional<int64_t> snapshotId;
std::vector<int64_t> deltaIds;
};
static void startLoad(const ReadBlobGranuleContext granuleContext,
const BlobGranuleChunkRef& chunk,
GranuleLoadIds& loadIds) {
// Start load process for all files in chunk
if (chunk.snapshotFile.present()) {
std::string snapshotFname = chunk.snapshotFile.get().filename.toString();
// FIXME: remove when we implement file multiplexing
ASSERT(chunk.snapshotFile.get().offset == 0);
ASSERT(chunk.snapshotFile.get().length == chunk.snapshotFile.get().fullFileLength);
loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(),
snapshotFname.size(),
chunk.snapshotFile.get().offset,
chunk.snapshotFile.get().length,
chunk.snapshotFile.get().fullFileLength,
granuleContext.userContext);
}
loadIds.deltaIds.reserve(chunk.deltaFiles.size());
for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString();
// FIXME: remove when we implement file multiplexing
ASSERT(chunk.deltaFiles[deltaFileIdx].offset == 0);
ASSERT(chunk.deltaFiles[deltaFileIdx].length == chunk.deltaFiles[deltaFileIdx].fullFileLength);
int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(),
deltaFName.size(),
chunk.deltaFiles[deltaFileIdx].offset,
chunk.deltaFiles[deltaFileIdx].length,
chunk.deltaFiles[deltaFileIdx].fullFileLength,
granuleContext.userContext);
loadIds.deltaIds.push_back(deltaLoadId);
}
}
ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<BlobGranuleChunkRef>>& files,
const KeyRangeRef& keyRange,
Version beginVersion,
Version readVersion,
ReadBlobGranuleContext granuleContext) {
int64_t parallelism = granuleContext.granuleParallelism;
if (parallelism < 1) {
parallelism = 1;
}
if (parallelism >= CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM) {
parallelism = CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM;
}
GranuleLoadIds loadIds[files.size()];
// Kick off first file reads if parallelism > 1
for (int i = 0; i < parallelism - 1 && i < files.size(); i++) {
startLoad(granuleContext, files[i], loadIds[i]);
}
try {
RangeResult results;
// FIXME: could submit multiple chunks to start_load_f in parallel?
for (const BlobGranuleChunkRef& chunk : files) {
for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) {
// Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1
if (chunkIdx + parallelism - 1 < files.size()) {
startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]);
}
RangeResult chunkRows;
int64_t snapshotLoadId;
int64_t deltaLoadIds[chunk.deltaFiles.size()];
// Start load process for all files in chunk
// In V1 of api snapshot is required, optional is just for forward compatibility
ASSERT(chunk.snapshotFile.present());
std::string snapshotFname = chunk.snapshotFile.get().filename.toString();
snapshotLoadId = granuleContext.start_load_f(snapshotFname.c_str(),
snapshotFname.size(),
chunk.snapshotFile.get().offset,
chunk.snapshotFile.get().length,
granuleContext.userContext);
int64_t deltaLoadLengths[chunk.deltaFiles.size()];
StringRef deltaData[chunk.deltaFiles.size()];
for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString();
deltaLoadIds[deltaFileIdx] = granuleContext.start_load_f(deltaFName.c_str(),
deltaFName.size(),
chunk.deltaFiles[deltaFileIdx].offset,
chunk.deltaFiles[deltaFileIdx].length,
granuleContext.userContext);
deltaLoadLengths[deltaFileIdx] = chunk.deltaFiles[deltaFileIdx].length;
}
// once all loads kicked off, load data for chunk
StringRef snapshotData(granuleContext.get_load_f(snapshotLoadId, granuleContext.userContext),
chunk.snapshotFile.get().length);
if (!snapshotData.begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
Optional<StringRef> snapshotData;
if (files[chunkIdx].snapshotFile.present()) {
snapshotData =
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext),
files[chunkIdx].snapshotFile.get().length);
if (!snapshotData.get().begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
}
}
for (int i = 0; i < chunk.deltaFiles.size(); i++) {
deltaData[i] = StringRef(granuleContext.get_load_f(deltaLoadIds[i], granuleContext.userContext),
chunk.deltaFiles[i].length);
StringRef deltaData[files[chunkIdx].deltaFiles.size()];
for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) {
deltaData[i] =
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext),
files[chunkIdx].deltaFiles[i].length);
// null data is error
if (!deltaData[i].begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
@ -262,14 +319,17 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
// materialize rows from chunk
chunkRows = materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData);
chunkRows =
materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData);
results.arena().dependsOn(chunkRows.arena());
results.append(results.arena(), chunkRows.begin(), chunkRows.size());
granuleContext.free_load_f(snapshotLoadId, granuleContext.userContext);
for (int i = 0; i < chunk.deltaFiles.size(); i++) {
granuleContext.free_load_f(deltaLoadIds[i], granuleContext.userContext);
if (loadIds[chunkIdx].snapshotId.present()) {
granuleContext.free_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext);
}
for (int i = 0; i < loadIds[chunkIdx].deltaIds.size(); i++) {
granuleContext.free_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext);
}
}
return ErrorOr<RangeResult>(results);
@ -278,8 +338,7 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
}
// FIXME: re-enable test!
TEST_CASE(":/blobgranule/files/applyDelta") {
TEST_CASE("/blobgranule/files/applyDelta") {
printf("Testing blob granule delta applying\n");
Arena a;

View File

@ -33,6 +33,7 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]);

View File

@ -28,6 +28,7 @@
#include "fdbclient/BlobGranuleReader.actor.h"
#include "fdbclient/BlobWorkerCommon.h"
#include "fdbclient/BlobWorkerInterface.h"
#include "fdbclient/FDBTypes.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other
@ -52,7 +53,6 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem
StringRef dataRef(data, f.length);
return Standalone<StringRef>(dataRef, arena);
} catch (Error& e) {
printf("Reading file %s got error %s\n", f.toString().c_str(), e.name());
throw e;
}
}
@ -64,22 +64,25 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem
// sub-functions that BlobGranuleFiles actually exposes?
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Reference<BackupContainerFileSystem> bstore,
Optional<BlobWorkerStats*> stats) {
// TODO REMOVE with V2 of protocol
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
ASSERT(chunk.snapshotFile.present());
state Arena arena;
try {
Future<Standalone<StringRef>> readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get());
state std::vector<Future<Standalone<StringRef>>> readDeltaFutures;
if (stats.present()) {
++stats.get()->s3GetReqs;
Future<Standalone<StringRef>> readSnapshotFuture;
if (chunk.snapshotFile.present()) {
readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get());
if (stats.present()) {
++stats.get()->s3GetReqs;
}
}
state std::vector<Future<Standalone<StringRef>>> readDeltaFutures;
readDeltaFutures.reserve(chunk.deltaFiles.size());
for (BlobFilePointerRef deltaFile : chunk.deltaFiles) {
@ -89,8 +92,12 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
}
}
state Standalone<StringRef> snapshotData = wait(readSnapshotFuture);
arena.dependsOn(snapshotData.arena());
state Optional<StringRef> snapshotData; // not present if snapshotFile isn't present
if (chunk.snapshotFile.present()) {
state Standalone<StringRef> s = wait(readSnapshotFuture);
arena.dependsOn(s.arena());
snapshotData = s;
}
state int numDeltaFiles = chunk.deltaFiles.size();
state StringRef* deltaData = new (arena) StringRef[numDeltaFiles];
@ -103,10 +110,9 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
arena.dependsOn(data.arena());
}
return materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData);
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData);
} catch (Error& e) {
printf("Reading blob granule got error %s\n", e.name());
throw e;
}
}
@ -121,18 +127,12 @@ ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
try {
state int i;
for (i = 0; i < reply.chunks.size(); i++) {
/*printf("ReadBlobGranules processing chunk %d [%s - %s)\n",
i,
reply.chunks[i].keyRange.begin.printable().c_str(),
reply.chunks[i].keyRange.end.printable().c_str());*/
RangeResult chunkResult =
wait(readBlobGranule(reply.chunks[i], request.keyRange, request.readVersion, bstore));
RangeResult chunkResult = wait(
readBlobGranule(reply.chunks[i], request.keyRange, request.beginVersion, request.readVersion, bstore));
results.send(std::move(chunkResult));
}
// printf("ReadBlobGranules done, sending EOS\n");
results.sendError(end_of_stream());
} catch (Error& e) {
printf("ReadBlobGranules got error %s\n", e.name());
results.sendError(e);
}

View File

@ -40,6 +40,7 @@
// the request
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Reference<BackupContainerFileSystem> bstore,
Optional<BlobWorkerStats*> stats = Optional<BlobWorkerStats*>());

View File

@ -37,10 +37,14 @@ struct BlobWorkerStats {
Counter readReqDeltaBytesReturned;
Counter commitVersionChecks;
Counter granuleUpdateErrors;
Counter granuleRequestTimeouts;
Counter readRequestsWithBegin;
Counter readRequestsCollapsed;
int numRangesAssigned;
int mutationBytesBuffered;
int activeReadRequests;
int granulesPendingSplitCheck;
Future<Void> logger;
@ -57,10 +61,13 @@ struct BlobWorkerStats {
wrongShardServer("WrongShardServer", cc), changeFeedInputBytes("RangeFeedInputBytes", cc),
readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc),
readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc),
granuleUpdateErrors("GranuleUpdateErrors", cc), numRangesAssigned(0), mutationBytesBuffered(0) {
granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc),
readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0) {
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
specialCounter(cc, "GranulesPendingSplitCheck", [this]() { return this->granulesPendingSplitCheck; });
logger = traceCounters("BlobWorkerMetrics", id, interval, &cc, "BlobWorkerMetrics");
}

View File

@ -34,6 +34,7 @@ struct BlobWorkerInterface {
RequestStream<struct BlobGranuleFileRequest> blobGranuleFileRequest;
RequestStream<struct AssignBlobRangeRequest> assignBlobRangeRequest;
RequestStream<struct RevokeBlobRangeRequest> revokeBlobRangeRequest;
RequestStream<struct GetGranuleAssignmentsRequest> granuleAssignmentsRequest;
RequestStream<struct GranuleStatusStreamRequest> granuleStatusStreamRequest;
RequestStream<struct HaltBlobWorkerRequest> haltBlobWorker;
@ -58,6 +59,7 @@ struct BlobWorkerInterface {
blobGranuleFileRequest,
assignBlobRangeRequest,
revokeBlobRangeRequest,
granuleAssignmentsRequest,
granuleStatusStreamRequest,
haltBlobWorker,
locality,
@ -84,26 +86,14 @@ struct BlobGranuleFileRequest {
KeyRangeRef keyRange;
Version beginVersion = 0;
Version readVersion;
bool canCollapseBegin = true;
ReplyPromise<BlobGranuleFileReply> reply;
BlobGranuleFileRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keyRange, beginVersion, readVersion, reply, arena);
}
};
struct AssignBlobRangeReply {
constexpr static FileIdentifier file_identifier = 6431923;
bool epochOk; // false if the worker has seen a new manager
AssignBlobRangeReply() {}
explicit AssignBlobRangeReply(bool epochOk) : epochOk(epochOk) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, epochOk);
serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, reply, arena);
}
};
@ -114,7 +104,7 @@ struct RevokeBlobRangeRequest {
int64_t managerEpoch;
int64_t managerSeqno;
bool dispose;
ReplyPromise<AssignBlobRangeReply> reply;
ReplyPromise<Void> reply;
RevokeBlobRangeRequest() {}
@ -124,6 +114,12 @@ struct RevokeBlobRangeRequest {
}
};
/*
* Continue: Blob worker should continue handling a granule that was evaluated for a split
* Normal: Blob worker should open the granule and start processing it
*/
enum AssignRequestType { Normal = 0, Continue = 1 };
struct AssignBlobRangeRequest {
constexpr static FileIdentifier file_identifier = 905381;
Arena arena;
@ -133,16 +129,15 @@ struct AssignBlobRangeRequest {
// If continueAssignment is true, this is just to instruct the worker that it *still* owns the range, so it should
// re-snapshot it and continue.
// For an initial assignment, reassignent, split, or merge, continueAssignment==false.
bool continueAssignment;
AssignRequestType type;
ReplyPromise<AssignBlobRangeReply> reply;
ReplyPromise<Void> reply;
AssignBlobRangeRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keyRange, managerEpoch, managerSeqno, continueAssignment, reply, arena);
serializer(ar, keyRange, managerEpoch, managerSeqno, type, reply, arena);
}
};
@ -153,22 +148,22 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
KeyRange granuleRange;
bool doSplit;
bool writeHotSplit;
int64_t epoch;
int64_t seqno;
UID granuleID;
Version startVersion;
Version latestVersion;
GranuleStatusReply() {}
explicit GranuleStatusReply(KeyRange range,
bool doSplit,
bool writeHotSplit,
int64_t epoch,
int64_t seqno,
UID granuleID,
Version startVersion,
Version latestVersion)
: granuleRange(range), doSplit(doSplit), epoch(epoch), seqno(seqno), granuleID(granuleID),
startVersion(startVersion), latestVersion(latestVersion) {}
Version startVersion)
: granuleRange(range), doSplit(doSplit), writeHotSplit(writeHotSplit), epoch(epoch), seqno(seqno),
granuleID(granuleID), startVersion(startVersion) {}
int expectedSize() const { return sizeof(GranuleStatusReply) + granuleRange.expectedSize(); }
@ -179,11 +174,11 @@ struct GranuleStatusReply : public ReplyPromiseStreamReply {
ReplyPromiseStreamReply::sequence,
granuleRange,
doSplit,
writeHotSplit,
epoch,
seqno,
granuleID,
startVersion,
latestVersion);
startVersion);
}
};
@ -220,4 +215,42 @@ struct HaltBlobWorkerRequest {
}
};
struct GranuleAssignmentRef {
KeyRangeRef range;
int64_t epochAssigned;
int64_t seqnoAssigned;
GranuleAssignmentRef() {}
explicit GranuleAssignmentRef(KeyRangeRef range, int64_t epochAssigned, int64_t seqnoAssigned)
: range(range), epochAssigned(epochAssigned), seqnoAssigned(seqnoAssigned) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, range, epochAssigned, seqnoAssigned);
}
};
struct GetGranuleAssignmentsReply {
constexpr static FileIdentifier file_identifier = 9191718;
Arena arena;
VectorRef<GranuleAssignmentRef> assignments;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, assignments, arena);
}
};
struct GetGranuleAssignmentsRequest {
constexpr static FileIdentifier file_identifier = 4121494;
int64_t managerEpoch;
ReplyPromise<GetGranuleAssignmentsReply> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, managerEpoch, reply);
}
};
#endif

View File

@ -205,6 +205,17 @@ if(BUILD_AZURE_BACKUP)
)
endif()
if(WITH_AWS_BACKUP)
add_compile_definitions(BUILD_AWS_BACKUP)
set(FDBCLIENT_SRCS
${FDBCLIENT_SRCS}
FDBAWSCredentialsProvider.h)
include(awssdk)
endif()
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
add_dependencies(fdbclient fdboptions)
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
@ -224,3 +235,8 @@ if(BUILD_AZURE_BACKUP)
target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite)
endif()
if(BUILD_AWS_BACKUP)
target_link_libraries(fdbclient PUBLIC awssdk_target)
target_link_libraries(fdbclient_sampling PUBLIC awssdk_target)
endif()

View File

@ -69,7 +69,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( RESOURCE_CONSTRAINED_MAX_BACKOFF, 30.0 );
init( PROXY_COMMIT_OVERHEAD_BYTES, 23 ); //The size of serializing 7 tags (3 primary, 3 remote, 1 log router) + 2 for the tag length
init( SHARD_STAT_SMOOTH_AMOUNT, 5.0 );
init( INIT_MID_SHARD_BYTES, 50000000 ); if( randomize && BUGGIFY ) INIT_MID_SHARD_BYTES = 40000; else if(randomize && !BUGGIFY) INIT_MID_SHARD_BYTES = 200000; // The same value as SERVER_KNOBS->MIN_SHARD_BYTES
init( INIT_MID_SHARD_BYTES, 50000000 ); if( randomize && BUGGIFY ) INIT_MID_SHARD_BYTES = 40000; else if(randomize && BUGGIFY_WITH_PROB(0.75)) INIT_MID_SHARD_BYTES = 200000; // The same value as SERVER_KNOBS->MIN_SHARD_BYTES
init( TRANSACTION_SIZE_LIMIT, 1e7 );
init( KEY_SIZE_LIMIT, 1e4 );
@ -80,6 +80,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( CHANGE_FEED_LOCATION_LIMIT, 10000 );
init( CHANGE_FEED_CACHE_SIZE, 100000 ); if( randomize && BUGGIFY ) CHANGE_FEED_CACHE_SIZE = 1;
init( CHANGE_FEED_POP_TIMEOUT, 5.0 );
init( CHANGE_FEED_STREAM_MIN_BYTES, 1e4 ); if( randomize && BUGGIFY ) CHANGE_FEED_STREAM_MIN_BYTES = 1;
init( MAX_BATCH_SIZE, 1000 ); if( randomize && BUGGIFY ) MAX_BATCH_SIZE = 1;
init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1;
@ -275,8 +276,12 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 );
init( BUSYNESS_SPIKE_SATURATED_THRESHOLD, 0.500 );
// blob granules
init( ENABLE_BLOB_GRANULES, false );
// multi-version client control
init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 );
init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 );
// Blob granules
init( BG_MAX_GRANULE_PARALLELISM, 10 );
// clang-format on
}

View File

@ -79,6 +79,7 @@ public:
int64_t CHANGE_FEED_LOCATION_LIMIT;
int64_t CHANGE_FEED_CACHE_SIZE;
double CHANGE_FEED_POP_TIMEOUT;
int64_t CHANGE_FEED_STREAM_MIN_BYTES;
int MAX_BATCH_SIZE;
double GRV_BATCH_TIMEOUT;
@ -267,8 +268,12 @@ public:
double BUSYNESS_SPIKE_START_THRESHOLD;
double BUSYNESS_SPIKE_SATURATED_THRESHOLD;
// blob granules
bool ENABLE_BLOB_GRANULES;
// multi-version client control
int MVC_CLIENTLIB_CHUNK_SIZE;
int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION;
// Blob Granules
int BG_MAX_GRANULE_PARALLELISM;
ClientKnobs(Randomize randomize);
void initialize(Randomize randomize);

View File

@ -28,6 +28,7 @@
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/ClusterInterface.h"
#include "fdbclient/WellKnownEndpoints.h"
#include "flow/Hostname.h"
const int MAX_CLUSTER_FILE_BYTES = 60000;

View File

@ -23,6 +23,7 @@
#include "flow/ITrace.h"
#include "flow/Trace.h"
#include "flow/genericactors.actor.h"
#include "flow/UnitTest.h"
DatabaseConfiguration::DatabaseConfiguration() {
resetInternal();
@ -50,6 +51,7 @@ void DatabaseConfiguration::resetInternal() {
perpetualStorageWiggleSpeed = 0;
perpetualStorageWiggleLocality = "0";
storageMigrationType = StorageMigrationType::DEFAULT;
blobGranulesEnabled = false;
tenantMode = TenantMode::DISABLED;
}
@ -300,7 +302,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
result["storage_engine"] = "ssd-redwood-1-experimental";
} else if (tLogDataStoreType == KeyValueStoreType::SSD_BTREE_V2 &&
storageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
result["storage_engine"] = "ssd-rocksdb-experimental";
result["storage_engine"] = "ssd-rocksdb-v1";
} else if (tLogDataStoreType == KeyValueStoreType::MEMORY && storageServerStoreType == KeyValueStoreType::MEMORY) {
result["storage_engine"] = "memory-1";
} else if (tLogDataStoreType == KeyValueStoreType::SSD_BTREE_V2 &&
@ -322,7 +324,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_REDWOOD_V1) {
result["tss_storage_engine"] = "ssd-redwood-1-experimental";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
result["tss_storage_engine"] = "ssd-rocksdb-experimental";
result["tss_storage_engine"] = "ssd-rocksdb-v1";
} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY_RADIXTREE) {
result["tss_storage_engine"] = "memory-radixtree-beta";
} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY) {
@ -404,6 +406,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
result["perpetual_storage_wiggle"] = perpetualStorageWiggleSpeed;
result["perpetual_storage_wiggle_locality"] = perpetualStorageWiggleLocality;
result["storage_migration_type"] = storageMigrationType.toString();
result["blob_granules_enabled"] = (int32_t)blobGranulesEnabled;
result["tenant_mode"] = tenantMode.toString();
return result;
}
@ -488,9 +491,9 @@ void DatabaseConfiguration::overwriteProxiesCount() {
Optional<ValueRef> optGrvProxies = DatabaseConfiguration::get(grvProxiesKey);
Optional<ValueRef> optProxies = DatabaseConfiguration::get(proxiesKey);
const int mutableGrvProxyCount = optGrvProxies.present() ? toInt(optGrvProxies.get()) : 0;
const int mutableCommitProxyCount = optCommitProxies.present() ? toInt(optCommitProxies.get()) : 0;
const int mutableProxiesCount = optProxies.present() ? toInt(optProxies.get()) : 0;
const int mutableGrvProxyCount = optGrvProxies.present() ? toInt(optGrvProxies.get()) : -1;
const int mutableCommitProxyCount = optCommitProxies.present() ? toInt(optCommitProxies.get()) : -1;
const int mutableProxiesCount = optProxies.present() ? toInt(optProxies.get()) : -1;
if (mutableProxiesCount > 1) {
TraceEvent(SevDebug, "OverwriteProxiesCount")
@ -500,23 +503,23 @@ void DatabaseConfiguration::overwriteProxiesCount() {
.detail("MutableGrvCPCount", mutableGrvProxyCount)
.detail("MutableProxiesCount", mutableProxiesCount);
if (grvProxyCount == -1 && commitProxyCount > 0) {
if (mutableProxiesCount > commitProxyCount) {
grvProxyCount = mutableProxiesCount - commitProxyCount;
if (mutableGrvProxyCount == -1 && mutableCommitProxyCount > 0) {
if (mutableProxiesCount > mutableCommitProxyCount) {
grvProxyCount = mutableProxiesCount - mutableCommitProxyCount;
} else {
// invalid configuration; provision min GrvProxies
grvProxyCount = 1;
commitProxyCount = mutableProxiesCount - 1;
}
} else if (grvProxyCount > 0 && commitProxyCount == -1) {
if (mutableProxiesCount > grvProxyCount) {
} else if (mutableGrvProxyCount > 0 && mutableCommitProxyCount == -1) {
if (mutableProxiesCount > mutableGrvProxyCount) {
commitProxyCount = mutableProxiesCount - grvProxyCount;
} else {
// invalid configuration; provision min CommitProxies
commitProxyCount = 1;
grvProxyCount = mutableProxiesCount - 1;
}
} else if (grvProxyCount == -1 && commitProxyCount == -1) {
} else if (mutableGrvProxyCount == -1 && mutableCommitProxyCount == -1) {
// Use DEFAULT_COMMIT_GRV_PROXIES_RATIO to split proxies between Grv & Commit proxies
const int derivedGrvProxyCount =
std::max(1,
@ -633,6 +636,9 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
tenantMode = (TenantMode::Mode)type;
} else if (ck == LiteralStringRef("proxies")) {
overwriteProxiesCount();
} else if (ck == LiteralStringRef("blob_granules_enabled")) {
parse((&type), value);
blobGranulesEnabled = (type != 0);
} else {
return false;
}
@ -654,6 +660,11 @@ void DatabaseConfiguration::applyMutation(MutationRef m) {
}
}
bool DatabaseConfiguration::involveMutation(MutationRef m) {
return (m.type == MutationRef::SetValue && m.param1.startsWith(configKeysPrefix)) ||
(m.type == MutationRef::ClearRange && KeyRangeRef(m.param1, m.param2).intersects(configKeys));
}
bool DatabaseConfiguration::set(KeyRef key, ValueRef value) {
makeConfigurationMutable();
mutableConfiguration.get()[key.toString()] = value.toString();
@ -820,3 +831,21 @@ bool DatabaseConfiguration::isOverridden(std::string key) const {
return false;
}
TEST_CASE("/fdbclient/databaseConfiguration/overwriteCommitProxy") {
DatabaseConfiguration conf1;
conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/grv_proxies"_sr, "5"_sr));
conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/proxies"_sr, "10"_sr));
conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/grv_proxies"_sr, "-1"_sr));
conf1.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/commit_proxies"_sr, "-1"_sr));
DatabaseConfiguration conf2;
conf2.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/proxies"_sr, "10"_sr));
conf2.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/grv_proxies"_sr, "-1"_sr));
conf2.applyMutation(MutationRef(MutationRef::SetValue, "\xff/conf/commit_proxies"_sr, "-1"_sr));
ASSERT(conf1 == conf2);
ASSERT(conf1.getDesiredCommitProxies() == conf2.getDesiredCommitProxies());
return Void();
}

View File

@ -104,6 +104,8 @@ struct DatabaseConfiguration {
DatabaseConfiguration();
void applyMutation(MutationRef mutation);
// return true if mutation will cause configuration changes
bool involveMutation(MutationRef mutation);
bool set(KeyRef key,
ValueRef value); // Returns true if a configuration option that requires recovery to take effect is changed
bool clear(KeyRangeRef keys);
@ -250,6 +252,8 @@ struct DatabaseConfiguration {
// Storage Migration Type
StorageMigrationType storageMigrationType;
// Blob Granules
bool blobGranulesEnabled;
TenantMode tenantMode;
// Excluded servers (no state should be here)

View File

@ -181,6 +181,7 @@ struct ChangeFeedStorageData : ReferenceCounted<ChangeFeedStorageData> {
NotifiedVersion version;
NotifiedVersion desired;
Promise<Void> destroyed;
UID interfToken;
~ChangeFeedStorageData() { destroyed.send(Void()); }
};
@ -196,6 +197,10 @@ struct ChangeFeedData : ReferenceCounted<ChangeFeedData> {
std::vector<Reference<ChangeFeedStorageData>> storageData;
AsyncVar<int> notAtLatest;
Promise<Void> refresh;
Version maxSeenVersion;
Version endVersion = invalidVersion;
Version popVersion =
invalidVersion; // like TLog pop version, set by SS and client can check it to see if they missed data
ChangeFeedData() : notAtLatest(1) {}
};
@ -292,6 +297,10 @@ public:
StorageMetrics const& permittedError,
int shardLimit,
int expectedShardCount);
Future<Void> splitStorageMetricsStream(PromiseStream<Key> const& resultsStream,
KeyRange const& keys,
StorageMetrics const& limit,
StorageMetrics const& estimated);
Future<Standalone<VectorRef<KeyRef>>> splitStorageMetrics(KeyRange const& keys,
StorageMetrics const& limit,
StorageMetrics const& estimated);
@ -355,7 +364,9 @@ public:
Key rangeID,
Version begin = 0,
Version end = std::numeric_limits<Version>::max(),
KeyRange range = allKeys);
KeyRange range = allKeys,
int replyBufferSize = -1,
bool canReadPopped = true);
Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);
@ -503,7 +514,7 @@ public:
Counter transactionGrvTimedOutBatches;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
bytesPerCommit;
bytesPerCommit, bgLatencies, bgGranulesPerRequest;
int outstandingWatches;
int maxOutstandingWatches;
@ -527,6 +538,7 @@ public:
bool transactionTracingSample;
double verifyCausalReadsProp = 0.0;
bool blobGranuleNoMaterialize = false;
bool anyBlobGranuleRequests = false;
Future<Void> logger;
Future<Void> throttleExpirer;

View File

@ -0,0 +1,47 @@
/*
* FDBAWSCredentialsProvider.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if (!defined FDB_AWS_CREDENTIALS_PROVIDER_H) && (defined BUILD_AWS_BACKUP)
#define FDB_AWS_CREDENTIALS_PROVIDER_H
#pragma once
#include "aws/core/Aws.h"
#include "aws/core/auth/AWSCredentialsProviderChain.h"
// Singleton
namespace FDBAWSCredentialsProvider {
bool doneInit = false;
// You're supposed to call AWS::ShutdownAPI(options); once done
// But we want this to live for the lifetime of the process, so we don't do that
static Aws::Auth::AWSCredentials getAwsCredentials() {
if (!doneInit) {
doneInit = true;
Aws::SDKOptions options;
Aws::InitAPI(options);
TraceEvent("AWSSDKInitSuccessful");
}
Aws::Auth::DefaultAWSCredentialsProviderChain credProvider;
Aws::Auth::AWSCredentials creds = credProvider.GetAWSCredentials();
return creds;
}
} // namespace FDBAWSCredentialsProvider
#endif

View File

@ -30,6 +30,25 @@
#include "flow/Arena.h"
#include "flow/flow.h"
enum class TraceFlags : uint8_t { unsampled = 0b00000000, sampled = 0b00000001 };
inline TraceFlags operator&(TraceFlags lhs, TraceFlags rhs) {
return static_cast<TraceFlags>(static_cast<std::underlying_type_t<TraceFlags>>(lhs) &
static_cast<std::underlying_type_t<TraceFlags>>(rhs));
}
struct SpanContext {
UID traceID;
uint64_t spanID;
TraceFlags m_Flags;
SpanContext() : traceID(UID()), spanID(0), m_Flags(TraceFlags::unsampled) {}
SpanContext(UID traceID, uint64_t spanID, TraceFlags flags) : traceID(traceID), spanID(spanID), m_Flags(flags) {}
SpanContext(UID traceID, uint64_t spanID) : traceID(traceID), spanID(spanID), m_Flags(TraceFlags::unsampled) {}
SpanContext(Arena arena, const SpanContext& span)
: traceID(span.traceID), spanID(span.spanID), m_Flags(span.m_Flags) {}
bool isSampled() const { return (m_Flags & TraceFlags::sampled) == TraceFlags::sampled; }
};
typedef int64_t Version;
typedef uint64_t LogEpoch;
typedef uint64_t Sequence;
@ -652,6 +671,7 @@ struct GetRangeLimits {
};
struct RangeResultRef : VectorRef<KeyValueRef> {
constexpr static FileIdentifier file_identifier = 3985192;
bool more; // True if (but not necessarily only if) values remain in the *key* range requested (possibly beyond the
// limits requested) False implies that no such values remain
Optional<KeyRef> readThrough; // Only present when 'more' is true. When present, this value represent the end (or
@ -831,7 +851,7 @@ struct KeyValueStoreType {
case SSD_REDWOOD_V1:
return "ssd-redwood-1-experimental";
case SSD_ROCKSDB_V1:
return "ssd-rocksdb-experimental";
return "ssd-rocksdb-v1";
case MEMORY:
return "memory";
case MEMORY_RADIXTREE:
@ -958,6 +978,7 @@ struct TLogSpillType {
// Contains the amount of free and total space for a storage server, in bytes
struct StorageBytes {
constexpr static FileIdentifier file_identifier = 3928581;
// Free space on the filesystem
int64_t free;
// Total space on the filesystem
@ -1342,7 +1363,12 @@ struct ReadBlobGranuleContext {
void* userContext;
// Returns a unique id for the load. Asynchronous to support queueing multiple in parallel.
int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
int64_t (*start_load_f)(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* context);
// Returns data for the load. Pass the loadId returned by start_load_f
uint8_t* (*get_load_f)(int64_t loadId, void* context);
@ -1353,17 +1379,20 @@ struct ReadBlobGranuleContext {
// Set this to true for testing if you don't want to read the granule files,
// just do the request to the blob workers
bool debugNoMaterialize;
// number of granules to load in parallel (default 1)
int granuleParallelism = 1;
};
// Store metadata associated with each storage server. Now it only contains data be used in perpetual storage wiggle.
struct StorageMetadataType {
constexpr static FileIdentifier file_identifier = 732123;
// when the SS is initialized
uint64_t createdTime; // comes from currentTime()
// when the SS is initialized, in epoch seconds, comes from currentTime()
double createdTime;
StorageMetadataType() : createdTime(0) {}
StorageMetadataType(uint64_t t) : createdTime(t) {}
static uint64_t currentTime() { return g_network->timer() * 1e9; }
static double currentTime() { return g_network->timer(); }
// To change this serialization, ProtocolVersion::StorageMetadata must be updated, and downgrades need
// to be considered

View File

@ -4363,13 +4363,14 @@ public:
Key backupTag,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
Key bcUrl,
Optional<std::string> proxy,
Version targetVersion,
LockDB lockDB,
UID randomUID,
Key addPrefix,
Key removePrefix) {
// Sanity check backup is valid
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(bcUrl.toString());
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(bcUrl.toString(), proxy, {});
state BackupDescription desc = wait(bc->describeBackup());
wait(desc.resolveVersionTimes(cx));
@ -4430,6 +4431,7 @@ public:
struct RestoreRequest restoreRequest(restoreIndex,
restoreTag,
bcUrl,
proxy,
targetVersion,
range,
deterministicRandom()->randomUniqueID(),
@ -4510,6 +4512,7 @@ public:
ACTOR static Future<Void> submitBackup(FileBackupAgent* backupAgent,
Reference<ReadYourWritesTransaction> tr,
Key outContainer,
Optional<std::string> proxy,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string tagName,
@ -4555,7 +4558,8 @@ public:
backupContainer = joinPath(backupContainer, std::string("backup-") + nowStr.toString());
}
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(backupContainer, encryptionKeyFileName);
state Reference<IBackupContainer> bc =
IBackupContainer::openContainer(backupContainer, proxy, encryptionKeyFileName);
try {
wait(timeoutError(bc->create(), 30));
} catch (Error& e) {
@ -4642,6 +4646,7 @@ public:
Reference<ReadYourWritesTransaction> tr,
Key tagName,
Key backupURL,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> ranges,
Version restoreVersion,
Key addPrefix,
@ -4710,7 +4715,7 @@ public:
// Point the tag to the new uid
tag.set(tr, { uid, false });
Reference<IBackupContainer> bc = IBackupContainer::openContainer(backupURL.toString());
Reference<IBackupContainer> bc = IBackupContainer::openContainer(backupURL.toString(), proxy, {});
// Configure the new restore
restore.tag().set(tr, tagName.toString());
@ -5303,6 +5308,7 @@ public:
Optional<Database> cxOrig,
Key tagName,
Key url,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> ranges,
WaitForComplete waitForComplete,
Version targetVersion,
@ -5320,7 +5326,7 @@ public:
throw restore_error();
}
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString(), proxy, {});
state BackupDescription desc = wait(bc->describeBackup(true));
if (cxOrig.present()) {
@ -5360,6 +5366,7 @@ public:
tr,
tagName,
url,
proxy,
ranges,
targetVersion,
addPrefix,
@ -5499,6 +5506,7 @@ public:
tagName,
ranges,
KeyRef(bc->getURL()),
bc->getProxy(),
targetVersion,
LockDB::True,
randomUid,
@ -5520,6 +5528,7 @@ public:
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
ranges,
WaitForComplete::True,
::invalidVersion,
@ -5561,13 +5570,14 @@ Future<Void> FileBackupAgent::submitParallelRestore(Database cx,
Key backupTag,
Standalone<VectorRef<KeyRangeRef>> backupRanges,
Key bcUrl,
Optional<std::string> proxy,
Version targetVersion,
LockDB lockDB,
UID randomUID,
Key addPrefix,
Key removePrefix) {
return FileBackupAgentImpl::submitParallelRestore(
cx, backupTag, backupRanges, bcUrl, targetVersion, lockDB, randomUID, addPrefix, removePrefix);
cx, backupTag, backupRanges, bcUrl, proxy, targetVersion, lockDB, randomUID, addPrefix, removePrefix);
}
Future<Void> FileBackupAgent::atomicParallelRestore(Database cx,
@ -5582,6 +5592,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
Optional<Database> cxOrig,
Key tagName,
Key url,
Optional<std::string> proxy,
Standalone<VectorRef<KeyRangeRef>> ranges,
WaitForComplete waitForComplete,
Version targetVersion,
@ -5598,6 +5609,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
cxOrig,
tagName,
url,
proxy,
ranges,
waitForComplete,
targetVersion,
@ -5639,6 +5651,7 @@ Future<ERestoreState> FileBackupAgent::waitRestore(Database cx, Key tagName, Ver
Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction> tr,
Key outContainer,
Optional<std::string> proxy,
int initialSnapshotIntervalSeconds,
int snapshotIntervalSeconds,
std::string const& tagName,
@ -5650,6 +5663,7 @@ Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction>
return FileBackupAgentImpl::submitBackup(this,
tr,
outContainer,
proxy,
initialSnapshotIntervalSeconds,
snapshotIntervalSeconds,
tagName,

View File

@ -125,6 +125,21 @@ bool isCompleteConfiguration(std::map<std::string, std::string> const& options);
ConfigureAutoResult parseConfig(StatusObject const& status);
template <typename Transaction, class T>
struct transaction_future_type {
using type = typename Transaction::template FutureT<T>;
};
template <typename Transaction, class T>
struct transaction_future_type<Transaction*, T> {
using type = typename transaction_future_type<Transaction, T>::type;
};
template <typename Transaction, class T>
struct transaction_future_type<Reference<Transaction>, T> {
using type = typename transaction_future_type<Transaction, T>::type;
};
// Management API written in template code to support both IClientAPI and NativeAPI
namespace ManagementAPI {
@ -636,7 +651,8 @@ Future<Optional<TenantMapEntry>> tryGetTenantTransaction(Transaction tr, TenantN
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
Optional<Value> val = wait(safeThreadFutureToFuture(tr->get(tenantMapKey)));
state typename transaction_future_type<Transaction, Optional<Value>>::type tenantFuture = tr->get(tenantMapKey);
Optional<Value> val = wait(safeThreadFutureToFuture(tenantFuture));
return val.map<TenantMapEntry>([](Optional<Value> v) { return decodeTenantEntry(v.get()); });
}
@ -688,10 +704,13 @@ Future<Optional<TenantMapEntry>> createTenantTransaction(Transaction tr, TenantN
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
state Future<Optional<TenantMapEntry>> tenantEntryFuture = tryGetTenantTransaction(tr, name);
state Future<Optional<Value>> tenantDataPrefixFuture = safeThreadFutureToFuture(tr->get(tenantDataPrefixKey));
state Future<Optional<Value>> lastIdFuture = safeThreadFutureToFuture(tr->get(tenantLastIdKey));
state typename transaction_future_type<Transaction, Optional<Value>>::type tenantDataPrefixFuture =
tr->get(tenantDataPrefixKey);
state typename transaction_future_type<Transaction, Optional<Value>>::type lastIdFuture = tr->get(tenantLastIdKey);
state typename transaction_future_type<Transaction, Optional<Value>>::type tenantModeFuture =
tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr));
Optional<Value> tenantMode = wait(safeThreadFutureToFuture(tr->get(configKeysPrefix.withSuffix("tenant_mode"_sr))));
Optional<Value> tenantMode = wait(safeThreadFutureToFuture(tenantModeFuture));
if (!tenantMode.present() || tenantMode.get() == StringRef(format("%d", TenantMode::DISABLED))) {
throw tenants_disabled();
@ -702,13 +721,15 @@ Future<Optional<TenantMapEntry>> createTenantTransaction(Transaction tr, TenantN
return Optional<TenantMapEntry>();
}
state Optional<Value> lastIdVal = wait(lastIdFuture);
Optional<Value> tenantDataPrefix = wait(tenantDataPrefixFuture);
state Optional<Value> lastIdVal = wait(safeThreadFutureToFuture(lastIdFuture));
Optional<Value> tenantDataPrefix = wait(safeThreadFutureToFuture(tenantDataPrefixFuture));
state TenantMapEntry newTenant(lastIdVal.present() ? TenantMapEntry::prefixToId(lastIdVal.get()) + 1 : 0,
tenantDataPrefix.present() ? (KeyRef)tenantDataPrefix.get() : ""_sr);
RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(newTenant.prefix), 1)));
state typename transaction_future_type<Transaction, RangeResult>::type prefixRangeFuture =
tr->getRange(prefixRange(newTenant.prefix), 1);
RangeResult contents = wait(safeThreadFutureToFuture(prefixRangeFuture));
if (!contents.empty()) {
throw tenant_prefix_allocator_conflict();
}
@ -774,7 +795,9 @@ Future<Void> deleteTenantTransaction(Transaction tr, TenantNameRef name) {
return Void();
}
RangeResult contents = wait(safeThreadFutureToFuture(tr->getRange(prefixRange(tenantEntry.get().prefix), 1)));
state typename transaction_future_type<Transaction, RangeResult>::type prefixRangeFuture =
tr->getRange(prefixRange(tenantEntry.get().prefix), 1);
RangeResult contents = wait(safeThreadFutureToFuture(prefixRangeFuture));
if (!contents.empty()) {
throw tenant_not_empty();
}
@ -832,8 +855,9 @@ Future<std::map<TenantName, TenantMapEntry>> listTenantsTransaction(Transaction
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
RangeResult results = wait(safeThreadFutureToFuture(
tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit)));
state typename transaction_future_type<Transaction, RangeResult>::type listFuture =
tr->getRange(firstGreaterOrEqual(range.begin), firstGreaterOrEqual(range.end), limit);
RangeResult results = wait(safeThreadFutureToFuture(listFuture));
std::map<TenantName, TenantMapEntry> tenants;
for (auto kv : results) {

View File

@ -65,8 +65,9 @@ private:
state int64_t window = 0;
loop {
RangeResult range =
wait(safeThreadFutureToFuture(tr->getRange(self->counters.range(), 1, Snapshot::True, Reverse::True)));
state typename TransactionT::template FutureT<RangeResult> rangeFuture =
tr->getRange(self->counters.range(), 1, Snapshot::True, Reverse::True);
RangeResult range = wait(safeThreadFutureToFuture(rangeFuture));
if (range.size() > 0) {
start = self->counters.unpack(range[0].key).getInt(0);
@ -83,11 +84,12 @@ private:
int64_t inc = 1;
tr->atomicOp(self->counters.get(start).key(), StringRef((uint8_t*)&inc, 8), MutationRef::AddValue);
Future<Optional<Value>> countFuture =
safeThreadFutureToFuture(tr->get(self->counters.get(start).key(), Snapshot::True));
state typename TransactionT::template FutureT<Optional<Value>> countFuture =
tr->get(self->counters.get(start).key(), Snapshot::True);
// }
Optional<Value> countValue = wait(countFuture);
Optional<Value> countValue = wait(safeThreadFutureToFuture(countFuture));
int64_t count = 0;
if (countValue.present()) {
@ -110,15 +112,17 @@ private:
state int64_t candidate = deterministicRandom()->randomInt(start, start + window);
// if thread safety is needed, this should be locked {
state Future<RangeResult> latestCounterFuture =
safeThreadFutureToFuture(tr->getRange(self->counters.range(), 1, Snapshot::True, Reverse::True));
state Future<Optional<Value>> candidateValueFuture =
safeThreadFutureToFuture(tr->get(self->recent.get(candidate).key()));
state typename TransactionT::template FutureT<RangeResult> latestCounterFuture =
tr->getRange(self->counters.range(), 1, Snapshot::True, Reverse::True);
state typename TransactionT::template FutureT<Optional<Value>> candidateValueFuture =
tr->get(self->recent.get(candidate).key());
tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
tr->set(self->recent.get(candidate).key(), ValueRef());
// }
wait(success(latestCounterFuture) && success(candidateValueFuture));
wait(success(safeThreadFutureToFuture(latestCounterFuture)) &&
success(safeThreadFutureToFuture(candidateValueFuture)));
int64_t currentWindowStart = 0;
if (latestCounterFuture.get().size() > 0) {
currentWindowStart = self->counters.unpack(latestCounterFuture.get()[0].key).getInt(0);

View File

@ -95,6 +95,31 @@ IKnobCollection& IKnobCollection::getMutableGlobalKnobCollection() {
return *globalKnobCollection();
}
void IKnobCollection::setupKnobs(const std::vector<std::pair<std::string, std::string>>& knobs) {
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
for (const auto& [knobName, knobValueString] : knobs) {
try {
auto knobValue = g_knobs.parseKnobValue(knobName, knobValueString);
g_knobs.setKnob(knobName, knobValue);
} catch (Error& e) {
if (e.code() == error_code_invalid_option_value) {
std::cerr << "WARNING: Invalid value '" << knobValueString << "' for knob option '" << knobName
<< "'\n";
TraceEvent(SevWarnAlways, "InvalidKnobValue")
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
} else {
std::cerr << "ERROR: Failed to set knob option '" << knobName << "': " << e.what() << "\n";
TraceEvent(SevError, "FailedToSetKnob")
.errorUnsuppressed(e)
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
throw e;
}
}
}
}
ConfigMutationRef IKnobCollection::createSetMutation(Arena arena, KeyRef key, ValueRef value) {
ConfigKey configKey = ConfigKeyRef::decodeKey(key);
auto knobValue =

View File

@ -69,6 +69,11 @@ public:
static void setGlobalKnobCollection(Type, Randomize, IsSimulated);
static IKnobCollection const& getGlobalKnobCollection();
static IKnobCollection& getMutableGlobalKnobCollection();
// Sets up a list of <knob, value> pairs. If encounter a failure,
// immediately throws the error.
static void setupKnobs(const std::vector<std::pair<std::string, std::string>>& knobs);
static ConfigMutationRef createSetMutation(Arena, KeyRef, ValueRef);
static ConfigMutationRef createClearMutation(Arena, KeyRef);
};

View File

@ -22,6 +22,7 @@
#include "fdbclient/json_spirit/json_spirit_writer_template.h"
#include "fdbclient/json_spirit/json_spirit_reader_template.h"
#include "flow/Error.h"
// JSONDoc is a convenient reader/writer class for manipulating JSON documents using "paths".
// Access is done using a "path", which is a string of dot-separated

View File

@ -175,6 +175,17 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
}
out[p + key] = format("%d", type);
}
if (key == "blob_granules_enabled") {
int enabled = std::stoi(value);
if (enabled != 0 && enabled != 1) {
printf("Error: Only 0 or 1 are valid values for blob_granules_enabled. "
"1 enables blob granules and 0 disables them.\n");
return out;
}
out[p + key] = value;
}
if (key == "tenant_mode") {
TenantMode tenantMode;
if (value == "disabled") {
@ -203,7 +214,7 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
} else if (mode == "ssd-redwood-1-experimental") {
logType = KeyValueStoreType::SSD_BTREE_V2;
storeType = KeyValueStoreType::SSD_REDWOOD_V1;
} else if (mode == "ssd-rocksdb-experimental") {
} else if (mode == "ssd-rocksdb-v1") {
logType = KeyValueStoreType::SSD_BTREE_V2;
storeType = KeyValueStoreType::SSD_ROCKSDB_V1;
} else if (mode == "memory" || mode == "memory-2") {

View File

@ -169,7 +169,7 @@ void ClusterConnectionString::resolveHostnamesBlocking() {
}
void ClusterConnectionString::resetToUnresolved() {
if (hostnames.size() > 0) {
if (status == RESOLVED && hostnames.size() > 0) {
coords.clear();
hostnames.clear();
networkAddressToHostname.clear();
@ -558,8 +558,8 @@ ACTOR Future<Void> monitorNominee(Key key,
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
.detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
if (rep.getError().code() == error_code_request_maybe_delivered) {
// 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(0.05));
// Delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(FLOW_KNOBS->HOSTNAME_RESOLVE_DELAY));
throw coordinators_changed();
} else {
throw rep.getError();
@ -589,7 +589,6 @@ ACTOR Future<Void> monitorNominee(Key key,
if (li.present() && li.get().forward)
wait(Future<Void>(Never()));
wait(Future<Void>(Void()));
}
}
}

View File

@ -282,8 +282,9 @@ ThreadResult<RangeResult> DLTransaction::readBlobGranules(const KeyRangeRef& key
context.get_load_f = granuleContext.get_load_f;
context.free_load_f = granuleContext.free_load_f;
context.debugNoMaterialize = granuleContext.debugNoMaterialize;
context.granuleParallelism = granuleContext.granuleParallelism;
int64_t rv = readVersion.present() ? readVersion.get() : invalidVersion;
int64_t rv = readVersion.present() ? readVersion.get() : latestVersion;
FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr,
keyRange.begin.begin(),
@ -779,7 +780,7 @@ void MultiVersionTransaction::updateTransaction() {
TransactionInfo newTr;
if (tenant.present()) {
ASSERT(tenant.get());
auto currentTenant = tenant.get()->tenantVar->get();
auto currentTenant = tenant.get()->tenantState->tenantVar->get();
if (currentTenant.value) {
newTr.transaction = currentTenant.value->createTransaction();
}
@ -1079,7 +1080,7 @@ ThreadFuture<Void> MultiVersionTransaction::onError(Error const& e) {
Optional<TenantName> MultiVersionTransaction::getTenant() {
if (tenant.present()) {
return tenant.get()->tenantName;
return tenant.get()->tenantState->tenantName;
} else {
return Optional<TenantName>();
}
@ -1213,20 +1214,27 @@ bool MultiVersionTransaction::isValid() {
// MultiVersionTenant
MultiVersionTenant::MultiVersionTenant(Reference<MultiVersionDatabase> db, StringRef tenantName)
: tenantVar(new ThreadSafeAsyncVar<Reference<ITenant>>(Reference<ITenant>(nullptr))), tenantName(tenantName), db(db) {
updateTenant();
: tenantState(makeReference<TenantState>(db, tenantName)) {}
MultiVersionTenant::~MultiVersionTenant() {
tenantState->close();
}
MultiVersionTenant::~MultiVersionTenant() {}
Reference<ITransaction> MultiVersionTenant::createTransaction() {
return Reference<ITransaction>(new MultiVersionTransaction(
db, Reference<MultiVersionTenant>::addRef(this), db->dbState->transactionDefaultOptions));
return Reference<ITransaction>(new MultiVersionTransaction(tenantState->db,
Reference<MultiVersionTenant>::addRef(this),
tenantState->db->dbState->transactionDefaultOptions));
}
MultiVersionTenant::TenantState::TenantState(Reference<MultiVersionDatabase> db, StringRef tenantName)
: tenantVar(new ThreadSafeAsyncVar<Reference<ITenant>>(Reference<ITenant>(nullptr))), tenantName(tenantName), db(db),
closed(false) {
updateTenant();
}
// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
// to open transactions via an AsyncVar.
void MultiVersionTenant::updateTenant() {
void MultiVersionTenant::TenantState::updateTenant() {
Reference<ITenant> tenant;
auto currentDb = db->dbState->dbVar->get();
if (currentDb.value) {
@ -1237,13 +1245,27 @@ void MultiVersionTenant::updateTenant() {
tenantVar->set(tenant);
Reference<TenantState> self = Reference<TenantState>::addRef(this);
MutexHolder holder(tenantLock);
tenantUpdater = mapThreadFuture<Void, Void>(currentDb.onChange, [this](ErrorOr<Void> result) {
updateTenant();
if (closed) {
return;
}
tenantUpdater = mapThreadFuture<Void, Void>(currentDb.onChange, [self](ErrorOr<Void> result) {
self->updateTenant();
return Void();
});
}
void MultiVersionTenant::TenantState::close() {
MutexHolder holder(tenantLock);
closed = true;
if (tenantUpdater.isValid()) {
tenantUpdater.cancel();
}
}
// MultiVersionDatabase
MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
int threadIdx,

View File

@ -95,8 +95,12 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
void* userContext;
// Returns a unique id for the load. Asynchronous to support queueing multiple in parallel.
int64_t (
*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
int64_t (*start_load_f)(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* context);
// Returns data for the load. Pass the loadId returned by start_load_f
uint8_t* (*get_load_f)(int64_t loadId, void* context);
@ -107,6 +111,9 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
// set this to true for testing if you don't want to read the granule files, just
// do the request to the blob workers
fdb_bool_t debugNoMaterialize;
// number of granules to load in parallel (default 1)
int granuleParallelism;
} FDBReadBlobGranuleContext;
typedef void (*FDBCallback)(FDBFuture* future, void* callback_parameter);
@ -639,18 +646,30 @@ public:
void addref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::addref(); }
void delref() override { ThreadSafeReferenceCounted<MultiVersionTenant>::delref(); }
Reference<ThreadSafeAsyncVar<Reference<ITenant>>> tenantVar;
const Standalone<StringRef> tenantName;
// A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying
// IDatabase object that is currently interacting with the cluster.
struct TenantState : ThreadSafeReferenceCounted<TenantState> {
TenantState(Reference<MultiVersionDatabase> db, StringRef tenantName);
private:
Reference<MultiVersionDatabase> db;
// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
// to open transactions via an AsyncVar.
void updateTenant();
Mutex tenantLock;
ThreadFuture<Void> tenantUpdater;
// Cleans up local state to break reference cycles
void close();
// Creates a new underlying tenant object whenever the database connection changes. This change is signaled
// to open transactions via an AsyncVar.
void updateTenant();
Reference<ThreadSafeAsyncVar<Reference<ITenant>>> tenantVar;
const Standalone<StringRef> tenantName;
Reference<MultiVersionDatabase> db;
Mutex tenantLock;
ThreadFuture<Void> tenantUpdater;
bool closed;
};
Reference<TenantState> tenantState;
};
// An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded

File diff suppressed because it is too large Load Diff

View File

@ -459,6 +459,10 @@ public:
std::vector<Reference<Watch>> watches;
Span span;
// used in template functions as returned Future type
template <typename Type>
using FutureT = Future<Type>;
private:
Future<Version> getReadVersion(uint32_t flags);

View File

@ -80,6 +80,8 @@ struct Notified {
val = std::move(r.val);
}
int numWaiting() { return waiting.size(); }
private:
using Item = std::pair<ValueType, Promise<Void>>;
struct ItemCompare {

View File

@ -1791,8 +1791,6 @@ Future<Standalone<VectorRef<BlobGranuleChunkRef>>> ReadYourWritesTransaction::re
Version begin,
Optional<Version> readVersion,
Version* readVersionOut) {
// Remove in V2 of API
ASSERT(begin == 0);
if (!options.readYourWritesDisabled) {
return blob_granule_no_ryw();

View File

@ -49,6 +49,7 @@ struct RestoreRequest {
int index;
Key tagName;
Key url;
Optional<std::string> proxy;
Version targetVersion;
KeyRange range;
UID randomUid;
@ -64,27 +65,29 @@ struct RestoreRequest {
explicit RestoreRequest(const int index,
const Key& tagName,
const Key& url,
const Optional<std::string>& proxy,
Version targetVersion,
const KeyRange& range,
const UID& randomUid,
Key& addPrefix,
Key removePrefix)
: index(index), tagName(tagName), url(url), targetVersion(targetVersion), range(range), randomUid(randomUid),
addPrefix(addPrefix), removePrefix(removePrefix) {}
: index(index), tagName(tagName), url(url), proxy(proxy), targetVersion(targetVersion), range(range),
randomUid(randomUid), addPrefix(addPrefix), removePrefix(removePrefix) {}
// To change this serialization, ProtocolVersion::RestoreRequestValue must be updated, and downgrades need to be
// considered
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, index, tagName, url, targetVersion, range, randomUid, addPrefix, removePrefix, reply);
serializer(ar, index, tagName, url, proxy, targetVersion, range, randomUid, addPrefix, removePrefix, reply);
}
std::string toString() const {
std::stringstream ss;
ss << "index:" << std::to_string(index) << " tagName:" << tagName.contents().toString()
<< " url:" << url.contents().toString() << " targetVersion:" << std::to_string(targetVersion)
<< " range:" << range.toString() << " randomUid:" << randomUid.toString()
<< " addPrefix:" << addPrefix.toString() << " removePrefix:" << removePrefix.toString();
<< " url:" << url.contents().toString() << " proxy:" << (proxy.present() ? proxy.get() : "")
<< " targetVersion:" << std::to_string(targetVersion) << " range:" << range.toString()
<< " randomUid:" << randomUid.toString() << " addPrefix:" << addPrefix.toString()
<< " removePrefix:" << removePrefix.toString();
return ss.str();
}
};

View File

@ -32,8 +32,11 @@
#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string.hpp>
#include "fdbrpc/IAsyncFile.h"
#include "flow/Hostname.h"
#include "flow/UnitTest.h"
#include "fdbclient/rapidxml/rapidxml.hpp"
#include "fdbclient/FDBAWSCredentialsProvider.h"
#include "flow/actorcompiler.h" // has to be last include
using namespace rapidxml;
@ -82,6 +85,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
max_send_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
max_recv_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
sdk_auth = false;
}
bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
@ -118,6 +122,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
TRY_PARAM(read_cache_blocks_per_file, rcb);
TRY_PARAM(max_send_bytes_per_second, sbps);
TRY_PARAM(max_recv_bytes_per_second, rbps);
TRY_PARAM(sdk_auth, sa);
#undef TRY_PARAM
return false;
}
@ -158,7 +163,8 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const {
return r;
}
Reference<S3BlobStoreEndpoint> S3BlobStoreEndpoint::fromString(std::string const& url,
Reference<S3BlobStoreEndpoint> S3BlobStoreEndpoint::fromString(const std::string& url,
const Optional<std::string>& proxy,
std::string* resourceFromURL,
std::string* error,
ParametersT* ignored_parameters) {
@ -171,6 +177,17 @@ Reference<S3BlobStoreEndpoint> S3BlobStoreEndpoint::fromString(std::string const
if (prefix != LiteralStringRef("blobstore"))
throw format("Invalid blobstore URL prefix '%s'", prefix.toString().c_str());
Optional<std::string> proxyHost, proxyPort;
if (proxy.present()) {
if (!Hostname::isHostname(proxy.get()) && !NetworkAddress::parseOptional(proxy.get()).present()) {
throw format("'%s' is not a valid value for proxy. Format should be either IP:port or host:port.",
proxy.get().c_str());
}
StringRef p(proxy.get());
proxyHost = p.eat(":").toString();
proxyPort = p.eat().toString();
}
Optional<StringRef> cred;
if (url.find("@") != std::string::npos) {
cred = t.eat("@");
@ -257,7 +274,8 @@ Reference<S3BlobStoreEndpoint> S3BlobStoreEndpoint::fromString(std::string const
creds = S3BlobStoreEndpoint::Credentials{ key.toString(), secret.toString(), securityToken.toString() };
}
return makeReference<S3BlobStoreEndpoint>(host.toString(), service.toString(), creds, knobs, extraHeaders);
return makeReference<S3BlobStoreEndpoint>(
host.toString(), service.toString(), proxyHost, proxyPort, creds, knobs, extraHeaders);
} catch (std::string& err) {
if (error != nullptr)
@ -506,7 +524,38 @@ ACTOR Future<Optional<json_spirit::mObject>> tryReadJSONFile(std::string path) {
return Optional<json_spirit::mObject>();
}
// If the credentials expire, the connection will eventually fail and be discarded from the pool, and then a new
// connection will be constructed, which will call this again to get updated credentials
static S3BlobStoreEndpoint::Credentials getSecretSdk() {
#ifdef BUILD_AWS_BACKUP
double elapsed = -timer_monotonic();
Aws::Auth::AWSCredentials awsCreds = FDBAWSCredentialsProvider::getAwsCredentials();
elapsed += timer_monotonic();
if (awsCreds.IsEmpty()) {
TraceEvent(SevWarn, "S3BlobStoreAWSCredsEmpty");
throw backup_auth_missing();
}
S3BlobStoreEndpoint::Credentials fdbCreds;
fdbCreds.key = awsCreds.GetAWSAccessKeyId();
fdbCreds.secret = awsCreds.GetAWSSecretKey();
fdbCreds.securityToken = awsCreds.GetSessionToken();
TraceEvent("S3BlobStoreGotSdkCredentials").suppressFor(60).detail("Duration", elapsed);
return fdbCreds;
#else
TraceEvent(SevError, "S3BlobStoreNoSDK");
throw backup_auth_missing();
#endif
}
ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
if (b->knobs.sdk_auth) {
b->credentials = getSecretSdk();
return Void();
}
std::vector<std::string>* pFiles = (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
if (pFiles == nullptr)
return Void();
@ -538,7 +587,7 @@ ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
JSONDoc accounts(doc.last().get_obj());
if (accounts.has(credentialsFileKey, false) && accounts.last().type() == json_spirit::obj_type) {
JSONDoc account(accounts.last());
S3BlobStoreEndpoint::Credentials creds;
S3BlobStoreEndpoint::Credentials creds = b->credentials.get();
if (b->lookupKey) {
std::string apiKey;
if (account.tryGet("api_key", apiKey))
@ -589,11 +638,11 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
return rconn;
}
}
std::string service = b->service;
std::string host = b->host, service = b->service;
if (service.empty())
service = b->knobs.secure_connection ? "https" : "http";
state Reference<IConnection> conn =
wait(INetworkConnections::net()->connect(b->host, service, b->knobs.secure_connection ? true : false));
wait(INetworkConnections::net()->connect(host, service, b->knobs.secure_connection ? true : false));
wait(conn->connectHandshake());
TraceEvent("S3BlobStoreEndpointNewConnection")
@ -601,7 +650,7 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
.detail("RemoteEndpoint", conn->getPeerAddress())
.detail("ExpiresIn", b->knobs.max_connection_life);
if (b->lookupKey || b->lookupSecret)
if (b->lookupKey || b->lookupSecret || b->knobs.sdk_auth)
wait(b->updateSecret());
return S3BlobStoreEndpoint::ReusableConnection({ conn, now() + b->knobs.max_connection_life });
@ -1574,7 +1623,7 @@ TEST_CASE("/backup/s3/v4headers") {
S3BlobStoreEndpoint::Credentials creds{ "AKIAIOSFODNN7EXAMPLE", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "" }
// GET without query parameters
{
S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", creds);
S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", "proxy", "port", creds);
std::string verb("GET");
std::string resource("/test.txt");
HTTP::Headers headers;
@ -1589,7 +1638,7 @@ TEST_CASE("/backup/s3/v4headers") {
// GET with query parameters
{
S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", creds);
S3BlobStoreEndpoint s3("s3.amazonaws.com", "s3", "proxy", "port", creds);
std::string verb("GET");
std::string resource("/test/examplebucket?Action=DescribeRegions&Version=2013-10-15");
HTTP::Headers headers;
@ -1604,7 +1653,7 @@ TEST_CASE("/backup/s3/v4headers") {
// POST
{
S3BlobStoreEndpoint s3("s3.us-west-2.amazonaws.com", "s3", creds);
S3BlobStoreEndpoint s3("s3.us-west-2.amazonaws.com", "s3", "proxy", "port", creds);
std::string verb("POST");
std::string resource("/simple.json");
HTTP::Headers headers;

View File

@ -59,7 +59,7 @@ public:
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second,
max_recv_bytes_per_second;
max_recv_bytes_per_second, sdk_auth;
bool set(StringRef name, int value);
std::string getURLParameters() const;
static std::vector<std::string> getKnobDescriptions() {
@ -91,17 +91,23 @@ public:
"read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.",
"max_send_bytes_per_second (or sbps) Max send bytes per second for all requests combined.",
"max_recv_bytes_per_second (or rbps) Max receive bytes per second for all requests combined (NOT YET "
"USED)."
"USED).",
"sdk_auth (or sa) Use AWS SDK to resolve credentials. Only valid if "
"BUILD_AWS_BACKUP is enabled."
};
}
};
S3BlobStoreEndpoint(std::string const& host,
std::string service,
std::string const& service,
Optional<std::string> const& proxyHost,
Optional<std::string> const& proxyPort,
Optional<Credentials> const& creds,
BlobKnobs const& knobs = BlobKnobs(),
HTTP::Headers extraHeaders = HTTP::Headers())
: host(host), service(service), credentials(creds), lookupKey(creds.present() && creds.get().key.empty()),
: host(host), service(service), proxyHost(proxyHost), proxyPort(proxyPort),
useProxy(proxyHost.present() && proxyPort.present()), credentials(creds),
lookupKey(creds.present() && creds.get().key.empty()),
lookupSecret(creds.present() && creds.get().secret.empty()), knobs(knobs), extraHeaders(extraHeaders),
requestRate(new SpeedLimit(knobs.requests_per_second, 1)),
requestRateList(new SpeedLimit(knobs.list_requests_per_second, 1)),
@ -112,7 +118,7 @@ public:
recvRate(new SpeedLimit(knobs.max_recv_bytes_per_second, 1)), concurrentRequests(knobs.concurrent_requests),
concurrentUploads(knobs.concurrent_uploads), concurrentLists(knobs.concurrent_lists) {
if (host.empty())
if (host.empty() || (proxyHost.present() != proxyPort.present()))
throw connection_string_invalid();
}
@ -130,10 +136,11 @@ public:
// Parse url and return a S3BlobStoreEndpoint
// If the url has parameters that S3BlobStoreEndpoint can't consume then an error will be thrown unless
// ignored_parameters is given in which case the unconsumed parameters will be added to it.
static Reference<S3BlobStoreEndpoint> fromString(std::string const& url,
std::string* resourceFromURL = nullptr,
std::string* error = nullptr,
ParametersT* ignored_parameters = nullptr);
static Reference<S3BlobStoreEndpoint> fromString(const std::string& url,
const Optional<std::string>& proxy,
std::string* resourceFromURL,
std::string* error,
ParametersT* ignored_parameters);
// Get a normalized version of this URL with the given resource and any non-default BlobKnob values as URL
// parameters in addition to the passed params string
@ -149,6 +156,10 @@ public:
std::string host;
std::string service;
Optional<std::string> proxyHost;
Optional<std::string> proxyPort;
bool useProxy;
Optional<Credentials> credentials;
bool lookupKey;
bool lookupSecret;

View File

@ -24,37 +24,37 @@
const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
{
"cluster":{
"storage_wiggler": {
"wiggle_server_ids":["0ccb4e0feddb55"],
"wiggle_server_addresses": ["127.0.0.1"],
"storage_wiggler": {
"wiggle_server_ids":["0ccb4e0feddb55"],
"wiggle_server_addresses": ["127.0.0.1"],
"primary": {
"last_round_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_round_start_timestamp": 63811229797,
"last_round_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_wiggle_start_timestamp": 63811229797,
"last_wiggle_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
},
"remote": {
"last_round_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_round_start_timestamp": 63811229797,
"last_round_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "Wed Feb 4 09:36:37 2022 +0000",
"last_wiggle_start_timestamp": 63811229797,
"last_wiggle_finish_datetime": "Thu Jan 1 00:00:00 1970 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
}
"last_round_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_round_start_timestamp": 1648857905.123,
"last_round_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_wiggle_start_timestamp": 1648857905.123,
"last_wiggle_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
},
"remote": {
"last_round_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_round_start_timestamp": 1648857905.123,
"last_round_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_wiggle_start_timestamp": 1648857905.123,
"last_wiggle_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
}
},
"layers":{
"_valid":true,
@ -136,7 +136,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
]
},
"storage_metadata":{
"created_time_datetime":"Thu Jan 1 00:00:00 1970 +0000",
"created_time_datetime":"1970-01-01 00:00:00.000 +0000",
"created_time_timestamp": 0
},
"data_version":12341234,
@ -768,7 +768,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"ssd-1",
"ssd-2",
"ssd-redwood-1-experimental",
"ssd-rocksdb-experimental",
"ssd-rocksdb-v1",
"memory",
"memory-1",
"memory-2",
@ -781,7 +781,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"ssd-1",
"ssd-2",
"ssd-redwood-1-experimental",
"ssd-rocksdb-experimental",
"ssd-rocksdb-v1",
"memory",
"memory-1",
"memory-2",
@ -810,6 +810,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"aggressive",
"gradual"
]},
"blob_granules_enabled":0,
"tenant_mode": {
"$enum":[
"disabled",

View File

@ -152,7 +152,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RETRY_RELOCATESHARD_DELAY, 0.1 );
init( DATA_DISTRIBUTION_FAILURE_REACTION_TIME, 60.0 ); if( randomize && BUGGIFY ) DATA_DISTRIBUTION_FAILURE_REACTION_TIME = 1.0;
bool buggifySmallShards = randomize && BUGGIFY;
bool simulationMediumShards = !buggifySmallShards && randomize && !BUGGIFY; // prefer smaller shards in simulation
bool simulationMediumShards = !buggifySmallShards && isSimulated && randomize && !BUGGIFY; // prefer smaller shards in simulation
init( MIN_SHARD_BYTES, 50000000 ); if( buggifySmallShards ) MIN_SHARD_BYTES = 40000; if (simulationMediumShards) MIN_SHARD_BYTES = 200000; //FIXME: data distribution tracker (specifically StorageMetrics) relies on this number being larger than the maximum size of a key value pair
init( SHARD_BYTES_RATIO, 4 );
init( SHARD_BYTES_PER_SQRT_BYTES, 45 ); if( buggifySmallShards ) SHARD_BYTES_PER_SQRT_BYTES = 0;//Approximately 10000 bytes per shard
@ -250,6 +250,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DEBOUNCE_RECRUITING_DELAY, 5.0 );
init( DD_FAILURE_TIME, 1.0 ); if( randomize && BUGGIFY ) DD_FAILURE_TIME = 10.0;
init( DD_ZERO_HEALTHY_TEAM_DELAY, 1.0 );
init( REMOTE_KV_STORE, false ); if( randomize && BUGGIFY ) REMOTE_KV_STORE = true;
init( REMOTE_KV_STORE_INIT_DELAY, 0.1 );
init( REMOTE_KV_STORE_MAX_INIT_DURATION, 10.0 );
init( REBALANCE_MAX_RETRIES, 100 );
init( DD_OVERLAP_PENALTY, 10000 );
init( DD_EXCLUDE_MIN_REPLICAS, 1 );
@ -292,7 +295,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( SQLITE_CHUNK_SIZE_PAGES_SIM, 1024 ); // 4MB
init( SQLITE_READER_THREADS, 64 ); // number of read threads
init( SQLITE_WRITE_WINDOW_SECONDS, -1 );
init( SQLITE_CURSOR_MAX_LIFETIME_BYTES, 1e6 ); if( randomize && BUGGIFY ) SQLITE_CURSOR_MAX_LIFETIME_BYTES = 0;
init( SQLITE_CURSOR_MAX_LIFETIME_BYTES, 1e6 ); if (buggifySmallShards || simulationMediumShards) SQLITE_CURSOR_MAX_LIFETIME_BYTES = MIN_SHARD_BYTES; if( randomize && BUGGIFY ) SQLITE_CURSOR_MAX_LIFETIME_BYTES = 0;
init( SQLITE_WRITE_WINDOW_LIMIT, -1 );
if( randomize && BUGGIFY ) {
// Choose an window between .01 and 1.01 seconds.
@ -380,6 +383,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
init( ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD, 1 );
init( ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD, 5 );
init( ROCKSDB_COMPACTION_READAHEAD_SIZE, 32768 ); // 32 KB, performs bigger reads when doing compaction.
// Leader election
bool longLeaderElection = randomize && BUGGIFY;
@ -537,6 +541,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( POLICY_GENERATIONS, 100 ); if( randomize && BUGGIFY ) POLICY_GENERATIONS = 10;
init( DBINFO_SEND_AMOUNT, 5 );
init( DBINFO_BATCH_DELAY, 0.1 );
init( SINGLETON_RECRUIT_BME_DELAY, 10.0 );
//Move Keys
init( SHARD_READY_DELAY, 0.25 );
@ -553,6 +558,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( MIN_REBOOT_TIME, 4.0 ); if( longReboots ) MIN_REBOOT_TIME = 10.0;
init( MAX_REBOOT_TIME, 5.0 ); if( longReboots ) MAX_REBOOT_TIME = 20.0;
init( LOG_DIRECTORY, "."); // Will be set to the command line flag.
init( CONN_FILE, ""); // Will be set to the command line flag.
init( SERVER_MEM_LIMIT, 8LL << 30 );
init( SYSTEM_MONITOR_FREQUENCY, 5.0 );
@ -649,7 +655,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6;
init( FETCH_KEYS_PARALLELISM, 2 );
init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( FETCH_CHANGEFEED_PARALLELISM, 2 );
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
init( STORAGE_FETCH_BYTES, 2500000 ); if( randomize && BUGGIFY ) STORAGE_FETCH_BYTES = 500000;
init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD, 0.25 );
@ -679,6 +687,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_TOO_LONG_TIME_CRITERIA, 300.0 );
init( MAX_STORAGE_COMMIT_TIME, 120.0 ); //The max fsync stall time on the storage server and tlog before marking a disk as failed
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true );
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
@ -708,6 +718,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.90 );
init( PEER_LATENCY_DEGRADATION_THRESHOLD, 0.05 );
init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD, 0.1 );
init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 1 );
// Test harness
init( WORKER_POLL_DELAY, 1.0 );
@ -823,14 +834,32 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// encrypt key proxy
init( ENABLE_ENCRYPTION, false );
init( ENCRYPTION_MODE, "AES-256-CTR");
// Blob granlues
init( BG_URL, "" ); // TODO: store in system key space, eventually
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( randomize && BUGGIFY ) { deterministicRandom()->random01() < 0.1 ? BG_SNAPSHOT_FILE_TARGET_BYTES /= 100 : BG_SNAPSHOT_FILE_TARGET_BYTES /= 10; }
init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (simulationMediumShards || (randomize && BUGGIFY) ) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 );
init( BG_DELTA_FILE_TARGET_BYTES, BG_DELTA_BYTES_BEFORE_COMPACT/10 );
init( BG_MAX_SPLIT_FANOUT, 10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15);
init( BG_HOT_SNAPSHOT_VERSIONS, 5000000 );
init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 );
init( BLOB_WORKER_BATCH_GRV_INTERVAL, 0.1 );
init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN, 0.1 );
init( BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX, 5.0 );
init( BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT, 1.5 );
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );
// clang-format on

View File

@ -233,6 +233,14 @@ public:
double DD_FAILURE_TIME;
double DD_ZERO_HEALTHY_TEAM_DELAY;
// Run storage enginee on a child process on the same machine with storage process
bool REMOTE_KV_STORE;
// A delay to avoid race on file resources if the new kv store process started immediately after the previous kv
// store process died
double REMOTE_KV_STORE_INIT_DELAY;
// max waiting time for the remote kv store to initialize
double REMOTE_KV_STORE_MAX_INIT_DURATION;
// KeyValueStore SQLITE
int CLEAR_BUFFER_SIZE;
double READ_VALUE_TIME_ESTIMATE;
@ -307,6 +315,7 @@ public:
int64_t ROCKSDB_CAN_COMMIT_COMPACT_BYTES_LIMIT;
int ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD;
int ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
// Leader election
int MAX_NOTIFICATIONS;
@ -469,6 +478,7 @@ public:
double RECRUITMENT_TIMEOUT;
int DBINFO_SEND_AMOUNT;
double DBINFO_BATCH_DELAY;
double SINGLETON_RECRUIT_BME_DELAY;
// Move Keys
double SHARD_READY_DELAY;
@ -486,6 +496,7 @@ public:
double MIN_REBOOT_TIME;
double MAX_REBOOT_TIME;
std::string LOG_DIRECTORY;
std::string CONN_FILE;
int64_t SERVER_MEM_LIMIT;
double SYSTEM_MONITOR_FREQUENCY;
@ -585,7 +596,9 @@ public:
int FETCH_KEYS_PARALLELISM_BYTES;
int FETCH_KEYS_PARALLELISM;
int FETCH_KEYS_LOWER_PRIORITY;
int FETCH_CHANGEFEED_PARALLELISM;
int BUGGIFY_BLOCK_BYTES;
int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
double STORAGE_DURABILITY_LAG_MIN_RATE;
int STORAGE_COMMIT_BYTES;
@ -615,6 +628,8 @@ public:
double FETCH_KEYS_TOO_LONG_TIME_CRITERIA;
double MAX_STORAGE_COMMIT_TIME;
int64_t RANGESTREAM_LIMIT_BYTES;
int64_t CHANGEFEEDSTREAM_LIMIT_BYTES;
int64_t BLOBWORKERSTATUSSTREAM_LIMIT_BYTES;
bool ENABLE_CLEAR_RANGE_EAGER_READS;
bool QUICK_GET_VALUE_FALLBACK;
bool QUICK_GET_KEY_VALUES_FALLBACK;
@ -645,6 +660,8 @@ public:
double PEER_LATENCY_DEGRADATION_PERCENTILE; // The percentile latency used to check peer health.
double PEER_LATENCY_DEGRADATION_THRESHOLD; // The latency threshold to consider a peer degraded.
double PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD; // The percentage of timeout to consider a peer degraded.
int PEER_DEGRADATION_CONNECTION_FAILURE_COUNT; // The number of connection failures experienced during measurement
// period to consider a peer degraded.
// Test harness
double WORKER_POLL_DELAY;
@ -770,8 +787,9 @@ public:
// Cluster recovery
std::string CLUSTER_RECOVERY_EVENT_NAME_PREFIX;
// encrypt key proxy
// Encryption
bool ENABLE_ENCRYPTION;
std::string ENCRYPTION_MODE;
// blob granule stuff
// FIXME: configure url with database configuration instead of knob eventually
@ -780,8 +798,22 @@ public:
int BG_SNAPSHOT_FILE_TARGET_BYTES;
int BG_DELTA_FILE_TARGET_BYTES;
int BG_DELTA_BYTES_BEFORE_COMPACT;
int BG_MAX_SPLIT_FANOUT;
int BG_HOT_SNAPSHOT_VERSIONS;
int BG_CONSISTENCY_CHECK_ENABLED;
int BG_CONSISTENCY_CHECK_TARGET_SPEED_KB;
int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
double BLOB_WORKERLIST_FETCH_INTERVAL;
double BLOB_WORKER_BATCH_GRV_INTERVAL;
double BLOB_MANAGER_STATUS_EXP_BACKOFF_MIN;
double BLOB_MANAGER_STATUS_EXP_BACKOFF_MAX;
double BLOB_MANAGER_STATUS_EXP_BACKOFF_EXPONENT;
double BGCC_TIMEOUT;
double BGCC_MIN_INTERVAL;
ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
void initialize(Randomize, ClientKnobs*, IsSimulated);

View File

@ -2704,16 +2704,23 @@ Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTr
}
ACTOR Future<RangeResult> getTenantList(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) {
KeyRangeRef tenantRange =
kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.removePrefix(TenantMapRangeImpl::submoduleRange.begin);
state KeyRef managementPrefix =
kr.begin.substr(0,
SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin.size() +
TenantMapRangeImpl::submoduleRange.begin.size());
std::map<TenantName, TenantMapEntry> tenants = wait(ManagementAPI::listTenantsTransaction(
&ryw->getTransaction(), tenantRange.begin, tenantRange.end, limitsHint.rows));
kr = kr.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
TenantNameRef beginTenant = kr.begin.removePrefix(TenantMapRangeImpl::submoduleRange.begin);
TenantNameRef endTenant = kr.end;
if (endTenant.startsWith(TenantMapRangeImpl::submoduleRange.begin)) {
endTenant = endTenant.removePrefix(TenantMapRangeImpl::submoduleRange.begin);
} else {
endTenant = "\xff"_sr;
}
std::map<TenantName, TenantMapEntry> tenants =
wait(ManagementAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, limitsHint.rows));
RangeResult results;
for (auto tenant : tenants) {
@ -2783,7 +2790,7 @@ Future<Optional<std::string>> TenantMapRangeImpl::commit(ReadYourWritesTransacti
TenantNameRef endTenant = range.end().removePrefix(
SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
if (endTenant.startsWith(submoduleRange.begin)) {
endTenant = endTenant.removePrefix(submoduleRange.end);
endTenant = endTenant.removePrefix(submoduleRange.begin);
} else {
endTenant = "\xff"_sr;
}

View File

@ -89,12 +89,19 @@ struct StorageServerInterface {
RequestStream<struct GetCheckpointRequest> checkpoint;
RequestStream<struct FetchCheckpointRequest> fetchCheckpoint;
explicit StorageServerInterface(UID uid) : uniqueID(uid) {}
StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
private:
bool acceptingRequests;
public:
explicit StorageServerInterface(UID uid) : uniqueID(uid) { acceptingRequests = false; }
StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) { acceptingRequests = false; }
NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
UID id() const { return uniqueID; }
bool isAcceptingRequests() const { return acceptingRequests; }
void startAcceptingRequests() { acceptingRequests = true; }
void stopAcceptingRequests() { acceptingRequests = false; }
bool isTss() const { return tssPairID.present(); }
std::string toString() const { return id().shortString(); }
template <class Ar>
@ -105,7 +112,11 @@ struct StorageServerInterface {
if (ar.protocolVersion().hasSmallEndpoints()) {
if (ar.protocolVersion().hasTSS()) {
serializer(ar, uniqueID, locality, getValue, tssPairID);
if (ar.protocolVersion().hasStorageInterfaceReadiness()) {
serializer(ar, uniqueID, locality, getValue, tssPairID, acceptingRequests);
} else {
serializer(ar, uniqueID, locality, getValue, tssPairID);
}
} else {
serializer(ar, uniqueID, locality, getValue);
}
@ -773,6 +784,7 @@ struct ChangeFeedStreamReply : public ReplyPromiseStreamReply {
VectorRef<MutationsAndVersionRef> mutations;
bool atLatestVersion = false;
Version minStreamVersion = invalidVersion;
Version popVersion = invalidVersion;
ChangeFeedStreamReply() {}
@ -786,6 +798,7 @@ struct ChangeFeedStreamReply : public ReplyPromiseStreamReply {
mutations,
atLatestVersion,
minStreamVersion,
popVersion,
arena);
}
};
@ -798,12 +811,18 @@ struct ChangeFeedStreamRequest {
Version begin = 0;
Version end = 0;
KeyRange range;
int replyBufferSize = -1;
bool canReadPopped = true;
UID debugUID; // This is only used for debugging and tracing, but being able to link a client + server side stream
// is so useful for testing, and this is such small overhead compared to streaming large amounts of
// change feed data, it is left in the interface
ReplyPromiseStream<ChangeFeedStreamReply> reply;
ChangeFeedStreamRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeID, begin, end, range, reply, spanContext, arena);
serializer(ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, debugUID, arena);
}
};
@ -881,19 +900,21 @@ struct FetchCheckpointRequest {
struct OverlappingChangeFeedEntry {
Key rangeId;
KeyRange range;
bool stopped = false;
Version emptyVersion;
Version stopVersion;
bool operator==(const OverlappingChangeFeedEntry& r) const {
return rangeId == r.rangeId && range == r.range && stopped == r.stopped;
return rangeId == r.rangeId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion;
}
OverlappingChangeFeedEntry() {}
OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, bool stopped)
: rangeId(rangeId), range(range), stopped(stopped) {}
OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, Version emptyVersion, Version stopVersion)
: rangeId(rangeId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeId, range, stopped);
serializer(ar, rangeId, range, emptyVersion, stopVersion);
}
};
@ -914,7 +935,7 @@ struct OverlappingChangeFeedsReply {
};
struct OverlappingChangeFeedsRequest {
constexpr static FileIdentifier file_identifier = 10726174;
constexpr static FileIdentifier file_identifier = 7228462;
KeyRange range;
Version minVersion;
ReplyPromise<OverlappingChangeFeedsReply> reply;
@ -929,7 +950,7 @@ struct OverlappingChangeFeedsRequest {
};
struct ChangeFeedVersionUpdateReply {
constexpr static FileIdentifier file_identifier = 11815134;
constexpr static FileIdentifier file_identifier = 4246160;
Version version = 0;
ChangeFeedVersionUpdateReply() {}
@ -983,6 +1004,22 @@ struct GetStorageMetricsRequest {
};
struct StorageQueuingMetricsReply {
struct TagInfo {
constexpr static FileIdentifier file_identifier = 4528694;
TransactionTag tag;
double rate{ 0.0 };
double fractionalBusyness{ 0.0 };
TagInfo() = default;
TagInfo(TransactionTag const& tag, double rate, double fractionalBusyness)
: tag(tag), rate(rate), fractionalBusyness(fractionalBusyness) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, tag, rate, fractionalBusyness);
}
};
constexpr static FileIdentifier file_identifier = 7633366;
double localTime;
int64_t instanceID; // changes if bytesDurable and bytesInput reset
@ -993,9 +1030,7 @@ struct StorageQueuingMetricsReply {
double cpuUsage;
double diskUsage;
double localRateLimit;
Optional<TransactionTag> busiestTag;
double busiestTagFractionalBusyness;
double busiestTagRate;
std::vector<TagInfo> busiestTags;
template <class Ar>
void serialize(Ar& ar) {
@ -1010,9 +1045,7 @@ struct StorageQueuingMetricsReply {
cpuUsage,
diskUsage,
localRateLimit,
busiestTag,
busiestTagFractionalBusyness,
busiestTagRate);
busiestTags);
}
};

View File

@ -587,28 +587,18 @@ const Key serverListKeyFor(UID serverID) {
return wr.toValue();
}
// TODO use flatbuffers depending on version
const Value serverListValue(StorageServerInterface const& server) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withServerListValue()));
wr << server;
return wr.toValue();
auto protocolVersion = currentProtocolVersion;
protocolVersion.addObjectSerializerFlag();
return ObjectWriter::toValue(server, IncludeVersion(protocolVersion));
}
UID decodeServerListKey(KeyRef const& key) {
UID serverID;
BinaryReader rd(key.removePrefix(serverListKeys.begin), Unversioned());
rd >> serverID;
return serverID;
}
StorageServerInterface decodeServerListValue(ValueRef const& value) {
StorageServerInterface s;
BinaryReader reader(value, IncludeVersion());
reader >> s;
return s;
}
const Value serverListValueFB(StorageServerInterface const& server) {
return ObjectWriter::toValue(server, IncludeVersion());
}
StorageServerInterface decodeServerListValueFB(ValueRef const& value) {
StorageServerInterface s;
@ -617,6 +607,18 @@ StorageServerInterface decodeServerListValueFB(ValueRef const& value) {
return s;
}
StorageServerInterface decodeServerListValue(ValueRef const& value) {
StorageServerInterface s;
BinaryReader reader(value, IncludeVersion());
if (!reader.protocolVersion().hasStorageInterfaceReadiness()) {
reader >> s;
return s;
}
return decodeServerListValueFB(value);
}
// processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0'
const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0"));
const KeyRef processClassPrefix = processClassKeys.begin;
@ -1153,30 +1155,33 @@ const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), Lite
const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0"));
const KeyRangeRef blobGranuleSplitKeys(LiteralStringRef("\xff\x02/bgs/"), LiteralStringRef("\xff\x02/bgs0"));
const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0"));
const KeyRangeRef blobGranulePruneKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0"));
const KeyRef blobGranulePruneChangeKey = LiteralStringRef("\xff\x02/bgpChange");
const uint8_t BG_FILE_TYPE_DELTA = 'D';
const uint8_t BG_FILE_TYPE_SNAPSHOT = 'S';
const Key blobGranuleFileKeyFor(UID granuleID, uint8_t fileType, Version fileVersion) {
const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType) {
ASSERT(fileType == 'D' || fileType == 'S');
BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule()));
wr.serializeBytes(blobGranuleFileKeys.begin);
wr << granuleID;
wr << fileType;
wr << bigEndian64(fileVersion);
wr << fileType;
return wr.toValue();
}
std::tuple<UID, uint8_t, Version> decodeBlobGranuleFileKey(KeyRef const& key) {
std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key) {
UID granuleID;
uint8_t fileType;
Version fileVersion;
uint8_t fileType;
BinaryReader reader(key.removePrefix(blobGranuleFileKeys.begin), AssumeVersion(ProtocolVersion::withBlobGranule()));
reader >> granuleID;
reader >> fileType;
reader >> fileVersion;
reader >> fileType;
ASSERT(fileType == 'D' || fileType == 'S');
return std::tuple(granuleID, fileType, bigEndian64(fileVersion));
return std::tuple(granuleID, bigEndian64(fileVersion), fileType);
}
const KeyRange blobGranuleFileKeyRangeFor(UID granuleID) {
@ -1187,23 +1192,45 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID) {
return KeyRangeRef(startKey, strinc(startKey));
}
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length) {
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << filename;
wr << offset;
wr << length;
wr << fullFileLength;
return wr.toValue();
}
std::tuple<Standalone<StringRef>, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value) {
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value) {
StringRef filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
BinaryReader reader(value, IncludeVersion());
reader >> filename;
reader >> offset;
reader >> length;
return std::tuple(filename, offset, length);
reader >> fullFileLength;
return std::tuple(filename, offset, length, fullFileLength);
}
const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << version;
wr << range;
wr << force;
return wr.toValue();
}
std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value) {
Version version;
KeyRange range;
bool force;
BinaryReader reader(value, IncludeVersion());
reader >> version;
reader >> range;
reader >> force;
return std::tuple(version, range, force);
}
const Value blobGranuleMappingValueFor(UID const& workerID) {
@ -1284,7 +1311,8 @@ std::pair<BlobGranuleSplitState, Version> decodeBlobGranuleSplitValue(const Valu
BinaryReader reader(value, IncludeVersion());
reader >> st;
reader >> v;
return std::pair(st, v);
return std::pair(st, bigEndian64(v));
}
const Key blobGranuleHistoryKeyFor(KeyRangeRef const& range, Version version) {
@ -1367,29 +1395,31 @@ const KeyRef tenantLastIdKey = "\xff/tenantLastId/"_sr;
const KeyRef tenantDataPrefixKey = "\xff/tenantDataPrefix"_sr;
// for tests
void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
void testSSISerdes(StorageServerInterface const& ssi) {
printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\nacceptingRequests=%s\naddress=%s\ngetValue=%s\n\n\n",
ssi.id().toString().c_str(),
ssi.locality.toString().c_str(),
ssi.isTss() ? "true" : "false",
ssi.isTss() ? ssi.tssPairID.get().toString().c_str() : "",
ssi.isAcceptingRequests() ? "true" : "false",
ssi.address().toString().c_str(),
ssi.getValue.getEndpoint().token.toString().c_str());
StorageServerInterface ssi2 =
(useFB) ? decodeServerListValueFB(serverListValueFB(ssi)) : decodeServerListValue(serverListValue(ssi));
StorageServerInterface ssi2 = decodeServerListValue(serverListValue(ssi));
printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\nacceptingRequests=%s\naddress=%s\ngetValue=%s\n\n\n",
ssi2.id().toString().c_str(),
ssi2.locality.toString().c_str(),
ssi2.isTss() ? "true" : "false",
ssi2.isTss() ? ssi2.tssPairID.get().toString().c_str() : "",
ssi2.isAcceptingRequests() ? "true" : "false",
ssi2.address().toString().c_str(),
ssi2.getValue.getEndpoint().token.toString().c_str());
ASSERT(ssi.id() == ssi2.id());
ASSERT(ssi.locality == ssi2.locality);
ASSERT(ssi.isTss() == ssi2.isTss());
ASSERT(ssi.isAcceptingRequests() == ssi2.isAcceptingRequests());
if (ssi.isTss()) {
ASSERT(ssi2.tssPairID.get() == ssi2.tssPairID.get());
}
@ -1411,13 +1441,11 @@ TEST_CASE("/SystemData/SerDes/SSI") {
ssi.locality = localityData;
ssi.initEndpoints();
testSSISerdes(ssi, false);
testSSISerdes(ssi, true);
testSSISerdes(ssi);
ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238);
testSSISerdes(ssi, false);
testSSISerdes(ssi, true);
testSSISerdes(ssi);
printf("ssi serdes test complete\n");
return Void();

View File

@ -563,12 +563,20 @@ extern const KeyRangeRef blobGranuleSplitKeys;
// \xff\x02/bgh/(beginKey,endKey,startVersion) = { granuleUID, [parentGranuleHistoryKeys] }
extern const KeyRangeRef blobGranuleHistoryKeys;
const Key blobGranuleFileKeyFor(UID granuleID, uint8_t fileType, Version fileVersion);
std::tuple<UID, uint8_t, Version> decodeBlobGranuleFileKey(ValueRef const& value);
// \xff\x02/bgp/(start,end) = (version, force)
extern const KeyRangeRef blobGranulePruneKeys;
extern const KeyRangeRef blobGranuleVersionKeys;
extern const KeyRef blobGranulePruneChangeKey;
const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType);
std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key);
const KeyRange blobGranuleFileKeyRangeFor(UID granuleID);
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length);
std::tuple<Standalone<StringRef>, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength);
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);
const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force);
std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value);
const Value blobGranuleMappingValueFor(UID const& workerID);
UID decodeBlobGranuleMappingValue(ValueRef const& value);
@ -587,7 +595,7 @@ const Value blobGranuleSplitValueFor(BlobGranuleSplitState st);
std::pair<BlobGranuleSplitState, Version> decodeBlobGranuleSplitValue(ValueRef const& value);
const Key blobGranuleHistoryKeyFor(KeyRangeRef const& range, Version version);
std::pair<KeyRange, Version> decodeBlobGranuleHistoryKey(KeyRef const& value);
std::pair<KeyRange, Version> decodeBlobGranuleHistoryKey(KeyRef const& key);
const KeyRange blobGranuleHistoryKeyRangeFor(KeyRangeRef const& range);
const Value blobGranuleHistoryValueFor(Standalone<BlobGranuleHistoryValue> const& historyValue);

View File

@ -319,9 +319,6 @@ ThreadResult<RangeResult> ThreadSafeTransaction::readBlobGranules(const KeyRange
Version beginVersion,
Optional<Version> readVersion,
ReadBlobGranuleContext granule_context) {
// In V1 of api this is required, field is just for forward compatibility
ASSERT(beginVersion == 0);
// FIXME: prevent from calling this from another main thread!
ISingleThreadTransaction* tr = this->tr;

View File

@ -52,4 +52,7 @@ enum WellKnownEndpoints {
WLTOKEN_RESERVED_COUNT // 23
};
static_assert(WLTOKEN_PROTOCOL_INFO ==
10); // Enforce that the value of this endpoint does not change per comment above.
#endif

Some files were not shown because too many files have changed in this diff Show More