Merge commit '478ff1eb76bc88201b6803b8b8fb5ad9d0bcc040' into aggressive-storage-migration
This commit is contained in:
commit
8cf40f86e6
|
@ -58,8 +58,8 @@ _java_cmd = 'java -ea -cp %s:%s com.apple.foundationdb.test.' % (
|
|||
|
||||
# We could set min_api_version lower on some of these if the testers were updated to support them
|
||||
testers = {
|
||||
'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES),
|
||||
'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES),
|
||||
'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
|
||||
'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
|
||||
'ruby': Tester('ruby', _absolute_path('ruby/tests/tester.rb'), 2040, 23, MAX_API_VERSION),
|
||||
'java': Tester('java', _java_cmd + 'StackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES),
|
||||
'java_async': Tester('java', _java_cmd + 'AsyncStackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES),
|
||||
|
|
|
@ -135,6 +135,7 @@ if(NOT WIN32)
|
|||
add_executable(fdb_c_performance_test test/performance_test.c test/test.h)
|
||||
add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h)
|
||||
add_executable(fdb_c_txn_size_test test/txn_size_test.c test/test.h)
|
||||
add_executable(fdb_c_client_memory_test test/client_memory_test.cpp test/unit/fdb_api.cpp test/unit/fdb_api.hpp)
|
||||
add_executable(mako ${MAKO_SRCS})
|
||||
add_executable(fdb_c_setup_tests test/unit/setup_tests.cpp)
|
||||
add_executable(fdb_c_unit_tests ${UNIT_TEST_SRCS})
|
||||
|
@ -145,10 +146,12 @@ if(NOT WIN32)
|
|||
strip_debug_symbols(fdb_c_performance_test)
|
||||
strip_debug_symbols(fdb_c_ryw_benchmark)
|
||||
strip_debug_symbols(fdb_c_txn_size_test)
|
||||
strip_debug_symbols(fdb_c_client_memory_test)
|
||||
endif()
|
||||
target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads)
|
||||
target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads)
|
||||
target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads)
|
||||
target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads)
|
||||
|
||||
add_dependencies(fdb_c_setup_tests doctest)
|
||||
add_dependencies(fdb_c_unit_tests doctest)
|
||||
|
|
|
@ -835,9 +835,10 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio
|
|||
context.get_load_f = granule_context.get_load_f;
|
||||
context.free_load_f = granule_context.free_load_f;
|
||||
context.debugNoMaterialize = granule_context.debugNoMaterialize;
|
||||
context.granuleParallelism = granule_context.granuleParallelism;
|
||||
|
||||
Optional<Version> rv;
|
||||
if (readVersion != invalidVersion) { rv = readVersion; }
|
||||
if (readVersion != latestVersion) { rv = readVersion; }
|
||||
|
||||
return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr()););
|
||||
}
|
||||
|
|
|
@ -185,7 +185,12 @@ typedef struct readgranulecontext {
|
|||
void* userContext;
|
||||
|
||||
/* Returns a unique id for the load. Asynchronous to support queueing multiple in parallel. */
|
||||
int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
|
||||
int64_t (*start_load_f)(const char* filename,
|
||||
int filenameLength,
|
||||
int64_t offset,
|
||||
int64_t length,
|
||||
int64_t fullFileLength,
|
||||
void* context);
|
||||
|
||||
/* Returns data for the load. Pass the loadId returned by start_load_f */
|
||||
uint8_t* (*get_load_f)(int64_t loadId, void* context);
|
||||
|
@ -196,6 +201,9 @@ typedef struct readgranulecontext {
|
|||
/* Set this to true for testing if you don't want to read the granule files,
|
||||
just do the request to the blob workers */
|
||||
fdb_bool_t debugNoMaterialize;
|
||||
|
||||
/* Number of granules to load in parallel */
|
||||
int granuleParallelism;
|
||||
} FDBReadBlobGranuleContext;
|
||||
|
||||
DLLEXPORT void fdb_future_cancel(FDBFuture* f);
|
||||
|
@ -447,7 +455,7 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(
|
|||
uint8_t const* end_key_name,
|
||||
int end_key_name_length);
|
||||
|
||||
/* InvalidVersion (-1) for readVersion means get read version from transaction
|
||||
/* LatestVersion (-2) for readVersion means get read version from transaction
|
||||
Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */
|
||||
DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* db,
|
||||
uint8_t const* begin_key_name,
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
* client_memory_test.cpp
|
||||
*
|
||||
* This source file is part of the FoundationDB open source project
|
||||
*
|
||||
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#define FDB_API_VERSION 710
|
||||
#include <foundationdb/fdb_c.h>
|
||||
|
||||
#include "unit/fdb_api.hpp"
|
||||
|
||||
#include <thread>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
void fdb_check(fdb_error_t e) {
|
||||
if (e) {
|
||||
std::cerr << fdb_get_error(e) << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
FDBDatabase* fdb_open_database(const char* clusterFile) {
|
||||
FDBDatabase* db;
|
||||
fdb_check(fdb_create_database(clusterFile, &db));
|
||||
return db;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 2) {
|
||||
printf("Usage: %s <cluster_file>", argv[0]);
|
||||
}
|
||||
fdb_check(fdb_select_api_version(710));
|
||||
fdb_check(fdb_setup_network());
|
||||
std::thread network_thread{ &fdb_run_network };
|
||||
|
||||
fdb_check(
|
||||
fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));
|
||||
fdb_check(fdb_network_set_option(
|
||||
FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, reinterpret_cast<const uint8_t*>("json"), 4));
|
||||
|
||||
// Use a bunch of memory from different client threads
|
||||
FDBDatabase* db = fdb_open_database(argv[1]);
|
||||
auto thread_func = [&]() {
|
||||
fdb::Transaction tr(db);
|
||||
for (int i = 0; i < 10000; ++i) {
|
||||
tr.set(std::to_string(i), std::string(i, '\x00'));
|
||||
}
|
||||
tr.cancel();
|
||||
};
|
||||
std::vector<std::thread> threads;
|
||||
constexpr auto kThreadCount = 64;
|
||||
for (int i = 0; i < kThreadCount; ++i) {
|
||||
threads.emplace_back(thread_func);
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
fdb_database_destroy(db);
|
||||
db = nullptr;
|
||||
|
||||
// Memory usage should go down now if the allocator is returning memory to the OS. It's expected that something is
|
||||
// externally monitoring the memory usage of this process during this sleep.
|
||||
using namespace std::chrono_literals;
|
||||
std::this_thread::sleep_for(10s);
|
||||
|
||||
fdb_check(fdb_stop_network());
|
||||
network_thread.join();
|
||||
}
|
|
@ -585,6 +585,7 @@ int64_t granule_start_load(const char* filename,
|
|||
int filenameLength,
|
||||
int64_t offset,
|
||||
int64_t length,
|
||||
int64_t fullFileLength,
|
||||
void* userContext) {
|
||||
FILE* fp;
|
||||
char full_fname[PATH_MAX];
|
||||
|
@ -682,6 +683,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction,
|
|||
granuleContext.get_load_f = &granule_get_load;
|
||||
granuleContext.free_load_f = &granule_free_load;
|
||||
granuleContext.debugNoMaterialize = !doMaterialize;
|
||||
granuleContext.granuleParallelism = 2; // TODO make knob or setting for changing this?
|
||||
|
||||
r = fdb_transaction_read_blob_granules(transaction,
|
||||
(uint8_t*)keystr,
|
||||
|
@ -689,7 +691,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction,
|
|||
(uint8_t*)keystr2,
|
||||
strlen(keystr2),
|
||||
0 /* beginVersion*/,
|
||||
-1, /* endVersion. -1 is use txn read version */
|
||||
-2, /* endVersion. -2 (latestVersion) is use txn read version */
|
||||
granuleContext);
|
||||
|
||||
free(fileContext.data_by_id);
|
||||
|
|
|
@ -88,6 +88,7 @@ def api_version(ver):
|
|||
'predicates',
|
||||
'Future',
|
||||
'Database',
|
||||
'Tenant',
|
||||
'Transaction',
|
||||
'KeyValue',
|
||||
'KeySelector',
|
||||
|
|
|
@ -34,6 +34,7 @@ import traceback
|
|||
|
||||
import fdb
|
||||
from fdb import six
|
||||
from fdb.tuple import pack, unpack
|
||||
|
||||
_network_thread = None
|
||||
_network_thread_reentrant_lock = threading.RLock()
|
||||
|
@ -198,9 +199,10 @@ def transactional(*tr_args, **tr_kwargs):
|
|||
one of two actions, depending on the type of the parameter passed
|
||||
to the function at call time.
|
||||
|
||||
If given a Database, a Transaction will be created and passed into
|
||||
the wrapped code in place of the Database. After the function is
|
||||
complete, the newly created transaction will be committed.
|
||||
If given a Database or Tenant, a Transaction will be created and
|
||||
passed into the wrapped code in place of the Database or Tenant.
|
||||
After the function is complete, the newly created transaction
|
||||
will be committed.
|
||||
|
||||
It is important to note that the wrapped method may be called
|
||||
multiple times in the event of a commit failure, until the commit
|
||||
|
@ -943,128 +945,114 @@ class FormerFuture(_FDBBase):
|
|||
except:
|
||||
pass
|
||||
|
||||
|
||||
class Database(_FDBBase):
|
||||
def __init__(self, dpointer):
|
||||
self.dpointer = dpointer
|
||||
self.options = _DatabaseOptions(self)
|
||||
|
||||
def __del__(self):
|
||||
# print('Destroying database 0x%x' % self.dpointer)
|
||||
self.capi.fdb_database_destroy(self.dpointer)
|
||||
|
||||
class _TransactionCreator(_FDBBase):
|
||||
def get(self, key):
|
||||
return Database.__database_getitem(self, key)
|
||||
return _TransactionCreator.__creator_getitem(self, key)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, slice):
|
||||
return self.get_range(key.start, key.stop, reverse=(key.step == -1))
|
||||
return Database.__database_getitem(self, key)
|
||||
return _TransactionCreator.__creator_getitem(self, key)
|
||||
|
||||
def get_key(self, key_selector):
|
||||
return Database.__database_get_key(self, key_selector)
|
||||
return _TransactionCreator.__creator_get_key(self, key_selector)
|
||||
|
||||
def get_range(self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.want_all):
|
||||
return Database.__database_get_range(self, begin, end, limit, reverse, streaming_mode)
|
||||
return _TransactionCreator.__creator_get_range(self, begin, end, limit, reverse, streaming_mode)
|
||||
|
||||
def get_range_startswith(self, prefix, *args, **kwargs):
|
||||
return Database.__database_get_range_startswith(self, prefix, *args, **kwargs)
|
||||
return _TransactionCreator.__creator_get_range_startswith(self, prefix, *args, **kwargs)
|
||||
|
||||
def set(self, key, value):
|
||||
Database.__database_setitem(self, key, value)
|
||||
_TransactionCreator.__creator_setitem(self, key, value)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
Database.__database_setitem(self, key, value)
|
||||
_TransactionCreator.__creator_setitem(self, key, value)
|
||||
|
||||
def clear(self, key):
|
||||
Database.__database_delitem(self, key)
|
||||
_TransactionCreator.__creator_delitem(self, key)
|
||||
|
||||
def clear_range(self, begin, end):
|
||||
Database.__database_delitem(self, slice(begin, end))
|
||||
_TransactionCreator.__creator_delitem(self, slice(begin, end))
|
||||
|
||||
def __delitem__(self, key_or_slice):
|
||||
Database.__database_delitem(self, key_or_slice)
|
||||
_TransactionCreator.__creator_delitem(self, key_or_slice)
|
||||
|
||||
def clear_range_startswith(self, prefix):
|
||||
Database.__database_clear_range_startswith(self, prefix)
|
||||
_TransactionCreator.__creator_clear_range_startswith(self, prefix)
|
||||
|
||||
def get_and_watch(self, key):
|
||||
return Database.__database_get_and_watch(self, key)
|
||||
return _TransactionCreator.__creator_get_and_watch(self, key)
|
||||
|
||||
def set_and_watch(self, key, value):
|
||||
return Database.__database_set_and_watch(self, key, value)
|
||||
return _TransactionCreator.__creator_set_and_watch(self, key, value)
|
||||
|
||||
def clear_and_watch(self, key):
|
||||
return Database.__database_clear_and_watch(self, key)
|
||||
return _TransactionCreator.__creator_clear_and_watch(self, key)
|
||||
|
||||
def create_transaction(self):
|
||||
pointer = ctypes.c_void_p()
|
||||
self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer))
|
||||
return Transaction(pointer.value, self)
|
||||
|
||||
def _set_option(self, option, param, length):
|
||||
self.capi.fdb_database_set_option(self.dpointer, option, param, length)
|
||||
pass
|
||||
|
||||
def _atomic_operation(self, opcode, key, param):
|
||||
Database.__database_atomic_operation(self, opcode, key, param)
|
||||
_TransactionCreator.__creator_atomic_operation(self, opcode, key, param)
|
||||
|
||||
#### Transaction implementations ####
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_getitem(tr, key):
|
||||
def __creator_getitem(tr, key):
|
||||
return tr[key].value
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_get_key(tr, key_selector):
|
||||
def __creator_get_key(tr, key_selector):
|
||||
return tr.get_key(key_selector).value
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_get_range(tr, begin, end, limit, reverse, streaming_mode):
|
||||
def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode):
|
||||
return tr.get_range(begin, end, limit, reverse, streaming_mode).to_list()
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_get_range_startswith(tr, prefix, *args, **kwargs):
|
||||
def __creator_get_range_startswith(tr, prefix, *args, **kwargs):
|
||||
return tr.get_range_startswith(prefix, *args, **kwargs).to_list()
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_setitem(tr, key, value):
|
||||
def __creator_setitem(tr, key, value):
|
||||
tr[key] = value
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_clear_range_startswith(tr, prefix):
|
||||
def __creator_clear_range_startswith(tr, prefix):
|
||||
tr.clear_range_startswith(prefix)
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_get_and_watch(tr, key):
|
||||
def __creator_get_and_watch(tr, key):
|
||||
v = tr.get(key)
|
||||
return v, tr.watch(key)
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_set_and_watch(tr, key, value):
|
||||
def __creator_set_and_watch(tr, key, value):
|
||||
tr.set(key, value)
|
||||
return tr.watch(key)
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_clear_and_watch(tr, key):
|
||||
def __creator_clear_and_watch(tr, key):
|
||||
del tr[key]
|
||||
return tr.watch(key)
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_delitem(tr, key_or_slice):
|
||||
def __creator_delitem(tr, key_or_slice):
|
||||
del tr[key_or_slice]
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_atomic_operation(tr, opcode, key, param):
|
||||
def __creator_atomic_operation(tr, opcode, key, param):
|
||||
tr._atomic_operation(opcode, key, param)
|
||||
|
||||
# Asynchronous transactions
|
||||
|
@ -1074,11 +1062,11 @@ class Database(_FDBBase):
|
|||
From = asyncio.From
|
||||
coroutine = asyncio.coroutine
|
||||
|
||||
class Database:
|
||||
class TransactionCreator:
|
||||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_getitem(tr, key):
|
||||
def __creator_getitem(tr, key):
|
||||
# raise Return(( yield From( tr[key] ) ))
|
||||
raise Return(tr[key])
|
||||
yield None
|
||||
|
@ -1086,26 +1074,26 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_get_key(tr, key_selector):
|
||||
def __creator_get_key(tr, key_selector):
|
||||
raise Return(tr.get_key(key_selector))
|
||||
yield None
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_get_range(tr, begin, end, limit, reverse, streaming_mode):
|
||||
def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode):
|
||||
raise Return((yield From(tr.get_range(begin, end, limit, reverse, streaming_mode).to_list())))
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_get_range_startswith(tr, prefix, *args, **kwargs):
|
||||
def __creator_get_range_startswith(tr, prefix, *args, **kwargs):
|
||||
raise Return((yield From(tr.get_range_startswith(prefix, *args, **kwargs).to_list())))
|
||||
|
||||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_setitem(tr, key, value):
|
||||
def __creator_setitem(tr, key, value):
|
||||
tr[key] = value
|
||||
raise Return()
|
||||
yield None
|
||||
|
@ -1113,7 +1101,7 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_clear_range_startswith(tr, prefix):
|
||||
def __creator_clear_range_startswith(tr, prefix):
|
||||
tr.clear_range_startswith(prefix)
|
||||
raise Return()
|
||||
yield None
|
||||
|
@ -1121,7 +1109,7 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_get_and_watch(tr, key):
|
||||
def __creator_get_and_watch(tr, key):
|
||||
v = tr.get(key)
|
||||
raise Return(v, tr.watch(key))
|
||||
yield None
|
||||
|
@ -1129,7 +1117,7 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_set_and_watch(tr, key, value):
|
||||
def __creator_set_and_watch(tr, key, value):
|
||||
tr.set(key, value)
|
||||
raise Return(tr.watch(key))
|
||||
yield None
|
||||
|
@ -1137,7 +1125,7 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_clear_and_watch(tr, key):
|
||||
def __creator_clear_and_watch(tr, key):
|
||||
del tr[key]
|
||||
raise Return(tr.watch(key))
|
||||
yield None
|
||||
|
@ -1145,7 +1133,7 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_delitem(tr, key_or_slice):
|
||||
def __creator_delitem(tr, key_or_slice):
|
||||
del tr[key_or_slice]
|
||||
raise Return()
|
||||
yield None
|
||||
|
@ -1153,11 +1141,101 @@ class Database(_FDBBase):
|
|||
@staticmethod
|
||||
@transactional
|
||||
@coroutine
|
||||
def __database_atomic_operation(tr, opcode, key, param):
|
||||
def __creator_atomic_operation(tr, opcode, key, param):
|
||||
tr._atomic_operation(opcode, key, param)
|
||||
raise Return()
|
||||
yield None
|
||||
return Database
|
||||
return TransactionCreator
|
||||
|
||||
def process_tenant_name(name):
|
||||
if isinstance(name, tuple):
|
||||
return pack(name)
|
||||
elif isinstance(name, bytes):
|
||||
return name
|
||||
else:
|
||||
raise TypeError('Tenant name must be of type ' + bytes.__name__ + ' or of type ' + tuple.__name__)
|
||||
|
||||
class Database(_TransactionCreator):
|
||||
def __init__(self, dpointer):
|
||||
self.dpointer = dpointer
|
||||
self.options = _DatabaseOptions(self)
|
||||
|
||||
def __del__(self):
|
||||
# print('Destroying database 0x%x' % self.dpointer)
|
||||
self.capi.fdb_database_destroy(self.dpointer)
|
||||
|
||||
def _set_option(self, option, param, length):
|
||||
self.capi.fdb_database_set_option(self.dpointer, option, param, length)
|
||||
|
||||
def open_tenant(self, name):
|
||||
tname = process_tenant_name(name)
|
||||
pointer = ctypes.c_void_p()
|
||||
self.capi.fdb_database_open_tenant(self.dpointer, tname, len(tname), ctypes.byref(pointer))
|
||||
return Tenant(pointer.value)
|
||||
|
||||
def create_transaction(self):
|
||||
pointer = ctypes.c_void_p()
|
||||
self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer))
|
||||
return Transaction(pointer.value, self)
|
||||
|
||||
def allocate_tenant(self, name):
|
||||
Database.__database_allocate_tenant(self, process_tenant_name(name), [])
|
||||
|
||||
def delete_tenant(self, name):
|
||||
Database.__database_delete_tenant(self, process_tenant_name(name), [])
|
||||
|
||||
# Attempt to allocate a tenant in the cluster. If the tenant already exists,
|
||||
# this function will return a tenant_already_exists error. If the tenant is created
|
||||
# concurrently, then this function may return success even if another caller creates
|
||||
# it.
|
||||
#
|
||||
# The existence_check_marker is expected to be an empty list. This function will
|
||||
# modify the list after completing the existence check to avoid checking for existence
|
||||
# on retries. This allows the operation to be idempotent.
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_allocate_tenant(tr, name, existence_check_marker):
|
||||
tr.options.set_special_key_space_enable_writes()
|
||||
key = b'\xff\xff/management/tenant_map/%s' % name
|
||||
if not existence_check_marker:
|
||||
existing_tenant = tr[key].wait()
|
||||
existence_check_marker.append(None)
|
||||
if existing_tenant != None:
|
||||
raise fdb.FDBError(2132) # tenant_already_exists
|
||||
tr[key] = b''
|
||||
|
||||
# Attempt to remove a tenant in the cluster. If the tenant doesn't exist, this
|
||||
# function will return a tenant_not_found error. If the tenant is deleted
|
||||
# concurrently, then this function may return success even if another caller deletes
|
||||
# it.
|
||||
#
|
||||
# The existence_check_marker is expected to be an empty list. This function will
|
||||
# modify the list after completing the existence check to avoid checking for existence
|
||||
# on retries. This allows the operation to be idempotent.
|
||||
@staticmethod
|
||||
@transactional
|
||||
def __database_delete_tenant(tr, name, existence_check_marker):
|
||||
tr.options.set_special_key_space_enable_writes()
|
||||
key = b'\xff\xff/management/tenant_map/%s' % name
|
||||
if not existence_check_marker:
|
||||
existing_tenant = tr[key].wait()
|
||||
existence_check_marker.append(None)
|
||||
if existing_tenant == None:
|
||||
raise fdb.FDBError(2131) # tenant_not_found
|
||||
del tr[key]
|
||||
|
||||
|
||||
class Tenant(_TransactionCreator):
|
||||
def __init__(self, tpointer):
|
||||
self.tpointer = tpointer
|
||||
|
||||
def __del__(self):
|
||||
self.capi.fdb_tenant_destroy(self.tpointer)
|
||||
|
||||
def create_transaction(self):
|
||||
pointer = ctypes.c_void_p()
|
||||
self.capi.fdb_tenant_create_transaction(self.tpointer, ctypes.byref(pointer))
|
||||
return Transaction(pointer.value, self)
|
||||
|
||||
|
||||
fill_operations()
|
||||
|
@ -1458,6 +1536,10 @@ def init_c_api():
|
|||
_capi.fdb_database_destroy.argtypes = [ctypes.c_void_p]
|
||||
_capi.fdb_database_destroy.restype = None
|
||||
|
||||
_capi.fdb_database_open_tenant.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p)]
|
||||
_capi.fdb_database_open_tenant.restype = ctypes.c_int
|
||||
_capi.fdb_database_open_tenant.errcheck = check_error_code
|
||||
|
||||
_capi.fdb_database_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)]
|
||||
_capi.fdb_database_create_transaction.restype = ctypes.c_int
|
||||
_capi.fdb_database_create_transaction.errcheck = check_error_code
|
||||
|
@ -1466,6 +1548,13 @@ def init_c_api():
|
|||
_capi.fdb_database_set_option.restype = ctypes.c_int
|
||||
_capi.fdb_database_set_option.errcheck = check_error_code
|
||||
|
||||
_capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p]
|
||||
_capi.fdb_tenant_destroy.restype = None
|
||||
|
||||
_capi.fdb_tenant_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)]
|
||||
_capi.fdb_tenant_create_transaction.restype = ctypes.c_int
|
||||
_capi.fdb_tenant_create_transaction.errcheck = check_error_code
|
||||
|
||||
_capi.fdb_transaction_destroy.argtypes = [ctypes.c_void_p]
|
||||
_capi.fdb_transaction_destroy.restype = None
|
||||
|
||||
|
@ -1686,10 +1775,10 @@ def init(event_model=None):
|
|||
raise asyncio.Return(self)
|
||||
return it()
|
||||
FDBRange.iterate = iterate
|
||||
AT = Database.declare_asynchronous_transactions()
|
||||
AT = _TransactionCreator.declare_asynchronous_transactions()
|
||||
for name in dir(AT):
|
||||
if name.startswith("_Database__database_"):
|
||||
setattr(Database, name, getattr(AT, name))
|
||||
if name.startswith("__TransactionCreator__creator_"):
|
||||
setattr(_TransactionCreator, name, getattr(AT, name))
|
||||
|
||||
def to_list(self):
|
||||
if self._mode == StreamingMode.iterator:
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
#!/usr/bin/python
|
||||
#
|
||||
# tenant_tests.py
|
||||
#
|
||||
# This source file is part of the FoundationDB open source project
|
||||
#
|
||||
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import fdb
|
||||
import sys
|
||||
import json
|
||||
from fdb.tuple import pack
|
||||
|
||||
if __name__ == '__main__':
|
||||
fdb.api_version(710)
|
||||
|
||||
def test_tenant_tuple_name(db):
|
||||
tuplename=(b'test', b'level', b'hierarchy', 3, 1.24, 'str')
|
||||
db.allocate_tenant(tuplename)
|
||||
|
||||
tenant=db.open_tenant(tuplename)
|
||||
tenant[b'foo'] = b'bar'
|
||||
|
||||
assert tenant[b'foo'] == b'bar'
|
||||
|
||||
del tenant[b'foo']
|
||||
db.delete_tenant(tuplename)
|
||||
|
||||
def cleanup_tenant(db, tenant_name):
|
||||
try:
|
||||
tenant = db.open_tenant(tenant_name)
|
||||
del tenant[:]
|
||||
db.delete_tenant(tenant_name)
|
||||
except fdb.FDBError as e:
|
||||
if e.code == 2131: # tenant not found
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
def test_tenant_operations(db):
|
||||
cleanup_tenant(db, b'tenant1')
|
||||
cleanup_tenant(db, b'tenant2')
|
||||
|
||||
db.allocate_tenant(b'tenant1')
|
||||
db.allocate_tenant(b'tenant2')
|
||||
|
||||
tenant1 = db.open_tenant(b'tenant1')
|
||||
tenant2 = db.open_tenant(b'tenant2')
|
||||
|
||||
db[b'tenant_test_key'] = b'no_tenant'
|
||||
tenant1[b'tenant_test_key'] = b'tenant1'
|
||||
tenant2[b'tenant_test_key'] = b'tenant2'
|
||||
|
||||
tenant1_entry = db[b'\xff\xff/management/tenant_map/tenant1']
|
||||
tenant1_json = json.loads(tenant1_entry)
|
||||
prefix1 = tenant1_json['prefix'].encode('utf8')
|
||||
|
||||
tenant2_entry = db[b'\xff\xff/management/tenant_map/tenant2']
|
||||
tenant2_json = json.loads(tenant2_entry)
|
||||
prefix2 = tenant2_json['prefix'].encode('utf8')
|
||||
|
||||
assert tenant1[b'tenant_test_key'] == b'tenant1'
|
||||
assert db[prefix1 + b'tenant_test_key'] == b'tenant1'
|
||||
assert tenant2[b'tenant_test_key'] == b'tenant2'
|
||||
assert db[prefix2 + b'tenant_test_key'] == b'tenant2'
|
||||
assert db[b'tenant_test_key'] == b'no_tenant'
|
||||
|
||||
tr1 = tenant1.create_transaction()
|
||||
try:
|
||||
del tr1[:]
|
||||
tr1.commit().wait()
|
||||
except fdb.FDBError as e:
|
||||
tr.on_error(e).wait()
|
||||
|
||||
assert tenant1[b'tenant_test_key'] == None
|
||||
assert db[prefix1 + b'tenant_test_key'] == None
|
||||
assert tenant2[b'tenant_test_key'] == b'tenant2'
|
||||
assert db[prefix2 + b'tenant_test_key'] == b'tenant2'
|
||||
assert db[b'tenant_test_key'] == b'no_tenant'
|
||||
|
||||
db.delete_tenant(b'tenant1')
|
||||
try:
|
||||
tenant1[b'tenant_test_key']
|
||||
assert False
|
||||
except fdb.FDBError as e:
|
||||
assert e.code == 2131 # tenant not found
|
||||
|
||||
del tenant2[:]
|
||||
db.delete_tenant(b'tenant2')
|
||||
|
||||
assert db[prefix1 + b'tenant_test_key'] == None
|
||||
assert db[prefix2 + b'tenant_test_key'] == None
|
||||
assert db[b'tenant_test_key'] == b'no_tenant'
|
||||
|
||||
del db[b'tenant_test_key']
|
||||
|
||||
assert db[b'tenant_test_key'] == None
|
||||
|
||||
def test_tenants(db):
|
||||
test_tenant_tuple_name(db)
|
||||
test_tenant_operations(db)
|
||||
|
||||
# Expect a cluster file as input. This test will write to the FDB cluster, so
|
||||
# be aware of potential side effects.
|
||||
if __name__ == '__main__':
|
||||
clusterFile = sys.argv[1]
|
||||
db = fdb.open(clusterFile)
|
||||
db.options.set_transaction_timeout(2000) # 2 seconds
|
||||
db.options.set_transaction_retry_limit(3)
|
||||
|
||||
test_tenants(db)
|
|
@ -49,6 +49,7 @@ from cancellation_timeout_tests import test_db_retry_limits
|
|||
from cancellation_timeout_tests import test_combinations
|
||||
|
||||
from size_limit_tests import test_size_limit_option, test_get_approximate_size
|
||||
from tenant_tests import test_tenants
|
||||
|
||||
random.seed(0)
|
||||
|
||||
|
@ -112,12 +113,13 @@ class Stack:
|
|||
|
||||
|
||||
class Instruction:
|
||||
def __init__(self, tr, stack, op, index, isDatabase=False, isSnapshot=False):
|
||||
def __init__(self, tr, stack, op, index, isDatabase=False, isTenant=False, isSnapshot=False):
|
||||
self.tr = tr
|
||||
self.stack = stack
|
||||
self.op = op
|
||||
self.index = index
|
||||
self.isDatabase = isDatabase
|
||||
self.isTenant = isTenant
|
||||
self.isSnapshot = isSnapshot
|
||||
|
||||
def pop(self, count=None, with_idx=False):
|
||||
|
@ -277,6 +279,7 @@ class Tester:
|
|||
|
||||
def __init__(self, db, prefix):
|
||||
self.db = db
|
||||
self.tenant = None
|
||||
|
||||
self.instructions = self.db[fdb.tuple.range((prefix,))]
|
||||
|
||||
|
@ -317,7 +320,8 @@ class Tester:
|
|||
|
||||
def new_transaction(self):
|
||||
with Tester.tr_map_lock:
|
||||
Tester.tr_map[self.tr_name] = self.db.create_transaction()
|
||||
tr_source = self.tenant if self.tenant is not None else self.db
|
||||
Tester.tr_map[self.tr_name] = tr_source.create_transaction()
|
||||
|
||||
def switch_transaction(self, name):
|
||||
self.tr_name = name
|
||||
|
@ -335,18 +339,22 @@ class Tester:
|
|||
# print("%d. Instruction is %s" % (idx, op))
|
||||
|
||||
isDatabase = op.endswith(six.u('_DATABASE'))
|
||||
isTenant = op.endswith(six.u('_TENANT'))
|
||||
isSnapshot = op.endswith(six.u('_SNAPSHOT'))
|
||||
|
||||
if isDatabase:
|
||||
op = op[:-9]
|
||||
obj = self.db
|
||||
elif isTenant:
|
||||
op = op[:-7]
|
||||
obj = self.tenant if self.tenant else self.db
|
||||
elif isSnapshot:
|
||||
op = op[:-9]
|
||||
obj = self.current_transaction().snapshot
|
||||
else:
|
||||
obj = self.current_transaction()
|
||||
|
||||
inst = Instruction(obj, self.stack, op, idx, isDatabase, isSnapshot)
|
||||
inst = Instruction(obj, self.stack, op, idx, isDatabase, isTenant, isSnapshot)
|
||||
|
||||
try:
|
||||
if inst.op == six.u("PUSH"):
|
||||
|
@ -583,6 +591,19 @@ class Tester:
|
|||
prefix = inst.pop()
|
||||
Tester.wait_empty(self.db, prefix)
|
||||
inst.push(b"WAITED_FOR_EMPTY")
|
||||
elif inst.op == six.u("TENANT_CREATE"):
|
||||
name = inst.pop()
|
||||
self.db.allocate_tenant(name)
|
||||
inst.push(b"RESULT_NOT_PRESENT")
|
||||
elif inst.op == six.u("TENANT_DELETE"):
|
||||
name = inst.pop()
|
||||
self.db.delete_tenant(name)
|
||||
inst.push(b"RESULT_NOT_PRESENT")
|
||||
elif inst.op == six.u("TENANT_SET_ACTIVE"):
|
||||
name = inst.pop()
|
||||
self.tenant = self.db.open_tenant(name)
|
||||
elif inst.op == six.u("TENANT_CLEAR_ACTIVE"):
|
||||
self.tenant = None
|
||||
elif inst.op == six.u("UNIT_TESTS"):
|
||||
try:
|
||||
test_db_options(db)
|
||||
|
@ -600,6 +621,8 @@ class Tester:
|
|||
test_size_limit_option(db)
|
||||
test_get_approximate_size(db)
|
||||
|
||||
test_tenants(db)
|
||||
|
||||
except fdb.FDBError as e:
|
||||
print("Unit tests failed: %s" % e.description)
|
||||
traceback.print_exc()
|
||||
|
|
|
@ -212,6 +212,17 @@ endif()
|
|||
|
||||
set(COROUTINE_IMPL ${DEFAULT_COROUTINE_IMPL} CACHE STRING "Which coroutine implementation to use. Options are boost and libcoro")
|
||||
|
||||
################################################################################
|
||||
# AWS SDK
|
||||
################################################################################
|
||||
|
||||
set(BUILD_AWS_BACKUP OFF CACHE BOOL "Build AWS S3 SDK backup client")
|
||||
if (BUILD_AWS_BACKUP)
|
||||
set(WITH_AWS_BACKUP ON)
|
||||
else()
|
||||
set(WITH_AWS_BACKUP OFF)
|
||||
endif()
|
||||
|
||||
################################################################################
|
||||
|
||||
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/packages)
|
||||
|
@ -232,6 +243,7 @@ function(print_components)
|
|||
message(STATUS "Build Python sdist (make package): ${WITH_PYTHON_BINDING}")
|
||||
message(STATUS "Configure CTest (depends on Python): ${WITH_PYTHON}")
|
||||
message(STATUS "Build with RocksDB: ${WITH_ROCKSDB_EXPERIMENTAL}")
|
||||
message(STATUS "Build with AWS SDK: ${WITH_AWS_BACKUP}")
|
||||
message(STATUS "=========================================")
|
||||
endfunction()
|
||||
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
project(awssdk-download NONE)
|
||||
|
||||
# Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang
|
||||
set(AWSSDK_COMPILER_FLAGS "")
|
||||
set(AWSSDK_LINK_FLAGS "")
|
||||
if(APPLE OR CLANG OR USE_LIBCXX)
|
||||
set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++)
|
||||
set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi)
|
||||
endif()
|
||||
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(awssdk_project
|
||||
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
|
||||
GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200
|
||||
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
|
||||
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
|
||||
GIT_CONFIG advice.detachedHead=false
|
||||
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
|
||||
-DENABLE_TESTING=OFF
|
||||
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
|
||||
-DSIMPLE_INSTALL=ON
|
||||
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
|
||||
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
|
||||
|
||||
|
||||
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
|
||||
-DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS}
|
||||
-DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS}
|
||||
TEST_COMMAND ""
|
||||
BUILD_ALWAYS TRUE
|
||||
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
|
||||
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
|
||||
)
|
||||
|
||||
add_library(awssdk_core STATIC IMPORTED)
|
||||
add_dependencies(awssdk_core awssdk_project)
|
||||
set_target_properties(awssdk_core PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a")
|
||||
|
||||
add_library(awssdk_crt STATIC IMPORTED)
|
||||
add_dependencies(awssdk_crt awssdk_project)
|
||||
set_target_properties(awssdk_crt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a")
|
||||
|
||||
# TODO: can we remove c_s3? It seems to be a dependency of libaws-crt
|
||||
add_library(awssdk_c_s3 STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_s3 awssdk_project)
|
||||
set_target_properties(awssdk_c_s3 PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a")
|
||||
|
||||
add_library(awssdk_c_auth STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_auth awssdk_project)
|
||||
set_target_properties(awssdk_c_auth PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a")
|
||||
|
||||
add_library(awssdk_c_eventstream STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_eventstream awssdk_project)
|
||||
set_target_properties(awssdk_c_eventstream PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a")
|
||||
|
||||
add_library(awssdk_c_http STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_http awssdk_project)
|
||||
set_target_properties(awssdk_c_http PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a")
|
||||
|
||||
add_library(awssdk_c_mqtt STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_mqtt awssdk_project)
|
||||
set_target_properties(awssdk_c_mqtt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a")
|
||||
|
||||
add_library(awssdk_c_io STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_io awssdk_project)
|
||||
set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a")
|
||||
|
||||
add_library(awssdk_checksums STATIC IMPORTED)
|
||||
add_dependencies(awssdk_checksums awssdk_project)
|
||||
set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a")
|
||||
|
||||
add_library(awssdk_c_compression STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_compression awssdk_project)
|
||||
set_target_properties(awssdk_c_compression PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a")
|
||||
|
||||
add_library(awssdk_c_cal STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_cal awssdk_project)
|
||||
set_target_properties(awssdk_c_cal PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a")
|
||||
|
||||
add_library(awssdk_c_common STATIC IMPORTED)
|
||||
add_dependencies(awssdk_c_common awssdk_project)
|
||||
set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a")
|
||||
|
||||
# link them all together in one interface target
|
||||
add_library(awssdk_target INTERFACE)
|
||||
target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include)
|
||||
target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl)
|
|
@ -3,3 +3,4 @@ setuptools>=20.10.0,<=57.4.0
|
|||
sphinx==1.5.6
|
||||
sphinx-bootstrap-theme==0.4.8
|
||||
docutils==0.16
|
||||
Jinja2==3.0.3
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
.. |database-type| replace:: ``Database``
|
||||
.. |database-class| replace:: :class:`Database`
|
||||
.. |database-auto| replace:: the :func:`@fdb.transactional <transactional>` decorator
|
||||
.. |tenant-type| replace:: FIXME
|
||||
.. |tenant-type| replace:: :class:`Tenant`
|
||||
.. |transaction-class| replace:: :class:`Transaction`
|
||||
.. |get-key-func| replace:: :func:`Transaction.get_key`
|
||||
.. |get-range-func| replace:: :func:`Transaction.get_range`
|
||||
|
@ -316,9 +316,29 @@ A |database-blurb1| |database-blurb2|
|
|||
|
||||
Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional <transactional>` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior.
|
||||
|
||||
.. method:: Database.open_tenant(tenant_name)
|
||||
|
||||
Opens an existing tenant to be used for running transactions and returns it as a :class`Tenant` object.
|
||||
|
||||
The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
|
||||
|
||||
.. |sync-read| replace:: This read is fully synchronous.
|
||||
.. |sync-write| replace:: This change will be committed immediately, and is fully synchronous.
|
||||
|
||||
.. method:: Database.allocate_tenant(tenant_name):
|
||||
|
||||
Creates a new tenant in the cluster. |sync-write|
|
||||
|
||||
The tenant name can be either a byte string or a tuple and cannot start with the ``\xff`` byte. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
|
||||
|
||||
.. method:: Database.delete_tenant(tenant_name):
|
||||
|
||||
Delete a tenant from the cluster. |sync-write|
|
||||
|
||||
The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
|
||||
|
||||
It is an error to delete a tenant that still has data. To delete a non-empty tenant, first clear all of the keys in the tenant.
|
||||
|
||||
.. method:: Database.get(key)
|
||||
|
||||
Returns the value associated with the specified key in the database (or ``None`` if the key does not exist). |sync-read|
|
||||
|
@ -460,6 +480,17 @@ Database options
|
|||
.. method:: Database.options.set_snapshot_ryw_disable()
|
||||
|
||||
|option-db-snapshot-ryw-disable-blurb|
|
||||
|
||||
Tenant objects
|
||||
==============
|
||||
|
||||
.. class:: Tenant
|
||||
|
||||
|tenant-blurb1|
|
||||
|
||||
.. method:: Tenant.create_transaction()
|
||||
|
||||
Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional <transactional>` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior.
|
||||
|
||||
.. _api-python-transactional-decorator:
|
||||
|
||||
|
@ -479,9 +510,9 @@ Transactional decoration
|
|||
|
||||
The ``@fdb.transactional`` decorator makes ``simple_function`` a transactional function. All functions using this decorator must have an argument **named** ``tr``. This specially named argument is passed a transaction that the function can use to do reads and writes.
|
||||
|
||||
A caller of a transactionally decorated function can pass a :class:`Database` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits.
|
||||
A caller of a transactionally decorated function can pass a :class:`Database` or :class:`Tenant` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits.
|
||||
|
||||
If ``db`` is a :class:`Database`, a call like ::
|
||||
If ``db`` is a :class:`Database` or :class:`Tenant`, a call like ::
|
||||
|
||||
simple_function(db, 'a', 'b')
|
||||
|
||||
|
@ -744,7 +775,7 @@ Committing
|
|||
|
||||
.. decorator:: transactional()
|
||||
|
||||
The ``transactional`` decorator makes it easy to write transactional functions which accept either a :class:`Database` or a :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
|
||||
The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
|
||||
|
||||
.. method :: Transaction.commit()
|
||||
|
||||
|
@ -754,7 +785,7 @@ Committing
|
|||
|
||||
|commit-outstanding-reads-blurb|
|
||||
|
||||
.. note :: Consider using the :func:`@fdb.transactional <transactional>` decorator, which not only calls :meth:`Database.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions.
|
||||
.. note :: Consider using the :func:`@fdb.transactional <transactional>` decorator, which not only calls :meth:`Database.create_transaction` or :meth`Tenant.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions.
|
||||
|
||||
.. warning :: |used-during-commit-blurb|
|
||||
|
||||
|
|
|
@ -155,6 +155,12 @@ Here is a complete list of valid parameters:
|
|||
|
||||
**Example**: The URL parameter *header=x-amz-storage-class:REDUCED_REDUNDANCY* would send the HTTP header required to use the reduced redundancy storage option in the S3 API.
|
||||
|
||||
Signing Protocol
|
||||
=================
|
||||
|
||||
AWS signature version 4 is the default signing protocol choice. This boolean knob ``--knob_http_request_aws_v4_header`` can be used to select between v4 style and v2 style signatures.
|
||||
If the knob is set to ``true`` then v4 signature will be used and if set to ``false`` then v2 signature will be used.
|
||||
|
||||
.. _blob-credential-files:
|
||||
|
||||
Blob Credential Files
|
||||
|
|
|
@ -46,6 +46,7 @@ enum {
|
|||
OPT_HEX_KEY_PREFIX,
|
||||
OPT_BEGIN_VERSION_FILTER,
|
||||
OPT_END_VERSION_FILTER,
|
||||
OPT_KNOB,
|
||||
OPT_HELP
|
||||
};
|
||||
|
||||
|
@ -72,6 +73,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
|
|||
{ OPT_HEX_KEY_PREFIX, "--hex-prefix", SO_REQ_SEP },
|
||||
{ OPT_BEGIN_VERSION_FILTER, "--begin-version-filter", SO_REQ_SEP },
|
||||
{ OPT_END_VERSION_FILTER, "--end-version-filter", SO_REQ_SEP },
|
||||
{ OPT_KNOB, "--knob-", SO_REQ_SEP },
|
||||
{ OPT_HELP, "-?", SO_NONE },
|
||||
{ OPT_HELP, "-h", SO_NONE },
|
||||
{ OPT_HELP, "--help", SO_NONE },
|
||||
|
|
|
@ -26,17 +26,21 @@
|
|||
#include <vector>
|
||||
|
||||
#include "fdbbackup/BackupTLSConfig.h"
|
||||
#include "fdbclient/BuildFlags.h"
|
||||
#include "fdbbackup/FileConverter.h"
|
||||
#include "fdbclient/BackupAgent.actor.h"
|
||||
#include "fdbclient/BackupContainer.h"
|
||||
#include "fdbbackup/FileConverter.h"
|
||||
#include "fdbclient/CommitTransaction.h"
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbclient/IKnobCollection.h"
|
||||
#include "fdbclient/Knobs.h"
|
||||
#include "fdbclient/MutationList.h"
|
||||
#include "flow/ArgParseUtil.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/flow.h"
|
||||
#include "flow/serialize.h"
|
||||
#include "fdbclient/BuildFlags.h"
|
||||
|
||||
#include "flow/actorcompiler.h" // has to be last include
|
||||
|
||||
#define SevDecodeInfo SevVerbose
|
||||
|
@ -73,11 +77,13 @@ void printDecodeUsage() {
|
|||
" --list-only Print file list and exit.\n"
|
||||
" -k KEY_PREFIX Use the prefix for filtering mutations\n"
|
||||
" --hex-prefix HEX_PREFIX\n"
|
||||
" The prefix specified in HEX format, e.g., \\x05\\x01.\n"
|
||||
" The prefix specified in HEX format, e.g., \"\\\\x05\\\\x01\".\n"
|
||||
" --begin-version-filter BEGIN_VERSION\n"
|
||||
" The version range's begin version (inclusive) for filtering.\n"
|
||||
" --end-version-filter END_VERSION\n"
|
||||
" The version range's end version (exclusive) for filtering.\n"
|
||||
" --knob-KNOBNAME KNOBVALUE\n"
|
||||
" Changes a knob value. KNOBNAME should be lowercase."
|
||||
"\n";
|
||||
return;
|
||||
}
|
||||
|
@ -97,6 +103,8 @@ struct DecodeParams {
|
|||
Version beginVersionFilter = 0;
|
||||
Version endVersionFilter = std::numeric_limits<Version>::max();
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> knobs;
|
||||
|
||||
// Returns if [begin, end) overlap with the filter range
|
||||
bool overlap(Version begin, Version end) const {
|
||||
// Filter [100, 200), [50,75) [200, 300)
|
||||
|
@ -130,8 +138,39 @@ struct DecodeParams {
|
|||
if (!prefix.empty()) {
|
||||
s.append(", KeyPrefix: ").append(printable(KeyRef(prefix)));
|
||||
}
|
||||
for (const auto& [knob, value] : knobs) {
|
||||
s.append(", KNOB-").append(knob).append(" = ").append(value);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void updateKnobs() {
|
||||
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
|
||||
for (const auto& [knobName, knobValueString] : knobs) {
|
||||
try {
|
||||
auto knobValue = g_knobs.parseKnobValue(knobName, knobValueString);
|
||||
g_knobs.setKnob(knobName, knobValue);
|
||||
} catch (Error& e) {
|
||||
if (e.code() == error_code_invalid_option_value) {
|
||||
std::cerr << "WARNING: Invalid value '" << knobValueString << "' for knob option '" << knobName
|
||||
<< "'\n";
|
||||
TraceEvent(SevWarnAlways, "InvalidKnobValue")
|
||||
.detail("Knob", printable(knobName))
|
||||
.detail("Value", printable(knobValueString));
|
||||
} else {
|
||||
std::cerr << "ERROR: Failed to set knob option '" << knobName << "': " << e.what() << "\n";
|
||||
TraceEvent(SevError, "FailedToSetKnob")
|
||||
.errorUnsuppressed(e)
|
||||
.detail("Knob", printable(knobName))
|
||||
.detail("Value", printable(knobValueString));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reinitialize knobs in order to update knobs that are dependent on explicitly set knobs
|
||||
g_knobs.initialize(Randomize::True, IsSimulated::False);
|
||||
}
|
||||
};
|
||||
|
||||
// Decode an ASCII string, e.g., "\x15\x1b\x19\x04\xaf\x0c\x28\x0a",
|
||||
|
@ -256,6 +295,16 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
|
|||
param->tlsConfig.blobCredentials.push_back(args->OptionArg());
|
||||
break;
|
||||
|
||||
case OPT_KNOB: {
|
||||
Optional<std::string> knobName = extractPrefixedArgument("--knob", args->OptionSyntax());
|
||||
if (!knobName.present()) {
|
||||
std::cerr << "ERROR: unable to parse knob option '" << args->OptionSyntax() << "'\n";
|
||||
return FDB_EXIT_ERROR;
|
||||
}
|
||||
param->knobs.emplace_back(knobName.get(), args->OptionArg());
|
||||
break;
|
||||
}
|
||||
|
||||
#ifndef TLS_DISABLED
|
||||
case TLSConfig::OPT_TLS_PLUGIN:
|
||||
args->OptionArg();
|
||||
|
@ -552,6 +601,9 @@ int main(int argc, char** argv) {
|
|||
StringRef url(param.container_url);
|
||||
setupNetwork(0, UseMetrics::True);
|
||||
|
||||
// Must be called after setupNetwork() to be effective
|
||||
param.updateKnobs();
|
||||
|
||||
TraceEvent::setNetworkThread();
|
||||
openTraceFile(NetworkAddress(), 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
|
||||
param.tlsConfig.setupBlobCredentials();
|
||||
|
|
|
@ -18,9 +18,12 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "contrib/fmt-8.1.1/include/fmt/format.h"
|
||||
#include "flow/serialize.h"
|
||||
#include "fdbclient/BlobGranuleFiles.h"
|
||||
#include "fdbclient/Knobs.h"
|
||||
#include "fdbclient/SystemData.h" // for allKeys unit test - could remove
|
||||
#include "flow/UnitTest.h"
|
||||
|
||||
|
@ -119,29 +122,43 @@ static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map<KeyRef, Val
|
|||
|
||||
static void applyDeltas(const GranuleDeltas& deltas,
|
||||
KeyRangeRef keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
Version& lastFileEndVersion,
|
||||
std::map<KeyRef, ValueRef>& dataMap) {
|
||||
if (!deltas.empty()) {
|
||||
// check that consecutive delta file versions are disjoint
|
||||
ASSERT(lastFileEndVersion < deltas.front().version);
|
||||
if (deltas.empty()) {
|
||||
return;
|
||||
}
|
||||
for (const MutationsAndVersionRef& delta : deltas) {
|
||||
if (delta.version > readVersion) {
|
||||
// check that consecutive delta file versions are disjoint
|
||||
ASSERT(lastFileEndVersion < deltas.front().version);
|
||||
|
||||
const MutationsAndVersionRef* mutationIt = deltas.begin();
|
||||
// prune beginVersion if necessary
|
||||
if (beginVersion > deltas.front().version) {
|
||||
ASSERT(beginVersion <= deltas.back().version);
|
||||
// binary search for beginVersion
|
||||
mutationIt = std::lower_bound(deltas.begin(),
|
||||
deltas.end(),
|
||||
MutationsAndVersionRef(beginVersion, 0),
|
||||
MutationsAndVersionRef::OrderByVersion());
|
||||
}
|
||||
|
||||
while (mutationIt != deltas.end()) {
|
||||
if (mutationIt->version > readVersion) {
|
||||
lastFileEndVersion = readVersion;
|
||||
return;
|
||||
}
|
||||
for (auto& m : delta.mutations) {
|
||||
for (auto& m : mutationIt->mutations) {
|
||||
applyDelta(keyRange, m, dataMap);
|
||||
}
|
||||
mutationIt++;
|
||||
}
|
||||
if (!deltas.empty()) {
|
||||
lastFileEndVersion = deltas.back().version;
|
||||
}
|
||||
lastFileEndVersion = deltas.back().version;
|
||||
}
|
||||
|
||||
static Arena loadDeltaFile(StringRef deltaData,
|
||||
KeyRangeRef keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
Version& lastFileEndVersion,
|
||||
std::map<KeyRef, ValueRef>& dataMap) {
|
||||
|
@ -151,7 +168,7 @@ static Arena loadDeltaFile(StringRef deltaData,
|
|||
reader.deserialize(FileIdentifierFor<GranuleDeltas>::value, deltas, parseArena);
|
||||
|
||||
if (BG_READ_DEBUG) {
|
||||
fmt::print("Parsed {}} deltas from file\n", deltas.size());
|
||||
fmt::print("Parsed {} deltas from file\n", deltas.size());
|
||||
}
|
||||
|
||||
// TODO REMOVE sanity check
|
||||
|
@ -163,19 +180,18 @@ static Arena loadDeltaFile(StringRef deltaData,
|
|||
ASSERT(deltas[i].version <= deltas[i + 1].version);
|
||||
}
|
||||
|
||||
applyDeltas(deltas, keyRange, readVersion, lastFileEndVersion, dataMap);
|
||||
applyDeltas(deltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
|
||||
return parseArena;
|
||||
}
|
||||
|
||||
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
|
||||
KeyRangeRef keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
Optional<StringRef> snapshotData,
|
||||
StringRef deltaFileData[]) {
|
||||
// TODO REMOVE with V2 of protocol
|
||||
// TODO REMOVE with early replying
|
||||
ASSERT(readVersion == chunk.includedVersion);
|
||||
ASSERT(chunk.snapshotFile.present());
|
||||
ASSERT(snapshotData.present());
|
||||
|
||||
// Arena to hold all allocations for applying deltas. Most of it, and the arenas produced by reading the files,
|
||||
// will likely be tossed if there are a significant number of mutations, so we copy at the end instead of doing a
|
||||
|
@ -195,13 +211,14 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
|
|||
fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
|
||||
}
|
||||
for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
|
||||
Arena deltaArena = loadDeltaFile(deltaFileData[deltaIdx], keyRange, readVersion, lastFileEndVersion, dataMap);
|
||||
Arena deltaArena =
|
||||
loadDeltaFile(deltaFileData[deltaIdx], keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
|
||||
arena.dependsOn(deltaArena);
|
||||
}
|
||||
if (BG_READ_DEBUG) {
|
||||
fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size());
|
||||
}
|
||||
applyDeltas(chunk.newDeltas, keyRange, readVersion, lastFileEndVersion, dataMap);
|
||||
applyDeltas(chunk.newDeltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
|
||||
|
||||
RangeResult ret;
|
||||
for (auto& it : dataMap) {
|
||||
|
@ -211,50 +228,85 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
|
|||
return ret;
|
||||
}
|
||||
|
||||
struct GranuleLoadIds {
|
||||
Optional<int64_t> snapshotId;
|
||||
std::vector<int64_t> deltaIds;
|
||||
};
|
||||
|
||||
static void startLoad(const ReadBlobGranuleContext granuleContext,
|
||||
const BlobGranuleChunkRef& chunk,
|
||||
GranuleLoadIds& loadIds) {
|
||||
|
||||
// Start load process for all files in chunk
|
||||
if (chunk.snapshotFile.present()) {
|
||||
std::string snapshotFname = chunk.snapshotFile.get().filename.toString();
|
||||
// FIXME: full file length won't always be length of read
|
||||
loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(),
|
||||
snapshotFname.size(),
|
||||
chunk.snapshotFile.get().offset,
|
||||
chunk.snapshotFile.get().length,
|
||||
chunk.snapshotFile.get().length,
|
||||
granuleContext.userContext);
|
||||
}
|
||||
loadIds.deltaIds.reserve(chunk.deltaFiles.size());
|
||||
for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
|
||||
std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString();
|
||||
int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(),
|
||||
deltaFName.size(),
|
||||
chunk.deltaFiles[deltaFileIdx].offset,
|
||||
chunk.deltaFiles[deltaFileIdx].length,
|
||||
chunk.deltaFiles[deltaFileIdx].length,
|
||||
granuleContext.userContext);
|
||||
loadIds.deltaIds.push_back(deltaLoadId);
|
||||
}
|
||||
}
|
||||
|
||||
ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<BlobGranuleChunkRef>>& files,
|
||||
const KeyRangeRef& keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
ReadBlobGranuleContext granuleContext) {
|
||||
int64_t parallelism = granuleContext.granuleParallelism;
|
||||
if (parallelism < 1) {
|
||||
parallelism = 1;
|
||||
}
|
||||
if (parallelism >= CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM) {
|
||||
parallelism = CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM;
|
||||
}
|
||||
|
||||
GranuleLoadIds loadIds[files.size()];
|
||||
|
||||
// Kick off first file reads if parallelism > 1
|
||||
for (int i = 0; i < parallelism - 1 && i < files.size(); i++) {
|
||||
startLoad(granuleContext, files[i], loadIds[i]);
|
||||
}
|
||||
|
||||
try {
|
||||
RangeResult results;
|
||||
// FIXME: could submit multiple chunks to start_load_f in parallel?
|
||||
for (const BlobGranuleChunkRef& chunk : files) {
|
||||
for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) {
|
||||
// Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1
|
||||
if (chunkIdx + parallelism - 1 < files.size()) {
|
||||
startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]);
|
||||
}
|
||||
|
||||
RangeResult chunkRows;
|
||||
|
||||
int64_t snapshotLoadId;
|
||||
int64_t deltaLoadIds[chunk.deltaFiles.size()];
|
||||
|
||||
// Start load process for all files in chunk
|
||||
// In V1 of api snapshot is required, optional is just for forward compatibility
|
||||
ASSERT(chunk.snapshotFile.present());
|
||||
std::string snapshotFname = chunk.snapshotFile.get().filename.toString();
|
||||
snapshotLoadId = granuleContext.start_load_f(snapshotFname.c_str(),
|
||||
snapshotFname.size(),
|
||||
chunk.snapshotFile.get().offset,
|
||||
chunk.snapshotFile.get().length,
|
||||
granuleContext.userContext);
|
||||
int64_t deltaLoadLengths[chunk.deltaFiles.size()];
|
||||
StringRef deltaData[chunk.deltaFiles.size()];
|
||||
for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
|
||||
std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString();
|
||||
deltaLoadIds[deltaFileIdx] = granuleContext.start_load_f(deltaFName.c_str(),
|
||||
deltaFName.size(),
|
||||
chunk.deltaFiles[deltaFileIdx].offset,
|
||||
chunk.deltaFiles[deltaFileIdx].length,
|
||||
granuleContext.userContext);
|
||||
deltaLoadLengths[deltaFileIdx] = chunk.deltaFiles[deltaFileIdx].length;
|
||||
}
|
||||
|
||||
// once all loads kicked off, load data for chunk
|
||||
StringRef snapshotData(granuleContext.get_load_f(snapshotLoadId, granuleContext.userContext),
|
||||
chunk.snapshotFile.get().length);
|
||||
if (!snapshotData.begin()) {
|
||||
return ErrorOr<RangeResult>(blob_granule_file_load_error());
|
||||
Optional<StringRef> snapshotData;
|
||||
if (files[chunkIdx].snapshotFile.present()) {
|
||||
snapshotData =
|
||||
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext),
|
||||
files[chunkIdx].snapshotFile.get().length);
|
||||
if (!snapshotData.get().begin()) {
|
||||
return ErrorOr<RangeResult>(blob_granule_file_load_error());
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < chunk.deltaFiles.size(); i++) {
|
||||
deltaData[i] = StringRef(granuleContext.get_load_f(deltaLoadIds[i], granuleContext.userContext),
|
||||
chunk.deltaFiles[i].length);
|
||||
|
||||
StringRef deltaData[files[chunkIdx].deltaFiles.size()];
|
||||
for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) {
|
||||
deltaData[i] =
|
||||
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext),
|
||||
files[chunkIdx].deltaFiles[i].length);
|
||||
// null data is error
|
||||
if (!deltaData[i].begin()) {
|
||||
return ErrorOr<RangeResult>(blob_granule_file_load_error());
|
||||
|
@ -262,14 +314,17 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
|
|||
}
|
||||
|
||||
// materialize rows from chunk
|
||||
chunkRows = materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData);
|
||||
chunkRows =
|
||||
materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData);
|
||||
|
||||
results.arena().dependsOn(chunkRows.arena());
|
||||
results.append(results.arena(), chunkRows.begin(), chunkRows.size());
|
||||
|
||||
granuleContext.free_load_f(snapshotLoadId, granuleContext.userContext);
|
||||
for (int i = 0; i < chunk.deltaFiles.size(); i++) {
|
||||
granuleContext.free_load_f(deltaLoadIds[i], granuleContext.userContext);
|
||||
if (loadIds[chunkIdx].snapshotId.present()) {
|
||||
granuleContext.free_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext);
|
||||
}
|
||||
for (int i = 0; i < loadIds[chunkIdx].deltaIds.size(); i++) {
|
||||
granuleContext.free_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext);
|
||||
}
|
||||
}
|
||||
return ErrorOr<RangeResult>(results);
|
||||
|
@ -278,8 +333,7 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
|
|||
}
|
||||
}
|
||||
|
||||
// FIXME: re-enable test!
|
||||
TEST_CASE(":/blobgranule/files/applyDelta") {
|
||||
TEST_CASE("/blobgranule/files/applyDelta") {
|
||||
printf("Testing blob granule delta applying\n");
|
||||
Arena a;
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
|
|||
|
||||
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
|
||||
KeyRangeRef keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
Optional<StringRef> snapshotData,
|
||||
StringRef deltaFileData[]);
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "fdbclient/BlobGranuleReader.actor.h"
|
||||
#include "fdbclient/BlobWorkerCommon.h"
|
||||
#include "fdbclient/BlobWorkerInterface.h"
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other
|
||||
|
@ -52,7 +53,6 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem
|
|||
StringRef dataRef(data, f.length);
|
||||
return Standalone<StringRef>(dataRef, arena);
|
||||
} catch (Error& e) {
|
||||
printf("Reading file %s got error %s\n", f.toString().c_str(), e.name());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -64,22 +64,25 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem
|
|||
// sub-functions that BlobGranuleFiles actually exposes?
|
||||
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
|
||||
KeyRangeRef keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
Reference<BackupContainerFileSystem> bstore,
|
||||
Optional<BlobWorkerStats*> stats) {
|
||||
|
||||
// TODO REMOVE with V2 of protocol
|
||||
// TODO REMOVE with early replying
|
||||
ASSERT(readVersion == chunk.includedVersion);
|
||||
ASSERT(chunk.snapshotFile.present());
|
||||
|
||||
state Arena arena;
|
||||
|
||||
try {
|
||||
Future<Standalone<StringRef>> readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get());
|
||||
state std::vector<Future<Standalone<StringRef>>> readDeltaFutures;
|
||||
if (stats.present()) {
|
||||
++stats.get()->s3GetReqs;
|
||||
Future<Standalone<StringRef>> readSnapshotFuture;
|
||||
if (chunk.snapshotFile.present()) {
|
||||
readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get());
|
||||
if (stats.present()) {
|
||||
++stats.get()->s3GetReqs;
|
||||
}
|
||||
}
|
||||
state std::vector<Future<Standalone<StringRef>>> readDeltaFutures;
|
||||
|
||||
readDeltaFutures.reserve(chunk.deltaFiles.size());
|
||||
for (BlobFilePointerRef deltaFile : chunk.deltaFiles) {
|
||||
|
@ -89,8 +92,12 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
|
|||
}
|
||||
}
|
||||
|
||||
state Standalone<StringRef> snapshotData = wait(readSnapshotFuture);
|
||||
arena.dependsOn(snapshotData.arena());
|
||||
state Optional<StringRef> snapshotData; // not present if snapshotFile isn't present
|
||||
if (chunk.snapshotFile.present()) {
|
||||
state Standalone<StringRef> s = wait(readSnapshotFuture);
|
||||
arena.dependsOn(s.arena());
|
||||
snapshotData = s;
|
||||
}
|
||||
|
||||
state int numDeltaFiles = chunk.deltaFiles.size();
|
||||
state StringRef* deltaData = new (arena) StringRef[numDeltaFiles];
|
||||
|
@ -103,10 +110,9 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
|
|||
arena.dependsOn(data.arena());
|
||||
}
|
||||
|
||||
return materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData);
|
||||
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData);
|
||||
|
||||
} catch (Error& e) {
|
||||
printf("Reading blob granule got error %s\n", e.name());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -121,18 +127,12 @@ ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
|
|||
try {
|
||||
state int i;
|
||||
for (i = 0; i < reply.chunks.size(); i++) {
|
||||
/*printf("ReadBlobGranules processing chunk %d [%s - %s)\n",
|
||||
i,
|
||||
reply.chunks[i].keyRange.begin.printable().c_str(),
|
||||
reply.chunks[i].keyRange.end.printable().c_str());*/
|
||||
RangeResult chunkResult =
|
||||
wait(readBlobGranule(reply.chunks[i], request.keyRange, request.readVersion, bstore));
|
||||
RangeResult chunkResult = wait(
|
||||
readBlobGranule(reply.chunks[i], request.keyRange, request.beginVersion, request.readVersion, bstore));
|
||||
results.send(std::move(chunkResult));
|
||||
}
|
||||
// printf("ReadBlobGranules done, sending EOS\n");
|
||||
results.sendError(end_of_stream());
|
||||
} catch (Error& e) {
|
||||
printf("ReadBlobGranules got error %s\n", e.name());
|
||||
results.sendError(e);
|
||||
}
|
||||
|
||||
|
|
|
@ -40,6 +40,7 @@
|
|||
// the request
|
||||
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
|
||||
KeyRangeRef keyRange,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
Reference<BackupContainerFileSystem> bstore,
|
||||
Optional<BlobWorkerStats*> stats = Optional<BlobWorkerStats*>());
|
||||
|
|
|
@ -38,6 +38,8 @@ struct BlobWorkerStats {
|
|||
Counter commitVersionChecks;
|
||||
Counter granuleUpdateErrors;
|
||||
Counter granuleRequestTimeouts;
|
||||
Counter readRequestsWithBegin;
|
||||
Counter readRequestsCollapsed;
|
||||
|
||||
int numRangesAssigned;
|
||||
int mutationBytesBuffered;
|
||||
|
@ -59,6 +61,7 @@ struct BlobWorkerStats {
|
|||
readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc),
|
||||
readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc),
|
||||
granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc),
|
||||
readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
|
||||
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0) {
|
||||
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
|
||||
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
|
||||
|
|
|
@ -86,13 +86,14 @@ struct BlobGranuleFileRequest {
|
|||
KeyRangeRef keyRange;
|
||||
Version beginVersion = 0;
|
||||
Version readVersion;
|
||||
bool canCollapseBegin = true;
|
||||
ReplyPromise<BlobGranuleFileReply> reply;
|
||||
|
||||
BlobGranuleFileRequest() {}
|
||||
|
||||
template <class Ar>
|
||||
void serialize(Ar& ar) {
|
||||
serializer(ar, keyRange, beginVersion, readVersion, reply, arena);
|
||||
serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, reply, arena);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -205,6 +205,17 @@ if(BUILD_AZURE_BACKUP)
|
|||
)
|
||||
endif()
|
||||
|
||||
|
||||
if(WITH_AWS_BACKUP)
|
||||
add_compile_definitions(BUILD_AWS_BACKUP)
|
||||
|
||||
set(FDBCLIENT_SRCS
|
||||
${FDBCLIENT_SRCS}
|
||||
FDBAWSCredentialsProvider.h)
|
||||
|
||||
include(awssdk)
|
||||
endif()
|
||||
|
||||
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
|
||||
add_dependencies(fdbclient fdboptions)
|
||||
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
|
||||
|
@ -224,3 +235,8 @@ if(BUILD_AZURE_BACKUP)
|
|||
target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
|
||||
target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite)
|
||||
endif()
|
||||
|
||||
if(BUILD_AWS_BACKUP)
|
||||
target_link_libraries(fdbclient PUBLIC awssdk_target)
|
||||
target_link_libraries(fdbclient_sampling PUBLIC awssdk_target)
|
||||
endif()
|
||||
|
|
|
@ -50,6 +50,7 @@ void ClientKnobs::initialize(Randomize randomize) {
|
|||
init( MAX_GENERATIONS_OVERRIDE, 0 );
|
||||
init( MAX_GENERATIONS_SIM, 50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION
|
||||
|
||||
init( COORDINATOR_HOSTNAME_RESOLVE_DELAY, 0.05 );
|
||||
init( COORDINATOR_RECONNECTION_DELAY, 1.0 );
|
||||
init( CLIENT_EXAMPLE_AMOUNT, 20 );
|
||||
init( MAX_CLIENT_STATUS_AGE, 1.0 );
|
||||
|
@ -280,6 +281,9 @@ void ClientKnobs::initialize(Randomize randomize) {
|
|||
init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 );
|
||||
init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 );
|
||||
|
||||
// Blob granules
|
||||
init( BG_MAX_GRANULE_PARALLELISM, 10 );
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ public:
|
|||
double MAX_GENERATIONS_OVERRIDE;
|
||||
double MAX_GENERATIONS_SIM;
|
||||
|
||||
double COORDINATOR_HOSTNAME_RESOLVE_DELAY;
|
||||
double COORDINATOR_RECONNECTION_DELAY;
|
||||
int CLIENT_EXAMPLE_AMOUNT;
|
||||
double MAX_CLIENT_STATUS_AGE;
|
||||
|
@ -272,6 +273,9 @@ public:
|
|||
int MVC_CLIENTLIB_CHUNK_SIZE;
|
||||
int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION;
|
||||
|
||||
// Blob Granules
|
||||
int BG_MAX_GRANULE_PARALLELISM;
|
||||
|
||||
ClientKnobs(Randomize randomize);
|
||||
void initialize(Randomize randomize);
|
||||
};
|
||||
|
|
|
@ -514,7 +514,7 @@ public:
|
|||
Counter transactionGrvTimedOutBatches;
|
||||
|
||||
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
|
||||
bytesPerCommit;
|
||||
bytesPerCommit, bgLatencies, bgGranulesPerRequest;
|
||||
|
||||
int outstandingWatches;
|
||||
int maxOutstandingWatches;
|
||||
|
@ -538,6 +538,7 @@ public:
|
|||
bool transactionTracingSample;
|
||||
double verifyCausalReadsProp = 0.0;
|
||||
bool blobGranuleNoMaterialize = false;
|
||||
bool anyBlobGranuleRequests = false;
|
||||
|
||||
Future<Void> logger;
|
||||
Future<Void> throttleExpirer;
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* FDBAWSCredentialsProvider.h
|
||||
*
|
||||
* This source file is part of the FoundationDB open source project
|
||||
*
|
||||
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#if (!defined FDB_AWS_CREDENTIALS_PROVIDER_H) && (defined BUILD_AWS_BACKUP)
|
||||
#define FDB_AWS_CREDENTIALS_PROVIDER_H
|
||||
#pragma once
|
||||
|
||||
#include "aws/core/Aws.h"
|
||||
#include "aws/core/auth/AWSCredentialsProviderChain.h"
|
||||
|
||||
// Singleton
|
||||
namespace FDBAWSCredentialsProvider {
|
||||
bool doneInit = false;
|
||||
|
||||
// You're supposed to call AWS::ShutdownAPI(options); once done
|
||||
// But we want this to live for the lifetime of the process, so we don't do that
|
||||
static Aws::Auth::AWSCredentials getAwsCredentials() {
|
||||
if (!doneInit) {
|
||||
doneInit = true;
|
||||
Aws::SDKOptions options;
|
||||
Aws::InitAPI(options);
|
||||
TraceEvent("AWSSDKInitSuccessful");
|
||||
}
|
||||
Aws::Auth::DefaultAWSCredentialsProviderChain credProvider;
|
||||
Aws::Auth::AWSCredentials creds = credProvider.GetAWSCredentials();
|
||||
return creds;
|
||||
}
|
||||
} // namespace FDBAWSCredentialsProvider
|
||||
|
||||
#endif
|
|
@ -1342,7 +1342,12 @@ struct ReadBlobGranuleContext {
|
|||
void* userContext;
|
||||
|
||||
// Returns a unique id for the load. Asynchronous to support queueing multiple in parallel.
|
||||
int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
|
||||
int64_t (*start_load_f)(const char* filename,
|
||||
int filenameLength,
|
||||
int64_t offset,
|
||||
int64_t length,
|
||||
int64_t fullFileLength,
|
||||
void* context);
|
||||
|
||||
// Returns data for the load. Pass the loadId returned by start_load_f
|
||||
uint8_t* (*get_load_f)(int64_t loadId, void* context);
|
||||
|
@ -1353,6 +1358,9 @@ struct ReadBlobGranuleContext {
|
|||
// Set this to true for testing if you don't want to read the granule files,
|
||||
// just do the request to the blob workers
|
||||
bool debugNoMaterialize;
|
||||
|
||||
// number of granules to load in parallel (default 1)
|
||||
int granuleParallelism = 1;
|
||||
};
|
||||
|
||||
// Store metadata associated with each storage server. Now it only contains data be used in perpetual storage wiggle.
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "fdbclient/json_spirit/json_spirit_writer_template.h"
|
||||
#include "fdbclient/json_spirit/json_spirit_reader_template.h"
|
||||
#include "flow/Error.h"
|
||||
|
||||
// JSONDoc is a convenient reader/writer class for manipulating JSON documents using "paths".
|
||||
// Access is done using a "path", which is a string of dot-separated
|
||||
|
|
|
@ -169,7 +169,7 @@ void ClusterConnectionString::resolveHostnamesBlocking() {
|
|||
}
|
||||
|
||||
void ClusterConnectionString::resetToUnresolved() {
|
||||
if (hostnames.size() > 0) {
|
||||
if (status == RESOLVED && hostnames.size() > 0) {
|
||||
coords.clear();
|
||||
hostnames.clear();
|
||||
networkAddressToHostname.clear();
|
||||
|
@ -558,8 +558,8 @@ ACTOR Future<Void> monitorNominee(Key key,
|
|||
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
|
||||
.detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
|
||||
if (rep.getError().code() == error_code_request_maybe_delivered) {
|
||||
// 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache
|
||||
wait(delay(0.05));
|
||||
// Delay to prevent tight resolving loop due to outdated DNS cache
|
||||
wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY));
|
||||
throw coordinators_changed();
|
||||
} else {
|
||||
throw rep.getError();
|
||||
|
@ -589,7 +589,6 @@ ACTOR Future<Void> monitorNominee(Key key,
|
|||
|
||||
if (li.present() && li.get().forward)
|
||||
wait(Future<Void>(Never()));
|
||||
wait(Future<Void>(Void()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -282,8 +282,9 @@ ThreadResult<RangeResult> DLTransaction::readBlobGranules(const KeyRangeRef& key
|
|||
context.get_load_f = granuleContext.get_load_f;
|
||||
context.free_load_f = granuleContext.free_load_f;
|
||||
context.debugNoMaterialize = granuleContext.debugNoMaterialize;
|
||||
context.granuleParallelism = granuleContext.granuleParallelism;
|
||||
|
||||
int64_t rv = readVersion.present() ? readVersion.get() : invalidVersion;
|
||||
int64_t rv = readVersion.present() ? readVersion.get() : latestVersion;
|
||||
|
||||
FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr,
|
||||
keyRange.begin.begin(),
|
||||
|
|
|
@ -95,8 +95,12 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
|
|||
void* userContext;
|
||||
|
||||
// Returns a unique id for the load. Asynchronous to support queueing multiple in parallel.
|
||||
int64_t (
|
||||
*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
|
||||
int64_t (*start_load_f)(const char* filename,
|
||||
int filenameLength,
|
||||
int64_t offset,
|
||||
int64_t length,
|
||||
int64_t fullFileLength,
|
||||
void* context);
|
||||
|
||||
// Returns data for the load. Pass the loadId returned by start_load_f
|
||||
uint8_t* (*get_load_f)(int64_t loadId, void* context);
|
||||
|
@ -107,6 +111,9 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
|
|||
// set this to true for testing if you don't want to read the granule files, just
|
||||
// do the request to the blob workers
|
||||
fdb_bool_t debugNoMaterialize;
|
||||
|
||||
// number of granules to load in parallel (default 1)
|
||||
int granuleParallelism;
|
||||
} FDBReadBlobGranuleContext;
|
||||
|
||||
typedef void (*FDBCallback)(FDBFuture* future, void* callback_parameter);
|
||||
|
|
|
@ -533,6 +533,14 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
|
|||
.detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
|
||||
.detail("MaxBytesPerCommit", cx->bytesPerCommit.max())
|
||||
.detail("NumLocalityCacheEntries", cx->locationCache.size());
|
||||
if (cx->anyBlobGranuleRequests) {
|
||||
ev.detail("MeanBGLatency", cx->bgLatencies.mean())
|
||||
.detail("MedianBGLatency", cx->bgLatencies.median())
|
||||
.detail("MaxBGLatency", cx->bgLatencies.max())
|
||||
.detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
|
||||
.detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
|
||||
.detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
|
||||
}
|
||||
}
|
||||
|
||||
cx->latencies.clear();
|
||||
|
@ -541,6 +549,8 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
|
|||
cx->commitLatencies.clear();
|
||||
cx->mutationsPerCommit.clear();
|
||||
cx->bytesPerCommit.clear();
|
||||
cx->bgLatencies.clear();
|
||||
cx->bgGranulesPerRequest.clear();
|
||||
|
||||
lastLogged = now();
|
||||
}
|
||||
|
@ -1353,11 +1363,11 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
|
|||
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
|
||||
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
|
||||
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000),
|
||||
bytesPerCommit(1000), outstandingWatches(0), lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0),
|
||||
lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID),
|
||||
clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(apiVersion),
|
||||
mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
|
||||
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
|
||||
bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), outstandingWatches(0), lastGrvTime(0.0),
|
||||
cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
|
||||
transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
|
||||
coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
|
||||
detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
|
||||
specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
|
||||
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {
|
||||
dbId = deterministicRandom()->randomUniqueID();
|
||||
|
@ -1619,7 +1629,8 @@ DatabaseContext::DatabaseContext(const Error& err)
|
|||
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
|
||||
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
|
||||
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000),
|
||||
bytesPerCommit(1000), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
|
||||
bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false),
|
||||
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
|
||||
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}
|
||||
|
||||
// Static constructor used by server processes to create a DatabaseContext
|
||||
|
@ -7340,6 +7351,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
|
|||
state Version rv;
|
||||
|
||||
state Standalone<VectorRef<BlobGranuleChunkRef>> results;
|
||||
state double startTime = now();
|
||||
|
||||
if (read.present()) {
|
||||
rv = read.get();
|
||||
|
@ -7442,6 +7454,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
|
|||
req.keyRange = KeyRangeRef(StringRef(req.arena, granuleStartKey), StringRef(req.arena, granuleEndKey));
|
||||
req.beginVersion = begin;
|
||||
req.readVersion = rv;
|
||||
req.canCollapseBegin = true; // TODO make this a parameter once we support it
|
||||
|
||||
std::vector<Reference<ReferencedInterface<BlobWorkerInterface>>> v;
|
||||
v.push_back(
|
||||
|
@ -7514,6 +7527,11 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
|
|||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
self->trState->cx->anyBlobGranuleRequests = true;
|
||||
self->trState->cx->bgGranulesPerRequest.addSample(results.size());
|
||||
self->trState->cx->bgLatencies.addSample(now() - startTime);
|
||||
|
||||
if (readVersionOut != nullptr) {
|
||||
*readVersionOut = rv;
|
||||
}
|
||||
|
|
|
@ -1791,8 +1791,6 @@ Future<Standalone<VectorRef<BlobGranuleChunkRef>>> ReadYourWritesTransaction::re
|
|||
Version begin,
|
||||
Optional<Version> readVersion,
|
||||
Version* readVersionOut) {
|
||||
// Remove in V2 of API
|
||||
ASSERT(begin == 0);
|
||||
|
||||
if (!options.readYourWritesDisabled) {
|
||||
return blob_granule_no_ryw();
|
||||
|
|
|
@ -34,6 +34,8 @@
|
|||
#include "fdbrpc/IAsyncFile.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "fdbclient/rapidxml/rapidxml.hpp"
|
||||
#include "fdbclient/FDBAWSCredentialsProvider.h"
|
||||
|
||||
#include "flow/actorcompiler.h" // has to be last include
|
||||
|
||||
using namespace rapidxml;
|
||||
|
@ -82,6 +84,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
|
|||
read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
|
||||
max_send_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
|
||||
max_recv_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
|
||||
sdk_auth = false;
|
||||
}
|
||||
|
||||
bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
|
||||
|
@ -118,6 +121,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
|
|||
TRY_PARAM(read_cache_blocks_per_file, rcb);
|
||||
TRY_PARAM(max_send_bytes_per_second, sbps);
|
||||
TRY_PARAM(max_recv_bytes_per_second, rbps);
|
||||
TRY_PARAM(sdk_auth, sa);
|
||||
#undef TRY_PARAM
|
||||
return false;
|
||||
}
|
||||
|
@ -506,7 +510,38 @@ ACTOR Future<Optional<json_spirit::mObject>> tryReadJSONFile(std::string path) {
|
|||
return Optional<json_spirit::mObject>();
|
||||
}
|
||||
|
||||
// If the credentials expire, the connection will eventually fail and be discarded from the pool, and then a new
|
||||
// connection will be constructed, which will call this again to get updated credentials
|
||||
static S3BlobStoreEndpoint::Credentials getSecretSdk() {
|
||||
#ifdef BUILD_AWS_BACKUP
|
||||
double elapsed = -timer_monotonic();
|
||||
Aws::Auth::AWSCredentials awsCreds = FDBAWSCredentialsProvider::getAwsCredentials();
|
||||
elapsed += timer_monotonic();
|
||||
|
||||
if (awsCreds.IsEmpty()) {
|
||||
TraceEvent(SevWarn, "S3BlobStoreAWSCredsEmpty");
|
||||
throw backup_auth_missing();
|
||||
}
|
||||
|
||||
S3BlobStoreEndpoint::Credentials fdbCreds;
|
||||
fdbCreds.key = awsCreds.GetAWSAccessKeyId();
|
||||
fdbCreds.secret = awsCreds.GetAWSSecretKey();
|
||||
fdbCreds.securityToken = awsCreds.GetSessionToken();
|
||||
|
||||
TraceEvent("S3BlobStoreGotSdkCredentials").suppressFor(60).detail("Duration", elapsed);
|
||||
|
||||
return fdbCreds;
|
||||
#else
|
||||
TraceEvent(SevError, "S3BlobStoreNoSDK");
|
||||
throw backup_auth_missing();
|
||||
#endif
|
||||
}
|
||||
|
||||
ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
|
||||
if (b->knobs.sdk_auth) {
|
||||
b->credentials = getSecretSdk();
|
||||
return Void();
|
||||
}
|
||||
std::vector<std::string>* pFiles = (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
|
||||
if (pFiles == nullptr)
|
||||
return Void();
|
||||
|
@ -538,7 +573,7 @@ ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
|
|||
JSONDoc accounts(doc.last().get_obj());
|
||||
if (accounts.has(credentialsFileKey, false) && accounts.last().type() == json_spirit::obj_type) {
|
||||
JSONDoc account(accounts.last());
|
||||
S3BlobStoreEndpoint::Credentials creds;
|
||||
S3BlobStoreEndpoint::Credentials creds = b->credentials.get();
|
||||
if (b->lookupKey) {
|
||||
std::string apiKey;
|
||||
if (account.tryGet("api_key", apiKey))
|
||||
|
@ -601,7 +636,7 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
|
|||
.detail("RemoteEndpoint", conn->getPeerAddress())
|
||||
.detail("ExpiresIn", b->knobs.max_connection_life);
|
||||
|
||||
if (b->lookupKey || b->lookupSecret)
|
||||
if (b->lookupKey || b->lookupSecret || b->knobs.sdk_auth)
|
||||
wait(b->updateSecret());
|
||||
|
||||
return S3BlobStoreEndpoint::ReusableConnection({ conn, now() + b->knobs.max_connection_life });
|
||||
|
|
|
@ -59,7 +59,7 @@ public:
|
|||
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
|
||||
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
|
||||
read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second,
|
||||
max_recv_bytes_per_second;
|
||||
max_recv_bytes_per_second, sdk_auth;
|
||||
bool set(StringRef name, int value);
|
||||
std::string getURLParameters() const;
|
||||
static std::vector<std::string> getKnobDescriptions() {
|
||||
|
@ -91,7 +91,9 @@ public:
|
|||
"read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.",
|
||||
"max_send_bytes_per_second (or sbps) Max send bytes per second for all requests combined.",
|
||||
"max_recv_bytes_per_second (or rbps) Max receive bytes per second for all requests combined (NOT YET "
|
||||
"USED)."
|
||||
"USED).",
|
||||
"sdk_auth (or sa) Use AWS SDK to resolve credentials. Only valid if "
|
||||
"BUILD_AWS_BACKUP is enabled."
|
||||
};
|
||||
}
|
||||
};
|
||||
|
|
|
@ -828,6 +828,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
|
|||
|
||||
// encrypt key proxy
|
||||
init( ENABLE_ENCRYPTION, false );
|
||||
init( ENCRYPTION_MODE, "AES-256-CTR");
|
||||
|
||||
// Blob granlues
|
||||
init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually
|
||||
|
|
|
@ -775,8 +775,9 @@ public:
|
|||
// Cluster recovery
|
||||
std::string CLUSTER_RECOVERY_EVENT_NAME_PREFIX;
|
||||
|
||||
// encrypt key proxy
|
||||
// Encryption
|
||||
bool ENABLE_ENCRYPTION;
|
||||
std::string ENCRYPTION_MODE;
|
||||
|
||||
// blob granule stuff
|
||||
// FIXME: configure url with database configuration instead of knob eventually
|
||||
|
|
|
@ -29,14 +29,7 @@ static std::map<NetworkAddress, std::pair<Reference<EvictablePageCache>, Referen
|
|||
|
||||
EvictablePage::~EvictablePage() {
|
||||
if (data) {
|
||||
#if defined(USE_JEMALLOC)
|
||||
aligned_free(data);
|
||||
#else
|
||||
if (pageCache->pageSize == 4096)
|
||||
FastAllocator<4096>::release(data);
|
||||
else
|
||||
aligned_free(data);
|
||||
#endif
|
||||
freeFast4kAligned(pageCache->pageSize, data);
|
||||
}
|
||||
if (EvictablePageCache::RANDOM == pageCache->cacheEvictionType) {
|
||||
if (index > -1) {
|
||||
|
@ -173,14 +166,7 @@ void AsyncFileCached::releaseZeroCopy(void* data, int length, int64_t offset) {
|
|||
if (o != orphanedPages.end()) {
|
||||
if (o->second == 1) {
|
||||
if (data) {
|
||||
#if defined(USE_JEMALLOC)
|
||||
aligned_free(data);
|
||||
#else
|
||||
if (length == 4096)
|
||||
FastAllocator<4096>::release(data);
|
||||
else
|
||||
aligned_free(data);
|
||||
#endif
|
||||
freeFast4kAligned(length, data);
|
||||
}
|
||||
} else {
|
||||
--o->second;
|
||||
|
|
|
@ -79,14 +79,9 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
|
|||
void allocate(EvictablePage* page) {
|
||||
try_evict();
|
||||
try_evict();
|
||||
#if defined(USE_JEMALLOC)
|
||||
page->data = aligned_alloc(4096, pageSize);
|
||||
#else
|
||||
page->data = pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageSize);
|
||||
#endif
|
||||
if (page->data == nullptr) {
|
||||
platform::outOfMemory();
|
||||
}
|
||||
|
||||
page->data = allocateFast4kAligned(pageSize);
|
||||
|
||||
if (RANDOM == cacheEvictionType) {
|
||||
page->index = pages.size();
|
||||
pages.push_back(page);
|
||||
|
@ -394,14 +389,7 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
|
|||
owner->orphanedPages[data] = zeroCopyRefCount;
|
||||
zeroCopyRefCount = 0;
|
||||
notReading = Void();
|
||||
#if defined(USE_JEMALLOC)
|
||||
data = aligned_alloc(4096, pageCache->pageSize);
|
||||
#else
|
||||
data = pageCache->pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageCache->pageSize);
|
||||
#endif
|
||||
if (data == nullptr) {
|
||||
platform::outOfMemory();
|
||||
}
|
||||
data = allocateFast4kAligned(pageCache->pageSize);
|
||||
}
|
||||
|
||||
Future<Void> write(void const* data, int length, int offset) {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "contrib/fmt-8.1.1/include/fmt/format.h"
|
||||
#include "fdbclient/SystemData.h"
|
||||
#include "fdbclient/BlobGranuleCommon.h"
|
||||
#include "fdbserver/BlobGranuleServerCommon.actor.h"
|
||||
|
@ -25,6 +26,7 @@
|
|||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbclient/ReadYourWrites.h"
|
||||
#include "flow/Arena.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "flow/actorcompiler.h" // has to be last include
|
||||
|
||||
// Gets the latest granule history node for range that was persisted
|
||||
|
@ -102,3 +104,252 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normally a beginVersion != 0 means the caller wants all mutations between beginVersion and readVersion, instead of
|
||||
// the latest snapshot before readVersion + deltas after the snapshot. When canCollapse is set, the beginVersion is
|
||||
// essentially just an optimization hint. The caller is still concerned with reconstructing rows at readVersion, it just
|
||||
// knows it doesn't need anything before beginVersion.
|
||||
// Normally this can eliminate the need for a snapshot and just return a small amount of deltas. But in a highly active
|
||||
// key range, the granule may have a snapshot file at version X, where beginVersion < X <= readVersion. In this case, if
|
||||
// the number of bytes in delta files between beginVersion and X is larger than the snapshot file at version X, it is
|
||||
// strictly more efficient (in terms of files and bytes read) to just use the snapshot file at version X instead.
|
||||
void GranuleFiles::getFiles(Version beginVersion,
|
||||
Version readVersion,
|
||||
bool canCollapse,
|
||||
BlobGranuleChunkRef& chunk,
|
||||
Arena& replyArena,
|
||||
int64_t& deltaBytesCounter) const {
|
||||
BlobFileIndex dummyIndex; // for searching
|
||||
|
||||
// if beginVersion == 0 or we can collapse, find the latest snapshot <= readVersion
|
||||
auto snapshotF = snapshotFiles.end();
|
||||
if (beginVersion == 0 || canCollapse) {
|
||||
dummyIndex.version = readVersion;
|
||||
snapshotF = std::lower_bound(snapshotFiles.begin(), snapshotFiles.end(), dummyIndex);
|
||||
if (snapshotF == snapshotFiles.end() || snapshotF->version > readVersion) {
|
||||
ASSERT(snapshotF != snapshotFiles.begin());
|
||||
snapshotF--;
|
||||
}
|
||||
ASSERT(snapshotF != snapshotFiles.end());
|
||||
ASSERT(snapshotF->version <= readVersion);
|
||||
}
|
||||
|
||||
auto deltaF = deltaFiles.end();
|
||||
if (beginVersion > 0) {
|
||||
dummyIndex.version = beginVersion;
|
||||
deltaF = std::lower_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
|
||||
if (canCollapse) {
|
||||
ASSERT(snapshotF != snapshotFiles.end());
|
||||
// If we can collapse, see if delta files up to snapshotVersion are smaller or larger than snapshotBytes in
|
||||
// total
|
||||
auto deltaFCopy = deltaF;
|
||||
int64_t snapshotBytes = snapshotF->length;
|
||||
while (deltaFCopy != deltaFiles.end() && deltaFCopy->version <= snapshotF->version && snapshotBytes > 0) {
|
||||
snapshotBytes -= deltaFCopy->length;
|
||||
deltaFCopy++;
|
||||
}
|
||||
// if delta files contain the same or more bytes as the snapshot with collapse, do the collapse
|
||||
if (snapshotBytes > 0) {
|
||||
// don't collapse, clear snapshotF and just do delta files
|
||||
snapshotF = snapshotFiles.end();
|
||||
} else {
|
||||
// do snapshot instead of previous deltas
|
||||
dummyIndex.version = snapshotF->version;
|
||||
deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
|
||||
ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dummyIndex.version = snapshotF->version;
|
||||
deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
|
||||
ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version);
|
||||
}
|
||||
|
||||
Version lastIncluded = invalidVersion;
|
||||
if (snapshotF != snapshotFiles.end()) {
|
||||
chunk.snapshotVersion = snapshotF->version;
|
||||
chunk.snapshotFile = BlobFilePointerRef(replyArena, snapshotF->filename, snapshotF->offset, snapshotF->length);
|
||||
lastIncluded = chunk.snapshotVersion;
|
||||
} else {
|
||||
chunk.snapshotVersion = invalidVersion;
|
||||
}
|
||||
|
||||
while (deltaF != deltaFiles.end() && deltaF->version < readVersion) {
|
||||
chunk.deltaFiles.emplace_back_deep(replyArena, deltaF->filename, deltaF->offset, deltaF->length);
|
||||
deltaBytesCounter += deltaF->length;
|
||||
ASSERT(lastIncluded < deltaF->version);
|
||||
lastIncluded = deltaF->version;
|
||||
deltaF++;
|
||||
}
|
||||
// include last delta file that passes readVersion, if it exists
|
||||
if (deltaF != deltaFiles.end() && lastIncluded < readVersion) {
|
||||
chunk.deltaFiles.emplace_back_deep(replyArena, deltaF->filename, deltaF->offset, deltaF->length);
|
||||
deltaBytesCounter += deltaF->length;
|
||||
lastIncluded = deltaF->version;
|
||||
}
|
||||
}
|
||||
|
||||
static std::string makeTestFileName(Version v) {
|
||||
return "test" + std::to_string(v);
|
||||
}
|
||||
|
||||
static BlobFileIndex makeTestFile(Version v, int64_t len) {
|
||||
return BlobFileIndex(v, makeTestFileName(v), 0, len);
|
||||
}
|
||||
|
||||
static void checkFile(int expectedVersion, const BlobFilePointerRef& actualFile) {
|
||||
ASSERT(makeTestFileName(expectedVersion) == actualFile.filename.toString());
|
||||
}
|
||||
|
||||
static void checkFiles(const GranuleFiles& f,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
bool canCollapse,
|
||||
Optional<int> expectedSnapshotVersion,
|
||||
std::vector<int> expectedDeltaVersions) {
|
||||
Arena a;
|
||||
BlobGranuleChunkRef chunk;
|
||||
int64_t deltaBytes = 0;
|
||||
f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes);
|
||||
fmt::print("results({0}, {1}, {2}):\nEXPECTED:\n snapshot={3}\n deltas ({4}):\n",
|
||||
beginVersion,
|
||||
readVersion,
|
||||
canCollapse ? "T" : "F",
|
||||
expectedSnapshotVersion.present() ? makeTestFileName(expectedSnapshotVersion.get()).c_str() : "<N/A>",
|
||||
expectedDeltaVersions.size());
|
||||
for (int d : expectedDeltaVersions) {
|
||||
fmt::print(" {}\n", makeTestFileName(d));
|
||||
}
|
||||
fmt::print("ACTUAL:\n snapshot={0}\n deltas ({1}):\n",
|
||||
chunk.snapshotFile.present() ? chunk.snapshotFile.get().filename.toString().c_str() : "<N/A>",
|
||||
chunk.deltaFiles.size());
|
||||
for (auto& it : chunk.deltaFiles) {
|
||||
fmt::print(" {}\n", it.filename.toString());
|
||||
}
|
||||
printf("\n\n\n");
|
||||
ASSERT(expectedSnapshotVersion.present() == chunk.snapshotFile.present());
|
||||
if (expectedSnapshotVersion.present()) {
|
||||
checkFile(expectedSnapshotVersion.get(), chunk.snapshotFile.get());
|
||||
}
|
||||
ASSERT(expectedDeltaVersions.size() == chunk.deltaFiles.size());
|
||||
for (int i = 0; i < expectedDeltaVersions.size(); i++) {
|
||||
checkFile(expectedDeltaVersions[i], chunk.deltaFiles[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Files:
|
||||
* S @ 100 (10 bytes)
|
||||
* D @ 150 (5 bytes)
|
||||
* D @ 200 (6 bytes)
|
||||
* S @ 200 (15 bytes)
|
||||
* D @ 250 (7 bytes)
|
||||
* D @ 300 (8 bytes)
|
||||
* S @ 300 (10 bytes)
|
||||
* D @ 350 (4 bytes)
|
||||
*/
|
||||
TEST_CASE("/blobgranule/server/common/granulefiles") {
|
||||
// simple cases first
|
||||
|
||||
// single snapshot file, no deltas
|
||||
GranuleFiles files;
|
||||
files.snapshotFiles.push_back(makeTestFile(100, 10));
|
||||
|
||||
printf("Just snapshot\n");
|
||||
|
||||
checkFiles(files, 0, 100, false, 100, {});
|
||||
checkFiles(files, 0, 200, false, 100, {});
|
||||
|
||||
printf("Small test\n");
|
||||
// add delta files with re-snapshot at end
|
||||
files.deltaFiles.push_back(makeTestFile(150, 5));
|
||||
files.deltaFiles.push_back(makeTestFile(200, 6));
|
||||
files.snapshotFiles.push_back(makeTestFile(200, 15));
|
||||
|
||||
// check different read versions with beginVersion=0
|
||||
checkFiles(files, 0, 100, false, 100, {});
|
||||
checkFiles(files, 0, 101, false, 100, { 150 });
|
||||
checkFiles(files, 0, 149, false, 100, { 150 });
|
||||
checkFiles(files, 0, 150, false, 100, { 150 });
|
||||
checkFiles(files, 0, 151, false, 100, { 150, 200 });
|
||||
checkFiles(files, 0, 199, false, 100, { 150, 200 });
|
||||
checkFiles(files, 0, 200, false, 200, {});
|
||||
checkFiles(files, 0, 300, false, 200, {});
|
||||
|
||||
// Test all cases of beginVersion + readVersion. Because delta files are smaller than snapshot at 200, this should
|
||||
// be the same with and without collapse
|
||||
checkFiles(files, 100, 200, false, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 101, 199, false, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 149, 151, false, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 149, 150, false, Optional<int>(), { 150 });
|
||||
checkFiles(files, 150, 151, false, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 151, 200, false, Optional<int>(), { 200 });
|
||||
|
||||
checkFiles(files, 100, 200, true, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 100, 300, true, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 101, 199, true, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 149, 151, true, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 149, 150, true, Optional<int>(), { 150 });
|
||||
checkFiles(files, 150, 151, true, Optional<int>(), { 150, 200 });
|
||||
checkFiles(files, 151, 200, true, Optional<int>(), { 200 });
|
||||
|
||||
printf("Larger test\n");
|
||||
// add more delta files and snapshots to check collapse logic
|
||||
files.deltaFiles.push_back(makeTestFile(250, 7));
|
||||
files.deltaFiles.push_back(makeTestFile(300, 8));
|
||||
files.snapshotFiles.push_back(makeTestFile(300, 10));
|
||||
files.deltaFiles.push_back(makeTestFile(350, 4));
|
||||
|
||||
checkFiles(files, 0, 300, false, 300, {});
|
||||
checkFiles(files, 0, 301, false, 300, { 350 });
|
||||
checkFiles(files, 0, 400, false, 300, { 350 });
|
||||
|
||||
// check delta files without collapse
|
||||
|
||||
checkFiles(files, 100, 301, false, Optional<int>(), { 150, 200, 250, 300, 350 });
|
||||
checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200, 250, 300 });
|
||||
checkFiles(files, 100, 251, false, Optional<int>(), { 150, 200, 250, 300 });
|
||||
checkFiles(files, 100, 250, false, Optional<int>(), { 150, 200, 250 });
|
||||
|
||||
checkFiles(files, 151, 300, false, Optional<int>(), { 200, 250, 300 });
|
||||
checkFiles(files, 151, 301, false, Optional<int>(), { 200, 250, 300, 350 });
|
||||
checkFiles(files, 151, 400, false, Optional<int>(), { 200, 250, 300, 350 });
|
||||
|
||||
checkFiles(files, 201, 300, false, Optional<int>(), { 250, 300 });
|
||||
checkFiles(files, 201, 301, false, Optional<int>(), { 250, 300, 350 });
|
||||
checkFiles(files, 201, 400, false, Optional<int>(), { 250, 300, 350 });
|
||||
|
||||
checkFiles(files, 251, 300, false, Optional<int>(), { 300 });
|
||||
checkFiles(files, 251, 301, false, Optional<int>(), { 300, 350 });
|
||||
checkFiles(files, 251, 400, false, Optional<int>(), { 300, 350 });
|
||||
checkFiles(files, 301, 400, false, Optional<int>(), { 350 });
|
||||
checkFiles(files, 351, 400, false, Optional<int>(), {});
|
||||
|
||||
// check with collapse
|
||||
// these 2 collapse because the delta files at 150+200+250+300 are larger than the snapshot at 300
|
||||
checkFiles(files, 100, 301, true, 300, { 350 });
|
||||
checkFiles(files, 100, 300, true, 300, {});
|
||||
// these 2 don't collapse because 150+200 delta files are smaller than the snapshot at 200
|
||||
checkFiles(files, 100, 251, true, Optional<int>(), { 150, 200, 250, 300 });
|
||||
checkFiles(files, 100, 250, true, Optional<int>(), { 150, 200, 250 });
|
||||
|
||||
// these 3 do collapse because the delta files at 200+250+300 are larger than the snapshot at 300
|
||||
checkFiles(files, 151, 300, true, 300, {});
|
||||
checkFiles(files, 151, 301, true, 300, { 350 });
|
||||
checkFiles(files, 151, 400, true, 300, { 350 });
|
||||
|
||||
// these 3 do collapse because the delta files at 250+300 are larger than the snapshot at 300
|
||||
checkFiles(files, 201, 300, true, 300, {});
|
||||
checkFiles(files, 201, 301, true, 300, { 350 });
|
||||
checkFiles(files, 201, 400, true, 300, { 350 });
|
||||
|
||||
// these don't collapse because the delta file at 300 is smaller than the snapshot at 300
|
||||
checkFiles(files, 251, 300, true, Optional<int>(), { 300 });
|
||||
checkFiles(files, 251, 301, true, Optional<int>(), { 300, 350 });
|
||||
checkFiles(files, 251, 400, true, Optional<int>(), { 300, 350 });
|
||||
checkFiles(files, 301, 400, true, Optional<int>(), { 350 });
|
||||
checkFiles(files, 351, 400, true, Optional<int>(), {});
|
||||
|
||||
return Void();
|
||||
}
|
|
@ -54,12 +54,23 @@ struct BlobFileIndex {
|
|||
|
||||
BlobFileIndex(Version version, std::string filename, int64_t offset, int64_t length)
|
||||
: version(version), filename(filename), offset(offset), length(length) {}
|
||||
|
||||
// compare on version
|
||||
bool operator<(const BlobFileIndex& r) const { return version < r.version; }
|
||||
};
|
||||
|
||||
// FIXME: initialize these to smaller default sizes to save a bit of memory, particularly snapshotFiles
|
||||
// Stores the files that comprise a blob granule
|
||||
struct GranuleFiles {
|
||||
std::deque<BlobFileIndex> snapshotFiles;
|
||||
std::deque<BlobFileIndex> deltaFiles;
|
||||
std::vector<BlobFileIndex> snapshotFiles;
|
||||
std::vector<BlobFileIndex> deltaFiles;
|
||||
|
||||
void getFiles(Version beginVersion,
|
||||
Version readVersion,
|
||||
bool canCollapse,
|
||||
BlobGranuleChunkRef& chunk,
|
||||
Arena& replyArena,
|
||||
int64_t& deltaBytesCounter) const;
|
||||
};
|
||||
|
||||
class Transaction;
|
||||
|
|
|
@ -2778,7 +2778,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
|
|||
// DB has [A - B) and [C - D). They should show up in knownBlobRanges, and [B - C) should be in removed.
|
||||
// DB has [B - C). It should show up in knownBlobRanges, [B - C) should be in added, and [A - B) and [C - D)
|
||||
// should be in removed.
|
||||
TEST_CASE(":/blobmanager/updateranges") {
|
||||
TEST_CASE("/blobmanager/updateranges") {
|
||||
KeyRangeMap<bool> knownBlobRanges(false, normalKeys.end);
|
||||
Arena ar;
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
@ -43,9 +44,10 @@
|
|||
#include "flow/Error.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/actorcompiler.h" // has to be last include
|
||||
#include "flow/network.h"
|
||||
|
||||
#include "flow/actorcompiler.h" // has to be last include
|
||||
|
||||
#define BW_DEBUG false
|
||||
#define BW_REQUEST_DEBUG false
|
||||
|
||||
|
@ -832,7 +834,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
|
|||
rowsStream,
|
||||
false);
|
||||
RangeResult newGranule =
|
||||
wait(readBlobGranule(chunk, metadata->keyRange, version, bwData->bstore, &bwData->stats));
|
||||
wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bwData->bstore, &bwData->stats));
|
||||
|
||||
bwData->stats.bytesReadFromS3ForCompaction += compactBytesRead;
|
||||
rowsStream.send(std::move(newGranule));
|
||||
|
@ -2093,16 +2095,25 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
|
|||
|
||||
ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, BlobGranuleFileRequest req) {
|
||||
if (BW_REQUEST_DEBUG) {
|
||||
fmt::print("BW {0} processing blobGranuleFileRequest for range [{1} - {2}) @ {3}\n",
|
||||
fmt::print("BW {0} processing blobGranuleFileRequest for range [{1} - {2}) @ ",
|
||||
bwData->id.toString(),
|
||||
req.keyRange.begin.printable(),
|
||||
req.keyRange.end.printable(),
|
||||
req.readVersion);
|
||||
if (req.beginVersion > 0) {
|
||||
fmt::print("{0} - {1}\n", req.beginVersion, req.readVersion);
|
||||
} else {
|
||||
fmt::print("{}", req.readVersion);
|
||||
}
|
||||
}
|
||||
|
||||
state bool didCollapse = false;
|
||||
try {
|
||||
// TODO REMOVE in api V2
|
||||
ASSERT(req.beginVersion == 0);
|
||||
// TODO remove requirement for canCollapseBegin once we implement early replying
|
||||
ASSERT(req.beginVersion == 0 || req.canCollapseBegin);
|
||||
if (req.beginVersion != 0) {
|
||||
ASSERT(req.beginVersion > 0);
|
||||
}
|
||||
state BlobGranuleFileReply rep;
|
||||
state std::vector<Reference<GranuleMetadata>> granules;
|
||||
|
||||
|
@ -2150,6 +2161,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
|
|||
continue;
|
||||
}
|
||||
state Reference<GranuleMetadata> metadata = m;
|
||||
state Version granuleBeginVersion = req.beginVersion;
|
||||
|
||||
choose {
|
||||
when(wait(metadata->readable.getFuture())) {}
|
||||
|
@ -2290,67 +2302,30 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
|
|||
// granule is up to date, do read
|
||||
ASSERT(metadata->cancelled.canBeSet());
|
||||
|
||||
// Right now we force a collapse if the version range crosses granule boundaries, for simplicity
|
||||
if (granuleBeginVersion <= chunkFiles.snapshotFiles.front().version) {
|
||||
TEST(true); // collapsed begin version request because of boundaries
|
||||
didCollapse = true;
|
||||
granuleBeginVersion = 0;
|
||||
}
|
||||
BlobGranuleChunkRef chunk;
|
||||
// TODO change in V2
|
||||
// TODO change with early reply
|
||||
chunk.includedVersion = req.readVersion;
|
||||
chunk.keyRange = KeyRangeRef(StringRef(rep.arena, chunkRange.begin), StringRef(rep.arena, chunkRange.end));
|
||||
|
||||
// handle snapshot files
|
||||
// TODO refactor the "find snapshot file" logic to GranuleFiles?
|
||||
// FIXME: binary search instead of linear search, especially when file count is large
|
||||
int i = chunkFiles.snapshotFiles.size() - 1;
|
||||
while (i >= 0 && chunkFiles.snapshotFiles[i].version > req.readVersion) {
|
||||
i--;
|
||||
}
|
||||
// because of granule history, we should always be able to find the desired snapshot
|
||||
// version, and have thrown blob_granule_transaction_too_old earlier if not possible.
|
||||
if (i < 0) {
|
||||
fmt::print("req @ {0} >= initial snapshot {1} but can't find snapshot in ({2}) files:\n",
|
||||
req.readVersion,
|
||||
metadata->initialSnapshotVersion,
|
||||
chunkFiles.snapshotFiles.size());
|
||||
for (auto& f : chunkFiles.snapshotFiles) {
|
||||
fmt::print(" {0}", f.version);
|
||||
}
|
||||
}
|
||||
ASSERT(i >= 0);
|
||||
|
||||
BlobFileIndex snapshotF = chunkFiles.snapshotFiles[i];
|
||||
chunk.snapshotFile = BlobFilePointerRef(rep.arena, snapshotF.filename, snapshotF.offset, snapshotF.length);
|
||||
Version snapshotVersion = chunkFiles.snapshotFiles[i].version;
|
||||
chunk.snapshotVersion = snapshotVersion;
|
||||
|
||||
// handle delta files
|
||||
// cast this to an int so i going to -1 still compares properly
|
||||
int lastDeltaFileIdx = chunkFiles.deltaFiles.size() - 1;
|
||||
i = lastDeltaFileIdx;
|
||||
// skip delta files that are too new
|
||||
while (i >= 0 && chunkFiles.deltaFiles[i].version > req.readVersion) {
|
||||
i--;
|
||||
}
|
||||
if (i < lastDeltaFileIdx) {
|
||||
// we skipped one file at the end with a larger read version, this will actually contain
|
||||
// our query version, so add it back.
|
||||
i++;
|
||||
}
|
||||
// only include delta files after the snapshot file
|
||||
int j = i;
|
||||
while (j >= 0 && chunkFiles.deltaFiles[j].version > snapshotVersion) {
|
||||
j--;
|
||||
}
|
||||
j++;
|
||||
while (j <= i) {
|
||||
BlobFileIndex deltaF = chunkFiles.deltaFiles[j];
|
||||
chunk.deltaFiles.emplace_back_deep(rep.arena, deltaF.filename, deltaF.offset, deltaF.length);
|
||||
bwData->stats.readReqDeltaBytesReturned += deltaF.length;
|
||||
j++;
|
||||
int64_t deltaBytes = 0;
|
||||
chunkFiles.getFiles(
|
||||
granuleBeginVersion, req.readVersion, req.canCollapseBegin, chunk, rep.arena, deltaBytes);
|
||||
bwData->stats.readReqDeltaBytesReturned += deltaBytes;
|
||||
if (granuleBeginVersion > 0 && chunk.snapshotFile.present()) {
|
||||
TEST(true); // collapsed begin version request for efficiency
|
||||
didCollapse = true;
|
||||
}
|
||||
|
||||
// new deltas (if version is larger than version of last delta file)
|
||||
// FIXME: do trivial key bounds here if key range is not fully contained in request key
|
||||
// range
|
||||
|
||||
if (req.readVersion > metadata->durableDeltaVersion.get()) {
|
||||
if (req.readVersion > metadata->durableDeltaVersion.get() && !metadata->currentDeltas.empty()) {
|
||||
if (metadata->durableDeltaVersion.get() != metadata->pendingDeltaVersion) {
|
||||
fmt::print("real-time read [{0} - {1}) @ {2} doesn't have mutations!! durable={3}, pending={4}\n",
|
||||
metadata->keyRange.begin.printable(),
|
||||
|
@ -2359,13 +2334,32 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
|
|||
metadata->durableDeltaVersion.get(),
|
||||
metadata->pendingDeltaVersion);
|
||||
}
|
||||
|
||||
// prune mutations based on begin version, if possible
|
||||
ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion);
|
||||
// FIXME: I think we can remove this dependsOn since we are doing push_back_deep
|
||||
rep.arena.dependsOn(metadata->currentDeltas.arena());
|
||||
for (auto& delta : metadata->currentDeltas) {
|
||||
if (delta.version > req.readVersion) {
|
||||
MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
|
||||
if (granuleBeginVersion > metadata->currentDeltas.back().version) {
|
||||
TEST(true); // beginVersion pruning all in-memory mutations
|
||||
mutationIt = metadata->currentDeltas.end();
|
||||
} else if (granuleBeginVersion > metadata->currentDeltas.front().version) {
|
||||
// binary search for beginVersion
|
||||
TEST(true); // beginVersion pruning some in-memory mutations
|
||||
mutationIt = std::lower_bound(metadata->currentDeltas.begin(),
|
||||
metadata->currentDeltas.end(),
|
||||
MutationsAndVersionRef(granuleBeginVersion, 0),
|
||||
MutationsAndVersionRef::OrderByVersion());
|
||||
}
|
||||
|
||||
// add mutations to response
|
||||
while (mutationIt != metadata->currentDeltas.end()) {
|
||||
if (mutationIt->version > req.readVersion) {
|
||||
TEST(true); // readVersion pruning some in-memory mutations
|
||||
break;
|
||||
}
|
||||
chunk.newDeltas.push_back_deep(rep.arena, delta);
|
||||
chunk.newDeltas.push_back_deep(rep.arena, *mutationIt);
|
||||
mutationIt++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2376,11 +2370,17 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
|
|||
|
||||
wait(yield(TaskPriority::DefaultEndpoint));
|
||||
}
|
||||
// do these together to keep them synchronous
|
||||
if (req.beginVersion != 0) {
|
||||
++bwData->stats.readRequestsWithBegin;
|
||||
}
|
||||
if (didCollapse) {
|
||||
++bwData->stats.readRequestsCollapsed;
|
||||
}
|
||||
ASSERT(!req.reply.isSet());
|
||||
req.reply.send(rep);
|
||||
--bwData->stats.activeReadRequests;
|
||||
} catch (Error& e) {
|
||||
// fmt::print("Error in BGFRequest {0}\n", e.name());
|
||||
if (e.code() == error_code_operation_cancelled) {
|
||||
req.reply.sendError(wrong_shard_server());
|
||||
throw;
|
||||
|
|
|
@ -2475,11 +2475,12 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
|
|||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
|
||||
ACTOR Future<Void> clusterControllerCore(Reference<IClusterConnectionRecord> connRecord,
|
||||
ClusterControllerFullInterface interf,
|
||||
Future<Void> leaderFail,
|
||||
ServerCoordinators coordinators,
|
||||
LocalityData locality,
|
||||
ConfigDBType configDBType) {
|
||||
state ServerCoordinators coordinators(connRecord);
|
||||
state ClusterControllerData self(interf, locality, coordinators);
|
||||
state ConfigBroadcaster configBroadcaster(coordinators, configDBType);
|
||||
state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
|
||||
|
@ -2612,7 +2613,7 @@ ACTOR Future<Void> replaceInterface(ClusterControllerFullInterface interf) {
|
|||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
|
||||
ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRecord,
|
||||
Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC,
|
||||
bool hasConnected,
|
||||
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
|
||||
|
@ -2623,9 +2624,10 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
|
|||
state bool inRole = false;
|
||||
cci.initEndpoints();
|
||||
try {
|
||||
wait(connRecord->resolveHostnames());
|
||||
// Register as a possible leader; wait to be elected
|
||||
state Future<Void> leaderFail =
|
||||
tryBecomeLeader(coordinators, cci, currentCC, hasConnected, asyncPriorityInfo);
|
||||
tryBecomeLeader(connRecord, cci, currentCC, hasConnected, asyncPriorityInfo);
|
||||
state Future<Void> shouldReplace = replaceInterface(cci);
|
||||
|
||||
while (!currentCC->get().present() || currentCC->get().get() != cci) {
|
||||
|
@ -2644,7 +2646,7 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
|
|||
startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
|
||||
inRole = true;
|
||||
|
||||
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType));
|
||||
wait(clusterControllerCore(connRecord, cci, leaderFail, locality, configDBType));
|
||||
}
|
||||
} catch (Error& e) {
|
||||
if (inRole)
|
||||
|
@ -2673,15 +2675,12 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
|
|||
state bool hasConnected = false;
|
||||
loop {
|
||||
try {
|
||||
wait(connRecord->resolveHostnames());
|
||||
ServerCoordinators coordinators(connRecord);
|
||||
wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
|
||||
wait(clusterController(connRecord, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
|
||||
hasConnected = true;
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_coordinators_changed)
|
||||
throw; // Expected to terminate fdbserver
|
||||
}
|
||||
|
||||
hasConnected = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -50,11 +50,12 @@ struct RelocateData {
|
|||
std::vector<UID> completeSources;
|
||||
std::vector<UID> completeDests;
|
||||
bool wantsNewServers;
|
||||
bool cancellable;
|
||||
TraceInterval interval;
|
||||
|
||||
RelocateData()
|
||||
: priority(-1), boundaryPriority(-1), healthPriority(-1), startTime(-1), workFactor(0), wantsNewServers(false),
|
||||
interval("QueuedRelocation") {}
|
||||
cancellable(false), interval("QueuedRelocation") {}
|
||||
explicit RelocateData(RelocateShard const& rs)
|
||||
: keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1),
|
||||
healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), startTime(now()),
|
||||
|
@ -63,7 +64,7 @@ struct RelocateData {
|
|||
rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
|
||||
rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD ||
|
||||
rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT),
|
||||
interval("QueuedRelocation") {}
|
||||
cancellable(true), interval("QueuedRelocation") {}
|
||||
|
||||
static bool isHealthPriority(int priority) {
|
||||
return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
|
||||
|
@ -610,19 +611,23 @@ struct DDQueueData {
|
|||
.detail(
|
||||
"Problem",
|
||||
"the key range in the inFlight map matches the key range in the RelocateData message");
|
||||
} else if (it->value().cancellable) {
|
||||
TraceEvent(SevError, "DDQueueValidateError13")
|
||||
.detail("Problem", "key range is cancellable but not in flight!")
|
||||
.detail("Range", it->range());
|
||||
}
|
||||
}
|
||||
|
||||
for (auto it = busymap.begin(); it != busymap.end(); ++it) {
|
||||
for (int i = 0; i < it->second.ledger.size() - 1; i++) {
|
||||
if (it->second.ledger[i] < it->second.ledger[i + 1])
|
||||
TraceEvent(SevError, "DDQueueValidateError13")
|
||||
TraceEvent(SevError, "DDQueueValidateError14")
|
||||
.detail("Problem", "ascending ledger problem")
|
||||
.detail("LedgerLevel", i)
|
||||
.detail("LedgerValueA", it->second.ledger[i])
|
||||
.detail("LedgerValueB", it->second.ledger[i + 1]);
|
||||
if (it->second.ledger[i] < 0.0)
|
||||
TraceEvent(SevError, "DDQueueValidateError14")
|
||||
TraceEvent(SevError, "DDQueueValidateError15")
|
||||
.detail("Problem", "negative ascending problem")
|
||||
.detail("LedgerLevel", i)
|
||||
.detail("LedgerValue", it->second.ledger[i]);
|
||||
|
@ -632,13 +637,13 @@ struct DDQueueData {
|
|||
for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) {
|
||||
for (int i = 0; i < it->second.ledger.size() - 1; i++) {
|
||||
if (it->second.ledger[i] < it->second.ledger[i + 1])
|
||||
TraceEvent(SevError, "DDQueueValidateError15")
|
||||
TraceEvent(SevError, "DDQueueValidateError16")
|
||||
.detail("Problem", "ascending ledger problem")
|
||||
.detail("LedgerLevel", i)
|
||||
.detail("LedgerValueA", it->second.ledger[i])
|
||||
.detail("LedgerValueB", it->second.ledger[i + 1]);
|
||||
if (it->second.ledger[i] < 0.0)
|
||||
TraceEvent(SevError, "DDQueueValidateError16")
|
||||
TraceEvent(SevError, "DDQueueValidateError17")
|
||||
.detail("Problem", "negative ascending problem")
|
||||
.detail("LedgerLevel", i)
|
||||
.detail("LedgerValue", it->second.ledger[i]);
|
||||
|
@ -954,7 +959,7 @@ struct DDQueueData {
|
|||
auto containedRanges = inFlight.containedRanges(rd.keys);
|
||||
std::vector<RelocateData> cancellableRelocations;
|
||||
for (auto it = containedRanges.begin(); it != containedRanges.end(); ++it) {
|
||||
if (inFlightActors.liveActorAt(it->range().begin)) {
|
||||
if (it.value().cancellable) {
|
||||
cancellableRelocations.push_back(it->value());
|
||||
}
|
||||
}
|
||||
|
@ -1180,6 +1185,12 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
|
|||
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
|
||||
}
|
||||
|
||||
// set cancellable to false on inFlight's entry for this key range
|
||||
auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
|
||||
ASSERT(inFlightRange.range() == rd.keys);
|
||||
ASSERT(inFlightRange.value().randomId == rd.randomId);
|
||||
inFlightRange.value().cancellable = false;
|
||||
|
||||
destIds.clear();
|
||||
state std::vector<UID> healthyIds;
|
||||
state std::vector<UID> extraIds;
|
||||
|
|
|
@ -25,28 +25,56 @@
|
|||
#include "fdbclient/MonitorLeader.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
// Keep trying to become a leader by submitting itself to all coordinators.
|
||||
// Monitor the health of all coordinators at the same time.
|
||||
// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
|
||||
// to throw `coordinators_changed()` error
|
||||
ACTOR Future<Void> submitCandidacy(Key key,
|
||||
LeaderElectionRegInterface coord,
|
||||
LeaderInfo myInfo,
|
||||
UID prevChangeID,
|
||||
Reference<AsyncVar<std::vector<Optional<LeaderInfo>>>> nominees,
|
||||
int index) {
|
||||
AsyncTrigger* nomineeChange,
|
||||
Optional<LeaderInfo>* nominee,
|
||||
Optional<Hostname> hostname = Optional<Hostname>()) {
|
||||
loop {
|
||||
auto const& nom = nominees->get()[index];
|
||||
Optional<LeaderInfo> li = wait(
|
||||
retryBrokenPromise(coord.candidacy,
|
||||
CandidacyRequest(key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID),
|
||||
TaskPriority::CoordinationReply));
|
||||
state Optional<LeaderInfo> li;
|
||||
|
||||
if (li != nominees->get()[index]) {
|
||||
std::vector<Optional<LeaderInfo>> v = nominees->get();
|
||||
v[index] = li;
|
||||
nominees->set(v);
|
||||
if (coord.candidacy.getEndpoint().getPrimaryAddress().fromHostname) {
|
||||
state ErrorOr<Optional<LeaderInfo>> rep = wait(coord.candidacy.tryGetReply(
|
||||
CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
|
||||
TaskPriority::CoordinationReply));
|
||||
if (rep.isError()) {
|
||||
// Connecting to nominee failed, most likely due to connection failed.
|
||||
TraceEvent("SubmitCandadicyError")
|
||||
.error(rep.getError())
|
||||
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
|
||||
.detail("OldAddr", coord.candidacy.getEndpoint().getPrimaryAddress().toString());
|
||||
if (rep.getError().code() == error_code_request_maybe_delivered) {
|
||||
// Delay to prevent tight resolving loop due to outdated DNS cache
|
||||
wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY));
|
||||
throw coordinators_changed();
|
||||
} else {
|
||||
throw rep.getError();
|
||||
}
|
||||
} else if (rep.present()) {
|
||||
li = rep.get();
|
||||
}
|
||||
} else {
|
||||
Optional<LeaderInfo> tmp = wait(retryBrokenPromise(
|
||||
coord.candidacy,
|
||||
CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
|
||||
TaskPriority::CoordinationReply));
|
||||
li = tmp;
|
||||
}
|
||||
|
||||
wait(Future<Void>(Void())); // Make sure we weren't cancelled
|
||||
|
||||
if (li != *nominee) {
|
||||
*nominee = li;
|
||||
nomineeChange->trigger();
|
||||
|
||||
if (li.present() && li.get().forward)
|
||||
wait(Future<Void>(Never()));
|
||||
|
||||
wait(Future<Void>(Void())); // Make sure we weren't cancelled
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -84,13 +112,14 @@ ACTOR Future<Void> changeLeaderCoordinators(ServerCoordinators coordinators, Val
|
|||
return Void();
|
||||
}
|
||||
|
||||
ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
|
||||
ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> connRecord,
|
||||
Value proposedSerializedInterface,
|
||||
Reference<AsyncVar<Value>> outSerializedLeader,
|
||||
bool hasConnected,
|
||||
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo) {
|
||||
state Reference<AsyncVar<std::vector<Optional<LeaderInfo>>>> nominees(
|
||||
new AsyncVar<std::vector<Optional<LeaderInfo>>>());
|
||||
state ServerCoordinators coordinators(connRecord);
|
||||
state AsyncTrigger nomineeChange;
|
||||
state std::vector<Optional<LeaderInfo>> nominees;
|
||||
state LeaderInfo myInfo;
|
||||
state Future<Void> candidacies;
|
||||
state bool iAmLeader = false;
|
||||
|
@ -105,8 +134,6 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
|
|||
wait(delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
|
||||
}
|
||||
|
||||
nominees->set(std::vector<Optional<LeaderInfo>>(coordinators.clientLeaderServers.size()));
|
||||
|
||||
myInfo.serializedInfo = proposedSerializedInterface;
|
||||
outSerializedLeader->set(Value());
|
||||
|
||||
|
@ -114,6 +141,9 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
|
|||
(SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY) ? buggifyDelayedAsyncVar(outSerializedLeader) : Void();
|
||||
|
||||
while (!iAmLeader) {
|
||||
wait(connRecord->resolveHostnames());
|
||||
coordinators = ServerCoordinators(connRecord);
|
||||
nominees.resize(coordinators.leaderElectionServers.size());
|
||||
state Future<Void> badCandidateTimeout;
|
||||
|
||||
myInfo.changeID = deterministicRandom()->randomUniqueID();
|
||||
|
@ -122,13 +152,25 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
|
|||
|
||||
std::vector<Future<Void>> cand;
|
||||
cand.reserve(coordinators.leaderElectionServers.size());
|
||||
for (int i = 0; i < coordinators.leaderElectionServers.size(); i++)
|
||||
cand.push_back(submitCandidacy(
|
||||
coordinators.clusterKey, coordinators.leaderElectionServers[i], myInfo, prevChangeID, nominees, i));
|
||||
for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) {
|
||||
Optional<Hostname> hostname;
|
||||
auto r = connRecord->getConnectionString().networkAddressToHostname.find(
|
||||
coordinators.leaderElectionServers[i].candidacy.getEndpoint().getPrimaryAddress());
|
||||
if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
|
||||
hostname = r->second;
|
||||
}
|
||||
cand.push_back(submitCandidacy(coordinators.clusterKey,
|
||||
coordinators.leaderElectionServers[i],
|
||||
myInfo,
|
||||
prevChangeID,
|
||||
&nomineeChange,
|
||||
&nominees[i],
|
||||
hostname));
|
||||
}
|
||||
candidacies = waitForAll(cand);
|
||||
|
||||
loop {
|
||||
state Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees->get());
|
||||
state Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
|
||||
if (leader.present() && leader.get().first.forward) {
|
||||
// These coordinators are forwarded to another set. But before we change our own cluster file, we need
|
||||
// to make sure that a majority of coordinators know that. SOMEDAY: Wait briefly to see if other
|
||||
|
@ -172,22 +214,30 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
|
|||
// If more than 2*SERVER_KNOBS->POLLING_FREQUENCY elapses while we are nominated by some coordinator but
|
||||
// there is no leader, we might be breaking the leader election process for someone with better
|
||||
// communications but lower ID, so change IDs.
|
||||
if ((!leader.present() || !leader.get().second) &&
|
||||
std::count(nominees->get().begin(), nominees->get().end(), myInfo)) {
|
||||
if ((!leader.present() || !leader.get().second) && std::count(nominees.begin(), nominees.end(), myInfo)) {
|
||||
if (!badCandidateTimeout.isValid())
|
||||
badCandidateTimeout = delay(SERVER_KNOBS->POLLING_FREQUENCY * 2, TaskPriority::CoordinationReply);
|
||||
} else
|
||||
badCandidateTimeout = Future<Void>();
|
||||
|
||||
choose {
|
||||
when(wait(nominees->onChange())) {}
|
||||
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
|
||||
TEST(true); // Bad candidate timeout
|
||||
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
|
||||
break;
|
||||
try {
|
||||
choose {
|
||||
when(wait(nomineeChange.onTrigger())) {}
|
||||
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
|
||||
TEST(true); // Bad candidate timeout
|
||||
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
|
||||
break;
|
||||
}
|
||||
when(wait(candidacies)) { ASSERT(false); }
|
||||
when(wait(asyncPriorityInfo->onChange())) { break; }
|
||||
}
|
||||
} catch (Error& e) {
|
||||
if (e.code() == error_code_coordinators_changed) {
|
||||
connRecord->getConnectionString().resetToUnresolved();
|
||||
break;
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
when(wait(candidacies)) { ASSERT(false); }
|
||||
when(wait(asyncPriorityInfo->onChange())) { break; }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ class ServerCoordinators;
|
|||
// eventually be set. If the return value is cancelled, the candidacy or leadership of the proposedInterface
|
||||
// will eventually end.
|
||||
template <class LeaderInterface>
|
||||
Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
|
||||
Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
|
||||
LeaderInterface const& proposedInterface,
|
||||
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
|
||||
bool hasConnected,
|
||||
|
@ -50,20 +50,20 @@ Future<Void> changeLeaderCoordinators(ServerCoordinators const& coordinators, Va
|
|||
#pragma region Implementation
|
||||
#endif // __INTEL_COMPILER
|
||||
|
||||
Future<Void> tryBecomeLeaderInternal(ServerCoordinators const& coordinators,
|
||||
Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> const& connRecord,
|
||||
Value const& proposedSerializedInterface,
|
||||
Reference<AsyncVar<Value>> const& outSerializedLeader,
|
||||
bool const& hasConnected,
|
||||
Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo);
|
||||
|
||||
template <class LeaderInterface>
|
||||
Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
|
||||
Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
|
||||
LeaderInterface const& proposedInterface,
|
||||
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
|
||||
bool hasConnected,
|
||||
Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo) {
|
||||
auto serializedInfo = makeReference<AsyncVar<Value>>();
|
||||
Future<Void> m = tryBecomeLeaderInternal(coordinators,
|
||||
Future<Void> m = tryBecomeLeaderInternal(connRecord,
|
||||
ObjectWriter::toValue(proposedInterface, IncludeVersion()),
|
||||
serializedInfo,
|
||||
hasConnected,
|
||||
|
|
|
@ -2726,8 +2726,6 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
|
|||
actors.push_back(serveProcess());
|
||||
|
||||
try {
|
||||
wait(connRecord->resolveHostnames());
|
||||
ServerCoordinators coordinators(connRecord);
|
||||
if (g_network->isSimulated()) {
|
||||
whitelistBinPaths = ",, random_path, /bin/snap_create.sh,,";
|
||||
}
|
||||
|
@ -2745,8 +2743,8 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
|
|||
if (coordFolder.size()) {
|
||||
// SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up
|
||||
// their files
|
||||
actors.push_back(fileNotFoundToNever(
|
||||
coordinationServer(coordFolder, coordinators.ccr, configNode, configBroadcastInterface)));
|
||||
actors.push_back(
|
||||
fileNotFoundToNever(coordinationServer(coordFolder, connRecord, configNode, configBroadcastInterface)));
|
||||
}
|
||||
|
||||
state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));
|
||||
|
|
|
@ -272,15 +272,20 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
}
|
||||
|
||||
// FIXME: typedef this pair type and/or chunk list
|
||||
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>>
|
||||
readFromBlob(Database cx, BlobGranuleCorrectnessWorkload* self, KeyRange range, Version version) {
|
||||
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
|
||||
Database cx,
|
||||
BlobGranuleCorrectnessWorkload* self,
|
||||
KeyRange range,
|
||||
Version beginVersion,
|
||||
Version readVersion) {
|
||||
state RangeResult out;
|
||||
state Standalone<VectorRef<BlobGranuleChunkRef>> chunks;
|
||||
state Transaction tr(cx);
|
||||
|
||||
loop {
|
||||
try {
|
||||
Standalone<VectorRef<BlobGranuleChunkRef>> chunks_ = wait(tr.readBlobGranules(range, 0, version));
|
||||
Standalone<VectorRef<BlobGranuleChunkRef>> chunks_ =
|
||||
wait(tr.readBlobGranules(range, beginVersion, readVersion));
|
||||
chunks = chunks_;
|
||||
break;
|
||||
} catch (Error& e) {
|
||||
|
@ -289,7 +294,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
}
|
||||
|
||||
for (const BlobGranuleChunkRef& chunk : chunks) {
|
||||
RangeResult chunkRows = wait(readBlobGranule(chunk, range, version, self->bstore));
|
||||
RangeResult chunkRows = wait(readBlobGranule(chunk, range, beginVersion, readVersion, self->bstore));
|
||||
out.arena().dependsOn(chunkRows.arena());
|
||||
out.append(out.arena(), chunkRows.begin(), chunkRows.size());
|
||||
}
|
||||
|
@ -321,7 +326,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
Version rv = wait(self->doGrv(&tr));
|
||||
state Version readVersion = rv;
|
||||
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
|
||||
wait(self->readFromBlob(cx, self, threadData->directoryRange, readVersion));
|
||||
wait(self->readFromBlob(cx, self, threadData->directoryRange, 0, readVersion));
|
||||
fmt::print("Directory {0} got {1} RV {2}\n",
|
||||
threadData->directoryID,
|
||||
doSetup ? "initial" : "final",
|
||||
|
@ -349,6 +354,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
const Optional<Value>& blobValue,
|
||||
uint32_t startKey,
|
||||
uint32_t endKey,
|
||||
Version beginVersion,
|
||||
Version readVersion,
|
||||
const std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>& blob) {
|
||||
threadData->mismatches++;
|
||||
|
@ -360,11 +366,13 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
ev.detail("DirectoryID", format("%08x", threadData->directoryID))
|
||||
.detail("RangeStart", format("%08x", startKey))
|
||||
.detail("RangeEnd", format("%08x", endKey))
|
||||
.detail("BeginVersion", beginVersion)
|
||||
.detail("Version", readVersion);
|
||||
fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3}\n",
|
||||
fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3} - {4}\n",
|
||||
format("%08x", threadData->directoryID),
|
||||
format("%08x", startKey),
|
||||
format("%08x", endKey),
|
||||
beginVersion,
|
||||
readVersion);
|
||||
if (lastMatching.present()) {
|
||||
fmt::print(" last correct: {}\n", lastMatching.get().printable());
|
||||
|
@ -456,6 +464,29 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
readVersion);
|
||||
}
|
||||
|
||||
// because each chunk could be separately collapsed or not if we set beginVersion, we have to track it by chunk
|
||||
KeyRangeMap<Version> beginVersionByChunk;
|
||||
beginVersionByChunk.insert(normalKeys, 0);
|
||||
int beginCollapsed = 0;
|
||||
int beginNotCollapsed = 0;
|
||||
for (auto& chunk : blob.second) {
|
||||
if (!chunk.snapshotFile.present()) {
|
||||
ASSERT(beginVersion > 0);
|
||||
ASSERT(chunk.snapshotVersion == invalidVersion);
|
||||
beginCollapsed++;
|
||||
beginVersionByChunk.insert(chunk.keyRange, beginVersion);
|
||||
} else {
|
||||
ASSERT(chunk.snapshotVersion != invalidVersion);
|
||||
if (beginVersion > 0) {
|
||||
beginNotCollapsed++;
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST(beginCollapsed > 0); // BGCorrectness got collapsed request with beginVersion > 0
|
||||
TEST(beginNotCollapsed > 0); // BGCorrectness got un-collapsed request with beginVersion > 0
|
||||
TEST(beginCollapsed > 0 &&
|
||||
beginNotCollapsed > 0); // BGCorrectness got both collapsed and uncollapsed in the same request!
|
||||
|
||||
while (checkIt != threadData->keyData.end() && checkIt->first < endKeyExclusive) {
|
||||
uint32_t key = checkIt->first;
|
||||
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
||||
|
@ -475,6 +506,16 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
for (; idIdx < checkIt->second.writes.size() && checkIt->second.writes[idIdx].writeVersion <= readVersion;
|
||||
idIdx++) {
|
||||
Key nextKeyShouldBe = threadData->getKey(key, idIdx);
|
||||
Version keyBeginVersion = beginVersionByChunk.rangeContaining(nextKeyShouldBe).cvalue();
|
||||
if (keyBeginVersion > checkIt->second.writes[idIdx].writeVersion) {
|
||||
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
||||
fmt::print("DBG READ: Skip ID {0} written @ {1} < beginVersion {2}\n",
|
||||
idIdx,
|
||||
checkIt->second.writes[idIdx].clearVersion,
|
||||
keyBeginVersion);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
||||
fmt::print("DBG READ: Checking ID {0} ({1}) written @ {2}\n",
|
||||
format("%08x", idIdx),
|
||||
|
@ -491,6 +532,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
Optional<Value>(),
|
||||
startKeyInclusive,
|
||||
endKeyExclusive,
|
||||
beginVersion,
|
||||
readVersion,
|
||||
blob);
|
||||
return false;
|
||||
|
@ -509,6 +551,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
Optional<Value>(),
|
||||
startKeyInclusive,
|
||||
endKeyExclusive,
|
||||
beginVersion,
|
||||
readVersion,
|
||||
blob);
|
||||
return false;
|
||||
|
@ -523,6 +566,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
blob.first[resultIdx].value,
|
||||
startKeyInclusive,
|
||||
endKeyExclusive,
|
||||
beginVersion,
|
||||
readVersion,
|
||||
blob);
|
||||
return false;
|
||||
|
@ -545,6 +589,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
Optional<Value>(),
|
||||
startKeyInclusive,
|
||||
endKeyExclusive,
|
||||
beginVersion,
|
||||
readVersion,
|
||||
blob);
|
||||
return false;
|
||||
|
@ -565,6 +610,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
state double targetReadBytesPerSec = threadData->targetByteRate * 4;
|
||||
ASSERT(targetReadBytesPerSec > 0);
|
||||
|
||||
state Version beginVersion;
|
||||
state Version readVersion;
|
||||
|
||||
TraceEvent("BlobGranuleCorrectnessReaderStart").log();
|
||||
|
@ -610,26 +656,42 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
state KeyRange range = KeyRangeRef(threadData->getKey(startKey, 0), threadData->getKey(endKey, 0));
|
||||
|
||||
// pick read version
|
||||
// TODO could also pick begin version here
|
||||
ASSERT(threadData->writeVersions.back() >= threadData->minSuccessfulReadVersion);
|
||||
size_t readVersionIdx;
|
||||
// randomly choose up to date vs time travel read
|
||||
if (deterministicRandom()->random01() < 0.5) {
|
||||
threadData->reads++;
|
||||
readVersionIdx = threadData->writeVersions.size() - 1;
|
||||
readVersion = threadData->writeVersions.back();
|
||||
} else {
|
||||
threadData->timeTravelReads++;
|
||||
size_t startIdx = 0;
|
||||
loop {
|
||||
int readVersionIdx = deterministicRandom()->randomInt(0, threadData->writeVersions.size());
|
||||
readVersionIdx = deterministicRandom()->randomInt(startIdx, threadData->writeVersions.size());
|
||||
readVersion = threadData->writeVersions[readVersionIdx];
|
||||
if (readVersion >= threadData->minSuccessfulReadVersion) {
|
||||
break;
|
||||
} else {
|
||||
startIdx = readVersionIdx + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// randomly choose begin version or not
|
||||
beginVersion = 0;
|
||||
if (deterministicRandom()->random01() < 0.5) {
|
||||
int startIdx = 0;
|
||||
int endIdxExclusive = readVersionIdx + 1;
|
||||
// Choose skewed towards later versions. It's ok if beginVersion isn't readable though because it
|
||||
// will collapse
|
||||
size_t beginVersionIdx = (size_t)std::sqrt(
|
||||
deterministicRandom()->randomInt(startIdx * startIdx, endIdxExclusive * endIdxExclusive));
|
||||
beginVersion = threadData->writeVersions[beginVersionIdx];
|
||||
}
|
||||
|
||||
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
|
||||
wait(self->readFromBlob(cx, self, range, readVersion));
|
||||
self->validateResult(threadData, blob, startKey, endKey, 0, readVersion);
|
||||
wait(self->readFromBlob(cx, self, range, beginVersion, readVersion));
|
||||
self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion);
|
||||
|
||||
int resultBytes = blob.first.expectedSize();
|
||||
threadData->rowsRead += blob.first.size();
|
||||
|
@ -822,7 +884,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|||
fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion);
|
||||
}
|
||||
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
|
||||
wait(self->readFromBlob(cx, self, threadData->directoryRange, readVersion));
|
||||
wait(self->readFromBlob(cx, self, threadData->directoryRange, 0, readVersion));
|
||||
result = self->validateResult(threadData, blob, 0, std::numeric_limits<uint32_t>::max(), 0, readVersion);
|
||||
finalRowsValidated = blob.first.size();
|
||||
|
||||
|
|
|
@ -225,7 +225,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
|
|||
}
|
||||
|
||||
for (const BlobGranuleChunkRef& chunk : chunks) {
|
||||
RangeResult chunkRows = wait(readBlobGranule(chunk, range, version, self->bstore));
|
||||
RangeResult chunkRows = wait(readBlobGranule(chunk, range, 0, version, self->bstore));
|
||||
out.arena().dependsOn(chunkRows.arena());
|
||||
out.append(out.arena(), chunkRows.begin(), chunkRows.size());
|
||||
}
|
||||
|
|
|
@ -2378,9 +2378,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
|
|||
(!nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address()) ||
|
||||
nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()].processClass.machineClassFitness(
|
||||
ProcessClass::EncryptKeyProxy) > fitnessLowerBound)) {
|
||||
TraceEvent("ConsistencyCheck_EncyrptKeyProxyNotBest")
|
||||
TraceEvent("ConsistencyCheck_EncryptKeyProxyNotBest")
|
||||
.detail("BestEncryptKeyProxyFitness", fitnessLowerBound)
|
||||
.detail("ExistingEncyrptKeyProxyFitness",
|
||||
.detail("ExistingEncryptKeyProxyFitness",
|
||||
nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address())
|
||||
? nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()]
|
||||
.processClass.machineClassFitness(ProcessClass::EncryptKeyProxy)
|
||||
|
|
|
@ -21,16 +21,19 @@
|
|||
#include "fdbclient/DatabaseContext.h"
|
||||
#include "fdbclient/NativeAPI.actor.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/StreamCipher.h"
|
||||
#include "flow/BlobCipher.h"
|
||||
#include "fdbserver/workloads/workloads.actor.h"
|
||||
#include "flow/ITrace.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
#if ENCRYPTION_ENABLED
|
||||
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
#if ENCRYPTION_ENABLED
|
||||
|
||||
#define MEGA_BYTES (1024 * 1024)
|
||||
#define NANO_SECOND (1000 * 1000 * 1000)
|
||||
|
@ -78,45 +81,69 @@ struct WorkloadMetrics {
|
|||
}
|
||||
};
|
||||
|
||||
// Workload generator for encryption/decryption operations.
|
||||
// 1. For every client run, it generate unique random encryptionDomainId range and simulate encryption of
|
||||
// either fixed size or variable size payload.
|
||||
// 2. For each encryption run, it would interact with BlobCipherKeyCache to fetch the desired encryption key,
|
||||
// which then is used for encrypting the plaintext payload.
|
||||
// 3. Encryption operation generates 'encryption header', it is leveraged to decrypt the ciphertext obtained from
|
||||
// step#2 (simulate real-world scenario)
|
||||
//
|
||||
// Correctness validations:
|
||||
// -----------------------
|
||||
// Correctness invariants are validated at various steps:
|
||||
// 1. Encryption key correctness: as part of performing decryption, BlobCipherKeyCache lookup is done to procure
|
||||
// desired encrytion key based on: {encryptionDomainId, baseCipherId}; the obtained key is validated against
|
||||
// the encryption key used for encrypting the data.
|
||||
// 2. After encryption, generated 'encryption header' fields are validated, encrypted buffer size and contents are
|
||||
// validated.
|
||||
// 3. After decryption, the obtained deciphertext is validated against the orginal plaintext payload.
|
||||
//
|
||||
// Performance metrics:
|
||||
// -------------------
|
||||
// The workload generator profiles below operations across the iterations and logs the details at the end, they are:
|
||||
// 1. Time spent in encryption key fetch (and derivation) operations.
|
||||
// 2. Time spent encrypting the buffer (doesn't incude key lookup time); also records the throughput in MB/sec.
|
||||
// 3. Time spent decrypting the buffer (doesn't incude key lookup time); also records the throughput in MB/sec.
|
||||
|
||||
struct EncryptionOpsWorkload : TestWorkload {
|
||||
int mode;
|
||||
int64_t numIterations;
|
||||
int pageSize;
|
||||
int maxBufSize;
|
||||
std::unique_ptr<uint8_t[]> buff;
|
||||
std::unique_ptr<uint8_t[]> validationBuff;
|
||||
|
||||
StreamCipher::IV iv;
|
||||
std::unique_ptr<HmacSha256StreamCipher> hmacGenerator;
|
||||
std::unique_ptr<uint8_t[]> parentKey;
|
||||
Arena arena;
|
||||
std::unique_ptr<WorkloadMetrics> metrics;
|
||||
|
||||
BlobCipherDomainId minDomainId;
|
||||
BlobCipherDomainId maxDomainId;
|
||||
BlobCipherBaseKeyId minBaseCipherId;
|
||||
|
||||
EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
||||
mode = getOption(options, LiteralStringRef("fixedSize"), 1);
|
||||
numIterations = getOption(options, LiteralStringRef("numIterations"), 10);
|
||||
pageSize = getOption(options, LiteralStringRef("pageSize"), 4096);
|
||||
maxBufSize = getOption(options, LiteralStringRef("maxBufSize"), 512 * 1024);
|
||||
buff = std::make_unique<uint8_t[]>(maxBufSize);
|
||||
validationBuff = std::make_unique<uint8_t[]>(maxBufSize);
|
||||
|
||||
iv = getRandomIV();
|
||||
hmacGenerator = std::make_unique<HmacSha256StreamCipher>();
|
||||
parentKey = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
|
||||
generateRandomData(parentKey.get(), AES_256_KEY_LENGTH);
|
||||
// assign unique encryptionDomainId range per workload clients
|
||||
minDomainId = wcx.clientId * 100 + mode * 30 + 1;
|
||||
maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
|
||||
minBaseCipherId = 100;
|
||||
|
||||
metrics = std::make_unique<WorkloadMetrics>();
|
||||
|
||||
TraceEvent("EncryptionOpsWorkload").detail("Mode", getModeStr());
|
||||
TraceEvent("EncryptionOpsWorkload")
|
||||
.detail("Mode", getModeStr())
|
||||
.detail("MinDomainId", minDomainId)
|
||||
.detail("MaxDomainId", maxDomainId);
|
||||
}
|
||||
|
||||
~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload_Done").log(); }
|
||||
|
||||
bool isFixedSizePayload() { return mode == 1; }
|
||||
|
||||
StreamCipher::IV getRandomIV() {
|
||||
generateRandomData(iv.data(), iv.size());
|
||||
return iv;
|
||||
}
|
||||
|
||||
std::string getModeStr() const {
|
||||
if (mode == 1) {
|
||||
return "FixedSize";
|
||||
|
@ -127,47 +154,97 @@ struct EncryptionOpsWorkload : TestWorkload {
|
|||
throw internal_error();
|
||||
}
|
||||
|
||||
void updateEncryptionKey(StreamCipherKey* cipherKey) {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
applyHmacKeyDerivationFunc(cipherKey, hmacGenerator.get(), arena);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());
|
||||
void generateRandomBaseCipher(const int maxLen, uint8_t* buff, int* retLen) {
|
||||
memset(buff, 0, maxLen);
|
||||
*retLen = deterministicRandom()->randomInt(maxLen / 2, maxLen);
|
||||
generateRandomData(buff, *retLen);
|
||||
}
|
||||
|
||||
StringRef doEncryption(const StreamCipherKey* key, uint8_t* payload, int len) {
|
||||
EncryptionStreamCipher encryptor(key, iv);
|
||||
void setupCipherEssentials() {
|
||||
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
|
||||
|
||||
TraceEvent("SetupCipherEssentials_Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
|
||||
|
||||
uint8_t buff[AES_256_KEY_LENGTH];
|
||||
std::vector<Reference<BlobCipherKey>> cipherKeys;
|
||||
for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
|
||||
int cipherLen = 0;
|
||||
generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
|
||||
cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen);
|
||||
|
||||
ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);
|
||||
|
||||
cipherKeys = cipherKeyCache.getAllCiphers(id);
|
||||
ASSERT(cipherKeys.size() == 1);
|
||||
}
|
||||
|
||||
TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
|
||||
}
|
||||
|
||||
void resetCipherEssentials() {
|
||||
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
|
||||
cipherKeyCache.cleanup();
|
||||
|
||||
TraceEvent("ResetCipherEssentials_Done").log();
|
||||
}
|
||||
|
||||
void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId,
|
||||
uint8_t* baseCipher,
|
||||
int* baseCipherLen,
|
||||
BlobCipherBaseKeyId* nextBaseCipherId) {
|
||||
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
|
||||
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
|
||||
*nextBaseCipherId = cipherKey->getBaseCipherId() + 1;
|
||||
|
||||
generateRandomBaseCipher(AES_256_KEY_LENGTH, baseCipher, baseCipherLen);
|
||||
|
||||
ASSERT(*baseCipherLen > 0 && *baseCipherLen <= AES_256_KEY_LENGTH);
|
||||
TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId);
|
||||
}
|
||||
|
||||
Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> key,
|
||||
uint8_t* payload,
|
||||
int len,
|
||||
BlobCipherEncryptHeader* header) {
|
||||
uint8_t iv[AES_256_IV_LENGTH];
|
||||
generateRandomData(&iv[0], AES_256_IV_LENGTH);
|
||||
EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH);
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
auto encrypted = encryptor.encrypt(buff.get(), len, arena);
|
||||
encryptor.finish(arena);
|
||||
Reference<EncryptBuf> encrypted = encryptor.encrypt(payload, len, header, arena);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// validate encrypted buffer size and contents (not matching with plaintext)
|
||||
ASSERT(encrypted.size() == len);
|
||||
std::copy(encrypted.begin(), encrypted.end(), validationBuff.get());
|
||||
ASSERT(memcmp(validationBuff.get(), buff.get(), len) != 0);
|
||||
ASSERT(encrypted->getLogicalSize() == len);
|
||||
ASSERT(memcmp(encrypted->begin(), payload, len) != 0);
|
||||
ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
|
||||
|
||||
metrics->updateEncryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
|
||||
return encrypted;
|
||||
}
|
||||
|
||||
void doDecryption(const StreamCipherKey* key,
|
||||
const StringRef& encrypted,
|
||||
void doDecryption(Reference<EncryptBuf> encrypted,
|
||||
int len,
|
||||
const BlobCipherEncryptHeader& header,
|
||||
uint8_t* originalPayload,
|
||||
uint8_t* validationBuff) {
|
||||
DecryptionStreamCipher decryptor(key, iv);
|
||||
Reference<BlobCipherKey> orgCipherKey) {
|
||||
ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
|
||||
ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
|
||||
|
||||
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
|
||||
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
|
||||
ASSERT(cipherKey.isValid());
|
||||
ASSERT(cipherKey->isEqual(orgCipherKey));
|
||||
|
||||
DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]);
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
Standalone<StringRef> decrypted = decryptor.decrypt(encrypted.begin(), len, arena);
|
||||
decryptor.finish(arena);
|
||||
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// validate decrypted buffer size and contents (matching with original plaintext)
|
||||
ASSERT(decrypted.size() == len);
|
||||
std::copy(decrypted.begin(), decrypted.end(), validationBuff);
|
||||
ASSERT(memcmp(validationBuff, originalPayload, len) == 0);
|
||||
ASSERT(decrypted->getLogicalSize() == len);
|
||||
ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0);
|
||||
|
||||
metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
|
||||
}
|
||||
|
@ -177,22 +254,64 @@ struct EncryptionOpsWorkload : TestWorkload {
|
|||
std::string description() const override { return "EncryptionOps"; }
|
||||
|
||||
Future<Void> start(Database const& cx) override {
|
||||
uint8_t baseCipher[AES_256_KEY_LENGTH];
|
||||
int baseCipherLen = 0;
|
||||
BlobCipherBaseKeyId nextBaseCipherId;
|
||||
|
||||
// Setup encryptDomainIds and corresponding baseCipher details
|
||||
setupCipherEssentials();
|
||||
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
StreamCipherKey key(AES_256_KEY_LENGTH);
|
||||
// derive the encryption key
|
||||
updateEncryptionKey(&key);
|
||||
bool updateBaseCipher = deterministicRandom()->randomInt(1, 100) < 5;
|
||||
|
||||
// Step-1: Encryption key derivation, caching the cipher for later use
|
||||
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
|
||||
|
||||
// randomly select a domainId
|
||||
const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
|
||||
ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId);
|
||||
|
||||
if (updateBaseCipher) {
|
||||
// simulate baseCipherId getting refreshed/updated
|
||||
updateLatestBaseCipher(encryptDomainId, &baseCipher[0], &baseCipherLen, &nextBaseCipherId);
|
||||
cipherKeyCache.insertCipherKey(encryptDomainId, nextBaseCipherId, &baseCipher[0], baseCipherLen);
|
||||
}
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());
|
||||
|
||||
// Validate sanity of "getLatestCipher", especially when baseCipher gets updated
|
||||
if (updateBaseCipher) {
|
||||
ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId);
|
||||
ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen);
|
||||
ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0);
|
||||
}
|
||||
|
||||
int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize);
|
||||
generateRandomData(buff.get(), dataLen);
|
||||
|
||||
// encrypt the payload
|
||||
const auto& encrypted = doEncryption(&key, buff.get(), dataLen);
|
||||
// Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later
|
||||
BlobCipherEncryptHeader header;
|
||||
try {
|
||||
Reference<EncryptBuf> encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header);
|
||||
|
||||
// decrypt the payload
|
||||
doDecryption(&key, encrypted, dataLen, buff.get(), validationBuff.get());
|
||||
// Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and
|
||||
// decrypt
|
||||
doDecryption(encrypted, dataLen, header, buff.get(), cipherKey);
|
||||
} catch (Error& e) {
|
||||
TraceEvent("Failed")
|
||||
.detail("DomainId", encryptDomainId)
|
||||
.detail("BaseCipherId", cipherKey->getBaseCipherId());
|
||||
throw;
|
||||
}
|
||||
|
||||
metrics->updateBytes(dataLen);
|
||||
}
|
||||
|
||||
// Cleanup cipherKeys
|
||||
resetCipherEssentials();
|
||||
return Void();
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ void forceLinkMemcpyTests();
|
|||
void forceLinkMemcpyPerfTests();
|
||||
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
|
||||
void forceLinkStreamCipherTests();
|
||||
void forceLinkBLockCiherTests();
|
||||
#endif
|
||||
void forceLinkParallelStreamTests();
|
||||
void forceLinkSimExternalConnectionTests();
|
||||
|
@ -76,6 +77,7 @@ struct UnitTestWorkload : TestWorkload {
|
|||
forceLinkMemcpyPerfTests();
|
||||
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
|
||||
forceLinkStreamCipherTests();
|
||||
void forceLinkBlobCipherTests();
|
||||
#endif
|
||||
forceLinkParallelStreamTests();
|
||||
forceLinkSimExternalConnectionTests();
|
||||
|
|
|
@ -342,23 +342,23 @@ ArenaBlock* ArenaBlock::create(int dataSize, Reference<ArenaBlock>& next) {
|
|||
b->bigSize = 256;
|
||||
INSTRUMENT_ALLOCATE("Arena256");
|
||||
} else if (reqSize <= 512) {
|
||||
b = (ArenaBlock*)FastAllocator<512>::allocate();
|
||||
b = (ArenaBlock*)new uint8_t[512];
|
||||
b->bigSize = 512;
|
||||
INSTRUMENT_ALLOCATE("Arena512");
|
||||
} else if (reqSize <= 1024) {
|
||||
b = (ArenaBlock*)FastAllocator<1024>::allocate();
|
||||
b = (ArenaBlock*)new uint8_t[1024];
|
||||
b->bigSize = 1024;
|
||||
INSTRUMENT_ALLOCATE("Arena1024");
|
||||
} else if (reqSize <= 2048) {
|
||||
b = (ArenaBlock*)FastAllocator<2048>::allocate();
|
||||
b = (ArenaBlock*)new uint8_t[2048];
|
||||
b->bigSize = 2048;
|
||||
INSTRUMENT_ALLOCATE("Arena2048");
|
||||
} else if (reqSize <= 4096) {
|
||||
b = (ArenaBlock*)FastAllocator<4096>::allocate();
|
||||
b = (ArenaBlock*)new uint8_t[4096];
|
||||
b->bigSize = 4096;
|
||||
INSTRUMENT_ALLOCATE("Arena4096");
|
||||
} else {
|
||||
b = (ArenaBlock*)FastAllocator<8192>::allocate();
|
||||
b = (ArenaBlock*)new uint8_t[8192];
|
||||
b->bigSize = 8192;
|
||||
INSTRUMENT_ALLOCATE("Arena8192");
|
||||
}
|
||||
|
@ -460,26 +460,26 @@ void ArenaBlock::destroyLeaf() {
|
|||
FastAllocator<256>::release(this);
|
||||
INSTRUMENT_RELEASE("Arena256");
|
||||
} else if (bigSize <= 512) {
|
||||
FastAllocator<512>::release(this);
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
INSTRUMENT_RELEASE("Arena512");
|
||||
} else if (bigSize <= 1024) {
|
||||
FastAllocator<1024>::release(this);
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
INSTRUMENT_RELEASE("Arena1024");
|
||||
} else if (bigSize <= 2048) {
|
||||
FastAllocator<2048>::release(this);
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
INSTRUMENT_RELEASE("Arena2048");
|
||||
} else if (bigSize <= 4096) {
|
||||
FastAllocator<4096>::release(this);
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
INSTRUMENT_RELEASE("Arena4096");
|
||||
} else if (bigSize <= 8192) {
|
||||
FastAllocator<8192>::release(this);
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
INSTRUMENT_RELEASE("Arena8192");
|
||||
} else {
|
||||
#ifdef ALLOC_INSTRUMENTATION
|
||||
allocInstr["ArenaHugeKB"].dealloc((bigSize + 1023) >> 10);
|
||||
#endif
|
||||
g_hugeArenaMemory.fetch_sub(bigSize);
|
||||
delete[](uint8_t*) this;
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,652 @@
|
|||
/*
|
||||
* BlobCipher.cpp
|
||||
*
|
||||
* This source file is part of the FoundationDB open source project
|
||||
*
|
||||
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "flow/BlobCipher.h"
|
||||
#include "flow/Error.h"
|
||||
#include "flow/FastRef.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/ITrace.h"
|
||||
#include "flow/network.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/UnitTest.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#if ENCRYPTION_ENABLED
|
||||
|
||||
// BlobCipherEncryptHeader
|
||||
BlobCipherEncryptHeader::BlobCipherEncryptHeader() {
|
||||
flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE;
|
||||
}
|
||||
|
||||
// BlobCipherKey class methods
|
||||
|
||||
BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
|
||||
const BlobCipherBaseKeyId& baseCiphId,
|
||||
const uint8_t* baseCiph,
|
||||
int baseCiphLen) {
|
||||
BlobCipherRandomSalt salt;
|
||||
if (g_network->isSimulated()) {
|
||||
salt = deterministicRandom()->randomUInt64();
|
||||
} else {
|
||||
salt = nondeterministicRandom()->randomUInt64();
|
||||
}
|
||||
initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt);
|
||||
/*TraceEvent("BlobCipherKey")
|
||||
.detail("DomainId", domainId)
|
||||
.detail("BaseCipherId", baseCipherId)
|
||||
.detail("BaseCipherLen", baseCipherLen)
|
||||
.detail("RandomSalt", randomSalt)
|
||||
.detail("CreationTime", creationTime);*/
|
||||
}
|
||||
|
||||
void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,
|
||||
const uint8_t* baseCiph,
|
||||
int baseCiphLen,
|
||||
const BlobCipherBaseKeyId& baseCiphId,
|
||||
const BlobCipherRandomSalt& salt) {
|
||||
// Set the base encryption key properties
|
||||
baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
|
||||
memset(baseCipher.get(), 0, AES_256_KEY_LENGTH);
|
||||
memcpy(baseCipher.get(), baseCiph, std::min<int>(baseCiphLen, AES_256_KEY_LENGTH));
|
||||
baseCipherLen = baseCiphLen;
|
||||
baseCipherId = baseCiphId;
|
||||
// Set the encryption domain for the base encryption key
|
||||
encryptDomainId = domainId;
|
||||
randomSalt = salt;
|
||||
// derive the encryption key
|
||||
cipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
|
||||
memset(cipher.get(), 0, AES_256_KEY_LENGTH);
|
||||
applyHmacSha256Derivation();
|
||||
// update the key creation time
|
||||
creationTime = now();
|
||||
}
|
||||
|
||||
void BlobCipherKey::applyHmacSha256Derivation() {
|
||||
Arena arena;
|
||||
uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)];
|
||||
memcpy(&buf[0], baseCipher.get(), baseCipherLen);
|
||||
memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt));
|
||||
HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen);
|
||||
StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena);
|
||||
std::copy(digest.begin(), digest.end(), cipher.get());
|
||||
if (digest.size() < AES_256_KEY_LENGTH) {
|
||||
memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size());
|
||||
}
|
||||
}
|
||||
|
||||
void BlobCipherKey::reset() {
|
||||
memset(baseCipher.get(), 0, baseCipherLen);
|
||||
memset(cipher.get(), 0, AES_256_KEY_LENGTH);
|
||||
}
|
||||
|
||||
// BlobKeyIdCache class methods
|
||||
|
||||
BlobCipherKeyIdCache::BlobCipherKeyIdCache()
|
||||
: domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {}
|
||||
|
||||
BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId)
|
||||
: domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {
|
||||
TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId);
|
||||
}
|
||||
|
||||
Reference<BlobCipherKey> BlobCipherKeyIdCache::getLatestCipherKey() {
|
||||
return getCipherByBaseCipherId(latestBaseCipherKeyId);
|
||||
}
|
||||
|
||||
Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) {
|
||||
BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId);
|
||||
if (itr == keyIdCache.end()) {
|
||||
throw encrypt_key_not_found();
|
||||
}
|
||||
return itr->second;
|
||||
}
|
||||
|
||||
void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId,
|
||||
const uint8_t* baseCipher,
|
||||
int baseCipherLen) {
|
||||
ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID);
|
||||
|
||||
// BaseCipherKeys are immutable, ensure that cached value doesn't get updated.
|
||||
BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId);
|
||||
if (itr != keyIdCache.end()) {
|
||||
if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) {
|
||||
TraceEvent("InsertBaseCipherKey_AlreadyPresent")
|
||||
.detail("BaseCipherKeyId", baseCipherId)
|
||||
.detail("DomainId", domainId);
|
||||
// Key is already present; nothing more to do.
|
||||
return;
|
||||
} else {
|
||||
TraceEvent("InsertBaseCipherKey_UpdateCipher")
|
||||
.detail("BaseCipherKeyId", baseCipherId)
|
||||
.detail("DomainId", domainId);
|
||||
throw encrypt_update_cipher();
|
||||
}
|
||||
}
|
||||
|
||||
keyIdCache.emplace(baseCipherId, makeReference<BlobCipherKey>(domainId, baseCipherId, baseCipher, baseCipherLen));
|
||||
// Update the latest BaseCipherKeyId for the given encryption domain
|
||||
latestBaseCipherKeyId = baseCipherId;
|
||||
}
|
||||
|
||||
void BlobCipherKeyIdCache::cleanup() {
|
||||
for (auto& keyItr : keyIdCache) {
|
||||
keyItr.second->reset();
|
||||
}
|
||||
|
||||
keyIdCache.clear();
|
||||
}
|
||||
|
||||
std::vector<Reference<BlobCipherKey>> BlobCipherKeyIdCache::getAllCipherKeys() {
|
||||
std::vector<Reference<BlobCipherKey>> cipherKeys;
|
||||
for (auto& keyItr : keyIdCache) {
|
||||
cipherKeys.push_back(keyItr.second);
|
||||
}
|
||||
return cipherKeys;
|
||||
}
|
||||
|
||||
// BlobCipherKeyCache class methods
|
||||
|
||||
void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
|
||||
const BlobCipherBaseKeyId& baseCipherId,
|
||||
const uint8_t* baseCipher,
|
||||
int baseCipherLen) {
|
||||
if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) {
|
||||
throw encrypt_invalid_id();
|
||||
}
|
||||
|
||||
try {
|
||||
auto domainItr = domainCacheMap.find(domainId);
|
||||
if (domainItr == domainCacheMap.end()) {
|
||||
// Add mapping to track new encryption domain
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = makeReference<BlobCipherKeyIdCache>(domainId);
|
||||
keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen);
|
||||
domainCacheMap.emplace(domainId, keyIdCache);
|
||||
} else {
|
||||
// Track new baseCipher keys
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
|
||||
keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen);
|
||||
}
|
||||
|
||||
TraceEvent("InsertCipherKey").detail("DomainId", domainId).detail("BaseCipherKeyId", baseCipherId);
|
||||
} catch (Error& e) {
|
||||
TraceEvent("InsertCipherKey_Failed").detail("BaseCipherKeyId", baseCipherId).detail("DomainId", domainId);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) {
|
||||
auto domainItr = domainCacheMap.find(domainId);
|
||||
if (domainItr == domainCacheMap.end()) {
|
||||
TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId);
|
||||
throw encrypt_key_not_found();
|
||||
}
|
||||
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
|
||||
Reference<BlobCipherKey> cipherKey = keyIdCache->getLatestCipherKey();
|
||||
if ((now() - cipherKey->getCreationTime()) > BlobCipherKeyCache::CIPHER_KEY_CACHE_TTL_SEC) {
|
||||
TraceEvent("GetLatestCipherKey_ExpiredTTL")
|
||||
.detail("DomainId", domainId)
|
||||
.detail("BaseCipherId", cipherKey->getBaseCipherId());
|
||||
throw encrypt_key_ttl_expired();
|
||||
}
|
||||
|
||||
return cipherKey;
|
||||
}
|
||||
|
||||
Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId,
|
||||
const BlobCipherBaseKeyId& baseCipherId) {
|
||||
auto domainItr = domainCacheMap.find(domainId);
|
||||
if (domainItr == domainCacheMap.end()) {
|
||||
throw encrypt_key_not_found();
|
||||
}
|
||||
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
|
||||
return keyIdCache->getCipherByBaseCipherId(baseCipherId);
|
||||
}
|
||||
|
||||
void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) {
|
||||
auto domainItr = domainCacheMap.find(domainId);
|
||||
if (domainItr == domainCacheMap.end()) {
|
||||
throw encrypt_key_not_found();
|
||||
}
|
||||
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
|
||||
keyIdCache->cleanup();
|
||||
TraceEvent("ResetEncryptDomainId").detail("DomainId", domainId);
|
||||
}
|
||||
|
||||
void BlobCipherKeyCache::cleanup() noexcept {
|
||||
BlobCipherKeyCache& instance = BlobCipherKeyCache::getInstance();
|
||||
for (auto& domainItr : instance.domainCacheMap) {
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr.second;
|
||||
keyIdCache->cleanup();
|
||||
TraceEvent("BlobCipherKeyCache_Cleanup").detail("DomainId", domainItr.first);
|
||||
}
|
||||
|
||||
instance.domainCacheMap.clear();
|
||||
}
|
||||
|
||||
std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) {
|
||||
auto domainItr = domainCacheMap.find(domainId);
|
||||
if (domainItr == domainCacheMap.end()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
|
||||
return keyIdCache->getAllCipherKeys();
|
||||
}
|
||||
|
||||
// EncryptBlobCipher class methods
|
||||
|
||||
EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key,
|
||||
const uint8_t* cipherIV,
|
||||
const int ivLen)
|
||||
: ctx(EVP_CIPHER_CTX_new()), cipherKey(key) {
|
||||
ASSERT(ivLen == AES_256_IV_LENGTH);
|
||||
memcpy(&iv[0], cipherIV, ivLen);
|
||||
|
||||
if (ctx == nullptr) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
}
|
||||
|
||||
Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plaintext,
|
||||
const int plaintextLen,
|
||||
BlobCipherEncryptHeader* header,
|
||||
Arena& arena) {
|
||||
TEST(true); // Encrypting data with BlobCipher
|
||||
|
||||
Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(plaintextLen + AES_BLOCK_SIZE, arena);
|
||||
uint8_t* ciphertext = encryptBuf->begin();
|
||||
int bytes{ 0 };
|
||||
if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
|
||||
TraceEvent("Encrypt_UpdateFailed")
|
||||
.detail("BaseCipherId", cipherKey->getBaseCipherId())
|
||||
.detail("EncryptDomainId", cipherKey->getDomainId());
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
int finalBytes{ 0 };
|
||||
if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
|
||||
TraceEvent("Encrypt_FinalFailed")
|
||||
.detail("BaseCipherId", cipherKey->getBaseCipherId())
|
||||
.detail("EncryptDomainId", cipherKey->getDomainId());
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
if ((bytes + finalBytes) != plaintextLen) {
|
||||
TraceEvent("Encrypt_UnexpectedCipherLen")
|
||||
.detail("PlaintextLen", plaintextLen)
|
||||
.detail("EncryptedBufLen", bytes + finalBytes);
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
// populate header details for the encrypted blob.
|
||||
header->flags.size = sizeof(BlobCipherEncryptHeader);
|
||||
header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION;
|
||||
header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR;
|
||||
header->baseCipherId = cipherKey->getBaseCipherId();
|
||||
header->encryptDomainId = cipherKey->getDomainId();
|
||||
header->salt = cipherKey->getSalt();
|
||||
memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH);
|
||||
|
||||
// Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip
|
||||
// scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM.
|
||||
|
||||
header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena);
|
||||
|
||||
encryptBuf->setLogicalSize(plaintextLen);
|
||||
return encryptBuf;
|
||||
}
|
||||
|
||||
EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() {
|
||||
if (ctx != nullptr) {
|
||||
EVP_CIPHER_CTX_free(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
// DecryptBlobCipher class methods
|
||||
|
||||
DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv)
|
||||
: ctx(EVP_CIPHER_CTX_new()) {
|
||||
if (ctx == nullptr) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
}
|
||||
|
||||
void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext,
|
||||
const int ciphertextLen,
|
||||
const BlobCipherEncryptHeader& header,
|
||||
Arena& arena) {
|
||||
// validate header flag sanity
|
||||
if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
|
||||
header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) {
|
||||
TraceEvent("VerifyEncryptBlobHeader")
|
||||
.detail("HeaderVersion", header.flags.headerVersion)
|
||||
.detail("HeaderMode", header.flags.encryptMode)
|
||||
.detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
|
||||
.detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
|
||||
throw encrypt_header_metadata_mismatch();
|
||||
}
|
||||
|
||||
// encrypted byte checksum sanity; protection against data bit-rot/flip.
|
||||
BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena);
|
||||
if (computed != header.ciphertextChecksum) {
|
||||
TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch")
|
||||
.detail("HeaderVersion", header.flags.headerVersion)
|
||||
.detail("HeaderMode", header.flags.encryptMode)
|
||||
.detail("CiphertextChecksum", header.ciphertextChecksum)
|
||||
.detail("ComputedCiphertextChecksum", computed);
|
||||
throw encrypt_header_checksum_mismatch();
|
||||
}
|
||||
}
|
||||
|
||||
Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphertext,
|
||||
const int ciphertextLen,
|
||||
const BlobCipherEncryptHeader& header,
|
||||
Arena& arena) {
|
||||
TEST(true); // Decrypting data with BlobCipher
|
||||
|
||||
verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena);
|
||||
|
||||
Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(ciphertextLen + AES_BLOCK_SIZE, arena);
|
||||
uint8_t* plaintext = decrypted->begin();
|
||||
int bytesDecrypted{ 0 };
|
||||
if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
|
||||
TraceEvent("Decrypt_UpdateFailed")
|
||||
.detail("BaseCipherId", header.baseCipherId)
|
||||
.detail("EncryptDomainId", header.encryptDomainId);
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
int finalBlobBytes{ 0 };
|
||||
if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
|
||||
TraceEvent("Decrypt_FinalFailed")
|
||||
.detail("BaseCipherId", header.baseCipherId)
|
||||
.detail("EncryptDomainId", header.encryptDomainId);
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) {
|
||||
TraceEvent("Encrypt_UnexpectedPlaintextLen")
|
||||
.detail("CiphertextLen", ciphertextLen)
|
||||
.detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes);
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
decrypted->setLogicalSize(ciphertextLen);
|
||||
return decrypted;
|
||||
}
|
||||
|
||||
DecryptBlobCipherAes256Ctr::~DecryptBlobCipherAes256Ctr() {
|
||||
if (ctx != nullptr) {
|
||||
EVP_CIPHER_CTX_free(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
// HmacSha256DigestGen class methods
|
||||
|
||||
HmacSha256DigestGen::HmacSha256DigestGen(const unsigned char* key, size_t len) : ctx(HMAC_CTX_new()) {
|
||||
if (!HMAC_Init_ex(ctx, key, len, EVP_sha256(), nullptr)) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
}
|
||||
|
||||
HmacSha256DigestGen::~HmacSha256DigestGen() {
|
||||
if (ctx != nullptr) {
|
||||
HMAC_CTX_free(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Arena& arena) {
|
||||
TEST(true); // Digest generation
|
||||
unsigned int digestLen = HMAC_size(ctx);
|
||||
auto digest = new (arena) unsigned char[digestLen];
|
||||
if (HMAC_Update(ctx, data, len) != 1) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
|
||||
if (HMAC_Final(ctx, digest, &digestLen) != 1) {
|
||||
throw encrypt_ops_error();
|
||||
}
|
||||
return StringRef(digest, digestLen);
|
||||
}
|
||||
|
||||
// Only used to link unit tests
|
||||
void forceLinkBlobCipherTests() {}
|
||||
|
||||
// Tests cases includes:
|
||||
// 1. Populate cache by inserting 'baseCipher' details for new encryptionDomainIds
|
||||
// 2. Random lookup for cipherKeys and content validation
|
||||
// 3. Inserting of 'identical' cipherKey (already cached) more than once works as desired.
|
||||
// 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired.
|
||||
// 5. Validation encryption ops (correctness):
|
||||
// 5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents.
|
||||
// 5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc.
|
||||
// 6. Cache cleanup
|
||||
// 6.1 cleanup cipherKeys by given encryptDomainId
|
||||
// 6.2. Cleanup all cached cipherKeys
|
||||
TEST_CASE("flow/BlobCipher") {
|
||||
TraceEvent("BlobCipherTest_Start").log();
|
||||
// Construct a dummy External Key Manager representation and populate with some keys
|
||||
class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
|
||||
public:
|
||||
BlobCipherDomainId domainId;
|
||||
int len;
|
||||
BlobCipherBaseKeyId keyId;
|
||||
std::unique_ptr<uint8_t[]> key;
|
||||
|
||||
BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId)
|
||||
: domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)),
|
||||
keyId(kId), key(std::make_unique<uint8_t[]>(len)) {
|
||||
generateRandomData(key.get(), len);
|
||||
}
|
||||
};
|
||||
|
||||
using BaseKeyMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BaseCipher>>;
|
||||
using DomainKeyMap = std::unordered_map<BlobCipherDomainId, BaseKeyMap>;
|
||||
DomainKeyMap domainKeyMap;
|
||||
const BlobCipherDomainId minDomainId = 1;
|
||||
const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
|
||||
const BlobCipherBaseKeyId minBaseCipherKeyId = 100;
|
||||
const BlobCipherBaseKeyId maxBaseCipherKeyId =
|
||||
deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15;
|
||||
for (int dId = minDomainId; dId <= maxDomainId; dId++) {
|
||||
for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) {
|
||||
domainKeyMap[dId].emplace(kId, makeReference<BaseCipher>(dId, kId));
|
||||
}
|
||||
}
|
||||
ASSERT(domainKeyMap.size() == maxDomainId);
|
||||
|
||||
// insert BlobCipher keys into BlobCipherKeyCache map and validate
|
||||
TraceEvent("BlobCipherTest_InsertKeys").log();
|
||||
BlobCipherKeyCache& cipherKeyCache = BlobCipherKeyCache::getInstance();
|
||||
for (auto& domainItr : domainKeyMap) {
|
||||
for (auto& baseKeyItr : domainItr.second) {
|
||||
Reference<BaseCipher> baseCipher = baseKeyItr.second;
|
||||
|
||||
cipherKeyCache.insertCipherKey(
|
||||
baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
|
||||
}
|
||||
}
|
||||
TraceEvent("BlobCipherTest_InsertKeysDone").log();
|
||||
|
||||
// validate the cipherKey lookups work as desired
|
||||
for (auto& domainItr : domainKeyMap) {
|
||||
for (auto& baseKeyItr : domainItr.second) {
|
||||
Reference<BaseCipher> baseCipher = baseKeyItr.second;
|
||||
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId);
|
||||
ASSERT(cipherKey.isValid());
|
||||
// validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher
|
||||
ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId);
|
||||
ASSERT(cipherKey->getDomainId() == baseCipher->domainId);
|
||||
ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len);
|
||||
// ensure that baseCipher matches with the cached information
|
||||
ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0);
|
||||
// validate the encryption derivation
|
||||
ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0);
|
||||
}
|
||||
}
|
||||
TraceEvent("BlobCipherTest_LooksupDone").log();
|
||||
|
||||
// Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP
|
||||
try {
|
||||
Reference<BaseCipher> baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId];
|
||||
cipherKeyCache.insertCipherKey(baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
|
||||
} catch (Error& e) {
|
||||
throw;
|
||||
}
|
||||
TraceEvent("BlobCipherTest_ReinsertIdempotentKeyDone").log();
|
||||
|
||||
// Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error
|
||||
try {
|
||||
Reference<BaseCipher> baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId];
|
||||
uint8_t rawCipher[baseCipher->len];
|
||||
memcpy(rawCipher, baseCipher->key.get(), baseCipher->len);
|
||||
// modify few bytes in the cipherKey
|
||||
for (int i = 2; i < 5; i++) {
|
||||
rawCipher[i]++;
|
||||
}
|
||||
cipherKeyCache.insertCipherKey(baseCipher->domainId, baseCipher->keyId, &rawCipher[0], baseCipher->len);
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_encrypt_update_cipher) {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
TraceEvent("BlobCipherTest_ReinsertNonIdempotentKeyDone").log();
|
||||
|
||||
// Validate Encyrption ops
|
||||
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId);
|
||||
const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512;
|
||||
uint8_t orgData[bufLen];
|
||||
generateRandomData(&orgData[0], bufLen);
|
||||
|
||||
Arena arena;
|
||||
uint8_t iv[AES_256_IV_LENGTH];
|
||||
generateRandomData(&iv[0], AES_256_IV_LENGTH);
|
||||
|
||||
// validate basic encrypt followed by decrypt operation
|
||||
EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH);
|
||||
BlobCipherEncryptHeader header;
|
||||
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
|
||||
|
||||
ASSERT(encrypted->getLogicalSize() == bufLen);
|
||||
ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0);
|
||||
ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
|
||||
ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
|
||||
|
||||
TraceEvent("BlobCipherTest_EncryptDone")
|
||||
.detail("HeaderVersion", header.flags.headerVersion)
|
||||
.detail("HeaderEncryptMode", header.flags.encryptMode)
|
||||
.detail("DomainId", header.encryptDomainId)
|
||||
.detail("BaseCipherId", header.baseCipherId)
|
||||
.detail("HeaderChecksum", header.ciphertextChecksum);
|
||||
|
||||
Reference<BlobCipherKey> encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
|
||||
ASSERT(encyrptKey->isEqual(cipherKey));
|
||||
DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]);
|
||||
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
|
||||
|
||||
ASSERT(decrypted->getLogicalSize() == bufLen);
|
||||
ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0);
|
||||
|
||||
TraceEvent("BlobCipherTest_DecryptDone").log();
|
||||
|
||||
// induce encryption header corruption - headerVersion corrupted
|
||||
header.flags.headerVersion += 1;
|
||||
try {
|
||||
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
|
||||
throw;
|
||||
}
|
||||
header.flags.headerVersion -= 1;
|
||||
}
|
||||
|
||||
// induce encryption header corruption - encryptionMode corrupted
|
||||
header.flags.encryptMode += 1;
|
||||
try {
|
||||
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
|
||||
throw;
|
||||
}
|
||||
header.flags.encryptMode -= 1;
|
||||
}
|
||||
|
||||
// induce encryption header corruption - checksum mismatch
|
||||
header.ciphertextChecksum += 1;
|
||||
try {
|
||||
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
|
||||
} catch (Error& e) {
|
||||
if (e.code() != error_code_encrypt_header_checksum_mismatch) {
|
||||
throw;
|
||||
}
|
||||
header.ciphertextChecksum -= 1;
|
||||
}
|
||||
|
||||
// Validate dropping encyrptDomainId cached keys
|
||||
const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
|
||||
cipherKeyCache.resetEncyrptDomainId(candidate);
|
||||
std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(candidate);
|
||||
ASSERT(cachedKeys.empty());
|
||||
|
||||
// Validate dropping all cached cipherKeys
|
||||
cipherKeyCache.cleanup();
|
||||
for (int dId = minDomainId; dId < maxDomainId; dId++) {
|
||||
std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(dId);
|
||||
ASSERT(cachedKeys.empty());
|
||||
}
|
||||
|
||||
TraceEvent("BlobCipherTest_Done").log();
|
||||
return Void();
|
||||
}
|
||||
|
||||
BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
|
||||
const int payloadLen,
|
||||
const BlobCipherRandomSalt& salt,
|
||||
Arena& arena) {
|
||||
// FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum
|
||||
// Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest.
|
||||
|
||||
HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt));
|
||||
StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
|
||||
ASSERT(digest.size() >= sizeof(BlobCipherChecksum));
|
||||
|
||||
BlobCipherChecksum checksum;
|
||||
memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum));
|
||||
return checksum;
|
||||
}
|
||||
|
||||
#endif // ENCRYPTION_ENABLED
|
|
@ -0,0 +1,321 @@
|
|||
/*
|
||||
* BlobCipher.h
|
||||
*
|
||||
* This source file is part of the FoundationDB open source project
|
||||
*
|
||||
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
|
||||
#define ENCRYPTION_ENABLED 1
|
||||
#else
|
||||
#define ENCRYPTION_ENABLED 0
|
||||
#endif
|
||||
|
||||
#if ENCRYPTION_ENABLED
|
||||
|
||||
#include "flow/Arena.h"
|
||||
#include "flow/FastRef.h"
|
||||
#include "flow/flow.h"
|
||||
#include "flow/xxhash.h"
|
||||
|
||||
#include <openssl/aes.h>
|
||||
#include <openssl/engine.h>
|
||||
#include <openssl/evp.h>
|
||||
#include <openssl/hmac.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#define AES_256_KEY_LENGTH 32
|
||||
#define AES_256_IV_LENGTH 16
|
||||
#define INVALID_DOMAIN_ID 0
|
||||
#define INVALID_CIPHER_KEY_ID 0
|
||||
|
||||
using BlobCipherDomainId = uint64_t;
|
||||
using BlobCipherRandomSalt = uint64_t;
|
||||
using BlobCipherBaseKeyId = uint64_t;
|
||||
using BlobCipherChecksum = uint64_t;
|
||||
|
||||
typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode;
|
||||
|
||||
// Encryption operations buffer management
|
||||
// Approach limits number of copies needed during encryption or decryption operations.
|
||||
// For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture
|
||||
// the ciphertext. Similarly, on decryption EncryptBuf is allocated using client supplied Arena and provided
|
||||
// to the AES library to capture decipher text and passed back to the clients. Given the object passed around
|
||||
// is reference-counted, it gets freed once refrenceCount goes to 0.
|
||||
|
||||
class EncryptBuf : public ReferenceCounted<EncryptBuf>, NonCopyable {
|
||||
public:
|
||||
EncryptBuf(int size, Arena& arena) : allocSize(size), logicalSize(size) {
|
||||
if (size > 0) {
|
||||
buffer = new (arena) uint8_t[size];
|
||||
} else {
|
||||
buffer = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int getLogicalSize() { return logicalSize; }
|
||||
void setLogicalSize(int value) {
|
||||
ASSERT(value <= allocSize);
|
||||
logicalSize = value;
|
||||
}
|
||||
uint8_t* begin() { return buffer; }
|
||||
|
||||
private:
|
||||
int allocSize;
|
||||
int logicalSize;
|
||||
uint8_t* buffer;
|
||||
};
|
||||
|
||||
// BlobCipher Encryption header format
|
||||
// This header is persisted along with encrypted buffer, it contains information necessary
|
||||
// to assist decrypting the buffers to serve read requests.
|
||||
//
|
||||
// The total space overhead is 56 bytes.
|
||||
|
||||
#pragma pack(push, 1) // exact fit - no padding
|
||||
typedef struct BlobCipherEncryptHeader {
|
||||
union {
|
||||
struct {
|
||||
uint8_t size; // reading first byte is sufficient to determine header
|
||||
// length. ALWAYS THE FIRST HEADER ELEMENT.
|
||||
uint8_t headerVersion{};
|
||||
uint8_t encryptMode{};
|
||||
uint8_t _reserved[5]{};
|
||||
} flags;
|
||||
uint64_t _padding{};
|
||||
};
|
||||
// Encyrption domain boundary identifier.
|
||||
BlobCipherDomainId encryptDomainId{};
|
||||
// BaseCipher encryption key identifier
|
||||
BlobCipherBaseKeyId baseCipherId{};
|
||||
// Random salt
|
||||
BlobCipherRandomSalt salt{};
|
||||
// Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'.
|
||||
BlobCipherChecksum ciphertextChecksum{};
|
||||
// Initialization vector used to encrypt the payload.
|
||||
uint8_t iv[AES_256_IV_LENGTH];
|
||||
|
||||
BlobCipherEncryptHeader();
|
||||
} BlobCipherEncryptHeader;
|
||||
#pragma pack(pop)
|
||||
|
||||
// This interface is in-memory representation of CipherKey used for encryption/decryption information.
|
||||
// It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying
|
||||
// HMAC-SHA-256 derivation technique.
|
||||
|
||||
class BlobCipherKey : public ReferenceCounted<BlobCipherKey>, NonCopyable {
|
||||
public:
|
||||
BlobCipherKey(const BlobCipherDomainId& domainId,
|
||||
const BlobCipherBaseKeyId& baseCiphId,
|
||||
const uint8_t* baseCiph,
|
||||
int baseCiphLen);
|
||||
|
||||
uint8_t* data() const { return cipher.get(); }
|
||||
uint64_t getCreationTime() const { return creationTime; }
|
||||
BlobCipherDomainId getDomainId() const { return encryptDomainId; }
|
||||
BlobCipherRandomSalt getSalt() const { return randomSalt; }
|
||||
BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
|
||||
int getBaseCipherLen() const { return baseCipherLen; }
|
||||
uint8_t* rawCipher() const { return cipher.get(); }
|
||||
uint8_t* rawBaseCipher() const { return baseCipher.get(); }
|
||||
bool isEqual(const Reference<BlobCipherKey> toCompare) {
|
||||
return encryptDomainId == toCompare->getDomainId() && baseCipherId == toCompare->getBaseCipherId() &&
|
||||
randomSalt == toCompare->getSalt() && baseCipherLen == toCompare->getBaseCipherLen() &&
|
||||
memcmp(cipher.get(), toCompare->rawCipher(), AES_256_KEY_LENGTH) == 0 &&
|
||||
memcmp(baseCipher.get(), toCompare->rawBaseCipher(), baseCipherLen) == 0;
|
||||
}
|
||||
void reset();
|
||||
|
||||
private:
|
||||
// Encryption domain boundary identifier
|
||||
BlobCipherDomainId encryptDomainId;
|
||||
// Base encryption cipher key properties
|
||||
std::unique_ptr<uint8_t[]> baseCipher;
|
||||
int baseCipherLen;
|
||||
BlobCipherBaseKeyId baseCipherId;
|
||||
// Random salt used for encryption cipher key derivation
|
||||
BlobCipherRandomSalt randomSalt;
|
||||
// Creation timestamp for the derived encryption cipher key
|
||||
uint64_t creationTime;
|
||||
// Derived encryption cipher key
|
||||
std::unique_ptr<uint8_t[]> cipher;
|
||||
|
||||
void initKey(const BlobCipherDomainId& domainId,
|
||||
const uint8_t* baseCiph,
|
||||
int baseCiphLen,
|
||||
const BlobCipherBaseKeyId& baseCiphId,
|
||||
const BlobCipherRandomSalt& salt);
|
||||
void applyHmacSha256Derivation();
|
||||
};
|
||||
|
||||
// This interface allows FDB processes participating in encryption to store and
|
||||
// index recently used encyption cipher keys. FDB encryption has two dimensions:
|
||||
// 1. Mapping on cipher encryption keys per "encryption domains"
|
||||
// 2. Per encryption domain, the cipher keys are index using "baseCipherKeyId".
|
||||
//
|
||||
// The design supports NIST recommendation of limiting lifetime of an encryption
|
||||
// key. For details refer to:
|
||||
// https://csrc.nist.gov/publications/detail/sp/800-57-part-1/rev-3/archive/2012-07-10
|
||||
//
|
||||
// Below gives a pictoral representation of in-memory datastructure implemented
|
||||
// to index encryption keys:
|
||||
// { encryptionDomain -> { baseCipherId -> cipherKey } }
|
||||
//
|
||||
// Supported cache lookups schemes:
|
||||
// 1. Lookup cipher based on { encryptionDomainId, baseCipherKeyId } tuple.
|
||||
// 2. Lookup latest cipher key for a given encryptionDomainId.
|
||||
//
|
||||
// Client is responsible to handle cache-miss usecase, the corrective operation
|
||||
// might vary based on the calling process, for instance: EncryptKeyServer
|
||||
// cache-miss shall invoke RPC to external Encryption Key Manager to fetch the
|
||||
// required encryption key, however, CPs/SSs cache-miss would result in RPC to
|
||||
// EncryptKeyServer to refresh the desired encryption key.
|
||||
|
||||
using BlobCipherKeyIdCacheMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>;
|
||||
using BlobCipherKeyIdCacheMapCItr = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;
|
||||
|
||||
struct BlobCipherKeyIdCache : ReferenceCounted<BlobCipherKeyIdCache> {
|
||||
public:
|
||||
BlobCipherKeyIdCache();
|
||||
explicit BlobCipherKeyIdCache(BlobCipherDomainId dId);
|
||||
|
||||
// API returns the last inserted cipherKey.
|
||||
// If none exists, 'encrypt_key_not_found' is thrown.
|
||||
Reference<BlobCipherKey> getLatestCipherKey();
|
||||
// API returns cipherKey corresponding to input 'baseCipherKeyId'.
|
||||
// If none exists, 'encrypt_key_not_found' is thrown.
|
||||
Reference<BlobCipherKey> getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId);
|
||||
// API enables inserting base encryption cipher details to the BlobCipherKeyIdCache.
|
||||
// Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey
|
||||
// is treated as a NOP (success), however, an attempt to update cipherKey would throw
|
||||
// 'encrypt_update_cipher' exception.
|
||||
void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
|
||||
// API cleanup the cache by dropping all cached cipherKeys
|
||||
void cleanup();
|
||||
// API returns list of all 'cached' cipherKeys
|
||||
std::vector<Reference<BlobCipherKey>> getAllCipherKeys();
|
||||
|
||||
private:
|
||||
BlobCipherDomainId domainId;
|
||||
BlobCipherKeyIdCacheMap keyIdCache;
|
||||
BlobCipherBaseKeyId latestBaseCipherKeyId;
|
||||
};
|
||||
|
||||
using BlobCipherDomainCacheMap = std::unordered_map<BlobCipherDomainId, Reference<BlobCipherKeyIdCache>>;
|
||||
|
||||
class BlobCipherKeyCache : NonCopyable {
|
||||
public:
|
||||
// Enable clients to insert base encryption cipher details to the BlobCipherKeyCache.
|
||||
// The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable,
|
||||
// attempting to re-insert same 'identical' cipherKey is treated as a NOP (success),
|
||||
// however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception.
|
||||
void insertCipherKey(const BlobCipherDomainId& domainId,
|
||||
const BlobCipherBaseKeyId& baseCipherId,
|
||||
const uint8_t* baseCipher,
|
||||
int baseCipherLen);
|
||||
// API returns the last insert cipherKey for a given encyryption domain Id.
|
||||
// If none exists, it would throw 'encrypt_key_not_found' exception.
|
||||
Reference<BlobCipherKey> getLatestCipherKey(const BlobCipherDomainId& domainId);
|
||||
// API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple.
|
||||
// If none exists, it would throw 'encrypt_key_not_found' exception.
|
||||
Reference<BlobCipherKey> getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId);
|
||||
// API returns point in time list of all 'cached' cipherKeys for a given encryption domainId.
|
||||
std::vector<Reference<BlobCipherKey>> getAllCiphers(const BlobCipherDomainId& domainId);
|
||||
// API enables dropping all 'cached' cipherKeys for a given encryption domain Id.
|
||||
// Useful to cleanup cache if an encryption domain gets removed/destroyed etc.
|
||||
void resetEncyrptDomainId(const BlobCipherDomainId domainId);
|
||||
|
||||
static BlobCipherKeyCache& getInstance() {
|
||||
static BlobCipherKeyCache instance;
|
||||
return instance;
|
||||
}
|
||||
// Ensures cached encryption key(s) (plaintext) never gets persisted as part
|
||||
// of FDB process/core dump.
|
||||
static void cleanup() noexcept;
|
||||
|
||||
private:
|
||||
BlobCipherDomainCacheMap domainCacheMap;
|
||||
static constexpr uint64_t CIPHER_KEY_CACHE_TTL_SEC = 10 * 60L;
|
||||
|
||||
BlobCipherKeyCache() {}
|
||||
};
|
||||
|
||||
// This interface enables data block encryption. An invocation to encrypt() will
|
||||
// do two things:
|
||||
// 1) generate encrypted ciphertext for given plaintext input.
|
||||
// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads.
|
||||
|
||||
class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted<EncryptBlobCipherAes265Ctr> {
|
||||
public:
|
||||
static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1;
|
||||
|
||||
EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key, const uint8_t* iv, const int ivLen);
|
||||
~EncryptBlobCipherAes265Ctr();
|
||||
Reference<EncryptBuf> encrypt(const uint8_t* plaintext,
|
||||
const int plaintextLen,
|
||||
BlobCipherEncryptHeader* header,
|
||||
Arena&);
|
||||
|
||||
private:
|
||||
EVP_CIPHER_CTX* ctx;
|
||||
Reference<BlobCipherKey> cipherKey;
|
||||
uint8_t iv[AES_256_IV_LENGTH];
|
||||
};
|
||||
|
||||
// This interface enable data block decryption. An invocation to decrypt() would generate
|
||||
// 'plaintext' for a given 'ciphertext' input, the caller needs to supply BlobCipherEncryptHeader.
|
||||
|
||||
class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted<DecryptBlobCipherAes256Ctr> {
|
||||
public:
|
||||
DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv);
|
||||
~DecryptBlobCipherAes256Ctr();
|
||||
Reference<EncryptBuf> decrypt(const uint8_t* ciphertext,
|
||||
const int ciphertextLen,
|
||||
const BlobCipherEncryptHeader& header,
|
||||
Arena&);
|
||||
|
||||
private:
|
||||
EVP_CIPHER_CTX* ctx;
|
||||
|
||||
void verifyEncryptBlobHeader(const uint8_t* cipherText,
|
||||
const int ciphertextLen,
|
||||
const BlobCipherEncryptHeader& header,
|
||||
Arena& arena);
|
||||
};
|
||||
|
||||
class HmacSha256DigestGen final : NonCopyable {
|
||||
public:
|
||||
HmacSha256DigestGen(const unsigned char* key, size_t len);
|
||||
~HmacSha256DigestGen();
|
||||
HMAC_CTX* getCtx() const { return ctx; }
|
||||
StringRef digest(unsigned char const* data, size_t len, Arena&);
|
||||
|
||||
private:
|
||||
HMAC_CTX* ctx;
|
||||
};
|
||||
|
||||
BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
|
||||
const int payloadLen,
|
||||
const BlobCipherRandomSalt& salt,
|
||||
Arena& arena);
|
||||
|
||||
#endif // ENCRYPTION_ENABLED
|
|
@ -8,6 +8,8 @@ set(FLOW_SRCS
|
|||
ArgParseUtil.h
|
||||
AsioReactor.h
|
||||
BooleanParam.h
|
||||
BlobCipher.h
|
||||
BlobCipher.cpp
|
||||
CompressedInt.actor.cpp
|
||||
CompressedInt.h
|
||||
Deque.cpp
|
||||
|
|
|
@ -210,13 +210,24 @@ public:
|
|||
if (s != sizeof(Object))
|
||||
abort();
|
||||
INSTRUMENT_ALLOCATE(typeid(Object).name());
|
||||
void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate();
|
||||
return p;
|
||||
|
||||
if constexpr (sizeof(Object) <= 256) {
|
||||
void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate();
|
||||
return p;
|
||||
} else {
|
||||
void* p = new uint8_t[nextFastAllocatedSize(sizeof(Object))];
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
static void operator delete(void* s) {
|
||||
INSTRUMENT_RELEASE(typeid(Object).name());
|
||||
FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s);
|
||||
|
||||
if constexpr (sizeof(Object) <= 256) {
|
||||
FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s);
|
||||
} else {
|
||||
delete[] reinterpret_cast<uint8_t*>(s);
|
||||
}
|
||||
}
|
||||
// Redefine placement new so you can still use it
|
||||
static void* operator new(size_t, void* p) { return p; }
|
||||
|
@ -236,18 +247,6 @@ public:
|
|||
return FastAllocator<128>::allocate();
|
||||
if (size <= 256)
|
||||
return FastAllocator<256>::allocate();
|
||||
if (size <= 512)
|
||||
return FastAllocator<512>::allocate();
|
||||
if (size <= 1024)
|
||||
return FastAllocator<1024>::allocate();
|
||||
if (size <= 2048)
|
||||
return FastAllocator<2048>::allocate();
|
||||
if (size <= 4096)
|
||||
return FastAllocator<4096>::allocate();
|
||||
if (size <= 8192)
|
||||
return FastAllocator<8192>::allocate();
|
||||
if (size <= 16384)
|
||||
return FastAllocator<16384>::allocate();
|
||||
return new uint8_t[size];
|
||||
}
|
||||
|
||||
|
@ -264,21 +263,11 @@ inline void freeFast(int size, void* ptr) {
|
|||
return FastAllocator<128>::release(ptr);
|
||||
if (size <= 256)
|
||||
return FastAllocator<256>::release(ptr);
|
||||
if (size <= 512)
|
||||
return FastAllocator<512>::release(ptr);
|
||||
if (size <= 1024)
|
||||
return FastAllocator<1024>::release(ptr);
|
||||
if (size <= 2048)
|
||||
return FastAllocator<2048>::release(ptr);
|
||||
if (size <= 4096)
|
||||
return FastAllocator<4096>::release(ptr);
|
||||
if (size <= 8192)
|
||||
return FastAllocator<8192>::release(ptr);
|
||||
if (size <= 16384)
|
||||
return FastAllocator<16384>::release(ptr);
|
||||
delete[](uint8_t*) ptr;
|
||||
}
|
||||
|
||||
// Allocate a block of memory aligned to 4096 bytes. Size must be a multiple of
|
||||
// 4096. Guaranteed not to return null. Use freeFast4kAligned to free.
|
||||
[[nodiscard]] inline void* allocateFast4kAligned(int size) {
|
||||
#if !defined(USE_JEMALLOC)
|
||||
// Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc
|
||||
|
@ -296,6 +285,7 @@ inline void freeFast(int size, void* ptr) {
|
|||
return result;
|
||||
}
|
||||
|
||||
// Free a pointer returned from allocateFast4kAligned(size)
|
||||
inline void freeFast4kAligned(int size, void* ptr) {
|
||||
#if !defined(USE_JEMALLOC)
|
||||
// Sizes supported by FastAllocator must be release via FastAllocator
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
|
||||
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
|
||||
#include "flow/StreamCipher.h"
|
||||
#include "flow/BlobCipher.h"
|
||||
#endif
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/Error.h"
|
||||
|
@ -3501,6 +3502,7 @@ void crashHandler(int sig) {
|
|||
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
|
||||
StreamCipherKey::cleanup();
|
||||
StreamCipher::cleanup();
|
||||
BlobCipherKeyCache::cleanup();
|
||||
#endif
|
||||
|
||||
fflush(stdout);
|
||||
|
|
|
@ -284,6 +284,15 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum
|
|||
ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
|
||||
ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string")
|
||||
|
||||
// 3XXX - Encryption operations errors
|
||||
ERROR( encrypt_ops_error, 3000, "Encryption operation error")
|
||||
ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch")
|
||||
ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing")
|
||||
ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired")
|
||||
ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch")
|
||||
ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key")
|
||||
ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id")
|
||||
|
||||
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
|
||||
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error
|
||||
ERROR( internal_error, 4100, "An internal error occurred" )
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
*/
|
||||
|
||||
#include "flow/flat_buffers.h"
|
||||
#include "flow/FileIdentifier.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "flow/Arena.h"
|
||||
#include "flow/serialize.h"
|
||||
|
@ -26,6 +27,7 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <iomanip>
|
||||
#include <unordered_set>
|
||||
#include <variant>
|
||||
|
||||
namespace detail {
|
||||
|
@ -361,6 +363,7 @@ struct string_serialized_traits<Void> : std::true_type {
|
|||
namespace unit_tests {
|
||||
|
||||
struct Y1 {
|
||||
constexpr static FileIdentifier file_identifier = 338229;
|
||||
int a;
|
||||
|
||||
template <class Archiver>
|
||||
|
@ -369,6 +372,14 @@ struct Y1 {
|
|||
}
|
||||
};
|
||||
|
||||
struct Y1Hasher {
|
||||
std::size_t operator()(const Y1& y) const noexcept { return std::hash<int>()(y.a); }
|
||||
};
|
||||
|
||||
struct Y1Equal {
|
||||
bool operator()(const Y1& l, const Y1& r) const { return l.a == r.a; }
|
||||
};
|
||||
|
||||
struct Y2 {
|
||||
int a;
|
||||
std::variant<int> b;
|
||||
|
@ -563,4 +574,43 @@ TEST_CASE("/flow/FlatBuffers/EmptyPreSerVectorRefs") {
|
|||
return Void();
|
||||
}
|
||||
|
||||
TEST_CASE("/flow/FlatBuffers/EmptyUnorderedSet") {
|
||||
int kSize = deterministicRandom()->randomInt(0, 100);
|
||||
Standalone<StringRef> msg =
|
||||
ObjectWriter::toValue(std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>>(kSize), Unversioned());
|
||||
ObjectReader rd(msg.begin(), Unversioned());
|
||||
std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>> xs;
|
||||
rd.deserialize(xs);
|
||||
ASSERT(xs.size() == kSize);
|
||||
for (const auto& x : xs) {
|
||||
ASSERT(x.size() == 0);
|
||||
}
|
||||
return Void();
|
||||
}
|
||||
|
||||
TEST_CASE("/flow/FlatBuffers/NonEmptyUnorderedSet") {
|
||||
int kSize = deterministicRandom()->randomInt(0, 100);
|
||||
std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>> src;
|
||||
std::unordered_set<Y1, Y1Hasher, Y1Equal> s;
|
||||
for (int i = 0; i < kSize; i++) {
|
||||
Y1 y;
|
||||
y.a = i;
|
||||
s.insert(y);
|
||||
}
|
||||
src.push_back(s);
|
||||
|
||||
Standalone<StringRef> msg = ObjectWriter::toValue(src, Unversioned());
|
||||
ObjectReader rd(msg.begin(), Unversioned());
|
||||
std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>> xs;
|
||||
rd.deserialize(xs);
|
||||
ASSERT(xs.size() == 1);
|
||||
ASSERT(xs[0].size() == kSize);
|
||||
for (int i = 0; i < kSize; i++) {
|
||||
Y1 y;
|
||||
y.a = i;
|
||||
ASSERT(xs[0].find(y) != xs[0].end());
|
||||
}
|
||||
return Void();
|
||||
}
|
||||
|
||||
} // namespace unit_tests
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include <cstring>
|
||||
#include <array>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <deque>
|
||||
#include "flow/FileIdentifier.h"
|
||||
#include "flow/ObjectSerializerTraits.h"
|
||||
|
@ -250,6 +251,31 @@ struct vector_like_traits<std::set<Key, Compare, Allocator>> : std::true_type {
|
|||
return v.begin();
|
||||
}
|
||||
};
|
||||
template <class Key, class Hash, class KeyEqual, class Allocator>
|
||||
struct vector_like_traits<std::unordered_set<Key, Hash, KeyEqual, Allocator>> : std::true_type {
|
||||
using Vec = std::unordered_set<Key, Hash, KeyEqual, Allocator>;
|
||||
using value_type = Key;
|
||||
using iterator = typename Vec::const_iterator;
|
||||
using insert_iterator = std::insert_iterator<Vec>;
|
||||
|
||||
template <class Context>
|
||||
static size_t num_entries(const Vec& v, Context&) {
|
||||
return v.size();
|
||||
}
|
||||
template <class Context>
|
||||
static void reserve(Vec& v, size_t size, Context&) {
|
||||
v.reserve(size);
|
||||
}
|
||||
|
||||
template <class Context>
|
||||
static insert_iterator insert(Vec& v, Context&) {
|
||||
return std::inserter(v, v.end());
|
||||
}
|
||||
template <class Context>
|
||||
static iterator begin(const Vec& v, Context&) {
|
||||
return v.begin();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct dynamic_size_traits<std::string> : std::true_type {
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#ifndef FLOW_SERIALIZE_H
|
||||
#define FLOW_SERIALIZE_H
|
||||
#include <unordered_set>
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
@ -172,6 +173,13 @@ template <class T, class Allocator>
|
|||
struct CompositionDepthFor<std::vector<T, Allocator>> : std::integral_constant<int, CompositionDepthFor<T>::value + 1> {
|
||||
};
|
||||
|
||||
template <class Key, class Hash, class KeyEqual, class Allocator>
|
||||
struct FileIdentifierFor<std::unordered_set<Key, Hash, KeyEqual, Allocator>> : ComposedIdentifierExternal<Key, 6> {};
|
||||
|
||||
template <class Key, class Hash, class KeyEqual, class Allocator>
|
||||
struct CompositionDepthFor<std::unordered_set<Key, Hash, KeyEqual, Allocator>>
|
||||
: std::integral_constant<int, CompositionDepthFor<Key>::value + 1> {};
|
||||
|
||||
template <class Archive, class T>
|
||||
inline void save(Archive& ar, const std::vector<T>& value) {
|
||||
ar << (int)value.size();
|
||||
|
@ -762,9 +770,6 @@ private:
|
|||
public:
|
||||
static PacketBuffer* create(size_t size = 0) {
|
||||
size = std::max(size, PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD);
|
||||
if (size == PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD) {
|
||||
return new (FastAllocator<PACKET_BUFFER_MIN_SIZE>::allocate()) PacketBuffer{ size };
|
||||
}
|
||||
uint8_t* mem = new uint8_t[size + PACKET_BUFFER_OVERHEAD];
|
||||
return new (mem) PacketBuffer{ size };
|
||||
}
|
||||
|
@ -772,11 +777,7 @@ public:
|
|||
void addref() { ++reference_count; }
|
||||
void delref() {
|
||||
if (!--reference_count) {
|
||||
if (size_ == PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD) {
|
||||
FastAllocator<PACKET_BUFFER_MIN_SIZE>::release(this);
|
||||
} else {
|
||||
delete[] this;
|
||||
}
|
||||
delete[] reinterpret_cast<uint8_t*>(this);
|
||||
}
|
||||
}
|
||||
int bytes_unwritten() const { return size_ - bytes_written; }
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
[[test]]
|
||||
testTitle = 'BlobGranuleServerCommonUnit'
|
||||
useDB = false
|
||||
startDelay = 0
|
||||
|
||||
[[test.workload]]
|
||||
testName = 'UnitTests'
|
||||
maxTestCases = 0
|
||||
testsMatching = /blobgranule/server/common/
|
|
@ -0,0 +1,10 @@
|
|||
[[test]]
|
||||
testTitle = 'BlobGranuleFileUnit'
|
||||
useDB = false
|
||||
startDelay = 0
|
||||
|
||||
[[test.workload]]
|
||||
testName = 'UnitTests'
|
||||
maxTestCases = 0
|
||||
testsMatching = /blobgranule/files/
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
testTitle=UnitTests
|
||||
startDelay=0
|
||||
useDB=false
|
||||
|
||||
testName=UnitTests
|
||||
maxTestCases=0
|
||||
testsMatching=/blobgranule/
|
|
@ -0,0 +1,9 @@
|
|||
[[test]]
|
||||
testTitle = 'BlobManagerUnit'
|
||||
useDB = false
|
||||
startDelay = 0
|
||||
|
||||
[[test.workload]]
|
||||
testName = 'UnitTests'
|
||||
maxTestCases = 0
|
||||
testsMatching = /blobmanager/
|
|
@ -1,7 +0,0 @@
|
|||
testTitle=UnitTests
|
||||
startDelay=0
|
||||
useDB=false
|
||||
|
||||
testName=UnitTests
|
||||
maxTestCases=0
|
||||
testsMatching=/blobmanager/
|
|
@ -50,8 +50,9 @@ if(WITH_PYTHON)
|
|||
add_fdb_test(TEST_FILES s3VersionHeaders.txt IGNORE)
|
||||
add_fdb_test(TEST_FILES BandwidthThrottle.txt IGNORE)
|
||||
add_fdb_test(TEST_FILES BigInsert.txt IGNORE)
|
||||
add_fdb_test(TEST_FILES BlobGranuleFileUnit.txt)
|
||||
add_fdb_test(TEST_FILES BlobManagerUnit.txt)
|
||||
add_fdb_test(TEST_FILES BGServerCommonUnit.toml)
|
||||
add_fdb_test(TEST_FILES BlobGranuleFileUnit.toml)
|
||||
add_fdb_test(TEST_FILES BlobManagerUnit.toml)
|
||||
add_fdb_test(TEST_FILES ConsistencyCheck.txt IGNORE)
|
||||
add_fdb_test(TEST_FILES DDMetricsExclude.txt IGNORE)
|
||||
add_fdb_test(TEST_FILES DataDistributionMetrics.txt IGNORE)
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifyAtomicOps'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifyCycle'
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
storageEngineExcludeTypes = [3] # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
|
||||
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [3, 4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifySmall'
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
storageEngineExcludeTypes = [3] # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
|
||||
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [3, 4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifySmallClean'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleCorrectness'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleCorrectness'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifyBalance'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifyBalanceClean'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifyLarge'
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[configuration]
|
||||
blobGranulesEnabled = true
|
||||
# FIXME: re-enable rocks at some point
|
||||
storageEngineExcludeTypes = [4]
|
||||
|
||||
[[test]]
|
||||
testTitle = 'BlobGranuleVerifyLargeClean'
|
||||
|
|
Loading…
Reference in New Issue