Merge commit '478ff1eb76bc88201b6803b8b8fb5ad9d0bcc040' into aggressive-storage-migration

This commit is contained in:
Steve Atherton 2022-03-28 10:10:32 -07:00
commit 8cf40f86e6
81 changed files with 2757 additions and 454 deletions

View File

@ -58,8 +58,8 @@ _java_cmd = 'java -ea -cp %s:%s com.apple.foundationdb.test.' % (
# We could set min_api_version lower on some of these if the testers were updated to support them
testers = {
'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES),
'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES),
'python': Tester('python', 'python ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
'python3': Tester('python3', 'python3 ' + _absolute_path('python/tests/tester.py'), 2040, 23, MAX_API_VERSION, types=ALL_TYPES, tenants_enabled=True),
'ruby': Tester('ruby', _absolute_path('ruby/tests/tester.rb'), 2040, 23, MAX_API_VERSION),
'java': Tester('java', _java_cmd + 'StackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES),
'java_async': Tester('java', _java_cmd + 'AsyncStackTester', 2040, 510, MAX_API_VERSION, types=ALL_TYPES),

View File

@ -135,6 +135,7 @@ if(NOT WIN32)
add_executable(fdb_c_performance_test test/performance_test.c test/test.h)
add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h)
add_executable(fdb_c_txn_size_test test/txn_size_test.c test/test.h)
add_executable(fdb_c_client_memory_test test/client_memory_test.cpp test/unit/fdb_api.cpp test/unit/fdb_api.hpp)
add_executable(mako ${MAKO_SRCS})
add_executable(fdb_c_setup_tests test/unit/setup_tests.cpp)
add_executable(fdb_c_unit_tests ${UNIT_TEST_SRCS})
@ -145,10 +146,12 @@ if(NOT WIN32)
strip_debug_symbols(fdb_c_performance_test)
strip_debug_symbols(fdb_c_ryw_benchmark)
strip_debug_symbols(fdb_c_txn_size_test)
strip_debug_symbols(fdb_c_client_memory_test)
endif()
target_link_libraries(fdb_c_performance_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_ryw_benchmark PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_txn_size_test PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_client_memory_test PRIVATE fdb_c Threads::Threads)
add_dependencies(fdb_c_setup_tests doctest)
add_dependencies(fdb_c_unit_tests doctest)

View File

@ -835,9 +835,10 @@ extern "C" DLLEXPORT FDBResult* fdb_transaction_read_blob_granules(FDBTransactio
context.get_load_f = granule_context.get_load_f;
context.free_load_f = granule_context.free_load_f;
context.debugNoMaterialize = granule_context.debugNoMaterialize;
context.granuleParallelism = granule_context.granuleParallelism;
Optional<Version> rv;
if (readVersion != invalidVersion) { rv = readVersion; }
if (readVersion != latestVersion) { rv = readVersion; }
return (FDBResult*)(TXN(tr)->readBlobGranules(range, beginVersion, rv, context).extractPtr()););
}

View File

@ -185,7 +185,12 @@ typedef struct readgranulecontext {
void* userContext;
/* Returns a unique id for the load. Asynchronous to support queueing multiple in parallel. */
int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
int64_t (*start_load_f)(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* context);
/* Returns data for the load. Pass the loadId returned by start_load_f */
uint8_t* (*get_load_f)(int64_t loadId, void* context);
@ -196,6 +201,9 @@ typedef struct readgranulecontext {
/* Set this to true for testing if you don't want to read the granule files,
just do the request to the blob workers */
fdb_bool_t debugNoMaterialize;
/* Number of granules to load in parallel */
int granuleParallelism;
} FDBReadBlobGranuleContext;
DLLEXPORT void fdb_future_cancel(FDBFuture* f);
@ -447,7 +455,7 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_blob_granule_ranges(
uint8_t const* end_key_name,
int end_key_name_length);
/* InvalidVersion (-1) for readVersion means get read version from transaction
/* LatestVersion (-2) for readVersion means get read version from transaction
Separated out as optional because BG reads can support longer-lived reads than normal FDB transactions */
DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_transaction_read_blob_granules(FDBTransaction* db,
uint8_t const* begin_key_name,

View File

@ -0,0 +1,83 @@
/*
* client_memory_test.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define FDB_API_VERSION 710
#include <foundationdb/fdb_c.h>
#include "unit/fdb_api.hpp"
#include <thread>
#include <iostream>
#include <vector>
void fdb_check(fdb_error_t e) {
if (e) {
std::cerr << fdb_get_error(e) << std::endl;
std::abort();
}
}
FDBDatabase* fdb_open_database(const char* clusterFile) {
FDBDatabase* db;
fdb_check(fdb_create_database(clusterFile, &db));
return db;
}
int main(int argc, char** argv) {
if (argc != 2) {
printf("Usage: %s <cluster_file>", argv[0]);
}
fdb_check(fdb_select_api_version(710));
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
fdb_check(
fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, reinterpret_cast<const uint8_t*>(""), 0));
fdb_check(fdb_network_set_option(
FDBNetworkOption::FDB_NET_OPTION_TRACE_FORMAT, reinterpret_cast<const uint8_t*>("json"), 4));
// Use a bunch of memory from different client threads
FDBDatabase* db = fdb_open_database(argv[1]);
auto thread_func = [&]() {
fdb::Transaction tr(db);
for (int i = 0; i < 10000; ++i) {
tr.set(std::to_string(i), std::string(i, '\x00'));
}
tr.cancel();
};
std::vector<std::thread> threads;
constexpr auto kThreadCount = 64;
for (int i = 0; i < kThreadCount; ++i) {
threads.emplace_back(thread_func);
}
for (auto& thread : threads) {
thread.join();
}
fdb_database_destroy(db);
db = nullptr;
// Memory usage should go down now if the allocator is returning memory to the OS. It's expected that something is
// externally monitoring the memory usage of this process during this sleep.
using namespace std::chrono_literals;
std::this_thread::sleep_for(10s);
fdb_check(fdb_stop_network());
network_thread.join();
}

View File

@ -585,6 +585,7 @@ int64_t granule_start_load(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* userContext) {
FILE* fp;
char full_fname[PATH_MAX];
@ -682,6 +683,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction,
granuleContext.get_load_f = &granule_get_load;
granuleContext.free_load_f = &granule_free_load;
granuleContext.debugNoMaterialize = !doMaterialize;
granuleContext.granuleParallelism = 2; // TODO make knob or setting for changing this?
r = fdb_transaction_read_blob_granules(transaction,
(uint8_t*)keystr,
@ -689,7 +691,7 @@ int run_op_read_blob_granules(FDBTransaction* transaction,
(uint8_t*)keystr2,
strlen(keystr2),
0 /* beginVersion*/,
-1, /* endVersion. -1 is use txn read version */
-2, /* endVersion. -2 (latestVersion) is use txn read version */
granuleContext);
free(fileContext.data_by_id);

View File

@ -88,6 +88,7 @@ def api_version(ver):
'predicates',
'Future',
'Database',
'Tenant',
'Transaction',
'KeyValue',
'KeySelector',

View File

@ -34,6 +34,7 @@ import traceback
import fdb
from fdb import six
from fdb.tuple import pack, unpack
_network_thread = None
_network_thread_reentrant_lock = threading.RLock()
@ -198,9 +199,10 @@ def transactional(*tr_args, **tr_kwargs):
one of two actions, depending on the type of the parameter passed
to the function at call time.
If given a Database, a Transaction will be created and passed into
the wrapped code in place of the Database. After the function is
complete, the newly created transaction will be committed.
If given a Database or Tenant, a Transaction will be created and
passed into the wrapped code in place of the Database or Tenant.
After the function is complete, the newly created transaction
will be committed.
It is important to note that the wrapped method may be called
multiple times in the event of a commit failure, until the commit
@ -943,128 +945,114 @@ class FormerFuture(_FDBBase):
except:
pass
class Database(_FDBBase):
def __init__(self, dpointer):
self.dpointer = dpointer
self.options = _DatabaseOptions(self)
def __del__(self):
# print('Destroying database 0x%x' % self.dpointer)
self.capi.fdb_database_destroy(self.dpointer)
class _TransactionCreator(_FDBBase):
def get(self, key):
return Database.__database_getitem(self, key)
return _TransactionCreator.__creator_getitem(self, key)
def __getitem__(self, key):
if isinstance(key, slice):
return self.get_range(key.start, key.stop, reverse=(key.step == -1))
return Database.__database_getitem(self, key)
return _TransactionCreator.__creator_getitem(self, key)
def get_key(self, key_selector):
return Database.__database_get_key(self, key_selector)
return _TransactionCreator.__creator_get_key(self, key_selector)
def get_range(self, begin, end, limit=0, reverse=False, streaming_mode=StreamingMode.want_all):
return Database.__database_get_range(self, begin, end, limit, reverse, streaming_mode)
return _TransactionCreator.__creator_get_range(self, begin, end, limit, reverse, streaming_mode)
def get_range_startswith(self, prefix, *args, **kwargs):
return Database.__database_get_range_startswith(self, prefix, *args, **kwargs)
return _TransactionCreator.__creator_get_range_startswith(self, prefix, *args, **kwargs)
def set(self, key, value):
Database.__database_setitem(self, key, value)
_TransactionCreator.__creator_setitem(self, key, value)
def __setitem__(self, key, value):
Database.__database_setitem(self, key, value)
_TransactionCreator.__creator_setitem(self, key, value)
def clear(self, key):
Database.__database_delitem(self, key)
_TransactionCreator.__creator_delitem(self, key)
def clear_range(self, begin, end):
Database.__database_delitem(self, slice(begin, end))
_TransactionCreator.__creator_delitem(self, slice(begin, end))
def __delitem__(self, key_or_slice):
Database.__database_delitem(self, key_or_slice)
_TransactionCreator.__creator_delitem(self, key_or_slice)
def clear_range_startswith(self, prefix):
Database.__database_clear_range_startswith(self, prefix)
_TransactionCreator.__creator_clear_range_startswith(self, prefix)
def get_and_watch(self, key):
return Database.__database_get_and_watch(self, key)
return _TransactionCreator.__creator_get_and_watch(self, key)
def set_and_watch(self, key, value):
return Database.__database_set_and_watch(self, key, value)
return _TransactionCreator.__creator_set_and_watch(self, key, value)
def clear_and_watch(self, key):
return Database.__database_clear_and_watch(self, key)
return _TransactionCreator.__creator_clear_and_watch(self, key)
def create_transaction(self):
pointer = ctypes.c_void_p()
self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer))
return Transaction(pointer.value, self)
def _set_option(self, option, param, length):
self.capi.fdb_database_set_option(self.dpointer, option, param, length)
pass
def _atomic_operation(self, opcode, key, param):
Database.__database_atomic_operation(self, opcode, key, param)
_TransactionCreator.__creator_atomic_operation(self, opcode, key, param)
#### Transaction implementations ####
@staticmethod
@transactional
def __database_getitem(tr, key):
def __creator_getitem(tr, key):
return tr[key].value
@staticmethod
@transactional
def __database_get_key(tr, key_selector):
def __creator_get_key(tr, key_selector):
return tr.get_key(key_selector).value
@staticmethod
@transactional
def __database_get_range(tr, begin, end, limit, reverse, streaming_mode):
def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode):
return tr.get_range(begin, end, limit, reverse, streaming_mode).to_list()
@staticmethod
@transactional
def __database_get_range_startswith(tr, prefix, *args, **kwargs):
def __creator_get_range_startswith(tr, prefix, *args, **kwargs):
return tr.get_range_startswith(prefix, *args, **kwargs).to_list()
@staticmethod
@transactional
def __database_setitem(tr, key, value):
def __creator_setitem(tr, key, value):
tr[key] = value
@staticmethod
@transactional
def __database_clear_range_startswith(tr, prefix):
def __creator_clear_range_startswith(tr, prefix):
tr.clear_range_startswith(prefix)
@staticmethod
@transactional
def __database_get_and_watch(tr, key):
def __creator_get_and_watch(tr, key):
v = tr.get(key)
return v, tr.watch(key)
@staticmethod
@transactional
def __database_set_and_watch(tr, key, value):
def __creator_set_and_watch(tr, key, value):
tr.set(key, value)
return tr.watch(key)
@staticmethod
@transactional
def __database_clear_and_watch(tr, key):
def __creator_clear_and_watch(tr, key):
del tr[key]
return tr.watch(key)
@staticmethod
@transactional
def __database_delitem(tr, key_or_slice):
def __creator_delitem(tr, key_or_slice):
del tr[key_or_slice]
@staticmethod
@transactional
def __database_atomic_operation(tr, opcode, key, param):
def __creator_atomic_operation(tr, opcode, key, param):
tr._atomic_operation(opcode, key, param)
# Asynchronous transactions
@ -1074,11 +1062,11 @@ class Database(_FDBBase):
From = asyncio.From
coroutine = asyncio.coroutine
class Database:
class TransactionCreator:
@staticmethod
@transactional
@coroutine
def __database_getitem(tr, key):
def __creator_getitem(tr, key):
# raise Return(( yield From( tr[key] ) ))
raise Return(tr[key])
yield None
@ -1086,26 +1074,26 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_get_key(tr, key_selector):
def __creator_get_key(tr, key_selector):
raise Return(tr.get_key(key_selector))
yield None
@staticmethod
@transactional
@coroutine
def __database_get_range(tr, begin, end, limit, reverse, streaming_mode):
def __creator_get_range(tr, begin, end, limit, reverse, streaming_mode):
raise Return((yield From(tr.get_range(begin, end, limit, reverse, streaming_mode).to_list())))
@staticmethod
@transactional
@coroutine
def __database_get_range_startswith(tr, prefix, *args, **kwargs):
def __creator_get_range_startswith(tr, prefix, *args, **kwargs):
raise Return((yield From(tr.get_range_startswith(prefix, *args, **kwargs).to_list())))
@staticmethod
@transactional
@coroutine
def __database_setitem(tr, key, value):
def __creator_setitem(tr, key, value):
tr[key] = value
raise Return()
yield None
@ -1113,7 +1101,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_clear_range_startswith(tr, prefix):
def __creator_clear_range_startswith(tr, prefix):
tr.clear_range_startswith(prefix)
raise Return()
yield None
@ -1121,7 +1109,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_get_and_watch(tr, key):
def __creator_get_and_watch(tr, key):
v = tr.get(key)
raise Return(v, tr.watch(key))
yield None
@ -1129,7 +1117,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_set_and_watch(tr, key, value):
def __creator_set_and_watch(tr, key, value):
tr.set(key, value)
raise Return(tr.watch(key))
yield None
@ -1137,7 +1125,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_clear_and_watch(tr, key):
def __creator_clear_and_watch(tr, key):
del tr[key]
raise Return(tr.watch(key))
yield None
@ -1145,7 +1133,7 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_delitem(tr, key_or_slice):
def __creator_delitem(tr, key_or_slice):
del tr[key_or_slice]
raise Return()
yield None
@ -1153,11 +1141,101 @@ class Database(_FDBBase):
@staticmethod
@transactional
@coroutine
def __database_atomic_operation(tr, opcode, key, param):
def __creator_atomic_operation(tr, opcode, key, param):
tr._atomic_operation(opcode, key, param)
raise Return()
yield None
return Database
return TransactionCreator
def process_tenant_name(name):
if isinstance(name, tuple):
return pack(name)
elif isinstance(name, bytes):
return name
else:
raise TypeError('Tenant name must be of type ' + bytes.__name__ + ' or of type ' + tuple.__name__)
class Database(_TransactionCreator):
def __init__(self, dpointer):
self.dpointer = dpointer
self.options = _DatabaseOptions(self)
def __del__(self):
# print('Destroying database 0x%x' % self.dpointer)
self.capi.fdb_database_destroy(self.dpointer)
def _set_option(self, option, param, length):
self.capi.fdb_database_set_option(self.dpointer, option, param, length)
def open_tenant(self, name):
tname = process_tenant_name(name)
pointer = ctypes.c_void_p()
self.capi.fdb_database_open_tenant(self.dpointer, tname, len(tname), ctypes.byref(pointer))
return Tenant(pointer.value)
def create_transaction(self):
pointer = ctypes.c_void_p()
self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer))
return Transaction(pointer.value, self)
def allocate_tenant(self, name):
Database.__database_allocate_tenant(self, process_tenant_name(name), [])
def delete_tenant(self, name):
Database.__database_delete_tenant(self, process_tenant_name(name), [])
# Attempt to allocate a tenant in the cluster. If the tenant already exists,
# this function will return a tenant_already_exists error. If the tenant is created
# concurrently, then this function may return success even if another caller creates
# it.
#
# The existence_check_marker is expected to be an empty list. This function will
# modify the list after completing the existence check to avoid checking for existence
# on retries. This allows the operation to be idempotent.
@staticmethod
@transactional
def __database_allocate_tenant(tr, name, existence_check_marker):
tr.options.set_special_key_space_enable_writes()
key = b'\xff\xff/management/tenant_map/%s' % name
if not existence_check_marker:
existing_tenant = tr[key].wait()
existence_check_marker.append(None)
if existing_tenant != None:
raise fdb.FDBError(2132) # tenant_already_exists
tr[key] = b''
# Attempt to remove a tenant in the cluster. If the tenant doesn't exist, this
# function will return a tenant_not_found error. If the tenant is deleted
# concurrently, then this function may return success even if another caller deletes
# it.
#
# The existence_check_marker is expected to be an empty list. This function will
# modify the list after completing the existence check to avoid checking for existence
# on retries. This allows the operation to be idempotent.
@staticmethod
@transactional
def __database_delete_tenant(tr, name, existence_check_marker):
tr.options.set_special_key_space_enable_writes()
key = b'\xff\xff/management/tenant_map/%s' % name
if not existence_check_marker:
existing_tenant = tr[key].wait()
existence_check_marker.append(None)
if existing_tenant == None:
raise fdb.FDBError(2131) # tenant_not_found
del tr[key]
class Tenant(_TransactionCreator):
def __init__(self, tpointer):
self.tpointer = tpointer
def __del__(self):
self.capi.fdb_tenant_destroy(self.tpointer)
def create_transaction(self):
pointer = ctypes.c_void_p()
self.capi.fdb_tenant_create_transaction(self.tpointer, ctypes.byref(pointer))
return Transaction(pointer.value, self)
fill_operations()
@ -1458,6 +1536,10 @@ def init_c_api():
_capi.fdb_database_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_database_destroy.restype = None
_capi.fdb_database_open_tenant.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.POINTER(ctypes.c_void_p)]
_capi.fdb_database_open_tenant.restype = ctypes.c_int
_capi.fdb_database_open_tenant.errcheck = check_error_code
_capi.fdb_database_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)]
_capi.fdb_database_create_transaction.restype = ctypes.c_int
_capi.fdb_database_create_transaction.errcheck = check_error_code
@ -1466,6 +1548,13 @@ def init_c_api():
_capi.fdb_database_set_option.restype = ctypes.c_int
_capi.fdb_database_set_option.errcheck = check_error_code
_capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_tenant_destroy.restype = None
_capi.fdb_tenant_create_transaction.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p)]
_capi.fdb_tenant_create_transaction.restype = ctypes.c_int
_capi.fdb_tenant_create_transaction.errcheck = check_error_code
_capi.fdb_transaction_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_transaction_destroy.restype = None
@ -1686,10 +1775,10 @@ def init(event_model=None):
raise asyncio.Return(self)
return it()
FDBRange.iterate = iterate
AT = Database.declare_asynchronous_transactions()
AT = _TransactionCreator.declare_asynchronous_transactions()
for name in dir(AT):
if name.startswith("_Database__database_"):
setattr(Database, name, getattr(AT, name))
if name.startswith("__TransactionCreator__creator_"):
setattr(_TransactionCreator, name, getattr(AT, name))
def to_list(self):
if self._mode == StreamingMode.iterator:

View File

@ -0,0 +1,123 @@
#!/usr/bin/python
#
# tenant_tests.py
#
# This source file is part of the FoundationDB open source project
#
# Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import fdb
import sys
import json
from fdb.tuple import pack
if __name__ == '__main__':
fdb.api_version(710)
def test_tenant_tuple_name(db):
tuplename=(b'test', b'level', b'hierarchy', 3, 1.24, 'str')
db.allocate_tenant(tuplename)
tenant=db.open_tenant(tuplename)
tenant[b'foo'] = b'bar'
assert tenant[b'foo'] == b'bar'
del tenant[b'foo']
db.delete_tenant(tuplename)
def cleanup_tenant(db, tenant_name):
try:
tenant = db.open_tenant(tenant_name)
del tenant[:]
db.delete_tenant(tenant_name)
except fdb.FDBError as e:
if e.code == 2131: # tenant not found
pass
else:
raise
def test_tenant_operations(db):
cleanup_tenant(db, b'tenant1')
cleanup_tenant(db, b'tenant2')
db.allocate_tenant(b'tenant1')
db.allocate_tenant(b'tenant2')
tenant1 = db.open_tenant(b'tenant1')
tenant2 = db.open_tenant(b'tenant2')
db[b'tenant_test_key'] = b'no_tenant'
tenant1[b'tenant_test_key'] = b'tenant1'
tenant2[b'tenant_test_key'] = b'tenant2'
tenant1_entry = db[b'\xff\xff/management/tenant_map/tenant1']
tenant1_json = json.loads(tenant1_entry)
prefix1 = tenant1_json['prefix'].encode('utf8')
tenant2_entry = db[b'\xff\xff/management/tenant_map/tenant2']
tenant2_json = json.loads(tenant2_entry)
prefix2 = tenant2_json['prefix'].encode('utf8')
assert tenant1[b'tenant_test_key'] == b'tenant1'
assert db[prefix1 + b'tenant_test_key'] == b'tenant1'
assert tenant2[b'tenant_test_key'] == b'tenant2'
assert db[prefix2 + b'tenant_test_key'] == b'tenant2'
assert db[b'tenant_test_key'] == b'no_tenant'
tr1 = tenant1.create_transaction()
try:
del tr1[:]
tr1.commit().wait()
except fdb.FDBError as e:
tr.on_error(e).wait()
assert tenant1[b'tenant_test_key'] == None
assert db[prefix1 + b'tenant_test_key'] == None
assert tenant2[b'tenant_test_key'] == b'tenant2'
assert db[prefix2 + b'tenant_test_key'] == b'tenant2'
assert db[b'tenant_test_key'] == b'no_tenant'
db.delete_tenant(b'tenant1')
try:
tenant1[b'tenant_test_key']
assert False
except fdb.FDBError as e:
assert e.code == 2131 # tenant not found
del tenant2[:]
db.delete_tenant(b'tenant2')
assert db[prefix1 + b'tenant_test_key'] == None
assert db[prefix2 + b'tenant_test_key'] == None
assert db[b'tenant_test_key'] == b'no_tenant'
del db[b'tenant_test_key']
assert db[b'tenant_test_key'] == None
def test_tenants(db):
test_tenant_tuple_name(db)
test_tenant_operations(db)
# Expect a cluster file as input. This test will write to the FDB cluster, so
# be aware of potential side effects.
if __name__ == '__main__':
clusterFile = sys.argv[1]
db = fdb.open(clusterFile)
db.options.set_transaction_timeout(2000) # 2 seconds
db.options.set_transaction_retry_limit(3)
test_tenants(db)

View File

@ -49,6 +49,7 @@ from cancellation_timeout_tests import test_db_retry_limits
from cancellation_timeout_tests import test_combinations
from size_limit_tests import test_size_limit_option, test_get_approximate_size
from tenant_tests import test_tenants
random.seed(0)
@ -112,12 +113,13 @@ class Stack:
class Instruction:
def __init__(self, tr, stack, op, index, isDatabase=False, isSnapshot=False):
def __init__(self, tr, stack, op, index, isDatabase=False, isTenant=False, isSnapshot=False):
self.tr = tr
self.stack = stack
self.op = op
self.index = index
self.isDatabase = isDatabase
self.isTenant = isTenant
self.isSnapshot = isSnapshot
def pop(self, count=None, with_idx=False):
@ -277,6 +279,7 @@ class Tester:
def __init__(self, db, prefix):
self.db = db
self.tenant = None
self.instructions = self.db[fdb.tuple.range((prefix,))]
@ -317,7 +320,8 @@ class Tester:
def new_transaction(self):
with Tester.tr_map_lock:
Tester.tr_map[self.tr_name] = self.db.create_transaction()
tr_source = self.tenant if self.tenant is not None else self.db
Tester.tr_map[self.tr_name] = tr_source.create_transaction()
def switch_transaction(self, name):
self.tr_name = name
@ -335,18 +339,22 @@ class Tester:
# print("%d. Instruction is %s" % (idx, op))
isDatabase = op.endswith(six.u('_DATABASE'))
isTenant = op.endswith(six.u('_TENANT'))
isSnapshot = op.endswith(six.u('_SNAPSHOT'))
if isDatabase:
op = op[:-9]
obj = self.db
elif isTenant:
op = op[:-7]
obj = self.tenant if self.tenant else self.db
elif isSnapshot:
op = op[:-9]
obj = self.current_transaction().snapshot
else:
obj = self.current_transaction()
inst = Instruction(obj, self.stack, op, idx, isDatabase, isSnapshot)
inst = Instruction(obj, self.stack, op, idx, isDatabase, isTenant, isSnapshot)
try:
if inst.op == six.u("PUSH"):
@ -583,6 +591,19 @@ class Tester:
prefix = inst.pop()
Tester.wait_empty(self.db, prefix)
inst.push(b"WAITED_FOR_EMPTY")
elif inst.op == six.u("TENANT_CREATE"):
name = inst.pop()
self.db.allocate_tenant(name)
inst.push(b"RESULT_NOT_PRESENT")
elif inst.op == six.u("TENANT_DELETE"):
name = inst.pop()
self.db.delete_tenant(name)
inst.push(b"RESULT_NOT_PRESENT")
elif inst.op == six.u("TENANT_SET_ACTIVE"):
name = inst.pop()
self.tenant = self.db.open_tenant(name)
elif inst.op == six.u("TENANT_CLEAR_ACTIVE"):
self.tenant = None
elif inst.op == six.u("UNIT_TESTS"):
try:
test_db_options(db)
@ -600,6 +621,8 @@ class Tester:
test_size_limit_option(db)
test_get_approximate_size(db)
test_tenants(db)
except fdb.FDBError as e:
print("Unit tests failed: %s" % e.description)
traceback.print_exc()

View File

@ -212,6 +212,17 @@ endif()
set(COROUTINE_IMPL ${DEFAULT_COROUTINE_IMPL} CACHE STRING "Which coroutine implementation to use. Options are boost and libcoro")
################################################################################
# AWS SDK
################################################################################
set(BUILD_AWS_BACKUP OFF CACHE BOOL "Build AWS S3 SDK backup client")
if (BUILD_AWS_BACKUP)
set(WITH_AWS_BACKUP ON)
else()
set(WITH_AWS_BACKUP OFF)
endif()
################################################################################
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/packages)
@ -232,6 +243,7 @@ function(print_components)
message(STATUS "Build Python sdist (make package): ${WITH_PYTHON_BINDING}")
message(STATUS "Configure CTest (depends on Python): ${WITH_PYTHON}")
message(STATUS "Build with RocksDB: ${WITH_ROCKSDB_EXPERIMENTAL}")
message(STATUS "Build with AWS SDK: ${WITH_AWS_BACKUP}")
message(STATUS "=========================================")
endfunction()

98
cmake/awssdk.cmake Normal file
View File

@ -0,0 +1,98 @@
project(awssdk-download NONE)
# Compile the sdk with clang and libc++, since otherwise we get libc++ vs libstdc++ link errors when compiling fdb with clang
set(AWSSDK_COMPILER_FLAGS "")
set(AWSSDK_LINK_FLAGS "")
if(APPLE OR CLANG OR USE_LIBCXX)
set(AWSSDK_COMPILER_FLAGS -stdlib=libc++ -nostdlib++)
set(AWSSDK_LINK_FLAGS -stdlib=libc++ -lc++abi)
endif()
include(ExternalProject)
ExternalProject_Add(awssdk_project
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG 2af3ce543c322cb259471b3b090829464f825972 # v1.9.200
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_EXE_LINKER_FLAGS=${AWSSDK_COMPILER_FLAGS}
-DCMAKE_CXX_FLAGS=${AWSSDK_LINK_FLAGS}
TEST_COMMAND ""
BUILD_ALWAYS TRUE
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
)
add_library(awssdk_core STATIC IMPORTED)
add_dependencies(awssdk_core awssdk_project)
set_target_properties(awssdk_core PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a")
add_library(awssdk_crt STATIC IMPORTED)
add_dependencies(awssdk_crt awssdk_project)
set_target_properties(awssdk_crt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a")
# TODO: can we remove c_s3? It seems to be a dependency of libaws-crt
add_library(awssdk_c_s3 STATIC IMPORTED)
add_dependencies(awssdk_c_s3 awssdk_project)
set_target_properties(awssdk_c_s3 PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a")
add_library(awssdk_c_auth STATIC IMPORTED)
add_dependencies(awssdk_c_auth awssdk_project)
set_target_properties(awssdk_c_auth PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a")
add_library(awssdk_c_eventstream STATIC IMPORTED)
add_dependencies(awssdk_c_eventstream awssdk_project)
set_target_properties(awssdk_c_eventstream PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a")
add_library(awssdk_c_http STATIC IMPORTED)
add_dependencies(awssdk_c_http awssdk_project)
set_target_properties(awssdk_c_http PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a")
add_library(awssdk_c_mqtt STATIC IMPORTED)
add_dependencies(awssdk_c_mqtt awssdk_project)
set_target_properties(awssdk_c_mqtt PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a")
add_library(awssdk_c_io STATIC IMPORTED)
add_dependencies(awssdk_c_io awssdk_project)
set_target_properties(awssdk_c_io PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a")
add_library(awssdk_checksums STATIC IMPORTED)
add_dependencies(awssdk_checksums awssdk_project)
set_target_properties(awssdk_checksums PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a")
add_library(awssdk_c_compression STATIC IMPORTED)
add_dependencies(awssdk_c_compression awssdk_project)
set_target_properties(awssdk_c_compression PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a")
add_library(awssdk_c_cal STATIC IMPORTED)
add_dependencies(awssdk_c_cal awssdk_project)
set_target_properties(awssdk_c_cal PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a")
add_library(awssdk_c_common STATIC IMPORTED)
add_dependencies(awssdk_c_common awssdk_project)
set_target_properties(awssdk_c_common PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a")
# link them all together in one interface target
add_library(awssdk_target INTERFACE)
target_include_directories(awssdk_target SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/include)
target_link_libraries(awssdk_target INTERFACE awssdk_core awssdk_crt awssdk_c_s3 awssdk_c_auth awssdk_c_eventstream awssdk_c_http awssdk_c_mqtt awssdk_c_io awssdk_checksums awssdk_c_compression awssdk_c_cal awssdk_c_common curl)

View File

@ -3,3 +3,4 @@ setuptools>=20.10.0,<=57.4.0
sphinx==1.5.6
sphinx-bootstrap-theme==0.4.8
docutils==0.16
Jinja2==3.0.3

View File

@ -7,7 +7,7 @@
.. |database-type| replace:: ``Database``
.. |database-class| replace:: :class:`Database`
.. |database-auto| replace:: the :func:`@fdb.transactional <transactional>` decorator
.. |tenant-type| replace:: FIXME
.. |tenant-type| replace:: :class:`Tenant`
.. |transaction-class| replace:: :class:`Transaction`
.. |get-key-func| replace:: :func:`Transaction.get_key`
.. |get-range-func| replace:: :func:`Transaction.get_range`
@ -316,9 +316,29 @@ A |database-blurb1| |database-blurb2|
Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional <transactional>` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior.
.. method:: Database.open_tenant(tenant_name)
Opens an existing tenant to be used for running transactions and returns it as a :class`Tenant` object.
The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
.. |sync-read| replace:: This read is fully synchronous.
.. |sync-write| replace:: This change will be committed immediately, and is fully synchronous.
.. method:: Database.allocate_tenant(tenant_name):
Creates a new tenant in the cluster. |sync-write|
The tenant name can be either a byte string or a tuple and cannot start with the ``\xff`` byte. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
.. method:: Database.delete_tenant(tenant_name):
Delete a tenant from the cluster. |sync-write|
The tenant name can be either a byte string or a tuple. If a tuple is provided, the tuple will be packed using the tuple layer to generate the byte string tenant name.
It is an error to delete a tenant that still has data. To delete a non-empty tenant, first clear all of the keys in the tenant.
.. method:: Database.get(key)
Returns the value associated with the specified key in the database (or ``None`` if the key does not exist). |sync-read|
@ -460,6 +480,17 @@ Database options
.. method:: Database.options.set_snapshot_ryw_disable()
|option-db-snapshot-ryw-disable-blurb|
Tenant objects
==============
.. class:: Tenant
|tenant-blurb1|
.. method:: Tenant.create_transaction()
Returns a new :class:`Transaction` object. Consider using the :func:`@fdb.transactional <transactional>` decorator to create transactions instead, since it will automatically provide you with appropriate retry behavior.
.. _api-python-transactional-decorator:
@ -479,9 +510,9 @@ Transactional decoration
The ``@fdb.transactional`` decorator makes ``simple_function`` a transactional function. All functions using this decorator must have an argument **named** ``tr``. This specially named argument is passed a transaction that the function can use to do reads and writes.
A caller of a transactionally decorated function can pass a :class:`Database` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits.
A caller of a transactionally decorated function can pass a :class:`Database` or :class:`Tenant` instead of a transaction for the ``tr`` parameter. Then a transaction will be created automatically, and automatically committed before returning to the caller. The decorator will retry calling the decorated function until the transaction successfully commits.
If ``db`` is a :class:`Database`, a call like ::
If ``db`` is a :class:`Database` or :class:`Tenant`, a call like ::
simple_function(db, 'a', 'b')
@ -744,7 +775,7 @@ Committing
.. decorator:: transactional()
The ``transactional`` decorator makes it easy to write transactional functions which accept either a :class:`Database` or a :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
The ``transactional`` decorator makes it easy to write transactional functions which accept a :class:`Database`, :class`Tenant`, or :class:`Transaction` as a parameter and automatically commit. See :func:`@fdb.transactional <transactional>` for explanation and examples.
.. method :: Transaction.commit()
@ -754,7 +785,7 @@ Committing
|commit-outstanding-reads-blurb|
.. note :: Consider using the :func:`@fdb.transactional <transactional>` decorator, which not only calls :meth:`Database.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions.
.. note :: Consider using the :func:`@fdb.transactional <transactional>` decorator, which not only calls :meth:`Database.create_transaction` or :meth`Tenant.create_transaction` and :meth:`Transaction.commit()` for you but also implements the required error handling and retry logic for transactions.
.. warning :: |used-during-commit-blurb|

View File

@ -155,6 +155,12 @@ Here is a complete list of valid parameters:
**Example**: The URL parameter *header=x-amz-storage-class:REDUCED_REDUNDANCY* would send the HTTP header required to use the reduced redundancy storage option in the S3 API.
Signing Protocol
=================
AWS signature version 4 is the default signing protocol choice. This boolean knob ``--knob_http_request_aws_v4_header`` can be used to select between v4 style and v2 style signatures.
If the knob is set to ``true`` then v4 signature will be used and if set to ``false`` then v2 signature will be used.
.. _blob-credential-files:
Blob Credential Files

View File

@ -46,6 +46,7 @@ enum {
OPT_HEX_KEY_PREFIX,
OPT_BEGIN_VERSION_FILTER,
OPT_END_VERSION_FILTER,
OPT_KNOB,
OPT_HELP
};
@ -72,6 +73,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
{ OPT_HEX_KEY_PREFIX, "--hex-prefix", SO_REQ_SEP },
{ OPT_BEGIN_VERSION_FILTER, "--begin-version-filter", SO_REQ_SEP },
{ OPT_END_VERSION_FILTER, "--end-version-filter", SO_REQ_SEP },
{ OPT_KNOB, "--knob-", SO_REQ_SEP },
{ OPT_HELP, "-?", SO_NONE },
{ OPT_HELP, "-h", SO_NONE },
{ OPT_HELP, "--help", SO_NONE },

View File

@ -26,17 +26,21 @@
#include <vector>
#include "fdbbackup/BackupTLSConfig.h"
#include "fdbclient/BuildFlags.h"
#include "fdbbackup/FileConverter.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbbackup/FileConverter.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/IKnobCollection.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/MutationList.h"
#include "flow/ArgParseUtil.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/serialize.h"
#include "fdbclient/BuildFlags.h"
#include "flow/actorcompiler.h" // has to be last include
#define SevDecodeInfo SevVerbose
@ -73,11 +77,13 @@ void printDecodeUsage() {
" --list-only Print file list and exit.\n"
" -k KEY_PREFIX Use the prefix for filtering mutations\n"
" --hex-prefix HEX_PREFIX\n"
" The prefix specified in HEX format, e.g., \\x05\\x01.\n"
" The prefix specified in HEX format, e.g., \"\\\\x05\\\\x01\".\n"
" --begin-version-filter BEGIN_VERSION\n"
" The version range's begin version (inclusive) for filtering.\n"
" --end-version-filter END_VERSION\n"
" The version range's end version (exclusive) for filtering.\n"
" --knob-KNOBNAME KNOBVALUE\n"
" Changes a knob value. KNOBNAME should be lowercase."
"\n";
return;
}
@ -97,6 +103,8 @@ struct DecodeParams {
Version beginVersionFilter = 0;
Version endVersionFilter = std::numeric_limits<Version>::max();
std::vector<std::pair<std::string, std::string>> knobs;
// Returns if [begin, end) overlap with the filter range
bool overlap(Version begin, Version end) const {
// Filter [100, 200), [50,75) [200, 300)
@ -130,8 +138,39 @@ struct DecodeParams {
if (!prefix.empty()) {
s.append(", KeyPrefix: ").append(printable(KeyRef(prefix)));
}
for (const auto& [knob, value] : knobs) {
s.append(", KNOB-").append(knob).append(" = ").append(value);
}
return s;
}
void updateKnobs() {
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
for (const auto& [knobName, knobValueString] : knobs) {
try {
auto knobValue = g_knobs.parseKnobValue(knobName, knobValueString);
g_knobs.setKnob(knobName, knobValue);
} catch (Error& e) {
if (e.code() == error_code_invalid_option_value) {
std::cerr << "WARNING: Invalid value '" << knobValueString << "' for knob option '" << knobName
<< "'\n";
TraceEvent(SevWarnAlways, "InvalidKnobValue")
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
} else {
std::cerr << "ERROR: Failed to set knob option '" << knobName << "': " << e.what() << "\n";
TraceEvent(SevError, "FailedToSetKnob")
.errorUnsuppressed(e)
.detail("Knob", printable(knobName))
.detail("Value", printable(knobValueString));
throw;
}
}
}
// Reinitialize knobs in order to update knobs that are dependent on explicitly set knobs
g_knobs.initialize(Randomize::True, IsSimulated::False);
}
};
// Decode an ASCII string, e.g., "\x15\x1b\x19\x04\xaf\x0c\x28\x0a",
@ -256,6 +295,16 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
param->tlsConfig.blobCredentials.push_back(args->OptionArg());
break;
case OPT_KNOB: {
Optional<std::string> knobName = extractPrefixedArgument("--knob", args->OptionSyntax());
if (!knobName.present()) {
std::cerr << "ERROR: unable to parse knob option '" << args->OptionSyntax() << "'\n";
return FDB_EXIT_ERROR;
}
param->knobs.emplace_back(knobName.get(), args->OptionArg());
break;
}
#ifndef TLS_DISABLED
case TLSConfig::OPT_TLS_PLUGIN:
args->OptionArg();
@ -552,6 +601,9 @@ int main(int argc, char** argv) {
StringRef url(param.container_url);
setupNetwork(0, UseMetrics::True);
// Must be called after setupNetwork() to be effective
param.updateKnobs();
TraceEvent::setNetworkThread();
openTraceFile(NetworkAddress(), 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
param.tlsConfig.setupBlobCredentials();

View File

@ -18,9 +18,12 @@
* limitations under the License.
*/
#include <vector>
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "flow/serialize.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/SystemData.h" // for allKeys unit test - could remove
#include "flow/UnitTest.h"
@ -119,29 +122,43 @@ static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map<KeyRef, Val
static void applyDeltas(const GranuleDeltas& deltas,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Version& lastFileEndVersion,
std::map<KeyRef, ValueRef>& dataMap) {
if (!deltas.empty()) {
// check that consecutive delta file versions are disjoint
ASSERT(lastFileEndVersion < deltas.front().version);
if (deltas.empty()) {
return;
}
for (const MutationsAndVersionRef& delta : deltas) {
if (delta.version > readVersion) {
// check that consecutive delta file versions are disjoint
ASSERT(lastFileEndVersion < deltas.front().version);
const MutationsAndVersionRef* mutationIt = deltas.begin();
// prune beginVersion if necessary
if (beginVersion > deltas.front().version) {
ASSERT(beginVersion <= deltas.back().version);
// binary search for beginVersion
mutationIt = std::lower_bound(deltas.begin(),
deltas.end(),
MutationsAndVersionRef(beginVersion, 0),
MutationsAndVersionRef::OrderByVersion());
}
while (mutationIt != deltas.end()) {
if (mutationIt->version > readVersion) {
lastFileEndVersion = readVersion;
return;
}
for (auto& m : delta.mutations) {
for (auto& m : mutationIt->mutations) {
applyDelta(keyRange, m, dataMap);
}
mutationIt++;
}
if (!deltas.empty()) {
lastFileEndVersion = deltas.back().version;
}
lastFileEndVersion = deltas.back().version;
}
static Arena loadDeltaFile(StringRef deltaData,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Version& lastFileEndVersion,
std::map<KeyRef, ValueRef>& dataMap) {
@ -151,7 +168,7 @@ static Arena loadDeltaFile(StringRef deltaData,
reader.deserialize(FileIdentifierFor<GranuleDeltas>::value, deltas, parseArena);
if (BG_READ_DEBUG) {
fmt::print("Parsed {}} deltas from file\n", deltas.size());
fmt::print("Parsed {} deltas from file\n", deltas.size());
}
// TODO REMOVE sanity check
@ -163,19 +180,18 @@ static Arena loadDeltaFile(StringRef deltaData,
ASSERT(deltas[i].version <= deltas[i + 1].version);
}
applyDeltas(deltas, keyRange, readVersion, lastFileEndVersion, dataMap);
applyDeltas(deltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
return parseArena;
}
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]) {
// TODO REMOVE with V2 of protocol
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
ASSERT(chunk.snapshotFile.present());
ASSERT(snapshotData.present());
// Arena to hold all allocations for applying deltas. Most of it, and the arenas produced by reading the files,
// will likely be tossed if there are a significant number of mutations, so we copy at the end instead of doing a
@ -195,13 +211,14 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
}
for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
Arena deltaArena = loadDeltaFile(deltaFileData[deltaIdx], keyRange, readVersion, lastFileEndVersion, dataMap);
Arena deltaArena =
loadDeltaFile(deltaFileData[deltaIdx], keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
arena.dependsOn(deltaArena);
}
if (BG_READ_DEBUG) {
fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size());
}
applyDeltas(chunk.newDeltas, keyRange, readVersion, lastFileEndVersion, dataMap);
applyDeltas(chunk.newDeltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap);
RangeResult ret;
for (auto& it : dataMap) {
@ -211,50 +228,85 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
return ret;
}
struct GranuleLoadIds {
Optional<int64_t> snapshotId;
std::vector<int64_t> deltaIds;
};
static void startLoad(const ReadBlobGranuleContext granuleContext,
const BlobGranuleChunkRef& chunk,
GranuleLoadIds& loadIds) {
// Start load process for all files in chunk
if (chunk.snapshotFile.present()) {
std::string snapshotFname = chunk.snapshotFile.get().filename.toString();
// FIXME: full file length won't always be length of read
loadIds.snapshotId = granuleContext.start_load_f(snapshotFname.c_str(),
snapshotFname.size(),
chunk.snapshotFile.get().offset,
chunk.snapshotFile.get().length,
chunk.snapshotFile.get().length,
granuleContext.userContext);
}
loadIds.deltaIds.reserve(chunk.deltaFiles.size());
for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString();
int64_t deltaLoadId = granuleContext.start_load_f(deltaFName.c_str(),
deltaFName.size(),
chunk.deltaFiles[deltaFileIdx].offset,
chunk.deltaFiles[deltaFileIdx].length,
chunk.deltaFiles[deltaFileIdx].length,
granuleContext.userContext);
loadIds.deltaIds.push_back(deltaLoadId);
}
}
ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<BlobGranuleChunkRef>>& files,
const KeyRangeRef& keyRange,
Version beginVersion,
Version readVersion,
ReadBlobGranuleContext granuleContext) {
int64_t parallelism = granuleContext.granuleParallelism;
if (parallelism < 1) {
parallelism = 1;
}
if (parallelism >= CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM) {
parallelism = CLIENT_KNOBS->BG_MAX_GRANULE_PARALLELISM;
}
GranuleLoadIds loadIds[files.size()];
// Kick off first file reads if parallelism > 1
for (int i = 0; i < parallelism - 1 && i < files.size(); i++) {
startLoad(granuleContext, files[i], loadIds[i]);
}
try {
RangeResult results;
// FIXME: could submit multiple chunks to start_load_f in parallel?
for (const BlobGranuleChunkRef& chunk : files) {
for (int chunkIdx = 0; chunkIdx < files.size(); chunkIdx++) {
// Kick off files for this granule if parallelism == 1, or future granule if parallelism > 1
if (chunkIdx + parallelism - 1 < files.size()) {
startLoad(granuleContext, files[chunkIdx + parallelism - 1], loadIds[chunkIdx + parallelism - 1]);
}
RangeResult chunkRows;
int64_t snapshotLoadId;
int64_t deltaLoadIds[chunk.deltaFiles.size()];
// Start load process for all files in chunk
// In V1 of api snapshot is required, optional is just for forward compatibility
ASSERT(chunk.snapshotFile.present());
std::string snapshotFname = chunk.snapshotFile.get().filename.toString();
snapshotLoadId = granuleContext.start_load_f(snapshotFname.c_str(),
snapshotFname.size(),
chunk.snapshotFile.get().offset,
chunk.snapshotFile.get().length,
granuleContext.userContext);
int64_t deltaLoadLengths[chunk.deltaFiles.size()];
StringRef deltaData[chunk.deltaFiles.size()];
for (int deltaFileIdx = 0; deltaFileIdx < chunk.deltaFiles.size(); deltaFileIdx++) {
std::string deltaFName = chunk.deltaFiles[deltaFileIdx].filename.toString();
deltaLoadIds[deltaFileIdx] = granuleContext.start_load_f(deltaFName.c_str(),
deltaFName.size(),
chunk.deltaFiles[deltaFileIdx].offset,
chunk.deltaFiles[deltaFileIdx].length,
granuleContext.userContext);
deltaLoadLengths[deltaFileIdx] = chunk.deltaFiles[deltaFileIdx].length;
}
// once all loads kicked off, load data for chunk
StringRef snapshotData(granuleContext.get_load_f(snapshotLoadId, granuleContext.userContext),
chunk.snapshotFile.get().length);
if (!snapshotData.begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
Optional<StringRef> snapshotData;
if (files[chunkIdx].snapshotFile.present()) {
snapshotData =
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext),
files[chunkIdx].snapshotFile.get().length);
if (!snapshotData.get().begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
}
}
for (int i = 0; i < chunk.deltaFiles.size(); i++) {
deltaData[i] = StringRef(granuleContext.get_load_f(deltaLoadIds[i], granuleContext.userContext),
chunk.deltaFiles[i].length);
StringRef deltaData[files[chunkIdx].deltaFiles.size()];
for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) {
deltaData[i] =
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext),
files[chunkIdx].deltaFiles[i].length);
// null data is error
if (!deltaData[i].begin()) {
return ErrorOr<RangeResult>(blob_granule_file_load_error());
@ -262,14 +314,17 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
// materialize rows from chunk
chunkRows = materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData);
chunkRows =
materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData);
results.arena().dependsOn(chunkRows.arena());
results.append(results.arena(), chunkRows.begin(), chunkRows.size());
granuleContext.free_load_f(snapshotLoadId, granuleContext.userContext);
for (int i = 0; i < chunk.deltaFiles.size(); i++) {
granuleContext.free_load_f(deltaLoadIds[i], granuleContext.userContext);
if (loadIds[chunkIdx].snapshotId.present()) {
granuleContext.free_load_f(loadIds[chunkIdx].snapshotId.get(), granuleContext.userContext);
}
for (int i = 0; i < loadIds[chunkIdx].deltaIds.size(); i++) {
granuleContext.free_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext);
}
}
return ErrorOr<RangeResult>(results);
@ -278,8 +333,7 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
}
// FIXME: re-enable test!
TEST_CASE(":/blobgranule/files/applyDelta") {
TEST_CASE("/blobgranule/files/applyDelta") {
printf("Testing blob granule delta applying\n");
Arena a;

View File

@ -33,6 +33,7 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[]);

View File

@ -28,6 +28,7 @@
#include "fdbclient/BlobGranuleReader.actor.h"
#include "fdbclient/BlobWorkerCommon.h"
#include "fdbclient/BlobWorkerInterface.h"
#include "fdbclient/FDBTypes.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// TODO more efficient data structure besides std::map? PTree is unnecessary since this isn't versioned, but some other
@ -52,7 +53,6 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem
StringRef dataRef(data, f.length);
return Standalone<StringRef>(dataRef, arena);
} catch (Error& e) {
printf("Reading file %s got error %s\n", f.toString().c_str(), e.name());
throw e;
}
}
@ -64,22 +64,25 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem
// sub-functions that BlobGranuleFiles actually exposes?
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Reference<BackupContainerFileSystem> bstore,
Optional<BlobWorkerStats*> stats) {
// TODO REMOVE with V2 of protocol
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
ASSERT(chunk.snapshotFile.present());
state Arena arena;
try {
Future<Standalone<StringRef>> readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get());
state std::vector<Future<Standalone<StringRef>>> readDeltaFutures;
if (stats.present()) {
++stats.get()->s3GetReqs;
Future<Standalone<StringRef>> readSnapshotFuture;
if (chunk.snapshotFile.present()) {
readSnapshotFuture = readFile(bstore, chunk.snapshotFile.get());
if (stats.present()) {
++stats.get()->s3GetReqs;
}
}
state std::vector<Future<Standalone<StringRef>>> readDeltaFutures;
readDeltaFutures.reserve(chunk.deltaFiles.size());
for (BlobFilePointerRef deltaFile : chunk.deltaFiles) {
@ -89,8 +92,12 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
}
}
state Standalone<StringRef> snapshotData = wait(readSnapshotFuture);
arena.dependsOn(snapshotData.arena());
state Optional<StringRef> snapshotData; // not present if snapshotFile isn't present
if (chunk.snapshotFile.present()) {
state Standalone<StringRef> s = wait(readSnapshotFuture);
arena.dependsOn(s.arena());
snapshotData = s;
}
state int numDeltaFiles = chunk.deltaFiles.size();
state StringRef* deltaData = new (arena) StringRef[numDeltaFiles];
@ -103,10 +110,9 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
arena.dependsOn(data.arena());
}
return materializeBlobGranule(chunk, keyRange, readVersion, snapshotData, deltaData);
return materializeBlobGranule(chunk, keyRange, beginVersion, readVersion, snapshotData, deltaData);
} catch (Error& e) {
printf("Reading blob granule got error %s\n", e.name());
throw e;
}
}
@ -121,18 +127,12 @@ ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
try {
state int i;
for (i = 0; i < reply.chunks.size(); i++) {
/*printf("ReadBlobGranules processing chunk %d [%s - %s)\n",
i,
reply.chunks[i].keyRange.begin.printable().c_str(),
reply.chunks[i].keyRange.end.printable().c_str());*/
RangeResult chunkResult =
wait(readBlobGranule(reply.chunks[i], request.keyRange, request.readVersion, bstore));
RangeResult chunkResult = wait(
readBlobGranule(reply.chunks[i], request.keyRange, request.beginVersion, request.readVersion, bstore));
results.send(std::move(chunkResult));
}
// printf("ReadBlobGranules done, sending EOS\n");
results.sendError(end_of_stream());
} catch (Error& e) {
printf("ReadBlobGranules got error %s\n", e.name());
results.sendError(e);
}

View File

@ -40,6 +40,7 @@
// the request
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Reference<BackupContainerFileSystem> bstore,
Optional<BlobWorkerStats*> stats = Optional<BlobWorkerStats*>());

View File

@ -38,6 +38,8 @@ struct BlobWorkerStats {
Counter commitVersionChecks;
Counter granuleUpdateErrors;
Counter granuleRequestTimeouts;
Counter readRequestsWithBegin;
Counter readRequestsCollapsed;
int numRangesAssigned;
int mutationBytesBuffered;
@ -59,6 +61,7 @@ struct BlobWorkerStats {
readReqTotalFilesReturned("ReadReqTotalFilesReturned", cc),
readReqDeltaBytesReturned("ReadReqDeltaBytesReturned", cc), commitVersionChecks("CommitVersionChecks", cc),
granuleUpdateErrors("GranuleUpdateErrors", cc), granuleRequestTimeouts("GranuleRequestTimeouts", cc),
readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0) {
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });

View File

@ -86,13 +86,14 @@ struct BlobGranuleFileRequest {
KeyRangeRef keyRange;
Version beginVersion = 0;
Version readVersion;
bool canCollapseBegin = true;
ReplyPromise<BlobGranuleFileReply> reply;
BlobGranuleFileRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, keyRange, beginVersion, readVersion, reply, arena);
serializer(ar, keyRange, beginVersion, readVersion, canCollapseBegin, reply, arena);
}
};

View File

@ -205,6 +205,17 @@ if(BUILD_AZURE_BACKUP)
)
endif()
if(WITH_AWS_BACKUP)
add_compile_definitions(BUILD_AWS_BACKUP)
set(FDBCLIENT_SRCS
${FDBCLIENT_SRCS}
FDBAWSCredentialsProvider.h)
include(awssdk)
endif()
add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
add_dependencies(fdbclient fdboptions)
target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
@ -224,3 +235,8 @@ if(BUILD_AZURE_BACKUP)
target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
target_link_libraries(fdbclient_sampling PRIVATE curl uuid azure-storage-lite)
endif()
if(BUILD_AWS_BACKUP)
target_link_libraries(fdbclient PUBLIC awssdk_target)
target_link_libraries(fdbclient_sampling PUBLIC awssdk_target)
endif()

View File

@ -50,6 +50,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( MAX_GENERATIONS_OVERRIDE, 0 );
init( MAX_GENERATIONS_SIM, 50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION
init( COORDINATOR_HOSTNAME_RESOLVE_DELAY, 0.05 );
init( COORDINATOR_RECONNECTION_DELAY, 1.0 );
init( CLIENT_EXAMPLE_AMOUNT, 20 );
init( MAX_CLIENT_STATUS_AGE, 1.0 );
@ -280,6 +281,9 @@ void ClientKnobs::initialize(Randomize randomize) {
init( MVC_CLIENTLIB_CHUNK_SIZE, 8*1024 );
init( MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION, 32 );
// Blob granules
init( BG_MAX_GRANULE_PARALLELISM, 10 );
// clang-format on
}

View File

@ -49,6 +49,7 @@ public:
double MAX_GENERATIONS_OVERRIDE;
double MAX_GENERATIONS_SIM;
double COORDINATOR_HOSTNAME_RESOLVE_DELAY;
double COORDINATOR_RECONNECTION_DELAY;
int CLIENT_EXAMPLE_AMOUNT;
double MAX_CLIENT_STATUS_AGE;
@ -272,6 +273,9 @@ public:
int MVC_CLIENTLIB_CHUNK_SIZE;
int MVC_CLIENTLIB_CHUNKS_PER_TRANSACTION;
// Blob Granules
int BG_MAX_GRANULE_PARALLELISM;
ClientKnobs(Randomize randomize);
void initialize(Randomize randomize);
};

View File

@ -514,7 +514,7 @@ public:
Counter transactionGrvTimedOutBatches;
ContinuousSample<double> latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit,
bytesPerCommit;
bytesPerCommit, bgLatencies, bgGranulesPerRequest;
int outstandingWatches;
int maxOutstandingWatches;
@ -538,6 +538,7 @@ public:
bool transactionTracingSample;
double verifyCausalReadsProp = 0.0;
bool blobGranuleNoMaterialize = false;
bool anyBlobGranuleRequests = false;
Future<Void> logger;
Future<Void> throttleExpirer;

View File

@ -0,0 +1,47 @@
/*
* FDBAWSCredentialsProvider.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if (!defined FDB_AWS_CREDENTIALS_PROVIDER_H) && (defined BUILD_AWS_BACKUP)
#define FDB_AWS_CREDENTIALS_PROVIDER_H
#pragma once
#include "aws/core/Aws.h"
#include "aws/core/auth/AWSCredentialsProviderChain.h"
// Singleton
namespace FDBAWSCredentialsProvider {
bool doneInit = false;
// You're supposed to call AWS::ShutdownAPI(options); once done
// But we want this to live for the lifetime of the process, so we don't do that
static Aws::Auth::AWSCredentials getAwsCredentials() {
if (!doneInit) {
doneInit = true;
Aws::SDKOptions options;
Aws::InitAPI(options);
TraceEvent("AWSSDKInitSuccessful");
}
Aws::Auth::DefaultAWSCredentialsProviderChain credProvider;
Aws::Auth::AWSCredentials creds = credProvider.GetAWSCredentials();
return creds;
}
} // namespace FDBAWSCredentialsProvider
#endif

View File

@ -1342,7 +1342,12 @@ struct ReadBlobGranuleContext {
void* userContext;
// Returns a unique id for the load. Asynchronous to support queueing multiple in parallel.
int64_t (*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
int64_t (*start_load_f)(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* context);
// Returns data for the load. Pass the loadId returned by start_load_f
uint8_t* (*get_load_f)(int64_t loadId, void* context);
@ -1353,6 +1358,9 @@ struct ReadBlobGranuleContext {
// Set this to true for testing if you don't want to read the granule files,
// just do the request to the blob workers
bool debugNoMaterialize;
// number of granules to load in parallel (default 1)
int granuleParallelism = 1;
};
// Store metadata associated with each storage server. Now it only contains data be used in perpetual storage wiggle.

View File

@ -22,6 +22,7 @@
#include "fdbclient/json_spirit/json_spirit_writer_template.h"
#include "fdbclient/json_spirit/json_spirit_reader_template.h"
#include "flow/Error.h"
// JSONDoc is a convenient reader/writer class for manipulating JSON documents using "paths".
// Access is done using a "path", which is a string of dot-separated

View File

@ -169,7 +169,7 @@ void ClusterConnectionString::resolveHostnamesBlocking() {
}
void ClusterConnectionString::resetToUnresolved() {
if (hostnames.size() > 0) {
if (status == RESOLVED && hostnames.size() > 0) {
coords.clear();
hostnames.clear();
networkAddressToHostname.clear();
@ -558,8 +558,8 @@ ACTOR Future<Void> monitorNominee(Key key,
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
.detail("OldAddr", coord.getLeader.getEndpoint().getPrimaryAddress().toString());
if (rep.getError().code() == error_code_request_maybe_delivered) {
// 50 milliseconds delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(0.05));
// Delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY));
throw coordinators_changed();
} else {
throw rep.getError();
@ -589,7 +589,6 @@ ACTOR Future<Void> monitorNominee(Key key,
if (li.present() && li.get().forward)
wait(Future<Void>(Never()));
wait(Future<Void>(Void()));
}
}
}

View File

@ -282,8 +282,9 @@ ThreadResult<RangeResult> DLTransaction::readBlobGranules(const KeyRangeRef& key
context.get_load_f = granuleContext.get_load_f;
context.free_load_f = granuleContext.free_load_f;
context.debugNoMaterialize = granuleContext.debugNoMaterialize;
context.granuleParallelism = granuleContext.granuleParallelism;
int64_t rv = readVersion.present() ? readVersion.get() : invalidVersion;
int64_t rv = readVersion.present() ? readVersion.get() : latestVersion;
FdbCApi::FDBResult* r = api->transactionReadBlobGranules(tr,
keyRange.begin.begin(),

View File

@ -95,8 +95,12 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
void* userContext;
// Returns a unique id for the load. Asynchronous to support queueing multiple in parallel.
int64_t (
*start_load_f)(const char* filename, int filenameLength, int64_t offset, int64_t length, void* context);
int64_t (*start_load_f)(const char* filename,
int filenameLength,
int64_t offset,
int64_t length,
int64_t fullFileLength,
void* context);
// Returns data for the load. Pass the loadId returned by start_load_f
uint8_t* (*get_load_f)(int64_t loadId, void* context);
@ -107,6 +111,9 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
// set this to true for testing if you don't want to read the granule files, just
// do the request to the blob workers
fdb_bool_t debugNoMaterialize;
// number of granules to load in parallel (default 1)
int granuleParallelism;
} FDBReadBlobGranuleContext;
typedef void (*FDBCallback)(FDBFuture* future, void* callback_parameter);

View File

@ -533,6 +533,14 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
.detail("MedianBytesPerCommit", cx->bytesPerCommit.median())
.detail("MaxBytesPerCommit", cx->bytesPerCommit.max())
.detail("NumLocalityCacheEntries", cx->locationCache.size());
if (cx->anyBlobGranuleRequests) {
ev.detail("MeanBGLatency", cx->bgLatencies.mean())
.detail("MedianBGLatency", cx->bgLatencies.median())
.detail("MaxBGLatency", cx->bgLatencies.max())
.detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean())
.detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median())
.detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max());
}
}
cx->latencies.clear();
@ -541,6 +549,8 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
cx->commitLatencies.clear();
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
cx->bgLatencies.clear();
cx->bgGranulesPerRequest.clear();
lastLogged = now();
}
@ -1353,11 +1363,11 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000),
bytesPerCommit(1000), outstandingWatches(0), lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0),
lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID),
clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(apiVersion),
mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0),
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), outstandingWatches(0), lastGrvTime(0.0),
cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0),
transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor),
coordinator(coordinator), apiVersion(apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0),
detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
specialKeySpace(std::make_unique<SpecialKeySpace>(specialKeys.begin, specialKeys.end, /* test */ false)),
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {
dbId = deterministicRandom()->randomUniqueID();
@ -1619,7 +1629,8 @@ DatabaseContext::DatabaseContext(const Error& err)
transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc),
transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc),
latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000),
bytesPerCommit(1000), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
bytesPerCommit(1000), bgLatencies(1000), bgGranulesPerRequest(1000), transactionTracingSample(false),
smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT),
connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {}
// Static constructor used by server processes to create a DatabaseContext
@ -7340,6 +7351,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
state Version rv;
state Standalone<VectorRef<BlobGranuleChunkRef>> results;
state double startTime = now();
if (read.present()) {
rv = read.get();
@ -7442,6 +7454,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
req.keyRange = KeyRangeRef(StringRef(req.arena, granuleStartKey), StringRef(req.arena, granuleEndKey));
req.beginVersion = begin;
req.readVersion = rv;
req.canCollapseBegin = true; // TODO make this a parameter once we support it
std::vector<Reference<ReferencedInterface<BlobWorkerInterface>>> v;
v.push_back(
@ -7514,6 +7527,11 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> readBlobGranulesActor(
throw e;
}
}
self->trState->cx->anyBlobGranuleRequests = true;
self->trState->cx->bgGranulesPerRequest.addSample(results.size());
self->trState->cx->bgLatencies.addSample(now() - startTime);
if (readVersionOut != nullptr) {
*readVersionOut = rv;
}

View File

@ -1791,8 +1791,6 @@ Future<Standalone<VectorRef<BlobGranuleChunkRef>>> ReadYourWritesTransaction::re
Version begin,
Optional<Version> readVersion,
Version* readVersionOut) {
// Remove in V2 of API
ASSERT(begin == 0);
if (!options.readYourWritesDisabled) {
return blob_granule_no_ryw();

View File

@ -34,6 +34,8 @@
#include "fdbrpc/IAsyncFile.h"
#include "flow/UnitTest.h"
#include "fdbclient/rapidxml/rapidxml.hpp"
#include "fdbclient/FDBAWSCredentialsProvider.h"
#include "flow/actorcompiler.h" // has to be last include
using namespace rapidxml;
@ -82,6 +84,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
max_send_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
max_recv_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
sdk_auth = false;
}
bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
@ -118,6 +121,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
TRY_PARAM(read_cache_blocks_per_file, rcb);
TRY_PARAM(max_send_bytes_per_second, sbps);
TRY_PARAM(max_recv_bytes_per_second, rbps);
TRY_PARAM(sdk_auth, sa);
#undef TRY_PARAM
return false;
}
@ -506,7 +510,38 @@ ACTOR Future<Optional<json_spirit::mObject>> tryReadJSONFile(std::string path) {
return Optional<json_spirit::mObject>();
}
// If the credentials expire, the connection will eventually fail and be discarded from the pool, and then a new
// connection will be constructed, which will call this again to get updated credentials
static S3BlobStoreEndpoint::Credentials getSecretSdk() {
#ifdef BUILD_AWS_BACKUP
double elapsed = -timer_monotonic();
Aws::Auth::AWSCredentials awsCreds = FDBAWSCredentialsProvider::getAwsCredentials();
elapsed += timer_monotonic();
if (awsCreds.IsEmpty()) {
TraceEvent(SevWarn, "S3BlobStoreAWSCredsEmpty");
throw backup_auth_missing();
}
S3BlobStoreEndpoint::Credentials fdbCreds;
fdbCreds.key = awsCreds.GetAWSAccessKeyId();
fdbCreds.secret = awsCreds.GetAWSSecretKey();
fdbCreds.securityToken = awsCreds.GetSessionToken();
TraceEvent("S3BlobStoreGotSdkCredentials").suppressFor(60).detail("Duration", elapsed);
return fdbCreds;
#else
TraceEvent(SevError, "S3BlobStoreNoSDK");
throw backup_auth_missing();
#endif
}
ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
if (b->knobs.sdk_auth) {
b->credentials = getSecretSdk();
return Void();
}
std::vector<std::string>* pFiles = (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
if (pFiles == nullptr)
return Void();
@ -538,7 +573,7 @@ ACTOR Future<Void> updateSecret_impl(Reference<S3BlobStoreEndpoint> b) {
JSONDoc accounts(doc.last().get_obj());
if (accounts.has(credentialsFileKey, false) && accounts.last().type() == json_spirit::obj_type) {
JSONDoc account(accounts.last());
S3BlobStoreEndpoint::Credentials creds;
S3BlobStoreEndpoint::Credentials creds = b->credentials.get();
if (b->lookupKey) {
std::string apiKey;
if (account.tryGet("api_key", apiKey))
@ -601,7 +636,7 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
.detail("RemoteEndpoint", conn->getPeerAddress())
.detail("ExpiresIn", b->knobs.max_connection_life);
if (b->lookupKey || b->lookupSecret)
if (b->lookupKey || b->lookupSecret || b->knobs.sdk_auth)
wait(b->updateSecret());
return S3BlobStoreEndpoint::ReusableConnection({ conn, now() + b->knobs.max_connection_life });

View File

@ -59,7 +59,7 @@ public:
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second,
max_recv_bytes_per_second;
max_recv_bytes_per_second, sdk_auth;
bool set(StringRef name, int value);
std::string getURLParameters() const;
static std::vector<std::string> getKnobDescriptions() {
@ -91,7 +91,9 @@ public:
"read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.",
"max_send_bytes_per_second (or sbps) Max send bytes per second for all requests combined.",
"max_recv_bytes_per_second (or rbps) Max receive bytes per second for all requests combined (NOT YET "
"USED)."
"USED).",
"sdk_auth (or sa) Use AWS SDK to resolve credentials. Only valid if "
"BUILD_AWS_BACKUP is enabled."
};
}
};

View File

@ -828,6 +828,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// encrypt key proxy
init( ENABLE_ENCRYPTION, false );
init( ENCRYPTION_MODE, "AES-256-CTR");
// Blob granlues
init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually

View File

@ -775,8 +775,9 @@ public:
// Cluster recovery
std::string CLUSTER_RECOVERY_EVENT_NAME_PREFIX;
// encrypt key proxy
// Encryption
bool ENABLE_ENCRYPTION;
std::string ENCRYPTION_MODE;
// blob granule stuff
// FIXME: configure url with database configuration instead of knob eventually

View File

@ -29,14 +29,7 @@ static std::map<NetworkAddress, std::pair<Reference<EvictablePageCache>, Referen
EvictablePage::~EvictablePage() {
if (data) {
#if defined(USE_JEMALLOC)
aligned_free(data);
#else
if (pageCache->pageSize == 4096)
FastAllocator<4096>::release(data);
else
aligned_free(data);
#endif
freeFast4kAligned(pageCache->pageSize, data);
}
if (EvictablePageCache::RANDOM == pageCache->cacheEvictionType) {
if (index > -1) {
@ -173,14 +166,7 @@ void AsyncFileCached::releaseZeroCopy(void* data, int length, int64_t offset) {
if (o != orphanedPages.end()) {
if (o->second == 1) {
if (data) {
#if defined(USE_JEMALLOC)
aligned_free(data);
#else
if (length == 4096)
FastAllocator<4096>::release(data);
else
aligned_free(data);
#endif
freeFast4kAligned(length, data);
}
} else {
--o->second;

View File

@ -79,14 +79,9 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
void allocate(EvictablePage* page) {
try_evict();
try_evict();
#if defined(USE_JEMALLOC)
page->data = aligned_alloc(4096, pageSize);
#else
page->data = pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageSize);
#endif
if (page->data == nullptr) {
platform::outOfMemory();
}
page->data = allocateFast4kAligned(pageSize);
if (RANDOM == cacheEvictionType) {
page->index = pages.size();
pages.push_back(page);
@ -394,14 +389,7 @@ struct AFCPage : public EvictablePage, public FastAllocated<AFCPage> {
owner->orphanedPages[data] = zeroCopyRefCount;
zeroCopyRefCount = 0;
notReading = Void();
#if defined(USE_JEMALLOC)
data = aligned_alloc(4096, pageCache->pageSize);
#else
data = pageCache->pageSize == 4096 ? FastAllocator<4096>::allocate() : aligned_alloc(4096, pageCache->pageSize);
#endif
if (data == nullptr) {
platform::outOfMemory();
}
data = allocateFast4kAligned(pageCache->pageSize);
}
Future<Void> write(void const* data, int length, int offset) {

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
@ -25,6 +26,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ReadYourWrites.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // has to be last include
// Gets the latest granule history node for range that was persisted
@ -102,3 +104,252 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID) {
}
}
}
// Normally a beginVersion != 0 means the caller wants all mutations between beginVersion and readVersion, instead of
// the latest snapshot before readVersion + deltas after the snapshot. When canCollapse is set, the beginVersion is
// essentially just an optimization hint. The caller is still concerned with reconstructing rows at readVersion, it just
// knows it doesn't need anything before beginVersion.
// Normally this can eliminate the need for a snapshot and just return a small amount of deltas. But in a highly active
// key range, the granule may have a snapshot file at version X, where beginVersion < X <= readVersion. In this case, if
// the number of bytes in delta files between beginVersion and X is larger than the snapshot file at version X, it is
// strictly more efficient (in terms of files and bytes read) to just use the snapshot file at version X instead.
void GranuleFiles::getFiles(Version beginVersion,
Version readVersion,
bool canCollapse,
BlobGranuleChunkRef& chunk,
Arena& replyArena,
int64_t& deltaBytesCounter) const {
BlobFileIndex dummyIndex; // for searching
// if beginVersion == 0 or we can collapse, find the latest snapshot <= readVersion
auto snapshotF = snapshotFiles.end();
if (beginVersion == 0 || canCollapse) {
dummyIndex.version = readVersion;
snapshotF = std::lower_bound(snapshotFiles.begin(), snapshotFiles.end(), dummyIndex);
if (snapshotF == snapshotFiles.end() || snapshotF->version > readVersion) {
ASSERT(snapshotF != snapshotFiles.begin());
snapshotF--;
}
ASSERT(snapshotF != snapshotFiles.end());
ASSERT(snapshotF->version <= readVersion);
}
auto deltaF = deltaFiles.end();
if (beginVersion > 0) {
dummyIndex.version = beginVersion;
deltaF = std::lower_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
if (canCollapse) {
ASSERT(snapshotF != snapshotFiles.end());
// If we can collapse, see if delta files up to snapshotVersion are smaller or larger than snapshotBytes in
// total
auto deltaFCopy = deltaF;
int64_t snapshotBytes = snapshotF->length;
while (deltaFCopy != deltaFiles.end() && deltaFCopy->version <= snapshotF->version && snapshotBytes > 0) {
snapshotBytes -= deltaFCopy->length;
deltaFCopy++;
}
// if delta files contain the same or more bytes as the snapshot with collapse, do the collapse
if (snapshotBytes > 0) {
// don't collapse, clear snapshotF and just do delta files
snapshotF = snapshotFiles.end();
} else {
// do snapshot instead of previous deltas
dummyIndex.version = snapshotF->version;
deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version);
}
}
} else {
dummyIndex.version = snapshotF->version;
deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version);
}
Version lastIncluded = invalidVersion;
if (snapshotF != snapshotFiles.end()) {
chunk.snapshotVersion = snapshotF->version;
chunk.snapshotFile = BlobFilePointerRef(replyArena, snapshotF->filename, snapshotF->offset, snapshotF->length);
lastIncluded = chunk.snapshotVersion;
} else {
chunk.snapshotVersion = invalidVersion;
}
while (deltaF != deltaFiles.end() && deltaF->version < readVersion) {
chunk.deltaFiles.emplace_back_deep(replyArena, deltaF->filename, deltaF->offset, deltaF->length);
deltaBytesCounter += deltaF->length;
ASSERT(lastIncluded < deltaF->version);
lastIncluded = deltaF->version;
deltaF++;
}
// include last delta file that passes readVersion, if it exists
if (deltaF != deltaFiles.end() && lastIncluded < readVersion) {
chunk.deltaFiles.emplace_back_deep(replyArena, deltaF->filename, deltaF->offset, deltaF->length);
deltaBytesCounter += deltaF->length;
lastIncluded = deltaF->version;
}
}
static std::string makeTestFileName(Version v) {
return "test" + std::to_string(v);
}
static BlobFileIndex makeTestFile(Version v, int64_t len) {
return BlobFileIndex(v, makeTestFileName(v), 0, len);
}
static void checkFile(int expectedVersion, const BlobFilePointerRef& actualFile) {
ASSERT(makeTestFileName(expectedVersion) == actualFile.filename.toString());
}
static void checkFiles(const GranuleFiles& f,
Version beginVersion,
Version readVersion,
bool canCollapse,
Optional<int> expectedSnapshotVersion,
std::vector<int> expectedDeltaVersions) {
Arena a;
BlobGranuleChunkRef chunk;
int64_t deltaBytes = 0;
f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes);
fmt::print("results({0}, {1}, {2}):\nEXPECTED:\n snapshot={3}\n deltas ({4}):\n",
beginVersion,
readVersion,
canCollapse ? "T" : "F",
expectedSnapshotVersion.present() ? makeTestFileName(expectedSnapshotVersion.get()).c_str() : "<N/A>",
expectedDeltaVersions.size());
for (int d : expectedDeltaVersions) {
fmt::print(" {}\n", makeTestFileName(d));
}
fmt::print("ACTUAL:\n snapshot={0}\n deltas ({1}):\n",
chunk.snapshotFile.present() ? chunk.snapshotFile.get().filename.toString().c_str() : "<N/A>",
chunk.deltaFiles.size());
for (auto& it : chunk.deltaFiles) {
fmt::print(" {}\n", it.filename.toString());
}
printf("\n\n\n");
ASSERT(expectedSnapshotVersion.present() == chunk.snapshotFile.present());
if (expectedSnapshotVersion.present()) {
checkFile(expectedSnapshotVersion.get(), chunk.snapshotFile.get());
}
ASSERT(expectedDeltaVersions.size() == chunk.deltaFiles.size());
for (int i = 0; i < expectedDeltaVersions.size(); i++) {
checkFile(expectedDeltaVersions[i], chunk.deltaFiles[i]);
}
}
/*
* Files:
* S @ 100 (10 bytes)
* D @ 150 (5 bytes)
* D @ 200 (6 bytes)
* S @ 200 (15 bytes)
* D @ 250 (7 bytes)
* D @ 300 (8 bytes)
* S @ 300 (10 bytes)
* D @ 350 (4 bytes)
*/
TEST_CASE("/blobgranule/server/common/granulefiles") {
// simple cases first
// single snapshot file, no deltas
GranuleFiles files;
files.snapshotFiles.push_back(makeTestFile(100, 10));
printf("Just snapshot\n");
checkFiles(files, 0, 100, false, 100, {});
checkFiles(files, 0, 200, false, 100, {});
printf("Small test\n");
// add delta files with re-snapshot at end
files.deltaFiles.push_back(makeTestFile(150, 5));
files.deltaFiles.push_back(makeTestFile(200, 6));
files.snapshotFiles.push_back(makeTestFile(200, 15));
// check different read versions with beginVersion=0
checkFiles(files, 0, 100, false, 100, {});
checkFiles(files, 0, 101, false, 100, { 150 });
checkFiles(files, 0, 149, false, 100, { 150 });
checkFiles(files, 0, 150, false, 100, { 150 });
checkFiles(files, 0, 151, false, 100, { 150, 200 });
checkFiles(files, 0, 199, false, 100, { 150, 200 });
checkFiles(files, 0, 200, false, 200, {});
checkFiles(files, 0, 300, false, 200, {});
// Test all cases of beginVersion + readVersion. Because delta files are smaller than snapshot at 200, this should
// be the same with and without collapse
checkFiles(files, 100, 200, false, Optional<int>(), { 150, 200 });
checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200 });
checkFiles(files, 101, 199, false, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 151, false, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 150, false, Optional<int>(), { 150 });
checkFiles(files, 150, 151, false, Optional<int>(), { 150, 200 });
checkFiles(files, 151, 200, false, Optional<int>(), { 200 });
checkFiles(files, 100, 200, true, Optional<int>(), { 150, 200 });
checkFiles(files, 100, 300, true, Optional<int>(), { 150, 200 });
checkFiles(files, 101, 199, true, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 151, true, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 150, true, Optional<int>(), { 150 });
checkFiles(files, 150, 151, true, Optional<int>(), { 150, 200 });
checkFiles(files, 151, 200, true, Optional<int>(), { 200 });
printf("Larger test\n");
// add more delta files and snapshots to check collapse logic
files.deltaFiles.push_back(makeTestFile(250, 7));
files.deltaFiles.push_back(makeTestFile(300, 8));
files.snapshotFiles.push_back(makeTestFile(300, 10));
files.deltaFiles.push_back(makeTestFile(350, 4));
checkFiles(files, 0, 300, false, 300, {});
checkFiles(files, 0, 301, false, 300, { 350 });
checkFiles(files, 0, 400, false, 300, { 350 });
// check delta files without collapse
checkFiles(files, 100, 301, false, Optional<int>(), { 150, 200, 250, 300, 350 });
checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200, 250, 300 });
checkFiles(files, 100, 251, false, Optional<int>(), { 150, 200, 250, 300 });
checkFiles(files, 100, 250, false, Optional<int>(), { 150, 200, 250 });
checkFiles(files, 151, 300, false, Optional<int>(), { 200, 250, 300 });
checkFiles(files, 151, 301, false, Optional<int>(), { 200, 250, 300, 350 });
checkFiles(files, 151, 400, false, Optional<int>(), { 200, 250, 300, 350 });
checkFiles(files, 201, 300, false, Optional<int>(), { 250, 300 });
checkFiles(files, 201, 301, false, Optional<int>(), { 250, 300, 350 });
checkFiles(files, 201, 400, false, Optional<int>(), { 250, 300, 350 });
checkFiles(files, 251, 300, false, Optional<int>(), { 300 });
checkFiles(files, 251, 301, false, Optional<int>(), { 300, 350 });
checkFiles(files, 251, 400, false, Optional<int>(), { 300, 350 });
checkFiles(files, 301, 400, false, Optional<int>(), { 350 });
checkFiles(files, 351, 400, false, Optional<int>(), {});
// check with collapse
// these 2 collapse because the delta files at 150+200+250+300 are larger than the snapshot at 300
checkFiles(files, 100, 301, true, 300, { 350 });
checkFiles(files, 100, 300, true, 300, {});
// these 2 don't collapse because 150+200 delta files are smaller than the snapshot at 200
checkFiles(files, 100, 251, true, Optional<int>(), { 150, 200, 250, 300 });
checkFiles(files, 100, 250, true, Optional<int>(), { 150, 200, 250 });
// these 3 do collapse because the delta files at 200+250+300 are larger than the snapshot at 300
checkFiles(files, 151, 300, true, 300, {});
checkFiles(files, 151, 301, true, 300, { 350 });
checkFiles(files, 151, 400, true, 300, { 350 });
// these 3 do collapse because the delta files at 250+300 are larger than the snapshot at 300
checkFiles(files, 201, 300, true, 300, {});
checkFiles(files, 201, 301, true, 300, { 350 });
checkFiles(files, 201, 400, true, 300, { 350 });
// these don't collapse because the delta file at 300 is smaller than the snapshot at 300
checkFiles(files, 251, 300, true, Optional<int>(), { 300 });
checkFiles(files, 251, 301, true, Optional<int>(), { 300, 350 });
checkFiles(files, 251, 400, true, Optional<int>(), { 300, 350 });
checkFiles(files, 301, 400, true, Optional<int>(), { 350 });
checkFiles(files, 351, 400, true, Optional<int>(), {});
return Void();
}

View File

@ -54,12 +54,23 @@ struct BlobFileIndex {
BlobFileIndex(Version version, std::string filename, int64_t offset, int64_t length)
: version(version), filename(filename), offset(offset), length(length) {}
// compare on version
bool operator<(const BlobFileIndex& r) const { return version < r.version; }
};
// FIXME: initialize these to smaller default sizes to save a bit of memory, particularly snapshotFiles
// Stores the files that comprise a blob granule
struct GranuleFiles {
std::deque<BlobFileIndex> snapshotFiles;
std::deque<BlobFileIndex> deltaFiles;
std::vector<BlobFileIndex> snapshotFiles;
std::vector<BlobFileIndex> deltaFiles;
void getFiles(Version beginVersion,
Version readVersion,
bool canCollapse,
BlobGranuleChunkRef& chunk,
Arena& replyArena,
int64_t& deltaBytesCounter) const;
};
class Transaction;

View File

@ -2778,7 +2778,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
// DB has [A - B) and [C - D). They should show up in knownBlobRanges, and [B - C) should be in removed.
// DB has [B - C). It should show up in knownBlobRanges, [B - C) should be in added, and [A - B) and [C - D)
// should be in removed.
TEST_CASE(":/blobmanager/updateranges") {
TEST_CASE("/blobmanager/updateranges") {
KeyRangeMap<bool> knownBlobRanges(false, normalKeys.end);
Arena ar;

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include <limits>
#include <tuple>
#include <utility>
#include <vector>
@ -43,9 +44,10 @@
#include "flow/Error.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/actorcompiler.h" // has to be last include
#include "flow/network.h"
#include "flow/actorcompiler.h" // has to be last include
#define BW_DEBUG false
#define BW_REQUEST_DEBUG false
@ -832,7 +834,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
rowsStream,
false);
RangeResult newGranule =
wait(readBlobGranule(chunk, metadata->keyRange, version, bwData->bstore, &bwData->stats));
wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bwData->bstore, &bwData->stats));
bwData->stats.bytesReadFromS3ForCompaction += compactBytesRead;
rowsStream.send(std::move(newGranule));
@ -2093,16 +2095,25 @@ ACTOR Future<Void> waitForVersion(Reference<GranuleMetadata> metadata, Version v
ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, BlobGranuleFileRequest req) {
if (BW_REQUEST_DEBUG) {
fmt::print("BW {0} processing blobGranuleFileRequest for range [{1} - {2}) @ {3}\n",
fmt::print("BW {0} processing blobGranuleFileRequest for range [{1} - {2}) @ ",
bwData->id.toString(),
req.keyRange.begin.printable(),
req.keyRange.end.printable(),
req.readVersion);
if (req.beginVersion > 0) {
fmt::print("{0} - {1}\n", req.beginVersion, req.readVersion);
} else {
fmt::print("{}", req.readVersion);
}
}
state bool didCollapse = false;
try {
// TODO REMOVE in api V2
ASSERT(req.beginVersion == 0);
// TODO remove requirement for canCollapseBegin once we implement early replying
ASSERT(req.beginVersion == 0 || req.canCollapseBegin);
if (req.beginVersion != 0) {
ASSERT(req.beginVersion > 0);
}
state BlobGranuleFileReply rep;
state std::vector<Reference<GranuleMetadata>> granules;
@ -2150,6 +2161,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
continue;
}
state Reference<GranuleMetadata> metadata = m;
state Version granuleBeginVersion = req.beginVersion;
choose {
when(wait(metadata->readable.getFuture())) {}
@ -2290,67 +2302,30 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// granule is up to date, do read
ASSERT(metadata->cancelled.canBeSet());
// Right now we force a collapse if the version range crosses granule boundaries, for simplicity
if (granuleBeginVersion <= chunkFiles.snapshotFiles.front().version) {
TEST(true); // collapsed begin version request because of boundaries
didCollapse = true;
granuleBeginVersion = 0;
}
BlobGranuleChunkRef chunk;
// TODO change in V2
// TODO change with early reply
chunk.includedVersion = req.readVersion;
chunk.keyRange = KeyRangeRef(StringRef(rep.arena, chunkRange.begin), StringRef(rep.arena, chunkRange.end));
// handle snapshot files
// TODO refactor the "find snapshot file" logic to GranuleFiles?
// FIXME: binary search instead of linear search, especially when file count is large
int i = chunkFiles.snapshotFiles.size() - 1;
while (i >= 0 && chunkFiles.snapshotFiles[i].version > req.readVersion) {
i--;
}
// because of granule history, we should always be able to find the desired snapshot
// version, and have thrown blob_granule_transaction_too_old earlier if not possible.
if (i < 0) {
fmt::print("req @ {0} >= initial snapshot {1} but can't find snapshot in ({2}) files:\n",
req.readVersion,
metadata->initialSnapshotVersion,
chunkFiles.snapshotFiles.size());
for (auto& f : chunkFiles.snapshotFiles) {
fmt::print(" {0}", f.version);
}
}
ASSERT(i >= 0);
BlobFileIndex snapshotF = chunkFiles.snapshotFiles[i];
chunk.snapshotFile = BlobFilePointerRef(rep.arena, snapshotF.filename, snapshotF.offset, snapshotF.length);
Version snapshotVersion = chunkFiles.snapshotFiles[i].version;
chunk.snapshotVersion = snapshotVersion;
// handle delta files
// cast this to an int so i going to -1 still compares properly
int lastDeltaFileIdx = chunkFiles.deltaFiles.size() - 1;
i = lastDeltaFileIdx;
// skip delta files that are too new
while (i >= 0 && chunkFiles.deltaFiles[i].version > req.readVersion) {
i--;
}
if (i < lastDeltaFileIdx) {
// we skipped one file at the end with a larger read version, this will actually contain
// our query version, so add it back.
i++;
}
// only include delta files after the snapshot file
int j = i;
while (j >= 0 && chunkFiles.deltaFiles[j].version > snapshotVersion) {
j--;
}
j++;
while (j <= i) {
BlobFileIndex deltaF = chunkFiles.deltaFiles[j];
chunk.deltaFiles.emplace_back_deep(rep.arena, deltaF.filename, deltaF.offset, deltaF.length);
bwData->stats.readReqDeltaBytesReturned += deltaF.length;
j++;
int64_t deltaBytes = 0;
chunkFiles.getFiles(
granuleBeginVersion, req.readVersion, req.canCollapseBegin, chunk, rep.arena, deltaBytes);
bwData->stats.readReqDeltaBytesReturned += deltaBytes;
if (granuleBeginVersion > 0 && chunk.snapshotFile.present()) {
TEST(true); // collapsed begin version request for efficiency
didCollapse = true;
}
// new deltas (if version is larger than version of last delta file)
// FIXME: do trivial key bounds here if key range is not fully contained in request key
// range
if (req.readVersion > metadata->durableDeltaVersion.get()) {
if (req.readVersion > metadata->durableDeltaVersion.get() && !metadata->currentDeltas.empty()) {
if (metadata->durableDeltaVersion.get() != metadata->pendingDeltaVersion) {
fmt::print("real-time read [{0} - {1}) @ {2} doesn't have mutations!! durable={3}, pending={4}\n",
metadata->keyRange.begin.printable(),
@ -2359,13 +2334,32 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
metadata->durableDeltaVersion.get(),
metadata->pendingDeltaVersion);
}
// prune mutations based on begin version, if possible
ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion);
// FIXME: I think we can remove this dependsOn since we are doing push_back_deep
rep.arena.dependsOn(metadata->currentDeltas.arena());
for (auto& delta : metadata->currentDeltas) {
if (delta.version > req.readVersion) {
MutationsAndVersionRef* mutationIt = metadata->currentDeltas.begin();
if (granuleBeginVersion > metadata->currentDeltas.back().version) {
TEST(true); // beginVersion pruning all in-memory mutations
mutationIt = metadata->currentDeltas.end();
} else if (granuleBeginVersion > metadata->currentDeltas.front().version) {
// binary search for beginVersion
TEST(true); // beginVersion pruning some in-memory mutations
mutationIt = std::lower_bound(metadata->currentDeltas.begin(),
metadata->currentDeltas.end(),
MutationsAndVersionRef(granuleBeginVersion, 0),
MutationsAndVersionRef::OrderByVersion());
}
// add mutations to response
while (mutationIt != metadata->currentDeltas.end()) {
if (mutationIt->version > req.readVersion) {
TEST(true); // readVersion pruning some in-memory mutations
break;
}
chunk.newDeltas.push_back_deep(rep.arena, delta);
chunk.newDeltas.push_back_deep(rep.arena, *mutationIt);
mutationIt++;
}
}
@ -2376,11 +2370,17 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
wait(yield(TaskPriority::DefaultEndpoint));
}
// do these together to keep them synchronous
if (req.beginVersion != 0) {
++bwData->stats.readRequestsWithBegin;
}
if (didCollapse) {
++bwData->stats.readRequestsCollapsed;
}
ASSERT(!req.reply.isSet());
req.reply.send(rep);
--bwData->stats.activeReadRequests;
} catch (Error& e) {
// fmt::print("Error in BGFRequest {0}\n", e.name());
if (e.code() == error_code_operation_cancelled) {
req.reply.sendError(wrong_shard_server());
throw;

View File

@ -2475,11 +2475,12 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
}
}
ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
ACTOR Future<Void> clusterControllerCore(Reference<IClusterConnectionRecord> connRecord,
ClusterControllerFullInterface interf,
Future<Void> leaderFail,
ServerCoordinators coordinators,
LocalityData locality,
ConfigDBType configDBType) {
state ServerCoordinators coordinators(connRecord);
state ClusterControllerData self(interf, locality, coordinators);
state ConfigBroadcaster configBroadcaster(coordinators, configDBType);
state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
@ -2612,7 +2613,7 @@ ACTOR Future<Void> replaceInterface(ClusterControllerFullInterface interf) {
}
}
ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
@ -2623,9 +2624,10 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
state bool inRole = false;
cci.initEndpoints();
try {
wait(connRecord->resolveHostnames());
// Register as a possible leader; wait to be elected
state Future<Void> leaderFail =
tryBecomeLeader(coordinators, cci, currentCC, hasConnected, asyncPriorityInfo);
tryBecomeLeader(connRecord, cci, currentCC, hasConnected, asyncPriorityInfo);
state Future<Void> shouldReplace = replaceInterface(cci);
while (!currentCC->get().present() || currentCC->get().get() != cci) {
@ -2644,7 +2646,7 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
inRole = true;
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType));
wait(clusterControllerCore(connRecord, cci, leaderFail, locality, configDBType));
}
} catch (Error& e) {
if (inRole)
@ -2673,15 +2675,12 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
state bool hasConnected = false;
loop {
try {
wait(connRecord->resolveHostnames());
ServerCoordinators coordinators(connRecord);
wait(clusterController(coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
wait(clusterController(connRecord, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType));
hasConnected = true;
} catch (Error& e) {
if (e.code() != error_code_coordinators_changed)
throw; // Expected to terminate fdbserver
}
hasConnected = true;
}
}

View File

@ -50,11 +50,12 @@ struct RelocateData {
std::vector<UID> completeSources;
std::vector<UID> completeDests;
bool wantsNewServers;
bool cancellable;
TraceInterval interval;
RelocateData()
: priority(-1), boundaryPriority(-1), healthPriority(-1), startTime(-1), workFactor(0), wantsNewServers(false),
interval("QueuedRelocation") {}
cancellable(false), interval("QueuedRelocation") {}
explicit RelocateData(RelocateShard const& rs)
: keys(rs.keys), priority(rs.priority), boundaryPriority(isBoundaryPriority(rs.priority) ? rs.priority : -1),
healthPriority(isHealthPriority(rs.priority) ? rs.priority : -1), startTime(now()),
@ -63,7 +64,7 @@ struct RelocateData {
rs.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM ||
rs.priority == SERVER_KNOBS->PRIORITY_SPLIT_SHARD ||
rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT),
interval("QueuedRelocation") {}
cancellable(true), interval("QueuedRelocation") {}
static bool isHealthPriority(int priority) {
return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION ||
@ -610,19 +611,23 @@ struct DDQueueData {
.detail(
"Problem",
"the key range in the inFlight map matches the key range in the RelocateData message");
} else if (it->value().cancellable) {
TraceEvent(SevError, "DDQueueValidateError13")
.detail("Problem", "key range is cancellable but not in flight!")
.detail("Range", it->range());
}
}
for (auto it = busymap.begin(); it != busymap.end(); ++it) {
for (int i = 0; i < it->second.ledger.size() - 1; i++) {
if (it->second.ledger[i] < it->second.ledger[i + 1])
TraceEvent(SevError, "DDQueueValidateError13")
TraceEvent(SevError, "DDQueueValidateError14")
.detail("Problem", "ascending ledger problem")
.detail("LedgerLevel", i)
.detail("LedgerValueA", it->second.ledger[i])
.detail("LedgerValueB", it->second.ledger[i + 1]);
if (it->second.ledger[i] < 0.0)
TraceEvent(SevError, "DDQueueValidateError14")
TraceEvent(SevError, "DDQueueValidateError15")
.detail("Problem", "negative ascending problem")
.detail("LedgerLevel", i)
.detail("LedgerValue", it->second.ledger[i]);
@ -632,13 +637,13 @@ struct DDQueueData {
for (auto it = destBusymap.begin(); it != destBusymap.end(); ++it) {
for (int i = 0; i < it->second.ledger.size() - 1; i++) {
if (it->second.ledger[i] < it->second.ledger[i + 1])
TraceEvent(SevError, "DDQueueValidateError15")
TraceEvent(SevError, "DDQueueValidateError16")
.detail("Problem", "ascending ledger problem")
.detail("LedgerLevel", i)
.detail("LedgerValueA", it->second.ledger[i])
.detail("LedgerValueB", it->second.ledger[i + 1]);
if (it->second.ledger[i] < 0.0)
TraceEvent(SevError, "DDQueueValidateError16")
TraceEvent(SevError, "DDQueueValidateError17")
.detail("Problem", "negative ascending problem")
.detail("LedgerLevel", i)
.detail("LedgerValue", it->second.ledger[i]);
@ -954,7 +959,7 @@ struct DDQueueData {
auto containedRanges = inFlight.containedRanges(rd.keys);
std::vector<RelocateData> cancellableRelocations;
for (auto it = containedRanges.begin(); it != containedRanges.end(); ++it) {
if (inFlightActors.liveActorAt(it->range().begin)) {
if (it.value().cancellable) {
cancellableRelocations.push_back(it->value());
}
}
@ -1180,6 +1185,12 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
}
// set cancellable to false on inFlight's entry for this key range
auto inFlightRange = self->inFlight.rangeContaining(rd.keys.begin);
ASSERT(inFlightRange.range() == rd.keys);
ASSERT(inFlightRange.value().randomId == rd.randomId);
inFlightRange.value().cancellable = false;
destIds.clear();
state std::vector<UID> healthyIds;
state std::vector<UID> extraIds;

View File

@ -25,28 +25,56 @@
#include "fdbclient/MonitorLeader.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// Keep trying to become a leader by submitting itself to all coordinators.
// Monitor the health of all coordinators at the same time.
// Note: for coordinators whose NetworkAddress is parsed out of a hostname, a connection failure will cause this actor
// to throw `coordinators_changed()` error
ACTOR Future<Void> submitCandidacy(Key key,
LeaderElectionRegInterface coord,
LeaderInfo myInfo,
UID prevChangeID,
Reference<AsyncVar<std::vector<Optional<LeaderInfo>>>> nominees,
int index) {
AsyncTrigger* nomineeChange,
Optional<LeaderInfo>* nominee,
Optional<Hostname> hostname = Optional<Hostname>()) {
loop {
auto const& nom = nominees->get()[index];
Optional<LeaderInfo> li = wait(
retryBrokenPromise(coord.candidacy,
CandidacyRequest(key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID),
TaskPriority::CoordinationReply));
state Optional<LeaderInfo> li;
if (li != nominees->get()[index]) {
std::vector<Optional<LeaderInfo>> v = nominees->get();
v[index] = li;
nominees->set(v);
if (coord.candidacy.getEndpoint().getPrimaryAddress().fromHostname) {
state ErrorOr<Optional<LeaderInfo>> rep = wait(coord.candidacy.tryGetReply(
CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
TaskPriority::CoordinationReply));
if (rep.isError()) {
// Connecting to nominee failed, most likely due to connection failed.
TraceEvent("SubmitCandadicyError")
.error(rep.getError())
.detail("Hostname", hostname.present() ? hostname.get().toString() : "UnknownHostname")
.detail("OldAddr", coord.candidacy.getEndpoint().getPrimaryAddress().toString());
if (rep.getError().code() == error_code_request_maybe_delivered) {
// Delay to prevent tight resolving loop due to outdated DNS cache
wait(delay(CLIENT_KNOBS->COORDINATOR_HOSTNAME_RESOLVE_DELAY));
throw coordinators_changed();
} else {
throw rep.getError();
}
} else if (rep.present()) {
li = rep.get();
}
} else {
Optional<LeaderInfo> tmp = wait(retryBrokenPromise(
coord.candidacy,
CandidacyRequest(key, myInfo, nominee->present() ? nominee->get().changeID : UID(), prevChangeID),
TaskPriority::CoordinationReply));
li = tmp;
}
wait(Future<Void>(Void())); // Make sure we weren't cancelled
if (li != *nominee) {
*nominee = li;
nomineeChange->trigger();
if (li.present() && li.get().forward)
wait(Future<Void>(Never()));
wait(Future<Void>(Void())); // Make sure we weren't cancelled
}
}
}
@ -84,13 +112,14 @@ ACTOR Future<Void> changeLeaderCoordinators(ServerCoordinators coordinators, Val
return Void();
}
ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
ACTOR Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> connRecord,
Value proposedSerializedInterface,
Reference<AsyncVar<Value>> outSerializedLeader,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo) {
state Reference<AsyncVar<std::vector<Optional<LeaderInfo>>>> nominees(
new AsyncVar<std::vector<Optional<LeaderInfo>>>());
state ServerCoordinators coordinators(connRecord);
state AsyncTrigger nomineeChange;
state std::vector<Optional<LeaderInfo>> nominees;
state LeaderInfo myInfo;
state Future<Void> candidacies;
state bool iAmLeader = false;
@ -105,8 +134,6 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
wait(delay(SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY));
}
nominees->set(std::vector<Optional<LeaderInfo>>(coordinators.clientLeaderServers.size()));
myInfo.serializedInfo = proposedSerializedInterface;
outSerializedLeader->set(Value());
@ -114,6 +141,9 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
(SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY) ? buggifyDelayedAsyncVar(outSerializedLeader) : Void();
while (!iAmLeader) {
wait(connRecord->resolveHostnames());
coordinators = ServerCoordinators(connRecord);
nominees.resize(coordinators.leaderElectionServers.size());
state Future<Void> badCandidateTimeout;
myInfo.changeID = deterministicRandom()->randomUniqueID();
@ -122,13 +152,25 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
std::vector<Future<Void>> cand;
cand.reserve(coordinators.leaderElectionServers.size());
for (int i = 0; i < coordinators.leaderElectionServers.size(); i++)
cand.push_back(submitCandidacy(
coordinators.clusterKey, coordinators.leaderElectionServers[i], myInfo, prevChangeID, nominees, i));
for (int i = 0; i < coordinators.leaderElectionServers.size(); i++) {
Optional<Hostname> hostname;
auto r = connRecord->getConnectionString().networkAddressToHostname.find(
coordinators.leaderElectionServers[i].candidacy.getEndpoint().getPrimaryAddress());
if (r != connRecord->getConnectionString().networkAddressToHostname.end()) {
hostname = r->second;
}
cand.push_back(submitCandidacy(coordinators.clusterKey,
coordinators.leaderElectionServers[i],
myInfo,
prevChangeID,
&nomineeChange,
&nominees[i],
hostname));
}
candidacies = waitForAll(cand);
loop {
state Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees->get());
state Optional<std::pair<LeaderInfo, bool>> leader = getLeader(nominees);
if (leader.present() && leader.get().first.forward) {
// These coordinators are forwarded to another set. But before we change our own cluster file, we need
// to make sure that a majority of coordinators know that. SOMEDAY: Wait briefly to see if other
@ -172,22 +214,30 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
// If more than 2*SERVER_KNOBS->POLLING_FREQUENCY elapses while we are nominated by some coordinator but
// there is no leader, we might be breaking the leader election process for someone with better
// communications but lower ID, so change IDs.
if ((!leader.present() || !leader.get().second) &&
std::count(nominees->get().begin(), nominees->get().end(), myInfo)) {
if ((!leader.present() || !leader.get().second) && std::count(nominees.begin(), nominees.end(), myInfo)) {
if (!badCandidateTimeout.isValid())
badCandidateTimeout = delay(SERVER_KNOBS->POLLING_FREQUENCY * 2, TaskPriority::CoordinationReply);
} else
badCandidateTimeout = Future<Void>();
choose {
when(wait(nominees->onChange())) {}
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
TEST(true); // Bad candidate timeout
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
break;
try {
choose {
when(wait(nomineeChange.onTrigger())) {}
when(wait(badCandidateTimeout.isValid() ? badCandidateTimeout : Never())) {
TEST(true); // Bad candidate timeout
TraceEvent("LeaderBadCandidateTimeout", myInfo.changeID).log();
break;
}
when(wait(candidacies)) { ASSERT(false); }
when(wait(asyncPriorityInfo->onChange())) { break; }
}
} catch (Error& e) {
if (e.code() == error_code_coordinators_changed) {
connRecord->getConnectionString().resetToUnresolved();
break;
} else {
throw e;
}
when(wait(candidacies)) { ASSERT(false); }
when(wait(asyncPriorityInfo->onChange())) { break; }
}
}

View File

@ -37,7 +37,7 @@ class ServerCoordinators;
// eventually be set. If the return value is cancelled, the candidacy or leadership of the proposedInterface
// will eventually end.
template <class LeaderInterface>
Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
LeaderInterface const& proposedInterface,
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
bool hasConnected,
@ -50,20 +50,20 @@ Future<Void> changeLeaderCoordinators(ServerCoordinators const& coordinators, Va
#pragma region Implementation
#endif // __INTEL_COMPILER
Future<Void> tryBecomeLeaderInternal(ServerCoordinators const& coordinators,
Future<Void> tryBecomeLeaderInternal(Reference<IClusterConnectionRecord> const& connRecord,
Value const& proposedSerializedInterface,
Reference<AsyncVar<Value>> const& outSerializedLeader,
bool const& hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo);
template <class LeaderInterface>
Future<Void> tryBecomeLeader(ServerCoordinators const& coordinators,
Future<Void> tryBecomeLeader(Reference<IClusterConnectionRecord> const& connRecord,
LeaderInterface const& proposedInterface,
Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
bool hasConnected,
Reference<AsyncVar<ClusterControllerPriorityInfo>> const& asyncPriorityInfo) {
auto serializedInfo = makeReference<AsyncVar<Value>>();
Future<Void> m = tryBecomeLeaderInternal(coordinators,
Future<Void> m = tryBecomeLeaderInternal(connRecord,
ObjectWriter::toValue(proposedInterface, IncludeVersion()),
serializedInfo,
hasConnected,

View File

@ -2726,8 +2726,6 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
actors.push_back(serveProcess());
try {
wait(connRecord->resolveHostnames());
ServerCoordinators coordinators(connRecord);
if (g_network->isSimulated()) {
whitelistBinPaths = ",, random_path, /bin/snap_create.sh,,";
}
@ -2745,8 +2743,8 @@ ACTOR Future<Void> fdbd(Reference<IClusterConnectionRecord> connRecord,
if (coordFolder.size()) {
// SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up
// their files
actors.push_back(fileNotFoundToNever(
coordinationServer(coordFolder, coordinators.ccr, configNode, configBroadcastInterface)));
actors.push_back(
fileNotFoundToNever(coordinationServer(coordFolder, connRecord, configNode, configBroadcastInterface)));
}
state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));

View File

@ -272,15 +272,20 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
}
// FIXME: typedef this pair type and/or chunk list
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>>
readFromBlob(Database cx, BlobGranuleCorrectnessWorkload* self, KeyRange range, Version version) {
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
Database cx,
BlobGranuleCorrectnessWorkload* self,
KeyRange range,
Version beginVersion,
Version readVersion) {
state RangeResult out;
state Standalone<VectorRef<BlobGranuleChunkRef>> chunks;
state Transaction tr(cx);
loop {
try {
Standalone<VectorRef<BlobGranuleChunkRef>> chunks_ = wait(tr.readBlobGranules(range, 0, version));
Standalone<VectorRef<BlobGranuleChunkRef>> chunks_ =
wait(tr.readBlobGranules(range, beginVersion, readVersion));
chunks = chunks_;
break;
} catch (Error& e) {
@ -289,7 +294,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
}
for (const BlobGranuleChunkRef& chunk : chunks) {
RangeResult chunkRows = wait(readBlobGranule(chunk, range, version, self->bstore));
RangeResult chunkRows = wait(readBlobGranule(chunk, range, beginVersion, readVersion, self->bstore));
out.arena().dependsOn(chunkRows.arena());
out.append(out.arena(), chunkRows.begin(), chunkRows.size());
}
@ -321,7 +326,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
Version rv = wait(self->doGrv(&tr));
state Version readVersion = rv;
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
wait(self->readFromBlob(cx, self, threadData->directoryRange, readVersion));
wait(self->readFromBlob(cx, self, threadData->directoryRange, 0, readVersion));
fmt::print("Directory {0} got {1} RV {2}\n",
threadData->directoryID,
doSetup ? "initial" : "final",
@ -349,6 +354,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
const Optional<Value>& blobValue,
uint32_t startKey,
uint32_t endKey,
Version beginVersion,
Version readVersion,
const std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>& blob) {
threadData->mismatches++;
@ -360,11 +366,13 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
ev.detail("DirectoryID", format("%08x", threadData->directoryID))
.detail("RangeStart", format("%08x", startKey))
.detail("RangeEnd", format("%08x", endKey))
.detail("BeginVersion", beginVersion)
.detail("Version", readVersion);
fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3}\n",
fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3} - {4}\n",
format("%08x", threadData->directoryID),
format("%08x", startKey),
format("%08x", endKey),
beginVersion,
readVersion);
if (lastMatching.present()) {
fmt::print(" last correct: {}\n", lastMatching.get().printable());
@ -456,6 +464,29 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
readVersion);
}
// because each chunk could be separately collapsed or not if we set beginVersion, we have to track it by chunk
KeyRangeMap<Version> beginVersionByChunk;
beginVersionByChunk.insert(normalKeys, 0);
int beginCollapsed = 0;
int beginNotCollapsed = 0;
for (auto& chunk : blob.second) {
if (!chunk.snapshotFile.present()) {
ASSERT(beginVersion > 0);
ASSERT(chunk.snapshotVersion == invalidVersion);
beginCollapsed++;
beginVersionByChunk.insert(chunk.keyRange, beginVersion);
} else {
ASSERT(chunk.snapshotVersion != invalidVersion);
if (beginVersion > 0) {
beginNotCollapsed++;
}
}
}
TEST(beginCollapsed > 0); // BGCorrectness got collapsed request with beginVersion > 0
TEST(beginNotCollapsed > 0); // BGCorrectness got un-collapsed request with beginVersion > 0
TEST(beginCollapsed > 0 &&
beginNotCollapsed > 0); // BGCorrectness got both collapsed and uncollapsed in the same request!
while (checkIt != threadData->keyData.end() && checkIt->first < endKeyExclusive) {
uint32_t key = checkIt->first;
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
@ -475,6 +506,16 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
for (; idIdx < checkIt->second.writes.size() && checkIt->second.writes[idIdx].writeVersion <= readVersion;
idIdx++) {
Key nextKeyShouldBe = threadData->getKey(key, idIdx);
Version keyBeginVersion = beginVersionByChunk.rangeContaining(nextKeyShouldBe).cvalue();
if (keyBeginVersion > checkIt->second.writes[idIdx].writeVersion) {
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
fmt::print("DBG READ: Skip ID {0} written @ {1} < beginVersion {2}\n",
idIdx,
checkIt->second.writes[idIdx].clearVersion,
keyBeginVersion);
}
continue;
}
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
fmt::print("DBG READ: Checking ID {0} ({1}) written @ {2}\n",
format("%08x", idIdx),
@ -491,6 +532,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
Optional<Value>(),
startKeyInclusive,
endKeyExclusive,
beginVersion,
readVersion,
blob);
return false;
@ -509,6 +551,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
Optional<Value>(),
startKeyInclusive,
endKeyExclusive,
beginVersion,
readVersion,
blob);
return false;
@ -523,6 +566,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
blob.first[resultIdx].value,
startKeyInclusive,
endKeyExclusive,
beginVersion,
readVersion,
blob);
return false;
@ -545,6 +589,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
Optional<Value>(),
startKeyInclusive,
endKeyExclusive,
beginVersion,
readVersion,
blob);
return false;
@ -565,6 +610,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
state double targetReadBytesPerSec = threadData->targetByteRate * 4;
ASSERT(targetReadBytesPerSec > 0);
state Version beginVersion;
state Version readVersion;
TraceEvent("BlobGranuleCorrectnessReaderStart").log();
@ -610,26 +656,42 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
state KeyRange range = KeyRangeRef(threadData->getKey(startKey, 0), threadData->getKey(endKey, 0));
// pick read version
// TODO could also pick begin version here
ASSERT(threadData->writeVersions.back() >= threadData->minSuccessfulReadVersion);
size_t readVersionIdx;
// randomly choose up to date vs time travel read
if (deterministicRandom()->random01() < 0.5) {
threadData->reads++;
readVersionIdx = threadData->writeVersions.size() - 1;
readVersion = threadData->writeVersions.back();
} else {
threadData->timeTravelReads++;
size_t startIdx = 0;
loop {
int readVersionIdx = deterministicRandom()->randomInt(0, threadData->writeVersions.size());
readVersionIdx = deterministicRandom()->randomInt(startIdx, threadData->writeVersions.size());
readVersion = threadData->writeVersions[readVersionIdx];
if (readVersion >= threadData->minSuccessfulReadVersion) {
break;
} else {
startIdx = readVersionIdx + 1;
}
}
}
// randomly choose begin version or not
beginVersion = 0;
if (deterministicRandom()->random01() < 0.5) {
int startIdx = 0;
int endIdxExclusive = readVersionIdx + 1;
// Choose skewed towards later versions. It's ok if beginVersion isn't readable though because it
// will collapse
size_t beginVersionIdx = (size_t)std::sqrt(
deterministicRandom()->randomInt(startIdx * startIdx, endIdxExclusive * endIdxExclusive));
beginVersion = threadData->writeVersions[beginVersionIdx];
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
wait(self->readFromBlob(cx, self, range, readVersion));
self->validateResult(threadData, blob, startKey, endKey, 0, readVersion);
wait(self->readFromBlob(cx, self, range, beginVersion, readVersion));
self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion);
int resultBytes = blob.first.expectedSize();
threadData->rowsRead += blob.first.size();
@ -822,7 +884,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion);
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
wait(self->readFromBlob(cx, self, threadData->directoryRange, readVersion));
wait(self->readFromBlob(cx, self, threadData->directoryRange, 0, readVersion));
result = self->validateResult(threadData, blob, 0, std::numeric_limits<uint32_t>::max(), 0, readVersion);
finalRowsValidated = blob.first.size();

View File

@ -225,7 +225,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
}
for (const BlobGranuleChunkRef& chunk : chunks) {
RangeResult chunkRows = wait(readBlobGranule(chunk, range, version, self->bstore));
RangeResult chunkRows = wait(readBlobGranule(chunk, range, 0, version, self->bstore));
out.arena().dependsOn(chunkRows.arena());
out.append(out.arena(), chunkRows.begin(), chunkRows.size());
}

View File

@ -2378,9 +2378,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
(!nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address()) ||
nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()].processClass.machineClassFitness(
ProcessClass::EncryptKeyProxy) > fitnessLowerBound)) {
TraceEvent("ConsistencyCheck_EncyrptKeyProxyNotBest")
TraceEvent("ConsistencyCheck_EncryptKeyProxyNotBest")
.detail("BestEncryptKeyProxyFitness", fitnessLowerBound)
.detail("ExistingEncyrptKeyProxyFitness",
.detail("ExistingEncryptKeyProxyFitness",
nonExcludedWorkerProcessMap.count(db.encryptKeyProxy.get().address())
? nonExcludedWorkerProcessMap[db.encryptKeyProxy.get().address()]
.processClass.machineClassFitness(ProcessClass::EncryptKeyProxy)

View File

@ -21,16 +21,19 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/NativeAPI.actor.h"
#include "flow/IRandom.h"
#include "flow/StreamCipher.h"
#include "flow/BlobCipher.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/ITrace.h"
#include "flow/Trace.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#if ENCRYPTION_ENABLED
#include <chrono>
#include <cstring>
#include <memory>
#include <random>
#include "flow/actorcompiler.h" // This must be the last #include.
#if ENCRYPTION_ENABLED
#define MEGA_BYTES (1024 * 1024)
#define NANO_SECOND (1000 * 1000 * 1000)
@ -78,45 +81,69 @@ struct WorkloadMetrics {
}
};
// Workload generator for encryption/decryption operations.
// 1. For every client run, it generate unique random encryptionDomainId range and simulate encryption of
// either fixed size or variable size payload.
// 2. For each encryption run, it would interact with BlobCipherKeyCache to fetch the desired encryption key,
// which then is used for encrypting the plaintext payload.
// 3. Encryption operation generates 'encryption header', it is leveraged to decrypt the ciphertext obtained from
// step#2 (simulate real-world scenario)
//
// Correctness validations:
// -----------------------
// Correctness invariants are validated at various steps:
// 1. Encryption key correctness: as part of performing decryption, BlobCipherKeyCache lookup is done to procure
// desired encrytion key based on: {encryptionDomainId, baseCipherId}; the obtained key is validated against
// the encryption key used for encrypting the data.
// 2. After encryption, generated 'encryption header' fields are validated, encrypted buffer size and contents are
// validated.
// 3. After decryption, the obtained deciphertext is validated against the orginal plaintext payload.
//
// Performance metrics:
// -------------------
// The workload generator profiles below operations across the iterations and logs the details at the end, they are:
// 1. Time spent in encryption key fetch (and derivation) operations.
// 2. Time spent encrypting the buffer (doesn't incude key lookup time); also records the throughput in MB/sec.
// 3. Time spent decrypting the buffer (doesn't incude key lookup time); also records the throughput in MB/sec.
struct EncryptionOpsWorkload : TestWorkload {
int mode;
int64_t numIterations;
int pageSize;
int maxBufSize;
std::unique_ptr<uint8_t[]> buff;
std::unique_ptr<uint8_t[]> validationBuff;
StreamCipher::IV iv;
std::unique_ptr<HmacSha256StreamCipher> hmacGenerator;
std::unique_ptr<uint8_t[]> parentKey;
Arena arena;
std::unique_ptr<WorkloadMetrics> metrics;
BlobCipherDomainId minDomainId;
BlobCipherDomainId maxDomainId;
BlobCipherBaseKeyId minBaseCipherId;
EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
mode = getOption(options, LiteralStringRef("fixedSize"), 1);
numIterations = getOption(options, LiteralStringRef("numIterations"), 10);
pageSize = getOption(options, LiteralStringRef("pageSize"), 4096);
maxBufSize = getOption(options, LiteralStringRef("maxBufSize"), 512 * 1024);
buff = std::make_unique<uint8_t[]>(maxBufSize);
validationBuff = std::make_unique<uint8_t[]>(maxBufSize);
iv = getRandomIV();
hmacGenerator = std::make_unique<HmacSha256StreamCipher>();
parentKey = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
generateRandomData(parentKey.get(), AES_256_KEY_LENGTH);
// assign unique encryptionDomainId range per workload clients
minDomainId = wcx.clientId * 100 + mode * 30 + 1;
maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
minBaseCipherId = 100;
metrics = std::make_unique<WorkloadMetrics>();
TraceEvent("EncryptionOpsWorkload").detail("Mode", getModeStr());
TraceEvent("EncryptionOpsWorkload")
.detail("Mode", getModeStr())
.detail("MinDomainId", minDomainId)
.detail("MaxDomainId", maxDomainId);
}
~EncryptionOpsWorkload() { TraceEvent("EncryptionOpsWorkload_Done").log(); }
bool isFixedSizePayload() { return mode == 1; }
StreamCipher::IV getRandomIV() {
generateRandomData(iv.data(), iv.size());
return iv;
}
std::string getModeStr() const {
if (mode == 1) {
return "FixedSize";
@ -127,47 +154,97 @@ struct EncryptionOpsWorkload : TestWorkload {
throw internal_error();
}
void updateEncryptionKey(StreamCipherKey* cipherKey) {
auto start = std::chrono::high_resolution_clock::now();
applyHmacKeyDerivationFunc(cipherKey, hmacGenerator.get(), arena);
auto end = std::chrono::high_resolution_clock::now();
metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());
void generateRandomBaseCipher(const int maxLen, uint8_t* buff, int* retLen) {
memset(buff, 0, maxLen);
*retLen = deterministicRandom()->randomInt(maxLen / 2, maxLen);
generateRandomData(buff, *retLen);
}
StringRef doEncryption(const StreamCipherKey* key, uint8_t* payload, int len) {
EncryptionStreamCipher encryptor(key, iv);
void setupCipherEssentials() {
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
TraceEvent("SetupCipherEssentials_Start").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
uint8_t buff[AES_256_KEY_LENGTH];
std::vector<Reference<BlobCipherKey>> cipherKeys;
for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
int cipherLen = 0;
generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen);
ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);
cipherKeys = cipherKeyCache.getAllCiphers(id);
ASSERT(cipherKeys.size() == 1);
}
TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
}
void resetCipherEssentials() {
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
cipherKeyCache.cleanup();
TraceEvent("ResetCipherEssentials_Done").log();
}
void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId,
uint8_t* baseCipher,
int* baseCipherLen,
BlobCipherBaseKeyId* nextBaseCipherId) {
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
*nextBaseCipherId = cipherKey->getBaseCipherId() + 1;
generateRandomBaseCipher(AES_256_KEY_LENGTH, baseCipher, baseCipherLen);
ASSERT(*baseCipherLen > 0 && *baseCipherLen <= AES_256_KEY_LENGTH);
TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId);
}
Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> key,
uint8_t* payload,
int len,
BlobCipherEncryptHeader* header) {
uint8_t iv[AES_256_IV_LENGTH];
generateRandomData(&iv[0], AES_256_IV_LENGTH);
EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH);
auto start = std::chrono::high_resolution_clock::now();
auto encrypted = encryptor.encrypt(buff.get(), len, arena);
encryptor.finish(arena);
Reference<EncryptBuf> encrypted = encryptor.encrypt(payload, len, header, arena);
auto end = std::chrono::high_resolution_clock::now();
// validate encrypted buffer size and contents (not matching with plaintext)
ASSERT(encrypted.size() == len);
std::copy(encrypted.begin(), encrypted.end(), validationBuff.get());
ASSERT(memcmp(validationBuff.get(), buff.get(), len) != 0);
ASSERT(encrypted->getLogicalSize() == len);
ASSERT(memcmp(encrypted->begin(), payload, len) != 0);
ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
metrics->updateEncryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
return encrypted;
}
void doDecryption(const StreamCipherKey* key,
const StringRef& encrypted,
void doDecryption(Reference<EncryptBuf> encrypted,
int len,
const BlobCipherEncryptHeader& header,
uint8_t* originalPayload,
uint8_t* validationBuff) {
DecryptionStreamCipher decryptor(key, iv);
Reference<BlobCipherKey> orgCipherKey) {
ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
ASSERT(cipherKey.isValid());
ASSERT(cipherKey->isEqual(orgCipherKey));
DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]);
auto start = std::chrono::high_resolution_clock::now();
Standalone<StringRef> decrypted = decryptor.decrypt(encrypted.begin(), len, arena);
decryptor.finish(arena);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena);
auto end = std::chrono::high_resolution_clock::now();
// validate decrypted buffer size and contents (matching with original plaintext)
ASSERT(decrypted.size() == len);
std::copy(decrypted.begin(), decrypted.end(), validationBuff);
ASSERT(memcmp(validationBuff, originalPayload, len) == 0);
ASSERT(decrypted->getLogicalSize() == len);
ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0);
metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
}
@ -177,22 +254,64 @@ struct EncryptionOpsWorkload : TestWorkload {
std::string description() const override { return "EncryptionOps"; }
Future<Void> start(Database const& cx) override {
uint8_t baseCipher[AES_256_KEY_LENGTH];
int baseCipherLen = 0;
BlobCipherBaseKeyId nextBaseCipherId;
// Setup encryptDomainIds and corresponding baseCipher details
setupCipherEssentials();
for (int i = 0; i < numIterations; i++) {
StreamCipherKey key(AES_256_KEY_LENGTH);
// derive the encryption key
updateEncryptionKey(&key);
bool updateBaseCipher = deterministicRandom()->randomInt(1, 100) < 5;
// Step-1: Encryption key derivation, caching the cipher for later use
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
// randomly select a domainId
const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId);
if (updateBaseCipher) {
// simulate baseCipherId getting refreshed/updated
updateLatestBaseCipher(encryptDomainId, &baseCipher[0], &baseCipherLen, &nextBaseCipherId);
cipherKeyCache.insertCipherKey(encryptDomainId, nextBaseCipherId, &baseCipher[0], baseCipherLen);
}
auto start = std::chrono::high_resolution_clock::now();
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
auto end = std::chrono::high_resolution_clock::now();
metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());
// Validate sanity of "getLatestCipher", especially when baseCipher gets updated
if (updateBaseCipher) {
ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId);
ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen);
ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0);
}
int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize);
generateRandomData(buff.get(), dataLen);
// encrypt the payload
const auto& encrypted = doEncryption(&key, buff.get(), dataLen);
// Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later
BlobCipherEncryptHeader header;
try {
Reference<EncryptBuf> encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header);
// decrypt the payload
doDecryption(&key, encrypted, dataLen, buff.get(), validationBuff.get());
// Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and
// decrypt
doDecryption(encrypted, dataLen, header, buff.get(), cipherKey);
} catch (Error& e) {
TraceEvent("Failed")
.detail("DomainId", encryptDomainId)
.detail("BaseCipherId", cipherKey->getBaseCipherId());
throw;
}
metrics->updateBytes(dataLen);
}
// Cleanup cipherKeys
resetCipherEssentials();
return Void();
}

View File

@ -30,6 +30,7 @@ void forceLinkMemcpyTests();
void forceLinkMemcpyPerfTests();
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
void forceLinkStreamCipherTests();
void forceLinkBLockCiherTests();
#endif
void forceLinkParallelStreamTests();
void forceLinkSimExternalConnectionTests();
@ -76,6 +77,7 @@ struct UnitTestWorkload : TestWorkload {
forceLinkMemcpyPerfTests();
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
forceLinkStreamCipherTests();
void forceLinkBlobCipherTests();
#endif
forceLinkParallelStreamTests();
forceLinkSimExternalConnectionTests();

View File

@ -342,23 +342,23 @@ ArenaBlock* ArenaBlock::create(int dataSize, Reference<ArenaBlock>& next) {
b->bigSize = 256;
INSTRUMENT_ALLOCATE("Arena256");
} else if (reqSize <= 512) {
b = (ArenaBlock*)FastAllocator<512>::allocate();
b = (ArenaBlock*)new uint8_t[512];
b->bigSize = 512;
INSTRUMENT_ALLOCATE("Arena512");
} else if (reqSize <= 1024) {
b = (ArenaBlock*)FastAllocator<1024>::allocate();
b = (ArenaBlock*)new uint8_t[1024];
b->bigSize = 1024;
INSTRUMENT_ALLOCATE("Arena1024");
} else if (reqSize <= 2048) {
b = (ArenaBlock*)FastAllocator<2048>::allocate();
b = (ArenaBlock*)new uint8_t[2048];
b->bigSize = 2048;
INSTRUMENT_ALLOCATE("Arena2048");
} else if (reqSize <= 4096) {
b = (ArenaBlock*)FastAllocator<4096>::allocate();
b = (ArenaBlock*)new uint8_t[4096];
b->bigSize = 4096;
INSTRUMENT_ALLOCATE("Arena4096");
} else {
b = (ArenaBlock*)FastAllocator<8192>::allocate();
b = (ArenaBlock*)new uint8_t[8192];
b->bigSize = 8192;
INSTRUMENT_ALLOCATE("Arena8192");
}
@ -460,26 +460,26 @@ void ArenaBlock::destroyLeaf() {
FastAllocator<256>::release(this);
INSTRUMENT_RELEASE("Arena256");
} else if (bigSize <= 512) {
FastAllocator<512>::release(this);
delete[] reinterpret_cast<uint8_t*>(this);
INSTRUMENT_RELEASE("Arena512");
} else if (bigSize <= 1024) {
FastAllocator<1024>::release(this);
delete[] reinterpret_cast<uint8_t*>(this);
INSTRUMENT_RELEASE("Arena1024");
} else if (bigSize <= 2048) {
FastAllocator<2048>::release(this);
delete[] reinterpret_cast<uint8_t*>(this);
INSTRUMENT_RELEASE("Arena2048");
} else if (bigSize <= 4096) {
FastAllocator<4096>::release(this);
delete[] reinterpret_cast<uint8_t*>(this);
INSTRUMENT_RELEASE("Arena4096");
} else if (bigSize <= 8192) {
FastAllocator<8192>::release(this);
delete[] reinterpret_cast<uint8_t*>(this);
INSTRUMENT_RELEASE("Arena8192");
} else {
#ifdef ALLOC_INSTRUMENTATION
allocInstr["ArenaHugeKB"].dealloc((bigSize + 1023) >> 10);
#endif
g_hugeArenaMemory.fetch_sub(bigSize);
delete[](uint8_t*) this;
delete[] reinterpret_cast<uint8_t*>(this);
}
}
}

652
flow/BlobCipher.cpp Normal file
View File

@ -0,0 +1,652 @@
/*
* BlobCipher.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "flow/BlobCipher.h"
#include "flow/Error.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
#include "flow/ITrace.h"
#include "flow/network.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
#include <cstring>
#include <memory>
#if ENCRYPTION_ENABLED
// BlobCipherEncryptHeader
BlobCipherEncryptHeader::BlobCipherEncryptHeader() {
flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE;
}
// BlobCipherKey class methods
BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCiphId,
const uint8_t* baseCiph,
int baseCiphLen) {
BlobCipherRandomSalt salt;
if (g_network->isSimulated()) {
salt = deterministicRandom()->randomUInt64();
} else {
salt = nondeterministicRandom()->randomUInt64();
}
initKey(domainId, baseCiph, baseCiphLen, baseCiphId, salt);
/*TraceEvent("BlobCipherKey")
.detail("DomainId", domainId)
.detail("BaseCipherId", baseCipherId)
.detail("BaseCipherLen", baseCipherLen)
.detail("RandomSalt", randomSalt)
.detail("CreationTime", creationTime);*/
}
void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,
const uint8_t* baseCiph,
int baseCiphLen,
const BlobCipherBaseKeyId& baseCiphId,
const BlobCipherRandomSalt& salt) {
// Set the base encryption key properties
baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
memset(baseCipher.get(), 0, AES_256_KEY_LENGTH);
memcpy(baseCipher.get(), baseCiph, std::min<int>(baseCiphLen, AES_256_KEY_LENGTH));
baseCipherLen = baseCiphLen;
baseCipherId = baseCiphId;
// Set the encryption domain for the base encryption key
encryptDomainId = domainId;
randomSalt = salt;
// derive the encryption key
cipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
memset(cipher.get(), 0, AES_256_KEY_LENGTH);
applyHmacSha256Derivation();
// update the key creation time
creationTime = now();
}
void BlobCipherKey::applyHmacSha256Derivation() {
Arena arena;
uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)];
memcpy(&buf[0], baseCipher.get(), baseCipherLen);
memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt));
HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen);
StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena);
std::copy(digest.begin(), digest.end(), cipher.get());
if (digest.size() < AES_256_KEY_LENGTH) {
memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size());
}
}
void BlobCipherKey::reset() {
memset(baseCipher.get(), 0, baseCipherLen);
memset(cipher.get(), 0, AES_256_KEY_LENGTH);
}
// BlobKeyIdCache class methods
BlobCipherKeyIdCache::BlobCipherKeyIdCache()
: domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {}
BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId)
: domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {
TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId);
}
Reference<BlobCipherKey> BlobCipherKeyIdCache::getLatestCipherKey() {
return getCipherByBaseCipherId(latestBaseCipherKeyId);
}
Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) {
BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId);
if (itr == keyIdCache.end()) {
throw encrypt_key_not_found();
}
return itr->second;
}
void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId,
const uint8_t* baseCipher,
int baseCipherLen) {
ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID);
// BaseCipherKeys are immutable, ensure that cached value doesn't get updated.
BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId);
if (itr != keyIdCache.end()) {
if (memcmp(itr->second->rawBaseCipher(), baseCipher, baseCipherLen) == 0) {
TraceEvent("InsertBaseCipherKey_AlreadyPresent")
.detail("BaseCipherKeyId", baseCipherId)
.detail("DomainId", domainId);
// Key is already present; nothing more to do.
return;
} else {
TraceEvent("InsertBaseCipherKey_UpdateCipher")
.detail("BaseCipherKeyId", baseCipherId)
.detail("DomainId", domainId);
throw encrypt_update_cipher();
}
}
keyIdCache.emplace(baseCipherId, makeReference<BlobCipherKey>(domainId, baseCipherId, baseCipher, baseCipherLen));
// Update the latest BaseCipherKeyId for the given encryption domain
latestBaseCipherKeyId = baseCipherId;
}
void BlobCipherKeyIdCache::cleanup() {
for (auto& keyItr : keyIdCache) {
keyItr.second->reset();
}
keyIdCache.clear();
}
std::vector<Reference<BlobCipherKey>> BlobCipherKeyIdCache::getAllCipherKeys() {
std::vector<Reference<BlobCipherKey>> cipherKeys;
for (auto& keyItr : keyIdCache) {
cipherKeys.push_back(keyItr.second);
}
return cipherKeys;
}
// BlobCipherKeyCache class methods
void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCipherId,
const uint8_t* baseCipher,
int baseCipherLen) {
if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) {
throw encrypt_invalid_id();
}
try {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
// Add mapping to track new encryption domain
Reference<BlobCipherKeyIdCache> keyIdCache = makeReference<BlobCipherKeyIdCache>(domainId);
keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen);
domainCacheMap.emplace(domainId, keyIdCache);
} else {
// Track new baseCipher keys
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
keyIdCache->insertBaseCipherKey(baseCipherId, baseCipher, baseCipherLen);
}
TraceEvent("InsertCipherKey").detail("DomainId", domainId).detail("BaseCipherKeyId", baseCipherId);
} catch (Error& e) {
TraceEvent("InsertCipherKey_Failed").detail("BaseCipherKeyId", baseCipherId).detail("DomainId", domainId);
throw;
}
}
Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId);
throw encrypt_key_not_found();
}
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
Reference<BlobCipherKey> cipherKey = keyIdCache->getLatestCipherKey();
if ((now() - cipherKey->getCreationTime()) > BlobCipherKeyCache::CIPHER_KEY_CACHE_TTL_SEC) {
TraceEvent("GetLatestCipherKey_ExpiredTTL")
.detail("DomainId", domainId)
.detail("BaseCipherId", cipherKey->getBaseCipherId());
throw encrypt_key_ttl_expired();
}
return cipherKey;
}
Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCipherId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
throw encrypt_key_not_found();
}
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
return keyIdCache->getCipherByBaseCipherId(baseCipherId);
}
void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
throw encrypt_key_not_found();
}
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
keyIdCache->cleanup();
TraceEvent("ResetEncryptDomainId").detail("DomainId", domainId);
}
void BlobCipherKeyCache::cleanup() noexcept {
BlobCipherKeyCache& instance = BlobCipherKeyCache::getInstance();
for (auto& domainItr : instance.domainCacheMap) {
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr.second;
keyIdCache->cleanup();
TraceEvent("BlobCipherKeyCache_Cleanup").detail("DomainId", domainItr.first);
}
instance.domainCacheMap.clear();
}
std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
return {};
}
Reference<BlobCipherKeyIdCache> keyIdCache = domainItr->second;
return keyIdCache->getAllCipherKeys();
}
// EncryptBlobCipher class methods
EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key,
const uint8_t* cipherIV,
const int ivLen)
: ctx(EVP_CIPHER_CTX_new()), cipherKey(key) {
ASSERT(ivLen == AES_256_IV_LENGTH);
memcpy(&iv[0], cipherIV, ivLen);
if (ctx == nullptr) {
throw encrypt_ops_error();
}
if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) {
throw encrypt_ops_error();
}
if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) {
throw encrypt_ops_error();
}
}
Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plaintext,
const int plaintextLen,
BlobCipherEncryptHeader* header,
Arena& arena) {
TEST(true); // Encrypting data with BlobCipher
Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(plaintextLen + AES_BLOCK_SIZE, arena);
uint8_t* ciphertext = encryptBuf->begin();
int bytes{ 0 };
if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
TraceEvent("Encrypt_UpdateFailed")
.detail("BaseCipherId", cipherKey->getBaseCipherId())
.detail("EncryptDomainId", cipherKey->getDomainId());
throw encrypt_ops_error();
}
int finalBytes{ 0 };
if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
TraceEvent("Encrypt_FinalFailed")
.detail("BaseCipherId", cipherKey->getBaseCipherId())
.detail("EncryptDomainId", cipherKey->getDomainId());
throw encrypt_ops_error();
}
if ((bytes + finalBytes) != plaintextLen) {
TraceEvent("Encrypt_UnexpectedCipherLen")
.detail("PlaintextLen", plaintextLen)
.detail("EncryptedBufLen", bytes + finalBytes);
throw encrypt_ops_error();
}
// populate header details for the encrypted blob.
header->flags.size = sizeof(BlobCipherEncryptHeader);
header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION;
header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR;
header->baseCipherId = cipherKey->getBaseCipherId();
header->encryptDomainId = cipherKey->getDomainId();
header->salt = cipherKey->getSalt();
memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH);
// Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip
// scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM.
header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena);
encryptBuf->setLogicalSize(plaintextLen);
return encryptBuf;
}
EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() {
if (ctx != nullptr) {
EVP_CIPHER_CTX_free(ctx);
}
}
// DecryptBlobCipher class methods
DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv)
: ctx(EVP_CIPHER_CTX_new()) {
if (ctx == nullptr) {
throw encrypt_ops_error();
}
if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) {
throw encrypt_ops_error();
}
if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) {
throw encrypt_ops_error();
}
}
void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena) {
// validate header flag sanity
if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) {
TraceEvent("VerifyEncryptBlobHeader")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
.detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
throw encrypt_header_metadata_mismatch();
}
// encrypted byte checksum sanity; protection against data bit-rot/flip.
BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena);
if (computed != header.ciphertextChecksum) {
TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("CiphertextChecksum", header.ciphertextChecksum)
.detail("ComputedCiphertextChecksum", computed);
throw encrypt_header_checksum_mismatch();
}
}
Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena) {
TEST(true); // Decrypting data with BlobCipher
verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena);
Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(ciphertextLen + AES_BLOCK_SIZE, arena);
uint8_t* plaintext = decrypted->begin();
int bytesDecrypted{ 0 };
if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
TraceEvent("Decrypt_UpdateFailed")
.detail("BaseCipherId", header.baseCipherId)
.detail("EncryptDomainId", header.encryptDomainId);
throw encrypt_ops_error();
}
int finalBlobBytes{ 0 };
if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
TraceEvent("Decrypt_FinalFailed")
.detail("BaseCipherId", header.baseCipherId)
.detail("EncryptDomainId", header.encryptDomainId);
throw encrypt_ops_error();
}
if ((bytesDecrypted + finalBlobBytes) != ciphertextLen) {
TraceEvent("Encrypt_UnexpectedPlaintextLen")
.detail("CiphertextLen", ciphertextLen)
.detail("DecryptedBufLen", bytesDecrypted + finalBlobBytes);
throw encrypt_ops_error();
}
decrypted->setLogicalSize(ciphertextLen);
return decrypted;
}
DecryptBlobCipherAes256Ctr::~DecryptBlobCipherAes256Ctr() {
if (ctx != nullptr) {
EVP_CIPHER_CTX_free(ctx);
}
}
// HmacSha256DigestGen class methods
HmacSha256DigestGen::HmacSha256DigestGen(const unsigned char* key, size_t len) : ctx(HMAC_CTX_new()) {
if (!HMAC_Init_ex(ctx, key, len, EVP_sha256(), nullptr)) {
throw encrypt_ops_error();
}
}
HmacSha256DigestGen::~HmacSha256DigestGen() {
if (ctx != nullptr) {
HMAC_CTX_free(ctx);
}
}
StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Arena& arena) {
TEST(true); // Digest generation
unsigned int digestLen = HMAC_size(ctx);
auto digest = new (arena) unsigned char[digestLen];
if (HMAC_Update(ctx, data, len) != 1) {
throw encrypt_ops_error();
}
if (HMAC_Final(ctx, digest, &digestLen) != 1) {
throw encrypt_ops_error();
}
return StringRef(digest, digestLen);
}
// Only used to link unit tests
void forceLinkBlobCipherTests() {}
// Tests cases includes:
// 1. Populate cache by inserting 'baseCipher' details for new encryptionDomainIds
// 2. Random lookup for cipherKeys and content validation
// 3. Inserting of 'identical' cipherKey (already cached) more than once works as desired.
// 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired.
// 5. Validation encryption ops (correctness):
// 5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents.
// 5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc.
// 6. Cache cleanup
// 6.1 cleanup cipherKeys by given encryptDomainId
// 6.2. Cleanup all cached cipherKeys
TEST_CASE("flow/BlobCipher") {
TraceEvent("BlobCipherTest_Start").log();
// Construct a dummy External Key Manager representation and populate with some keys
class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
public:
BlobCipherDomainId domainId;
int len;
BlobCipherBaseKeyId keyId;
std::unique_ptr<uint8_t[]> key;
BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId)
: domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)),
keyId(kId), key(std::make_unique<uint8_t[]>(len)) {
generateRandomData(key.get(), len);
}
};
using BaseKeyMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BaseCipher>>;
using DomainKeyMap = std::unordered_map<BlobCipherDomainId, BaseKeyMap>;
DomainKeyMap domainKeyMap;
const BlobCipherDomainId minDomainId = 1;
const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
const BlobCipherBaseKeyId minBaseCipherKeyId = 100;
const BlobCipherBaseKeyId maxBaseCipherKeyId =
deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15;
for (int dId = minDomainId; dId <= maxDomainId; dId++) {
for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) {
domainKeyMap[dId].emplace(kId, makeReference<BaseCipher>(dId, kId));
}
}
ASSERT(domainKeyMap.size() == maxDomainId);
// insert BlobCipher keys into BlobCipherKeyCache map and validate
TraceEvent("BlobCipherTest_InsertKeys").log();
BlobCipherKeyCache& cipherKeyCache = BlobCipherKeyCache::getInstance();
for (auto& domainItr : domainKeyMap) {
for (auto& baseKeyItr : domainItr.second) {
Reference<BaseCipher> baseCipher = baseKeyItr.second;
cipherKeyCache.insertCipherKey(
baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
}
}
TraceEvent("BlobCipherTest_InsertKeysDone").log();
// validate the cipherKey lookups work as desired
for (auto& domainItr : domainKeyMap) {
for (auto& baseKeyItr : domainItr.second) {
Reference<BaseCipher> baseCipher = baseKeyItr.second;
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId);
ASSERT(cipherKey.isValid());
// validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher
ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId);
ASSERT(cipherKey->getDomainId() == baseCipher->domainId);
ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len);
// ensure that baseCipher matches with the cached information
ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0);
// validate the encryption derivation
ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0);
}
}
TraceEvent("BlobCipherTest_LooksupDone").log();
// Ensure attemtping to insert existing cipherKey (identical) more than once is treated as a NOP
try {
Reference<BaseCipher> baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId];
cipherKeyCache.insertCipherKey(baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
} catch (Error& e) {
throw;
}
TraceEvent("BlobCipherTest_ReinsertIdempotentKeyDone").log();
// Ensure attemtping to insert an existing cipherKey (modified) fails with appropriate error
try {
Reference<BaseCipher> baseCipher = domainKeyMap[minDomainId][minBaseCipherKeyId];
uint8_t rawCipher[baseCipher->len];
memcpy(rawCipher, baseCipher->key.get(), baseCipher->len);
// modify few bytes in the cipherKey
for (int i = 2; i < 5; i++) {
rawCipher[i]++;
}
cipherKeyCache.insertCipherKey(baseCipher->domainId, baseCipher->keyId, &rawCipher[0], baseCipher->len);
} catch (Error& e) {
if (e.code() != error_code_encrypt_update_cipher) {
throw;
}
}
TraceEvent("BlobCipherTest_ReinsertNonIdempotentKeyDone").log();
// Validate Encyrption ops
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId);
const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512;
uint8_t orgData[bufLen];
generateRandomData(&orgData[0], bufLen);
Arena arena;
uint8_t iv[AES_256_IV_LENGTH];
generateRandomData(&iv[0], AES_256_IV_LENGTH);
// validate basic encrypt followed by decrypt operation
EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
ASSERT(encrypted->getLogicalSize() == bufLen);
ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0);
ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
TraceEvent("BlobCipherTest_EncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("DomainId", header.encryptDomainId)
.detail("BaseCipherId", header.baseCipherId)
.detail("HeaderChecksum", header.ciphertextChecksum);
Reference<BlobCipherKey> encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
ASSERT(encyrptKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
ASSERT(decrypted->getLogicalSize() == bufLen);
ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0);
TraceEvent("BlobCipherTest_DecryptDone").log();
// induce encryption header corruption - headerVersion corrupted
header.flags.headerVersion += 1;
try {
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
header.flags.headerVersion -= 1;
}
// induce encryption header corruption - encryptionMode corrupted
header.flags.encryptMode += 1;
try {
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
header.flags.encryptMode -= 1;
}
// induce encryption header corruption - checksum mismatch
header.ciphertextChecksum += 1;
try {
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_checksum_mismatch) {
throw;
}
header.ciphertextChecksum -= 1;
}
// Validate dropping encyrptDomainId cached keys
const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
cipherKeyCache.resetEncyrptDomainId(candidate);
std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(candidate);
ASSERT(cachedKeys.empty());
// Validate dropping all cached cipherKeys
cipherKeyCache.cleanup();
for (int dId = minDomainId; dId < maxDomainId; dId++) {
std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(dId);
ASSERT(cachedKeys.empty());
}
TraceEvent("BlobCipherTest_Done").log();
return Void();
}
BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
const int payloadLen,
const BlobCipherRandomSalt& salt,
Arena& arena) {
// FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum
// Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest.
HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt));
StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
ASSERT(digest.size() >= sizeof(BlobCipherChecksum));
BlobCipherChecksum checksum;
memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum));
return checksum;
}
#endif // ENCRYPTION_ENABLED

321
flow/BlobCipher.h Normal file
View File

@ -0,0 +1,321 @@
/*
* BlobCipher.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
#define ENCRYPTION_ENABLED 1
#else
#define ENCRYPTION_ENABLED 0
#endif
#if ENCRYPTION_ENABLED
#include "flow/Arena.h"
#include "flow/FastRef.h"
#include "flow/flow.h"
#include "flow/xxhash.h"
#include <openssl/aes.h>
#include <openssl/engine.h>
#include <openssl/evp.h>
#include <openssl/hmac.h>
#include <openssl/sha.h>
#define AES_256_KEY_LENGTH 32
#define AES_256_IV_LENGTH 16
#define INVALID_DOMAIN_ID 0
#define INVALID_CIPHER_KEY_ID 0
using BlobCipherDomainId = uint64_t;
using BlobCipherRandomSalt = uint64_t;
using BlobCipherBaseKeyId = uint64_t;
using BlobCipherChecksum = uint64_t;
typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode;
// Encryption operations buffer management
// Approach limits number of copies needed during encryption or decryption operations.
// For encryption EncryptBuf is allocated using client supplied Arena and provided to AES library to capture
// the ciphertext. Similarly, on decryption EncryptBuf is allocated using client supplied Arena and provided
// to the AES library to capture decipher text and passed back to the clients. Given the object passed around
// is reference-counted, it gets freed once refrenceCount goes to 0.
class EncryptBuf : public ReferenceCounted<EncryptBuf>, NonCopyable {
public:
EncryptBuf(int size, Arena& arena) : allocSize(size), logicalSize(size) {
if (size > 0) {
buffer = new (arena) uint8_t[size];
} else {
buffer = nullptr;
}
}
int getLogicalSize() { return logicalSize; }
void setLogicalSize(int value) {
ASSERT(value <= allocSize);
logicalSize = value;
}
uint8_t* begin() { return buffer; }
private:
int allocSize;
int logicalSize;
uint8_t* buffer;
};
// BlobCipher Encryption header format
// This header is persisted along with encrypted buffer, it contains information necessary
// to assist decrypting the buffers to serve read requests.
//
// The total space overhead is 56 bytes.
#pragma pack(push, 1) // exact fit - no padding
typedef struct BlobCipherEncryptHeader {
union {
struct {
uint8_t size; // reading first byte is sufficient to determine header
// length. ALWAYS THE FIRST HEADER ELEMENT.
uint8_t headerVersion{};
uint8_t encryptMode{};
uint8_t _reserved[5]{};
} flags;
uint64_t _padding{};
};
// Encyrption domain boundary identifier.
BlobCipherDomainId encryptDomainId{};
// BaseCipher encryption key identifier
BlobCipherBaseKeyId baseCipherId{};
// Random salt
BlobCipherRandomSalt salt{};
// Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'.
BlobCipherChecksum ciphertextChecksum{};
// Initialization vector used to encrypt the payload.
uint8_t iv[AES_256_IV_LENGTH];
BlobCipherEncryptHeader();
} BlobCipherEncryptHeader;
#pragma pack(pop)
// This interface is in-memory representation of CipherKey used for encryption/decryption information.
// It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying
// HMAC-SHA-256 derivation technique.
class BlobCipherKey : public ReferenceCounted<BlobCipherKey>, NonCopyable {
public:
BlobCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCiphId,
const uint8_t* baseCiph,
int baseCiphLen);
uint8_t* data() const { return cipher.get(); }
uint64_t getCreationTime() const { return creationTime; }
BlobCipherDomainId getDomainId() const { return encryptDomainId; }
BlobCipherRandomSalt getSalt() const { return randomSalt; }
BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
int getBaseCipherLen() const { return baseCipherLen; }
uint8_t* rawCipher() const { return cipher.get(); }
uint8_t* rawBaseCipher() const { return baseCipher.get(); }
bool isEqual(const Reference<BlobCipherKey> toCompare) {
return encryptDomainId == toCompare->getDomainId() && baseCipherId == toCompare->getBaseCipherId() &&
randomSalt == toCompare->getSalt() && baseCipherLen == toCompare->getBaseCipherLen() &&
memcmp(cipher.get(), toCompare->rawCipher(), AES_256_KEY_LENGTH) == 0 &&
memcmp(baseCipher.get(), toCompare->rawBaseCipher(), baseCipherLen) == 0;
}
void reset();
private:
// Encryption domain boundary identifier
BlobCipherDomainId encryptDomainId;
// Base encryption cipher key properties
std::unique_ptr<uint8_t[]> baseCipher;
int baseCipherLen;
BlobCipherBaseKeyId baseCipherId;
// Random salt used for encryption cipher key derivation
BlobCipherRandomSalt randomSalt;
// Creation timestamp for the derived encryption cipher key
uint64_t creationTime;
// Derived encryption cipher key
std::unique_ptr<uint8_t[]> cipher;
void initKey(const BlobCipherDomainId& domainId,
const uint8_t* baseCiph,
int baseCiphLen,
const BlobCipherBaseKeyId& baseCiphId,
const BlobCipherRandomSalt& salt);
void applyHmacSha256Derivation();
};
// This interface allows FDB processes participating in encryption to store and
// index recently used encyption cipher keys. FDB encryption has two dimensions:
// 1. Mapping on cipher encryption keys per "encryption domains"
// 2. Per encryption domain, the cipher keys are index using "baseCipherKeyId".
//
// The design supports NIST recommendation of limiting lifetime of an encryption
// key. For details refer to:
// https://csrc.nist.gov/publications/detail/sp/800-57-part-1/rev-3/archive/2012-07-10
//
// Below gives a pictoral representation of in-memory datastructure implemented
// to index encryption keys:
// { encryptionDomain -> { baseCipherId -> cipherKey } }
//
// Supported cache lookups schemes:
// 1. Lookup cipher based on { encryptionDomainId, baseCipherKeyId } tuple.
// 2. Lookup latest cipher key for a given encryptionDomainId.
//
// Client is responsible to handle cache-miss usecase, the corrective operation
// might vary based on the calling process, for instance: EncryptKeyServer
// cache-miss shall invoke RPC to external Encryption Key Manager to fetch the
// required encryption key, however, CPs/SSs cache-miss would result in RPC to
// EncryptKeyServer to refresh the desired encryption key.
using BlobCipherKeyIdCacheMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>;
using BlobCipherKeyIdCacheMapCItr = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;
struct BlobCipherKeyIdCache : ReferenceCounted<BlobCipherKeyIdCache> {
public:
BlobCipherKeyIdCache();
explicit BlobCipherKeyIdCache(BlobCipherDomainId dId);
// API returns the last inserted cipherKey.
// If none exists, 'encrypt_key_not_found' is thrown.
Reference<BlobCipherKey> getLatestCipherKey();
// API returns cipherKey corresponding to input 'baseCipherKeyId'.
// If none exists, 'encrypt_key_not_found' is thrown.
Reference<BlobCipherKey> getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId);
// API enables inserting base encryption cipher details to the BlobCipherKeyIdCache.
// Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey
// is treated as a NOP (success), however, an attempt to update cipherKey would throw
// 'encrypt_update_cipher' exception.
void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
// API cleanup the cache by dropping all cached cipherKeys
void cleanup();
// API returns list of all 'cached' cipherKeys
std::vector<Reference<BlobCipherKey>> getAllCipherKeys();
private:
BlobCipherDomainId domainId;
BlobCipherKeyIdCacheMap keyIdCache;
BlobCipherBaseKeyId latestBaseCipherKeyId;
};
using BlobCipherDomainCacheMap = std::unordered_map<BlobCipherDomainId, Reference<BlobCipherKeyIdCache>>;
class BlobCipherKeyCache : NonCopyable {
public:
// Enable clients to insert base encryption cipher details to the BlobCipherKeyCache.
// The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable,
// attempting to re-insert same 'identical' cipherKey is treated as a NOP (success),
// however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception.
void insertCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCipherId,
const uint8_t* baseCipher,
int baseCipherLen);
// API returns the last insert cipherKey for a given encyryption domain Id.
// If none exists, it would throw 'encrypt_key_not_found' exception.
Reference<BlobCipherKey> getLatestCipherKey(const BlobCipherDomainId& domainId);
// API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple.
// If none exists, it would throw 'encrypt_key_not_found' exception.
Reference<BlobCipherKey> getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId);
// API returns point in time list of all 'cached' cipherKeys for a given encryption domainId.
std::vector<Reference<BlobCipherKey>> getAllCiphers(const BlobCipherDomainId& domainId);
// API enables dropping all 'cached' cipherKeys for a given encryption domain Id.
// Useful to cleanup cache if an encryption domain gets removed/destroyed etc.
void resetEncyrptDomainId(const BlobCipherDomainId domainId);
static BlobCipherKeyCache& getInstance() {
static BlobCipherKeyCache instance;
return instance;
}
// Ensures cached encryption key(s) (plaintext) never gets persisted as part
// of FDB process/core dump.
static void cleanup() noexcept;
private:
BlobCipherDomainCacheMap domainCacheMap;
static constexpr uint64_t CIPHER_KEY_CACHE_TTL_SEC = 10 * 60L;
BlobCipherKeyCache() {}
};
// This interface enables data block encryption. An invocation to encrypt() will
// do two things:
// 1) generate encrypted ciphertext for given plaintext input.
// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads.
class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted<EncryptBlobCipherAes265Ctr> {
public:
static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1;
EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key, const uint8_t* iv, const int ivLen);
~EncryptBlobCipherAes265Ctr();
Reference<EncryptBuf> encrypt(const uint8_t* plaintext,
const int plaintextLen,
BlobCipherEncryptHeader* header,
Arena&);
private:
EVP_CIPHER_CTX* ctx;
Reference<BlobCipherKey> cipherKey;
uint8_t iv[AES_256_IV_LENGTH];
};
// This interface enable data block decryption. An invocation to decrypt() would generate
// 'plaintext' for a given 'ciphertext' input, the caller needs to supply BlobCipherEncryptHeader.
class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted<DecryptBlobCipherAes256Ctr> {
public:
DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv);
~DecryptBlobCipherAes256Ctr();
Reference<EncryptBuf> decrypt(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena&);
private:
EVP_CIPHER_CTX* ctx;
void verifyEncryptBlobHeader(const uint8_t* cipherText,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena);
};
class HmacSha256DigestGen final : NonCopyable {
public:
HmacSha256DigestGen(const unsigned char* key, size_t len);
~HmacSha256DigestGen();
HMAC_CTX* getCtx() const { return ctx; }
StringRef digest(unsigned char const* data, size_t len, Arena&);
private:
HMAC_CTX* ctx;
};
BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
const int payloadLen,
const BlobCipherRandomSalt& salt,
Arena& arena);
#endif // ENCRYPTION_ENABLED

View File

@ -8,6 +8,8 @@ set(FLOW_SRCS
ArgParseUtil.h
AsioReactor.h
BooleanParam.h
BlobCipher.h
BlobCipher.cpp
CompressedInt.actor.cpp
CompressedInt.h
Deque.cpp

View File

@ -210,13 +210,24 @@ public:
if (s != sizeof(Object))
abort();
INSTRUMENT_ALLOCATE(typeid(Object).name());
void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate();
return p;
if constexpr (sizeof(Object) <= 256) {
void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate();
return p;
} else {
void* p = new uint8_t[nextFastAllocatedSize(sizeof(Object))];
return p;
}
}
static void operator delete(void* s) {
INSTRUMENT_RELEASE(typeid(Object).name());
FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s);
if constexpr (sizeof(Object) <= 256) {
FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s);
} else {
delete[] reinterpret_cast<uint8_t*>(s);
}
}
// Redefine placement new so you can still use it
static void* operator new(size_t, void* p) { return p; }
@ -236,18 +247,6 @@ public:
return FastAllocator<128>::allocate();
if (size <= 256)
return FastAllocator<256>::allocate();
if (size <= 512)
return FastAllocator<512>::allocate();
if (size <= 1024)
return FastAllocator<1024>::allocate();
if (size <= 2048)
return FastAllocator<2048>::allocate();
if (size <= 4096)
return FastAllocator<4096>::allocate();
if (size <= 8192)
return FastAllocator<8192>::allocate();
if (size <= 16384)
return FastAllocator<16384>::allocate();
return new uint8_t[size];
}
@ -264,21 +263,11 @@ inline void freeFast(int size, void* ptr) {
return FastAllocator<128>::release(ptr);
if (size <= 256)
return FastAllocator<256>::release(ptr);
if (size <= 512)
return FastAllocator<512>::release(ptr);
if (size <= 1024)
return FastAllocator<1024>::release(ptr);
if (size <= 2048)
return FastAllocator<2048>::release(ptr);
if (size <= 4096)
return FastAllocator<4096>::release(ptr);
if (size <= 8192)
return FastAllocator<8192>::release(ptr);
if (size <= 16384)
return FastAllocator<16384>::release(ptr);
delete[](uint8_t*) ptr;
}
// Allocate a block of memory aligned to 4096 bytes. Size must be a multiple of
// 4096. Guaranteed not to return null. Use freeFast4kAligned to free.
[[nodiscard]] inline void* allocateFast4kAligned(int size) {
#if !defined(USE_JEMALLOC)
// Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc
@ -296,6 +285,7 @@ inline void freeFast(int size, void* ptr) {
return result;
}
// Free a pointer returned from allocateFast4kAligned(size)
inline void freeFast4kAligned(int size, void* ptr) {
#if !defined(USE_JEMALLOC)
// Sizes supported by FastAllocator must be release via FastAllocator

View File

@ -33,6 +33,7 @@
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
#include "flow/StreamCipher.h"
#include "flow/BlobCipher.h"
#endif
#include "flow/Trace.h"
#include "flow/Error.h"
@ -3501,6 +3502,7 @@ void crashHandler(int sig) {
#if (!defined(TLS_DISABLED) && !defined(_WIN32))
StreamCipherKey::cleanup();
StreamCipher::cleanup();
BlobCipherKeyCache::cleanup();
#endif
fflush(stdout);

View File

@ -284,6 +284,15 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum
ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string")
// 3XXX - Encryption operations errors
ERROR( encrypt_ops_error, 3000, "Encryption operation error")
ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch")
ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing")
ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired")
ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch")
ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key")
ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id")
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error
ERROR( internal_error, 4100, "An internal error occurred" )

View File

@ -19,6 +19,7 @@
*/
#include "flow/flat_buffers.h"
#include "flow/FileIdentifier.h"
#include "flow/UnitTest.h"
#include "flow/Arena.h"
#include "flow/serialize.h"
@ -26,6 +27,7 @@
#include <algorithm>
#include <iomanip>
#include <unordered_set>
#include <variant>
namespace detail {
@ -361,6 +363,7 @@ struct string_serialized_traits<Void> : std::true_type {
namespace unit_tests {
struct Y1 {
constexpr static FileIdentifier file_identifier = 338229;
int a;
template <class Archiver>
@ -369,6 +372,14 @@ struct Y1 {
}
};
struct Y1Hasher {
std::size_t operator()(const Y1& y) const noexcept { return std::hash<int>()(y.a); }
};
struct Y1Equal {
bool operator()(const Y1& l, const Y1& r) const { return l.a == r.a; }
};
struct Y2 {
int a;
std::variant<int> b;
@ -563,4 +574,43 @@ TEST_CASE("/flow/FlatBuffers/EmptyPreSerVectorRefs") {
return Void();
}
TEST_CASE("/flow/FlatBuffers/EmptyUnorderedSet") {
int kSize = deterministicRandom()->randomInt(0, 100);
Standalone<StringRef> msg =
ObjectWriter::toValue(std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>>(kSize), Unversioned());
ObjectReader rd(msg.begin(), Unversioned());
std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>> xs;
rd.deserialize(xs);
ASSERT(xs.size() == kSize);
for (const auto& x : xs) {
ASSERT(x.size() == 0);
}
return Void();
}
TEST_CASE("/flow/FlatBuffers/NonEmptyUnorderedSet") {
int kSize = deterministicRandom()->randomInt(0, 100);
std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>> src;
std::unordered_set<Y1, Y1Hasher, Y1Equal> s;
for (int i = 0; i < kSize; i++) {
Y1 y;
y.a = i;
s.insert(y);
}
src.push_back(s);
Standalone<StringRef> msg = ObjectWriter::toValue(src, Unversioned());
ObjectReader rd(msg.begin(), Unversioned());
std::vector<std::unordered_set<Y1, Y1Hasher, Y1Equal>> xs;
rd.deserialize(xs);
ASSERT(xs.size() == 1);
ASSERT(xs[0].size() == kSize);
for (int i = 0; i < kSize; i++) {
Y1 y;
y.a = i;
ASSERT(xs[0].find(y) != xs[0].end());
}
return Void();
}
} // namespace unit_tests

View File

@ -35,6 +35,7 @@
#include <cstring>
#include <array>
#include <unordered_map>
#include <unordered_set>
#include <deque>
#include "flow/FileIdentifier.h"
#include "flow/ObjectSerializerTraits.h"
@ -250,6 +251,31 @@ struct vector_like_traits<std::set<Key, Compare, Allocator>> : std::true_type {
return v.begin();
}
};
template <class Key, class Hash, class KeyEqual, class Allocator>
struct vector_like_traits<std::unordered_set<Key, Hash, KeyEqual, Allocator>> : std::true_type {
using Vec = std::unordered_set<Key, Hash, KeyEqual, Allocator>;
using value_type = Key;
using iterator = typename Vec::const_iterator;
using insert_iterator = std::insert_iterator<Vec>;
template <class Context>
static size_t num_entries(const Vec& v, Context&) {
return v.size();
}
template <class Context>
static void reserve(Vec& v, size_t size, Context&) {
v.reserve(size);
}
template <class Context>
static insert_iterator insert(Vec& v, Context&) {
return std::inserter(v, v.end());
}
template <class Context>
static iterator begin(const Vec& v, Context&) {
return v.begin();
}
};
template <>
struct dynamic_size_traits<std::string> : std::true_type {

View File

@ -20,6 +20,7 @@
#ifndef FLOW_SERIALIZE_H
#define FLOW_SERIALIZE_H
#include <unordered_set>
#pragma once
#include <stdint.h>
@ -172,6 +173,13 @@ template <class T, class Allocator>
struct CompositionDepthFor<std::vector<T, Allocator>> : std::integral_constant<int, CompositionDepthFor<T>::value + 1> {
};
template <class Key, class Hash, class KeyEqual, class Allocator>
struct FileIdentifierFor<std::unordered_set<Key, Hash, KeyEqual, Allocator>> : ComposedIdentifierExternal<Key, 6> {};
template <class Key, class Hash, class KeyEqual, class Allocator>
struct CompositionDepthFor<std::unordered_set<Key, Hash, KeyEqual, Allocator>>
: std::integral_constant<int, CompositionDepthFor<Key>::value + 1> {};
template <class Archive, class T>
inline void save(Archive& ar, const std::vector<T>& value) {
ar << (int)value.size();
@ -762,9 +770,6 @@ private:
public:
static PacketBuffer* create(size_t size = 0) {
size = std::max(size, PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD);
if (size == PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD) {
return new (FastAllocator<PACKET_BUFFER_MIN_SIZE>::allocate()) PacketBuffer{ size };
}
uint8_t* mem = new uint8_t[size + PACKET_BUFFER_OVERHEAD];
return new (mem) PacketBuffer{ size };
}
@ -772,11 +777,7 @@ public:
void addref() { ++reference_count; }
void delref() {
if (!--reference_count) {
if (size_ == PACKET_BUFFER_MIN_SIZE - PACKET_BUFFER_OVERHEAD) {
FastAllocator<PACKET_BUFFER_MIN_SIZE>::release(this);
} else {
delete[] this;
}
delete[] reinterpret_cast<uint8_t*>(this);
}
}
int bytes_unwritten() const { return size_ - bytes_written; }

View File

@ -0,0 +1,9 @@
[[test]]
testTitle = 'BlobGranuleServerCommonUnit'
useDB = false
startDelay = 0
[[test.workload]]
testName = 'UnitTests'
maxTestCases = 0
testsMatching = /blobgranule/server/common/

View File

@ -0,0 +1,10 @@
[[test]]
testTitle = 'BlobGranuleFileUnit'
useDB = false
startDelay = 0
[[test.workload]]
testName = 'UnitTests'
maxTestCases = 0
testsMatching = /blobgranule/files/

View File

@ -1,7 +0,0 @@
testTitle=UnitTests
startDelay=0
useDB=false
testName=UnitTests
maxTestCases=0
testsMatching=/blobgranule/

View File

@ -0,0 +1,9 @@
[[test]]
testTitle = 'BlobManagerUnit'
useDB = false
startDelay = 0
[[test.workload]]
testName = 'UnitTests'
maxTestCases = 0
testsMatching = /blobmanager/

View File

@ -1,7 +0,0 @@
testTitle=UnitTests
startDelay=0
useDB=false
testName=UnitTests
maxTestCases=0
testsMatching=/blobmanager/

View File

@ -50,8 +50,9 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES s3VersionHeaders.txt IGNORE)
add_fdb_test(TEST_FILES BandwidthThrottle.txt IGNORE)
add_fdb_test(TEST_FILES BigInsert.txt IGNORE)
add_fdb_test(TEST_FILES BlobGranuleFileUnit.txt)
add_fdb_test(TEST_FILES BlobManagerUnit.txt)
add_fdb_test(TEST_FILES BGServerCommonUnit.toml)
add_fdb_test(TEST_FILES BlobGranuleFileUnit.toml)
add_fdb_test(TEST_FILES BlobManagerUnit.toml)
add_fdb_test(TEST_FILES ConsistencyCheck.txt IGNORE)
add_fdb_test(TEST_FILES DDMetricsExclude.txt IGNORE)
add_fdb_test(TEST_FILES DataDistributionMetrics.txt IGNORE)

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleVerifyAtomicOps'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleVerifyCycle'

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
storageEngineExcludeTypes = [3] # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4]
[[test]]
testTitle = 'BlobGranuleVerifySmall'

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
storageEngineExcludeTypes = [3] # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4]
[[test]]
testTitle = 'BlobGranuleVerifySmallClean'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleCorrectness'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleCorrectness'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleVerifyBalance'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleVerifyBalanceClean'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleVerifyLarge'

View File

@ -1,5 +1,7 @@
[configuration]
blobGranulesEnabled = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[test]]
testTitle = 'BlobGranuleVerifyLargeClean'