Merge branch 'main' into readaware

This commit is contained in:
Xiaoxi Wang 2022-04-12 16:47:15 -07:00
commit ed97a35dc0
77 changed files with 4455 additions and 1797 deletions

View File

@ -466,6 +466,27 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
}).extractPtr());
}
extern "C" DLLEXPORT FDBFuture* fdb_database_purge_blob_granules(FDBDatabase* db,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t purge_version,
fdb_bool_t force) {
return (FDBFuture*)(DB(db)
->purgeBlobGranules(KeyRangeRef(StringRef(begin_key_name, begin_key_name_length),
StringRef(end_key_name, end_key_name_length)),
purge_version,
force)
.extractPtr());
}
extern "C" DLLEXPORT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDatabase* db,
uint8_t const* purge_key_name,
int purge_key_name_length) {
return (
FDBFuture*)(DB(db)->waitPurgeGranulesComplete(StringRef(purge_key_name, purge_key_name_length)).extractPtr());
}
extern "C" DLLEXPORT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant, FDBTransaction** out_transaction) {
CATCH_AND_RETURN(*out_transaction = (FDBTransaction*)TENANT(tenant)->createTransaction().extractPtr(););
}
@ -619,23 +640,23 @@ FDBFuture* fdb_transaction_get_range_impl(FDBTransaction* tr,
.extractPtr());
}
FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
fdb_bool_t begin_or_equal,
int begin_offset,
uint8_t const* end_key_name,
int end_key_name_length,
fdb_bool_t end_or_equal,
int end_offset,
uint8_t const* mapper_name,
int mapper_name_length,
int limit,
int target_bytes,
FDBStreamingMode mode,
int iteration,
fdb_bool_t snapshot,
fdb_bool_t reverse) {
extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
fdb_bool_t begin_or_equal,
int begin_offset,
uint8_t const* end_key_name,
int end_key_name_length,
fdb_bool_t end_or_equal,
int end_offset,
uint8_t const* mapper_name,
int mapper_name_length,
int limit,
int target_bytes,
FDBStreamingMode mode,
int iteration,
fdb_bool_t snapshot,
fdb_bool_t reverse) {
FDBFuture* r = validate_and_update_parameters(limit, target_bytes, mode, iteration, reverse);
if (r != nullptr)
return r;
@ -651,25 +672,24 @@ FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr,
.extractPtr());
}
// TODO: Support FDB_API_ADDED in generate_asm.py and then this can be replaced with fdb_api_ptr_unimpl.
FDBFuture* fdb_transaction_get_mapped_range_v699(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
fdb_bool_t begin_or_equal,
int begin_offset,
uint8_t const* end_key_name,
int end_key_name_length,
fdb_bool_t end_or_equal,
int end_offset,
uint8_t const* mapper_name,
int mapper_name_length,
int limit,
int target_bytes,
FDBStreamingMode mode,
int iteration,
fdb_bool_t snapshot,
fdb_bool_t reverse) {
fprintf(stderr, "UNIMPLEMENTED FDB API FUNCTION\n");
FDBFuture* fdb_transaction_get_range_and_flat_map_v709(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
fdb_bool_t begin_or_equal,
int begin_offset,
uint8_t const* end_key_name,
int end_key_name_length,
fdb_bool_t end_or_equal,
int end_offset,
uint8_t const* mapper_name,
int mapper_name_length,
int limit,
int target_bytes,
FDBStreamingMode mode,
int iteration,
fdb_bool_t snapshot,
fdb_bool_t reverse) {
fprintf(stderr, "GetRangeAndFlatMap is removed from 7.0. Please upgrade to 7.1 and use GetMappedRange\n");
abort();
}
@ -900,13 +920,13 @@ extern "C" DLLEXPORT fdb_error_t fdb_select_api_version_impl(int runtime_version
// Versioned API changes -- descending order by version (new changes at top)
// FDB_API_CHANGED( function, ver ) means there is a new implementation as of ver, and a function function_(ver-1)
// is the old implementation FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and
// is the old implementation. FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and
// function_(ver-1) is the old implementation
//
// WARNING: use caution when implementing removed functions by calling public API functions. This can lead to
// undesired behavior when using the multi-version API. Instead, it is better to have both the removed and public
// functions call an internal implementation function. See fdb_create_database_impl for an example.
FDB_API_CHANGED(fdb_transaction_get_mapped_range, 700);
FDB_API_REMOVED(fdb_transaction_get_range_and_flat_map, 710);
FDB_API_REMOVED(fdb_future_get_version, 620);
FDB_API_REMOVED(fdb_create_cluster, 610);
FDB_API_REMOVED(fdb_cluster_create_database, 610);

View File

@ -299,6 +299,18 @@ DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDat
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_purge_blob_granules(FDBDatabase* db,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t purge_version,
fdb_bool_t force);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_wait_purge_granules_complete(FDBDatabase* db,
uint8_t const* purge_key_name,
int purge_key_name_length);
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_tenant_create_transaction(FDBTenant* tenant,
FDBTransaction** out_transaction);

View File

@ -130,6 +130,25 @@ EmptyFuture Database::create_snapshot(FDBDatabase* db,
return EmptyFuture(fdb_database_create_snapshot(db, uid, uid_length, snap_command, snap_command_length));
}
KeyFuture Database::purge_blob_granules(FDBDatabase* db,
std::string_view begin_key,
std::string_view end_key,
int64_t purge_version,
fdb_bool_t force) {
return KeyFuture(fdb_database_purge_blob_granules(db,
(const uint8_t*)begin_key.data(),
begin_key.size(),
(const uint8_t*)end_key.data(),
end_key.size(),
purge_version,
force));
}
EmptyFuture Database::wait_purge_granules_complete(FDBDatabase* db, std::string_view purge_key) {
return EmptyFuture(
fdb_database_wait_purge_granules_complete(db, (const uint8_t*)purge_key.data(), purge_key.size()));
}
// Tenant
Tenant::Tenant(FDBDatabase* db, const uint8_t* name, int name_length) {
if (fdb_error_t err = fdb_database_open_tenant(db, name, name_length, &tenant)) {

View File

@ -97,6 +97,7 @@ public:
private:
friend class Transaction;
friend class Database;
KeyFuture(FDBFuture* f) : Future(f) {}
};
@ -201,6 +202,14 @@ public:
int uid_length,
const uint8_t* snap_command,
int snap_command_length);
static KeyFuture purge_blob_granules(FDBDatabase* db,
std::string_view begin_key,
std::string_view end_key,
int64_t purge_version,
fdb_bool_t force);
static EmptyFuture wait_purge_granules_complete(FDBDatabase* db, std::string_view purge_key);
};
class Tenant final {

View File

@ -2592,7 +2592,6 @@ TEST_CASE("Blob Granule Functions") {
}
// write some data
insert_data(db, create_data({ { "bg1", "a" }, { "bg2", "b" }, { "bg3", "c" } }));
// because wiring up files is non-trivial, just test the calls complete with the expected no_materialize error
@ -2709,6 +2708,42 @@ TEST_CASE("Blob Granule Functions") {
tr.reset();
break;
}
// do a purge + wait at that version to purge everything before originalReadVersion
fdb::KeyFuture purgeKeyFuture =
fdb::Database::purge_blob_granules(db, key("bg"), key("bh"), originalReadVersion, false);
fdb_check(wait_future(purgeKeyFuture));
const uint8_t* purgeKeyData;
int purgeKeyLen;
fdb_check(purgeKeyFuture.get(&purgeKeyData, &purgeKeyLen));
std::string purgeKey((const char*)purgeKeyData, purgeKeyLen);
fdb::EmptyFuture waitPurgeFuture = fdb::Database::wait_purge_granules_complete(db, purgeKey);
fdb_check(wait_future(waitPurgeFuture));
// re-read again at the purge version to make sure it is still valid
while (1) {
fdb_check(tr.set_option(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE, nullptr, 0));
fdb::KeyValueArrayResult r =
tr.read_blob_granules(key("bg"), key("bh"), 0, originalReadVersion, granuleContext);
fdb_error_t err = r.get(&out_kv, &out_count, &out_more);
if (err && err != 2037 /* blob_granule_not_materialized */) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
CHECK(err == 2037 /* blob_granule_not_materialized */);
tr.reset();
break;
}
}
int main(int argc, char** argv) {

View File

@ -260,6 +260,45 @@ def suspend(logger):
assert get_value_from_status_json(False, 'client', 'database_status', 'available')
def extract_version_epoch(cli_output):
return int(cli_output.split("\n")[-1].split(" ")[-1])
@enable_logging()
def targetversion(logger):
version1 = run_fdbcli_command('targetversion getepoch')
assert version1 == "Version epoch is unset"
version2 = int(run_fdbcli_command('getversion'))
logger.debug("read version: {}".format(version2))
assert version2 >= 0
# set the version epoch to the default value
logger.debug("setting version epoch to default")
run_fdbcli_command('targetversion add 0')
# get the version epoch
versionepoch1 = extract_version_epoch(run_fdbcli_command('targetversion getepoch'))
logger.debug("version epoch: {}".format(versionepoch1))
# make sure the version increased
version3 = int(run_fdbcli_command('getversion'))
logger.debug("read version: {}".format(version3))
assert version3 >= version2
# slightly increase the version epoch
versionepoch2 = extract_version_epoch(run_fdbcli_command("targetversion setepoch {}".format(versionepoch1 + 1000000)))
logger.debug("version epoch: {}".format(versionepoch2))
assert versionepoch2 == versionepoch1 + 1000000
# slightly decrease the version epoch
versionepoch3 = extract_version_epoch(run_fdbcli_command("targetversion add {}".format(-1000000)))
logger.debug("version epoch: {}".format(versionepoch3))
assert versionepoch3 == versionepoch2 - 1000000 == versionepoch1
# the versions should still be increasing
version4 = int(run_fdbcli_command('getversion'))
logger.debug("read version: {}".format(version4))
assert version4 >= version3
# clear the version epoch and make sure it is now unset
run_fdbcli_command("targetversion clearepoch")
version5 = run_fdbcli_command('targetversion getepoch')
assert version5 == "Version epoch is unset"
def get_value_from_status_json(retry, *args):
while True:
result = json.loads(run_fdbcli_command('status', 'json'))
@ -685,6 +724,9 @@ if __name__ == '__main__':
throttle()
triggerddteaminfolog()
tenants()
# TODO: similar to advanceversion, this seems to cause some issues, so disable for now
# This must go last, otherwise the version advancement can mess with the other tests
# targetversion()
else:
assert args.process_number > 1, "Process number should be positive"
coordinators()

View File

@ -52,7 +52,6 @@ mark_as_advanced(
if (GPERFTOOLS_FOUND)
add_library(gperftools UNKNOWN IMPORTED)
target_compile_definitions(gperftools PUBLIC USE_GPERFTOOLS)
set_target_properties(gperftools PROPERTIES
IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC_AND_PROFILER}
INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")

View File

@ -3,6 +3,40 @@
.. code-block:: javascript
"cluster":{
"storage_wiggler": {
"wiggle_server_ids":["0ccb4e0feddb55"],
"wiggle_server_addresses": ["127.0.0.1"],
"primary": { // primary DC storage wiggler stats
// One StorageServer wiggle round is considered 'complete', when all StorageServers with creationTime < T are wiggled
"last_round_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_round_start_timestamp": 1648857905.123, // when did the latest round start
"last_round_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_round_finish_timestamp": 0, // when did the latest finished round finish
"smoothed_round_seconds": 1, // moving average duration of a wiggle round
"finished_round": 1,
// 1 wiggle step as 1 storage server is wiggled in the current round
"last_wiggle_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_wiggle_start_timestamp": 1648857905.123, // when did the latest wiggle step start
"last_wiggle_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
},
"remote": { // remote DC storage wiggler stats
"last_round_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_round_start_timestamp": 1648857905.123,
"last_round_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_round_finish_timestamp": 0,
"smoothed_round_seconds": 1,
"finished_round": 1,
"last_wiggle_start_datetime": "2022-04-02 00:05:05.123 +0000",
"last_wiggle_start_timestamp": 1648857905.123,
"last_wiggle_finish_datetime": "1970-01-01 00:00:00.000 +0000",
"last_wiggle_finish_timestamp": 0,
"smoothed_wiggle_seconds": 1,
"finished_wiggle": 1
}
},
"layers":{
"_valid":true,
"_error":"some error description"

View File

@ -14,7 +14,7 @@ Summary
============
Perpetual storage wiggle is a feature that forces the data distributor to constantly build new storage teams when the cluster is healthy. On a high-level note, the process is like this:
Order storage servers by process id. For each storage server n:
Order storage servers by their created time, from oldest to newest. For each storage server n:
a. Exclude storage server n.
@ -22,7 +22,7 @@ b. Wait until all data has been moved off the storage server.
c. Include storage n
Goto a to wiggle the next storage process with different process id.
Goto step a to wiggle the next storage server.
With a perpetual wiggle, storage migrations will be much less impactful. The wiggler will detect the healthy status based on healthy teams, available disk space and the number of unhealthy relocations. It will pause the wiggle until the cluster is healthy again.
@ -47,7 +47,8 @@ Disable perpetual storage wiggle locality matching filter, which wiggles all the
Monitor
=======
The ``status`` command in the FDB :ref:`command line interface <command-line-interface>` will show the current perpetual_storage_wiggle value.
* The ``status`` command will report the IP address of the Storage Server under wiggling.
* The ``status json`` command in the FDB :ref:`command line interface <command-line-interface>` will show the current `perpetual_storage_wiggle` value. Plus, the ``cluster.storage_wiggler`` field reports storage wiggle details.
Trace Events
----------------------

View File

@ -28,7 +28,6 @@ Features
* Improved the efficiency with which storage servers replicate data between themselves. `(PR #5017) <https://github.com/apple/foundationdb/pull/5017>`_
* Added support to ``exclude command`` to exclude based on locality match. `(PR #5113) <https://github.com/apple/foundationdb/pull/5113>`_
* Add the ``trace_partial_file_suffix`` network option. This option will give unfinished trace files a special suffix to indicate they're not complete yet. When the trace file is complete, it is renamed to remove the suffix. `(PR #5328) <https://github.com/apple/foundationdb/pull/5328>`_
* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
Performance
-----------
@ -85,8 +84,6 @@ Bindings
* C: Added a function, ``fdb_database_create_snapshot``, to create a snapshot of the database. `(PR #4241) <https://github.com/apple/foundationdb/pull/4241/files>`_
* C: Added ``fdb_database_get_main_thread_busyness`` function to report how busy a client's main thread is. `(PR #4504) <https://github.com/apple/foundationdb/pull/4504>`_
* Java: Added ``Database.getMainThreadBusyness`` function to report how busy a client's main thread is. `(PR #4564) <https://github.com/apple/foundationdb/pull/4564>`_
* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
Other Changes
-------------

View File

@ -10,6 +10,7 @@ Release Notes
Features
--------
* Added ``USE_GRV_CACHE`` transaction option to allow read versions to be locally cached on the client side for latency optimizations. `(PR #5725) <https://github.com/apple/foundationdb/pull/5725>`_ `(PR #6664) <https://github.com/apple/foundationdb/pull/6664>`_
* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_, `(PR #6181) <https://github.com/apple/foundationdb/pull/6181>`_, etc..
Performance
-----------
@ -22,14 +23,18 @@ Fixes
Status
------
* Added ``cluster.storage_wiggler`` field report storage wiggle stats `(PR #6219) <https://github.com/apple/foundationdb/pull/6219>`_
Bindings
--------
* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) <https://github.com/apple/foundationdb/pull/5609>`_
Other Changes
-------------
* OpenTracing support is now deprecated in favor of OpenTelemetry tracing, which will be enabled in a future release. `(PR #6478) <https://github.com/apple/foundationdb/pull/6478/files>`_
* Changed ``memory`` option to limit resident memory instead of virtual memory. Added a new ``memory_vsize`` option if limiting virtual memory is desired. `(PR #6719) <https://github.com/apple/foundationdb/pull/6719>`_
* Change ``perpetual storage wiggle`` to wiggle the storage servers based on their created time. `(PR #6219) <https://github.com/apple/foundationdb/pull/6219>`_
Earlier release notes
---------------------

View File

@ -29,6 +29,7 @@ set(FDBCLI_SRCS
TriggerDDTeamInfoLogCommand.actor.cpp
TssqCommand.actor.cpp
Util.actor.cpp
VersionEpochCommand.actor.cpp
linenoise/linenoise.h)
if(NOT WIN32)

View File

@ -190,6 +190,11 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
case ConfigurationResult::DATABASE_CREATED:
printf("Database created\n");
break;
case ConfigurationResult::DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL:
printf("Database created\n");
fprintf(stderr,
"WARN: RocksDB storage engine type is still in experimental stage, not yet production tested.\n");
break;
case ConfigurationResult::DATABASE_UNAVAILABLE:
fprintf(stderr, "ERROR: The database is unavailable\n");
fprintf(stderr, "Type `configure FORCE <TOKEN...>' to configure without this check\n");
@ -250,6 +255,11 @@ ACTOR Future<bool> configureCommandActor(Reference<IDatabase> db,
"storage_migration_type=gradual' to set the gradual migration type.\n");
ret = false;
break;
case ConfigurationResult::SUCCESS_WARN_ROCKSDB_EXPERIMENTAL:
printf("Configuration changed\n");
fprintf(stderr,
"WARN: RocksDB storage engine type is still in experimental stage, not yet production tested.\n");
break;
default:
ASSERT(false);
ret = false;

View File

@ -0,0 +1,174 @@
/*
* VersionEpochCommand.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "boost/lexical_cast.hpp"
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/IClientApi.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "flow/Arena.h"
#include "flow/FastRef.h"
#include "flow/ThreadHelper.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace fdb_cli {
const KeyRef versionEpochSpecialKey = LiteralStringRef("\xff\xff/management/version_epoch");
struct VersionInfo {
int64_t version;
int64_t expectedVersion;
};
ACTOR static Future<Optional<VersionInfo>> getVersionInfo(Reference<IDatabase> db) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Version rv = wait(safeThreadFutureToFuture(tr->getReadVersion()));
state ThreadFuture<Optional<Value>> versionEpochValFuture = tr->get(versionEpochKey);
Optional<Value> versionEpochVal = wait(safeThreadFutureToFuture(versionEpochValFuture));
if (!versionEpochVal.present()) {
return Optional<VersionInfo>();
}
int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochVal.get(), Unversioned());
int64_t expected = g_network->timer() * CLIENT_KNOBS->CORE_VERSIONSPERSECOND - versionEpoch;
return VersionInfo{ rv, expected };
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR static Future<Optional<int64_t>> getVersionEpoch(Reference<ITransaction> tr) {
loop {
try {
state ThreadFuture<Optional<Value>> versionEpochValFuture = tr->get(versionEpochSpecialKey);
Optional<Value> versionEpochVal = wait(safeThreadFutureToFuture(versionEpochValFuture));
return versionEpochVal.present() ? boost::lexical_cast<int64_t>(versionEpochVal.get().toString())
: Optional<int64_t>();
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
}
ACTOR Future<bool> versionEpochCommandActor(Reference<IDatabase> db, Database cx, std::vector<StringRef> tokens) {
if (tokens.size() <= 3) {
state Reference<ITransaction> tr = db->createTransaction();
if (tokens.size() == 1) {
Optional<VersionInfo> versionInfo = wait(getVersionInfo(db));
if (versionInfo.present()) {
int64_t diff = versionInfo.get().expectedVersion - versionInfo.get().version;
printf("Version: %" PRId64 "\n", versionInfo.get().version);
printf("Expected: %" PRId64 "\n", versionInfo.get().expectedVersion);
printf("Difference: %" PRId64 " (%.2fs)\n", diff, 1.0 * diff / CLIENT_KNOBS->VERSIONS_PER_SECOND);
} else {
printf("Version epoch is unset\n");
}
return true;
} else if (tokens.size() == 2 && tokencmp(tokens[1], "get")) {
Optional<int64_t> versionEpoch = wait(getVersionEpoch(db->createTransaction()));
if (versionEpoch.present()) {
printf("Current version epoch is %" PRId64 "\n", versionEpoch.get());
} else {
printf("Version epoch is unset\n");
}
return true;
} else if (tokens.size() == 2 && tokencmp(tokens[1], "disable")) {
// Clearing the version epoch means versions will no longer attempt
// to advance at the same rate as the clock. The current version
// will remain unchanged.
loop {
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
Optional<int64_t> versionEpoch = wait(getVersionEpoch(db->createTransaction()));
if (!versionEpoch.present()) {
return true;
} else {
tr->clear(versionEpochSpecialKey);
wait(safeThreadFutureToFuture(tr->commit()));
}
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
} else if ((tokens.size() == 2 && tokencmp(tokens[1], "enable")) ||
(tokens.size() == 3 && tokencmp(tokens[1], "set"))) {
state int64_t v;
if (tokens.size() == 3) {
int n = 0;
if (sscanf(tokens[2].toString().c_str(), "%" SCNd64 "%n", &v, &n) != 1 || n != tokens[2].size()) {
printUsage(tokens[0]);
return false;
}
} else {
v = 0; // default version epoch
}
loop {
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
Optional<int64_t> versionEpoch = wait(getVersionEpoch(tr));
if (!versionEpoch.present() || (versionEpoch.get() != v && tokens.size() == 3)) {
tr->set(versionEpochSpecialKey, BinaryWriter::toValue(v, Unversioned()));
wait(safeThreadFutureToFuture(tr->commit()));
} else {
printf("Version epoch enabled. Run `versionepoch commit` to irreversibly jump to the target "
"version\n");
return true;
}
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
} else if (tokens.size() == 2 && tokencmp(tokens[1], "commit")) {
Optional<VersionInfo> versionInfo = wait(getVersionInfo(db));
if (versionInfo.present()) {
wait(advanceVersion(cx, versionInfo.get().expectedVersion));
} else {
printf("Must set the version epoch before committing it (see `versionepoch enable`)\n");
}
return true;
}
}
printUsage(tokens[0]);
return false;
}
CommandFactory versionEpochFactory(
"versionepoch",
CommandHelp("versionepoch [<enable|commit|set|disable> [EPOCH]]",
"Read or write the version epoch",
"If no arguments are specified, reports the offset between the expected version "
"and the actual version. Otherwise, enables, disables, or commits the version epoch. "
"Setting the version epoch can be irreversible since it can cause a large verison jump. "
"Thus, the version epoch must first by enabled with the enable or set command. This "
"causes a recovery. Once the version epoch has been set, versions may be given out at "
"a faster or slower rate to attempt to match the actual version to the expected version, "
"based on the version epoch. After setting the version, run the commit command to perform "
"a one time jump to the expected version. This is useful when there is a very large gap "
"between the current version and the expected version. Note that once a version jump has "
"occurred, it cannot be undone. Run this command without any arguments to see the current "
"and expected version."));
} // namespace fdb_cli

View File

@ -1646,6 +1646,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
continue;
}
if (tokencmp(tokens[0], "versionepoch")) {
bool _result = wait(makeInterruptable(versionEpochCommandActor(db, localDb, tokens)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "kill")) {
getTransaction(db, managementTenant, tr, options, intrans);
bool _result = wait(makeInterruptable(killCommandActor(db, tr, tokens, &address_interface)));

View File

@ -210,6 +210,10 @@ ACTOR Future<bool> throttleCommandActor(Reference<IDatabase> db, std::vector<Str
ACTOR Future<bool> triggerddteaminfologCommandActor(Reference<IDatabase> db);
// tssq command
ACTOR Future<bool> tssqCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
// versionepoch command
ACTOR Future<bool> versionEpochCommandActor(Reference<IDatabase> db, Database cx, std::vector<StringRef> tokens);
// targetversion command
ACTOR Future<bool> targetVersionCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens);
} // namespace fdb_cli

View File

@ -44,7 +44,18 @@ struct BlobWorkerInterface {
BlobWorkerInterface() {}
explicit BlobWorkerInterface(const struct LocalityData& l, UID id) : locality(l), myId(id) {}
void initEndpoints() {}
void initEndpoints() {
// TODO: specify endpoint priorities?
std::vector<std::pair<FlowReceiver*, TaskPriority>> streams;
streams.push_back(waitFailure.getReceiver());
streams.push_back(blobGranuleFileRequest.getReceiver());
streams.push_back(assignBlobRangeRequest.getReceiver());
streams.push_back(revokeBlobRangeRequest.getReceiver());
streams.push_back(granuleAssignmentsRequest.getReceiver());
streams.push_back(granuleStatusStreamRequest.getReceiver());
streams.push_back(haltBlobWorker.getReceiver());
FlowTransport::transport().addEndpoints(streams);
}
UID id() const { return myId; }
NetworkAddress address() const { return blobGranuleFileRequest.getEndpoint().getPrimaryAddress(); }
NetworkAddress stableAddress() const { return blobGranuleFileRequest.getEndpoint().getStableAddress(); }
@ -54,16 +65,22 @@ struct BlobWorkerInterface {
template <class Archive>
void serialize(Archive& ar) {
serializer(ar,
waitFailure,
blobGranuleFileRequest,
assignBlobRangeRequest,
revokeBlobRangeRequest,
granuleAssignmentsRequest,
granuleStatusStreamRequest,
haltBlobWorker,
locality,
myId);
// use adjusted endpoints
serializer(ar, myId, locality, waitFailure);
if (Archive::isDeserializing) {
blobGranuleFileRequest =
RequestStream<struct BlobGranuleFileRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(1));
assignBlobRangeRequest =
RequestStream<struct AssignBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(2));
revokeBlobRangeRequest =
RequestStream<struct RevokeBlobRangeRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(3));
granuleAssignmentsRequest =
RequestStream<struct GetGranuleAssignmentsRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(4));
granuleStatusStreamRequest =
RequestStream<struct GranuleStatusStreamRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(5));
haltBlobWorker =
RequestStream<struct HaltBlobWorkerRequest>(waitFailure.getEndpoint().getAdjustedEndpoint(6));
}
}
};

View File

@ -372,6 +372,9 @@ public:
Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);
Future<Key> purgeBlobGranules(KeyRange keyRange, Version purgeVersion, bool force = false);
Future<Void> waitPurgeGranulesComplete(Key purgeKey);
// private:
explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,
Reference<AsyncVar<ClientDBInfo>> clientDBInfo,

View File

@ -22,6 +22,7 @@
#define FDBCLIENT_FDBTYPES_H
#include <algorithm>
#include <cinttypes>
#include <set>
#include <string>
#include <vector>

View File

@ -65,6 +65,8 @@ enum class ConfigurationResult {
LOCKED_NOT_NEW,
SUCCESS_WARN_PPW_GRADUAL,
SUCCESS,
SUCCESS_WARN_ROCKSDB_EXPERIMENTAL,
DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL,
};
enum class CoordinatorsResult {
@ -290,6 +292,7 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
state bool oldReplicationUsesDcId = false;
state bool warnPPWGradual = false;
state bool warnChangeStorageNoMigrate = false;
state bool warnRocksDBIsExperimental = false;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -477,6 +480,9 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
} else if (newConfig.storageMigrationType == StorageMigrationType::GRADUAL &&
newConfig.perpetualStorageWiggleSpeed == 0) {
warnPPWGradual = true;
} else if (newConfig.storageServerStoreType != oldConfig.storageServerStoreType &&
newConfig.storageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
warnRocksDBIsExperimental = true;
}
}
}
@ -525,6 +531,9 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
Optional<Value> v = wait(safeThreadFutureToFuture(vF));
if (v != m[initIdKey.toString()])
return ConfigurationResult::DATABASE_ALREADY_CREATED;
else if (m[configKeysPrefix.toString() + "storage_engine"] ==
std::to_string(KeyValueStoreType::SSD_ROCKSDB_V1))
return ConfigurationResult::DATABASE_CREATED_WARN_ROCKSDB_EXPERIMENTAL;
else
return ConfigurationResult::DATABASE_CREATED;
} catch (Error& e2) {
@ -538,6 +547,8 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
if (warnPPWGradual) {
return ConfigurationResult::SUCCESS_WARN_PPW_GRADUAL;
} else if (warnRocksDBIsExperimental) {
return ConfigurationResult::SUCCESS_WARN_ROCKSDB_EXPERIMENTAL;
} else {
return ConfigurationResult::SUCCESS;
}

View File

@ -152,6 +152,11 @@ public:
// Management API, create snapshot
virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;
// purge blob granules api. purgeBlobGranules is asynchronus, calling waitPurgeGranulesComplete after guarantees
// completion.
virtual ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) = 0;
virtual ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) = 0;
// Interface to manage shared state across multiple connections to the same Database
virtual ThreadFuture<DatabaseSharedState*> createSharedState() = 0;
virtual void setSharedState(DatabaseSharedState* p) = 0;

View File

@ -516,6 +516,38 @@ ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVer
});
}
ThreadFuture<Key> DLDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
if (!api->purgeBlobGranules) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->purgeBlobGranules(db,
keyRange.begin.begin(),
keyRange.begin.size(),
keyRange.end.begin(),
keyRange.end.size(),
purgeVersion,
force);
return toThreadFuture<Key>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
const uint8_t* key;
int keyLength;
FdbCApi::fdb_error_t error = api->futureGetKey(f, &key, &keyLength);
ASSERT(!error);
// The memory for this is stored in the FDBFuture and is released when the future gets destroyed
return Key(KeyRef(key, keyLength), Arena());
});
}
ThreadFuture<Void> DLDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
if (!api->waitPurgeGranulesComplete) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->waitPurgeGranulesComplete(db, purgeKey.begin(), purgeKey.size());
return toThreadFuture<Void>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) { return Void(); });
}
// DLApi
// Loads the specified function from a dynamic library
@ -590,6 +622,15 @@ void DLApi::init() {
loadClientFunction(
&api->databaseCreateSnapshot, lib, fdbCPath, "fdb_database_create_snapshot", headerVersion >= 700);
loadClientFunction(
&api->purgeBlobGranules, lib, fdbCPath, "fdb_database_purge_blob_granules", headerVersion >= 710);
loadClientFunction(&api->waitPurgeGranulesComplete,
lib,
fdbCPath,
"fdb_database_wait_purge_granules_complete",
headerVersion >= 710);
loadClientFunction(
&api->tenantCreateTransaction, lib, fdbCPath, "fdb_tenant_create_transaction", headerVersion >= 710);
loadClientFunction(&api->tenantDestroy, lib, fdbCPath, "fdb_tenant_destroy", headerVersion >= 710);
@ -609,7 +650,7 @@ void DLApi::init() {
headerVersion >= 0);
loadClientFunction(&api->transactionGetRange, lib, fdbCPath, "fdb_transaction_get_range", headerVersion >= 0);
loadClientFunction(
&api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 700);
&api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 710);
loadClientFunction(
&api->transactionGetVersionstamp, lib, fdbCPath, "fdb_transaction_get_versionstamp", headerVersion >= 410);
loadClientFunction(&api->transactionSet, lib, fdbCPath, "fdb_transaction_set", headerVersion >= 0);
@ -667,7 +708,7 @@ void DLApi::init() {
loadClientFunction(
&api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0);
loadClientFunction(
&api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 700);
&api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 710);
loadClientFunction(&api->futureGetSharedState, lib, fdbCPath, "fdb_future_get_shared_state", headerVersion >= 710);
loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0);
loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0);
@ -1442,6 +1483,17 @@ double MultiVersionDatabase::getMainThreadBusyness() {
return localClientBusyness;
}
ThreadFuture<Key> MultiVersionDatabase::purgeBlobGranules(const KeyRangeRef& keyRange,
Version purgeVersion,
bool force) {
auto f = dbState->db ? dbState->db->purgeBlobGranules(keyRange, purgeVersion, force) : ThreadFuture<Key>(Never());
return abortableFuture(f, dbState->dbVar->get().onChange);
}
ThreadFuture<Void> MultiVersionDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
auto f = dbState->db ? dbState->db->waitPurgeGranulesComplete(purgeKey) : ThreadFuture<Void>(Never());
return abortableFuture(f, dbState->dbVar->get().onChange);
}
// Returns the protocol version reported by the coordinator this client is connected to
// If an expected version is given, the future won't return until the protocol version is different than expected
// Note: this will never return if the server is running a protocol from FDB 5.0 or older
@ -1536,7 +1588,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
.detail("OldProtocolVersion", dbProtocolVersion);
// When the protocol version changes, clear the corresponding entry in the shared state map
// so it can be re-initialized. Only do so if there was a valid previous protocol version.
if (dbProtocolVersion.present()) {
if (dbProtocolVersion.present() && MultiVersionApi::apiVersionAtLeast(710)) {
MultiVersionApi::api->clearClusterSharedStateMapEntry(clusterFilePath);
}
@ -2333,9 +2385,14 @@ ThreadFuture<Void> MultiVersionApi::updateClusterSharedStateMap(std::string clus
void MultiVersionApi::clearClusterSharedStateMapEntry(std::string clusterFilePath) {
MutexHolder holder(lock);
auto ssPtr = clusterSharedStateMap[clusterFilePath].get();
auto mapEntry = clusterSharedStateMap.find(clusterFilePath);
if (mapEntry == clusterSharedStateMap.end()) {
TraceEvent(SevError, "ClusterSharedStateMapEntryNotFound").detail("ClusterFilePath", clusterFilePath);
return;
}
auto ssPtr = mapEntry->second.get();
ssPtr->delRef(ssPtr);
clusterSharedStateMap.erase(clusterFilePath);
clusterSharedStateMap.erase(mapEntry);
}
std::vector<std::string> parseOptionValues(std::string valueStr) {

View File

@ -156,6 +156,16 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);
FDBFuture* (*purgeBlobGranules)(FDBDatabase* db,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t purge_version,
fdb_bool_t force);
FDBFuture* (*waitPurgeGranulesComplete)(FDBDatabase* db, uint8_t const* purge_key_name, int purge_key_name_length);
// Tenant
fdb_error_t (*tenantCreateTransaction)(FDBTenant* tenant, FDBTransaction** outTransaction);
void (*tenantDestroy)(FDBTenant* tenant);
@ -438,6 +448,9 @@ public:
ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;
ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
ThreadFuture<DatabaseSharedState*> createSharedState() override;
void setSharedState(DatabaseSharedState* p) override;
@ -716,6 +729,9 @@ public:
ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;
ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
ThreadFuture<DatabaseSharedState*> createSharedState() override;
void setSharedState(DatabaseSharedState* p) override;

View File

@ -1519,6 +1519,12 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection
std::make_unique<AdvanceVersionImpl>(
singleKeyRange(LiteralStringRef("min_required_commit_version"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<VersionEpochImpl>(
singleKeyRange(LiteralStringRef("version_epoch"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT,
SpecialKeySpace::IMPLTYPE::READWRITE,
@ -9178,3 +9184,86 @@ Future<Void> DatabaseContext::popChangeFeedMutations(Key rangeID, Version versio
Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
return makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(this)));
}
ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
KeyRange range,
Version purgeVersion,
bool force) {
state Database cx(db);
state Transaction tr(cx);
state Key purgeKey;
// FIXME: implement force
if (!force) {
throw unsupported_operation();
}
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Value purgeValue = blobGranulePurgeValueFor(purgeVersion, range, force);
tr.atomicOp(
addVersionStampAtEnd(blobGranulePurgeKeys.begin), purgeValue, MutationRef::SetVersionstampedKey);
tr.set(blobGranulePurgeChangeKey, deterministicRandom()->randomUniqueID().toString());
state Future<Standalone<StringRef>> fTrVs = tr.getVersionstamp();
wait(tr.commit());
Standalone<StringRef> vs = wait(fTrVs);
purgeKey = blobGranulePurgeKeys.begin.withSuffix(vs);
if (BG_REQUEST_DEBUG) {
fmt::print("purgeBlobGranules for range [{0} - {1}) at version {2} registered {3}\n",
range.begin.printable(),
range.end.printable(),
purgeVersion,
purgeKey.printable());
}
break;
} catch (Error& e) {
if (BG_REQUEST_DEBUG) {
fmt::print("purgeBlobGranules for range [{0} - {1}) at version {2} encountered error {3}\n",
range.begin.printable(),
range.end.printable(),
purgeVersion,
e.name());
}
wait(tr.onError(e));
}
}
return purgeKey;
}
Future<Key> DatabaseContext::purgeBlobGranules(KeyRange range, Version purgeVersion, bool force) {
return purgeBlobGranulesActor(Reference<DatabaseContext>::addRef(this), range, purgeVersion, force);
}
ACTOR Future<Void> waitPurgeGranulesCompleteActor(Reference<DatabaseContext> db, Key purgeKey) {
state Database cx(db);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> purgeVal = wait(tr->get(purgeKey));
if (!purgeVal.present()) {
if (BG_REQUEST_DEBUG) {
fmt::print("purgeBlobGranules for {0} succeeded\n", purgeKey.printable());
}
return Void();
}
if (BG_REQUEST_DEBUG) {
fmt::print("purgeBlobGranules for {0} watching\n", purgeKey.printable());
}
state Future<Void> watchFuture = tr->watch(purgeKey);
wait(tr->commit());
wait(watchFuture);
tr->reset();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
}

View File

@ -36,6 +36,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( MAX_WRITE_TRANSACTION_LIFE_VERSIONS, 5 * VERSIONS_PER_SECOND ); if (randomize && BUGGIFY) MAX_WRITE_TRANSACTION_LIFE_VERSIONS=std::max<int>(1, 1 * VERSIONS_PER_SECOND);
init( MAX_COMMIT_BATCH_INTERVAL, 2.0 ); if( randomize && BUGGIFY ) MAX_COMMIT_BATCH_INTERVAL = 0.5; // Each commit proxy generates a CommitTransactionBatchRequest at least this often, so that versions always advance smoothly
MAX_COMMIT_BATCH_INTERVAL = std::min(MAX_COMMIT_BATCH_INTERVAL, MAX_READ_TRANSACTION_LIFE_VERSIONS/double(2*VERSIONS_PER_SECOND)); // Ensure that the proxy commits 2 times every MAX_READ_TRANSACTION_LIFE_VERSIONS, otherwise the master will not give out versions fast enough
init( MAX_VERSION_RATE_MODIFIER, 0.1 );
init( MAX_VERSION_RATE_OFFSET, VERSIONS_PER_SECOND ); // If the calculated version is more than this amount away from the expected version, it will be clamped to this value. This prevents huge version jumps.
// TLogs
init( TLOG_TIMEOUT, 0.4 ); //cannot buggify because of availability
@ -109,6 +111,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
init( MAX_FORKED_PROCESS_OUTPUT, 1024 );
init( SNAP_CREATE_MAX_TIMEOUT, 300.0 );
init( MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE, 1 );
init( MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE, 1 );
// Data distribution queue
init( HEALTH_POLL_TIME, 1.0 );
@ -529,6 +533,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( CC_HEALTH_TRIGGER_FAILOVER, false );
init( CC_FAILOVER_DUE_TO_HEALTH_MIN_DEGRADATION, 5 );
init( CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION, 10 );
init( CC_ENABLE_ENTIRE_SATELLITE_MONITORING, false );
init( CC_SATELLITE_DEGRADATION_MIN_COMPLAINER, 3 );
init( CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER, 3 );
init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL, 600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;
init( EXPECTED_MASTER_FITNESS, ProcessClass::UnsetFit );
@ -719,6 +726,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( PEER_LATENCY_CHECK_MIN_POPULATION, 30 );
init( PEER_LATENCY_DEGRADATION_PERCENTILE, 0.90 );
init( PEER_LATENCY_DEGRADATION_THRESHOLD, 0.05 );
init( PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE, 0.90 );
init( PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE, 0.1 );
init( PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD, 0.1 );
init( PEER_DEGRADATION_CONNECTION_FAILURE_COUNT, 1 );
init( WORKER_HEALTH_REPORT_RECENT_DESTROYED_PEER, true );
@ -827,6 +836,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_METRICS_INTERVAL, 5.0 );
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
// Server request latency measurement
init( LATENCY_SAMPLE_SIZE, 100000 );

View File

@ -39,6 +39,8 @@ public:
int64_t MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
double MAX_COMMIT_BATCH_INTERVAL; // Each commit proxy generates a CommitTransactionBatchRequest at least this
// often, so that versions always advance smoothly
double MAX_VERSION_RATE_MODIFIER;
int64_t MAX_VERSION_RATE_OFFSET;
// TLogs
bool PEEK_USING_STREAMING;
@ -466,6 +468,14 @@ public:
// failover.
int CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION; // The maximum number of degraded servers that can trigger a
// failover.
bool CC_ENABLE_ENTIRE_SATELLITE_MONITORING; // When enabled, gray failure tries to detect whether the entire
// satellite DC is degraded.
int CC_SATELLITE_DEGRADATION_MIN_COMPLAINER; // When the network between primary and satellite becomes bad, all the
// workers in primary may have bad network talking to the satellite.
// This is the minimum amount of complainer for a satellite worker to
// be determined as degraded worker.
int CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER; // The minimum amount of degraded server in satellite DC to be
// determined as degraded satellite.
// Knobs used to select the best policy (via monte carlo)
int POLICY_RATING_TESTS; // number of tests per policy (in order to compare)
@ -575,6 +585,12 @@ public:
// disk snapshot
int64_t MAX_FORKED_PROCESS_OUTPUT;
double SNAP_CREATE_MAX_TIMEOUT;
// Maximum number of storage servers a snapshot can fail to
// capture while still succeeding
int64_t MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE;
// Maximum number of coordinators a snapshot can fail to
// capture while still succeeding
int64_t MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE;
// Storage Metrics
double STORAGE_METRICS_AVERAGE_INTERVAL;
@ -659,8 +675,12 @@ public:
bool ENABLE_WORKER_HEALTH_MONITOR;
double WORKER_HEALTH_MONITOR_INTERVAL; // Interval between two health monitor health check.
int PEER_LATENCY_CHECK_MIN_POPULATION; // The minimum number of latency samples required to check a peer.
double PEER_LATENCY_DEGRADATION_PERCENTILE; // The percentile latency used to check peer health.
double PEER_LATENCY_DEGRADATION_PERCENTILE; // The percentile latency used to check peer health among workers inside
// primary or remote DC.
double PEER_LATENCY_DEGRADATION_THRESHOLD; // The latency threshold to consider a peer degraded.
double PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE; // The percentile latency used to check peer health between
// primary and primary satellite.
double PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE; // The latency threshold to consider a peer degraded.
double PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD; // The percentage of timeout to consider a peer degraded.
int PEER_DEGRADATION_CONNECTION_FAILURE_COUNT; // The number of connection failures experienced during measurement
// period to consider a peer degraded.
@ -784,6 +804,7 @@ public:
double REDWOOD_METRICS_INTERVAL;
double REDWOOD_HISTOGRAM_INTERVAL;
bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache.
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;

View File

@ -106,6 +106,8 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
{ "advanceversion",
singleKeyRange(LiteralStringRef("min_required_commit_version"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "versionepoch",
singleKeyRange(LiteralStringRef("version_epoch")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "profile",
KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
@ -1909,6 +1911,42 @@ Future<Optional<std::string>> AdvanceVersionImpl::commit(ReadYourWritesTransacti
return Optional<std::string>();
}
ACTOR static Future<RangeResult> getVersionEpochActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS);
Optional<Value> val = wait(ryw->getTransaction().get(versionEpochKey));
RangeResult result;
if (val.present()) {
int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(val.get(), Unversioned());
ValueRef version(result.arena(), boost::lexical_cast<std::string>(versionEpoch));
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, version));
}
return result;
}
VersionEpochImpl::VersionEpochImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> VersionEpochImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
ASSERT(kr == getKeyRange());
return getVersionEpochActor(ryw, kr);
}
Future<Optional<std::string>> VersionEpochImpl::commit(ReadYourWritesTransaction* ryw) {
auto versionEpoch =
ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("versionepoch")].second;
if (versionEpoch.present()) {
int64_t epoch = BinaryReader::fromStringRef<int64_t>(versionEpoch.get(), Unversioned());
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::RAW_ACCESS);
ryw->getTransaction().set(versionEpochKey, BinaryWriter::toValue(epoch, Unversioned()));
} else {
ryw->getTransaction().clear(versionEpochKey);
}
return Optional<std::string>();
}
ClientProfilingImpl::ClientProfilingImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
ACTOR static Future<RangeResult> ClientProfilingGetRangeActor(ReadYourWritesTransaction* ryw,

View File

@ -476,6 +476,15 @@ public:
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class VersionEpochImpl : public SpecialKeyRangeRWImpl {
public:
explicit VersionEpochImpl(KeyRangeRef kr);
Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const override;
Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
};
class ClientProfilingImpl : public SpecialKeyRangeRWImpl {
public:
explicit ClientProfilingImpl(KeyRangeRef kr);

View File

@ -838,6 +838,7 @@ std::vector<std::pair<UID, Version>> decodeBackupStartedValue(const ValueRef& va
const KeyRef coordinatorsKey = LiteralStringRef("\xff/coordinators");
const KeyRef logsKey = LiteralStringRef("\xff/logs");
const KeyRef minRequiredCommitVersionKey = LiteralStringRef("\xff/minRequiredCommitVersion");
const KeyRef versionEpochKey = LiteralStringRef("\xff/versionEpoch");
const KeyRef globalKeysPrefix = LiteralStringRef("\xff/globals");
const KeyRef lastEpochEndKey = LiteralStringRef("\xff/globals/lastEpochEnd");
@ -1170,9 +1171,9 @@ const KeyRangeRef blobGranuleMappingKeys(LiteralStringRef("\xff\x02/bgm/"), Lite
const KeyRangeRef blobGranuleLockKeys(LiteralStringRef("\xff\x02/bgl/"), LiteralStringRef("\xff\x02/bgl0"));
const KeyRangeRef blobGranuleSplitKeys(LiteralStringRef("\xff\x02/bgs/"), LiteralStringRef("\xff\x02/bgs0"));
const KeyRangeRef blobGranuleHistoryKeys(LiteralStringRef("\xff\x02/bgh/"), LiteralStringRef("\xff\x02/bgh0"));
const KeyRangeRef blobGranulePruneKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
const KeyRangeRef blobGranulePurgeKeys(LiteralStringRef("\xff\x02/bgp/"), LiteralStringRef("\xff\x02/bgp0"));
const KeyRangeRef blobGranuleVersionKeys(LiteralStringRef("\xff\x02/bgv/"), LiteralStringRef("\xff\x02/bgv0"));
const KeyRef blobGranulePruneChangeKey = LiteralStringRef("\xff\x02/bgpChange");
const KeyRef blobGranulePurgeChangeKey = LiteralStringRef("\xff\x02/bgpChange");
const uint8_t BG_FILE_TYPE_DELTA = 'D';
const uint8_t BG_FILE_TYPE_SNAPSHOT = 'S';
@ -1229,7 +1230,7 @@ std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFi
return std::tuple(filename, offset, length, fullFileLength);
}
const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force) {
const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << version;
wr << range;
@ -1237,7 +1238,7 @@ const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force
return wr.toValue();
}
std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value) {
std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value) {
Version version;
KeyRange range;
bool force;

View File

@ -350,6 +350,11 @@ extern const KeyRef logsKey;
// Used during backup/recovery to restrict version requirements
extern const KeyRef minRequiredCommitVersionKey;
// "\xff/versionEpochKey" = "[[uint64_t]]"
// Defines the base epoch representing version 0. The value itself is the
// number of microseconds since the Unix epoch.
extern const KeyRef versionEpochKey;
const Value logsValue(const std::vector<std::pair<UID, NetworkAddress>>& logs,
const std::vector<std::pair<UID, NetworkAddress>>& oldLogs);
std::pair<std::vector<std::pair<UID, NetworkAddress>>, std::vector<std::pair<UID, NetworkAddress>>> decodeLogsValue(
@ -567,9 +572,9 @@ extern const KeyRangeRef blobGranuleSplitKeys;
extern const KeyRangeRef blobGranuleHistoryKeys;
// \xff\x02/bgp/(start,end) = (version, force)
extern const KeyRangeRef blobGranulePruneKeys;
extern const KeyRangeRef blobGranulePurgeKeys;
extern const KeyRangeRef blobGranuleVersionKeys;
extern const KeyRef blobGranulePruneChangeKey;
extern const KeyRef blobGranulePurgeChangeKey;
const Key blobGranuleFileKeyFor(UID granuleID, Version fileVersion, uint8_t fileType);
std::tuple<UID, Version, uint8_t> decodeBlobGranuleFileKey(KeyRef const& key);
@ -578,8 +583,8 @@ const KeyRange blobGranuleFileKeyRangeFor(UID granuleID);
const Value blobGranuleFileValueFor(StringRef const& filename, int64_t offset, int64_t length, int64_t fullFileLength);
std::tuple<Standalone<StringRef>, int64_t, int64_t, int64_t> decodeBlobGranuleFileValue(ValueRef const& value);
const Value blobGranulePruneValueFor(Version version, KeyRange range, bool force);
std::tuple<Version, KeyRange, bool> decodeBlobGranulePruneValue(ValueRef const& value);
const Value blobGranulePurgeValueFor(Version version, KeyRange range, bool force);
std::tuple<Version, KeyRange, bool> decodeBlobGranulePurgeValue(ValueRef const& value);
const Value blobGranuleMappingValueFor(UID const& workerID);
UID decodeBlobGranuleMappingValue(ValueRef const& value);

View File

@ -127,6 +127,20 @@ ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<Pro
[db, expectedVersion]() -> Future<ProtocolVersion> { return db->getClusterProtocol(expectedVersion); });
}
ThreadFuture<Key> ThreadSafeDatabase::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {
DatabaseContext* db = this->db;
KeyRange range = keyRange;
return onMainThread([db, range, purgeVersion, force]() -> Future<Key> {
return db->purgeBlobGranules(range, purgeVersion, force);
});
}
ThreadFuture<Void> ThreadSafeDatabase::waitPurgeGranulesComplete(const KeyRef& purgeKey) {
DatabaseContext* db = this->db;
Key key = purgeKey;
return onMainThread([db, key]() -> Future<Void> { return db->waitPurgeGranulesComplete(key); });
}
ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
ClusterConnectionFile* connFile =
new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first);

View File

@ -59,6 +59,9 @@ public:
ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;
ThreadFuture<Key> purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) override;
ThreadFuture<Void> waitPurgeGranulesComplete(const KeyRef& purgeKey) override;
ThreadFuture<DatabaseSharedState*> createSharedState() override;
void setSharedState(DatabaseSharedState* p) override;

View File

@ -35,11 +35,10 @@ void HealthMonitor::purgeOutdatedHistory() {
auto& count = peerClosedNum[p.second];
--count;
ASSERT(count >= 0);
peerClosedHistory.pop_front();
if (count == 0) {
peerClosedNum.erase(p.second);
}
peerClosedHistory.pop_front();
} else {
break;
}

View File

@ -326,6 +326,7 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
AcknowledgementReceiver acknowledgements;
Endpoint requestStreamEndpoint;
bool sentError = false;
bool notifiedFailed = false;
Promise<Void> onConnect;
NetNotifiedQueueWithAcknowledgements(int futures, int promises)
@ -402,14 +403,20 @@ struct NetNotifiedQueueWithAcknowledgements final : NotifiedQueue<T>,
return res;
}
~NetNotifiedQueueWithAcknowledgements() {
if (acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() && !this->hasError()) {
void notifyFailed() {
if (!notifiedFailed && acknowledgements.getRawEndpoint().isValid() && acknowledgements.isRemoteEndpoint() &&
!this->hasError()) {
// Notify the server that a client is not using this ReplyPromiseStream anymore
FlowTransport::transport().sendUnreliable(
SerializeSource<ErrorOr<AcknowledgementReply>>(operation_obsolete()),
acknowledgements.getEndpoint(TaskPriority::ReadSocket),
false);
notifiedFailed = true;
}
}
~NetNotifiedQueueWithAcknowledgements() {
notifyFailed();
if (isRemoteEndpoint() && !sentError && !acknowledgements.failures.isReady()) {
// Notify the client ReplyPromiseStream was cancelled before sending an error, so the storage server must
// have died
@ -505,6 +512,8 @@ public:
return queue->onConnect.getFuture();
}
void notifyFailed() { queue->notifyFailed(); }
~ReplyPromiseStream() {
if (queue)
queue->delPromiseRef();

View File

@ -321,6 +321,8 @@ void endStreamOnDisconnect(Future<Void> signal,
wait(signal || stream.onConnected());
}
}
// Notify BEFORE dropping last reference, causing broken_promise to send on stream before destructor is called
stream.notifyFailed();
}
}

View File

@ -590,6 +590,18 @@ private:
TEST(true); // Recovering at a higher version.
}
void checkSetVersionEpochKey(MutationRef m) {
if (m.param1 != versionEpochKey) {
return;
}
int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(m.param2, Unversioned());
TraceEvent("VersionEpoch", dbgid).detail("Epoch", versionEpoch);
if (!initialCommit)
txnStateStore->set(KeyValueRef(m.param1, m.param2));
confChange = true;
TEST(true); // Setting version epoch
}
void checkSetWriteRecoverKey(MutationRef m) {
if (m.param1 != writeRecoveryKey) {
return;
@ -957,6 +969,16 @@ private:
}
}
void checkClearVersionEpochKeys(MutationRef m, KeyRangeRef range) {
if (!range.contains(versionEpochKey)) {
return;
}
if (!initialCommit)
txnStateStore->clear(singleKeyRange(versionEpochKey));
TraceEvent("MutationRequiresRestart", dbgid).detail("M", m);
confChange = true;
}
void checkClearTenantMapPrefix(KeyRangeRef range) {
if (tenantMapKeys.intersects(range)) {
if (tenantMap) {
@ -1119,6 +1141,7 @@ public:
checkSetGlobalKeys(m);
checkSetWriteRecoverKey(m);
checkSetMinRequiredCommitVersionKey(m);
checkSetVersionEpochKey(m);
checkSetTenantMapPrefix(m);
checkSetOtherKeys(m);
} else if (m.type == MutationRef::ClearRange && isSystemKey(m.param2)) {
@ -1135,6 +1158,7 @@ public:
checkClearLogRangesRange(range);
checkClearTssMappingKeys(m, range);
checkClearTssQuarantineKeys(m, range);
checkClearVersionEpochKeys(m, range);
checkClearTenantMapPrefix(range);
checkClearMiscRangeKeys(range);
}

View File

@ -216,7 +216,7 @@ struct SplitEvaluation {
struct BlobManagerStats {
CounterCollection cc;
// FIXME: pruning stats
// FIXME: purging stats
Counter granuleSplits;
Counter granuleWriteHotSplits;
@ -226,6 +226,10 @@ struct BlobManagerStats {
Counter ccMismatches;
Counter ccTimeouts;
Counter ccErrors;
Counter purgesProcessed;
Counter granulesFullyPurged;
Counter granulesPartiallyPurged;
Counter filesPurged;
Future<Void> logger;
// Current stats maintained for a given blob worker process
@ -233,7 +237,9 @@ struct BlobManagerStats {
: cc("BlobManagerStats", id.toString()), granuleSplits("GranuleSplits", cc),
granuleWriteHotSplits("GranuleWriteHotSplits", cc), ccGranulesChecked("CCGranulesChecked", cc),
ccRowsChecked("CCRowsChecked", cc), ccBytesChecked("CCBytesChecked", cc), ccMismatches("CCMismatches", cc),
ccTimeouts("CCTimeouts", cc), ccErrors("CCErrors", cc) {
ccTimeouts("CCTimeouts", cc), ccErrors("CCErrors", cc), purgesProcessed("PurgesProcessed", cc),
granulesFullyPurged("GranulesFullyPurged", cc), granulesPartiallyPurged("GranulesPartiallyPurged", cc),
filesPurged("FilesPurged", cc) {
specialCounter(cc, "WorkerCount", [workers]() { return workers->size(); });
logger = traceCounters("BlobManagerMetrics", id, interval, &cc, "BlobManagerMetrics");
}
@ -438,6 +444,7 @@ ACTOR Future<UID> pickWorkerForAssign(Reference<BlobManagerData> bmData) {
ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
RangeAssignment assignment,
Optional<UID> workerID,
int64_t epoch,
int64_t seqNo) {
// WorkerId is set, except in case of assigning to any worker. Then we pick the worker to assign to in here
@ -468,7 +475,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
assignment.isAssign ? "assigning" : "revoking",
assignment.keyRange.begin.printable(),
assignment.keyRange.end.printable(),
bmData->epoch,
epoch,
seqNo,
workerID.get().toString());
}
@ -481,7 +488,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
AssignBlobRangeRequest req;
req.keyRange = KeyRangeRef(StringRef(req.arena, assignment.keyRange.begin),
StringRef(req.arena, assignment.keyRange.end));
req.managerEpoch = bmData->epoch;
req.managerEpoch = epoch;
req.managerSeqno = seqNo;
req.type = assignment.assign.get().type;
@ -497,7 +504,7 @@ ACTOR Future<Void> doRangeAssignment(Reference<BlobManagerData> bmData,
RevokeBlobRangeRequest req;
req.keyRange = KeyRangeRef(StringRef(req.arena, assignment.keyRange.begin),
StringRef(req.arena, assignment.keyRange.end));
req.managerEpoch = bmData->epoch;
req.managerEpoch = epoch;
req.managerSeqno = seqNo;
req.dispose = assignment.revoke.get().dispose;
@ -637,10 +644,10 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
}
count++;
}
ASSERT(count == 1);
if (skip) {
continue;
}
ASSERT(count == 1);
if (assignment.worker.present() && assignment.worker.get().isValid()) {
if (BM_DEBUG) {
@ -653,7 +660,7 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
bmData->workerAssignments.insert(assignment.keyRange, workerId);
bmData->assignsInProgress.insert(assignment.keyRange,
doRangeAssignment(bmData, assignment, workerId, seqNo));
doRangeAssignment(bmData, assignment, workerId, bmData->epoch, seqNo));
// If we know about the worker and this is not a continue, then this is a new range for the worker
if (bmData->workerStats.count(workerId) &&
assignment.assign.get().type != AssignRequestType::Continue) {
@ -662,8 +669,8 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
} else {
// Ensure the key boundaries are updated before we pick a worker
bmData->workerAssignments.insert(assignment.keyRange, UID());
bmData->assignsInProgress.insert(assignment.keyRange,
doRangeAssignment(bmData, assignment, Optional<UID>(), seqNo));
bmData->assignsInProgress.insert(
assignment.keyRange, doRangeAssignment(bmData, assignment, Optional<UID>(), bmData->epoch, seqNo));
}
} else {
@ -677,7 +684,8 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
if (existingRange.range() == assignment.keyRange && existingRange.cvalue() == assignment.worker.get()) {
bmData->workerAssignments.insert(assignment.keyRange, UID());
}
bmData->addActor.send(doRangeAssignment(bmData, assignment, assignment.worker.get(), seqNo));
bmData->addActor.send(
doRangeAssignment(bmData, assignment, assignment.worker.get(), bmData->epoch, seqNo));
} else {
auto currentAssignments = bmData->workerAssignments.intersectingRanges(assignment.keyRange);
for (auto& it : currentAssignments) {
@ -693,7 +701,7 @@ ACTOR Future<Void> rangeAssigner(Reference<BlobManagerData> bmData) {
}
// revoke the range for the worker that owns it, not the worker specified in the revoke
bmData->addActor.send(doRangeAssignment(bmData, assignment, it.value(), seqNo));
bmData->addActor.send(doRangeAssignment(bmData, assignment, it.value(), bmData->epoch, seqNo));
}
bmData->workerAssignments.insert(assignment.keyRange, UID());
}
@ -1356,26 +1364,6 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
// back is to split the range.
ASSERT(rep.doSplit);
// only evaluate for split if this worker currently owns the granule in this blob manager's mapping
auto currGranuleAssignment = bmData->workerAssignments.rangeContaining(rep.granuleRange.begin);
if (!(currGranuleAssignment.begin() == rep.granuleRange.begin &&
currGranuleAssignment.end() == rep.granuleRange.end &&
currGranuleAssignment.cvalue() == bwInterf.id())) {
if (BM_DEBUG) {
fmt::print("Manager {0} ignoring status from BW {1} for granule [{2} - {3}) since BW {4} owns "
"[{5} - {6}).\n",
bmData->epoch,
bwInterf.id().toString().substr(0, 5),
rep.granuleRange.begin.printable(),
rep.granuleRange.end.printable(),
currGranuleAssignment.cvalue().toString().substr(0, 5),
currGranuleAssignment.begin().printable(),
currGranuleAssignment.end().printable());
}
// FIXME: could send revoke request
continue;
}
// FIXME: We will need to go over all splits in the range once we're doing merges, instead of first one
auto lastSplitEval = bmData->splitEvaluations.rangeContaining(rep.granuleRange.begin);
if (rep.granuleRange.begin == lastSplitEval.begin() && rep.granuleRange.end == lastSplitEval.end() &&
@ -1386,46 +1374,67 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
rep.granuleRange.begin.printable(),
rep.granuleRange.end.printable());
}
} else {
ASSERT(lastSplitEval.cvalue().epoch < rep.epoch ||
(lastSplitEval.cvalue().epoch == rep.epoch && lastSplitEval.cvalue().seqno < rep.seqno));
if (lastSplitEval.cvalue().inProgress.isValid() && !lastSplitEval.cvalue().inProgress.isReady()) {
TEST(true); // racing BM splits
// For example, one worker asked BM to split, then died, granule was moved, new worker asks to
// split on recovery. We need to ensure that they are semantically the same split.
// We will just rely on the in-progress split to finish
if (BM_DEBUG) {
fmt::print("Manager {0} got split request for [{1} - {2}) @ ({3}, {4}), but already in "
"progress from [{5} - {6}) @ ({7}, {8})\n",
bmData->epoch,
rep.granuleRange.begin.printable().c_str(),
rep.granuleRange.end.printable().c_str(),
rep.epoch,
rep.seqno,
lastSplitEval.begin().printable().c_str(),
lastSplitEval.end().printable().c_str(),
lastSplitEval.cvalue().epoch,
lastSplitEval.cvalue().seqno);
}
// ignore the request, they will retry
} else {
if (BM_DEBUG) {
fmt::print("Manager {0} evaluating [{1} - {2}) @ ({3}, {4}) for split\n",
bmData->epoch,
rep.granuleRange.begin.printable().c_str(),
rep.granuleRange.end.printable().c_str(),
rep.epoch,
rep.seqno);
}
Future<Void> doSplitEval = maybeSplitRange(bmData,
bwInterf.id(),
rep.granuleRange,
rep.granuleID,
rep.startVersion,
rep.writeHotSplit);
bmData->splitEvaluations.insert(rep.granuleRange,
SplitEvaluation(rep.epoch, rep.seqno, doSplitEval));
} else if (!(lastSplitEval.cvalue().epoch < rep.epoch ||
(lastSplitEval.cvalue().epoch == rep.epoch && lastSplitEval.cvalue().seqno < rep.seqno))) {
TEST(true); // BM got out-of-date split request
if (BM_DEBUG) {
fmt::print(
"Manager {0} ignoring status from BW {1} for granule [{2} - {3}) since it already processed"
"[{4} - {5}) @ ({6}, {7}).\n",
bmData->epoch,
bwInterf.id().toString().substr(0, 5),
rep.granuleRange.begin.printable(),
rep.granuleRange.end.printable(),
lastSplitEval.begin().printable(),
lastSplitEval.end().printable(),
lastSplitEval.cvalue().epoch,
lastSplitEval.cvalue().seqno);
}
// revoke range from out-of-date worker, but bypass rangeAssigner and hack (epoch, seqno) to be
// (requesting epoch, requesting seqno + 1) to ensure no race with then reassigning the range to the
// worker at a later version
RangeAssignment revokeOld;
revokeOld.isAssign = false;
revokeOld.worker = bwInterf.id();
revokeOld.keyRange = rep.granuleRange;
revokeOld.revoke = RangeRevokeData(false);
bmData->addActor.send(
doRangeAssignment(bmData, revokeOld, bwInterf.id(), rep.epoch, rep.seqno + 1));
} else if (lastSplitEval.cvalue().inProgress.isValid() &&
!lastSplitEval.cvalue().inProgress.isReady()) {
TEST(true); // racing BM splits
// For example, one worker asked BM to split, then died, granule was moved, new worker asks to
// split on recovery. We need to ensure that they are semantically the same split.
// We will just rely on the in-progress split to finish
if (BM_DEBUG) {
fmt::print("Manager {0} got split request for [{1} - {2}) @ ({3}, {4}), but already in "
"progress from [{5} - {6}) @ ({7}, {8})\n",
bmData->epoch,
rep.granuleRange.begin.printable().c_str(),
rep.granuleRange.end.printable().c_str(),
rep.epoch,
rep.seqno,
lastSplitEval.begin().printable().c_str(),
lastSplitEval.end().printable().c_str(),
lastSplitEval.cvalue().epoch,
lastSplitEval.cvalue().seqno);
}
// ignore the request, they will retry
} else {
if (BM_DEBUG) {
fmt::print("Manager {0} evaluating [{1} - {2}) @ ({3}, {4}) for split\n",
bmData->epoch,
rep.granuleRange.begin.printable().c_str(),
rep.granuleRange.end.printable().c_str(),
rep.epoch,
rep.seqno);
}
Future<Void> doSplitEval = maybeSplitRange(
bmData, bwInterf.id(), rep.granuleRange, rep.granuleID, rep.startVersion, rep.writeHotSplit);
bmData->splitEvaluations.insert(rep.granuleRange,
SplitEvaluation(rep.epoch, rep.seqno, doSplitEval));
}
}
} catch (Error& e) {
@ -2160,23 +2169,84 @@ ACTOR Future<GranuleFiles> loadHistoryFiles(Reference<BlobManagerData> bmData, U
}
}
// FIXME: trace events for pruning
// FIXME: trace events for purging
ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID granuleId) {
state Transaction tr(self->db);
state KeyRange splitRange = blobGranuleSplitKeyRangeFor(granuleId);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state RangeResult splitState = wait(tr.getRange(splitRange, SERVER_KNOBS->BG_MAX_SPLIT_FANOUT));
state int i = 0;
state bool retry = false;
for (; i < splitState.size(); i++) {
UID parent, child;
BlobGranuleSplitState st;
Version v;
std::tie(parent, child) = decodeBlobGranuleSplitKey(splitState[i].key);
std::tie(st, v) = decodeBlobGranuleSplitValue(splitState[i].value);
// if split state is done, this granule has definitely persisted a snapshot
if (st >= BlobGranuleSplitState::Done) {
continue;
}
// if split state isn't even assigned, this granule has definitely not persisted a snapshot
if (st <= BlobGranuleSplitState::Initialized) {
retry = true;
break;
}
ASSERT(st == BlobGranuleSplitState::Assigned);
// if assigned, granule may or may not have snapshotted. Check files to confirm. Since a re-snapshot is
// the first file written for a new granule, any files present mean it has re-snapshotted from this
// granule
KeyRange granuleFileRange = blobGranuleFileKeyRangeFor(child);
RangeResult files = wait(tr.getRange(granuleFileRange, 1));
if (files.empty()) {
retry = true;
break;
}
}
if (retry) {
tr.reset();
wait(delay(1.0));
} else {
if (splitState.empty() || !splitState.more) {
break;
}
splitRange = KeyRangeRef(keyAfter(splitState.back().key), splitRange.end);
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
return Void();
}
/*
* Deletes all files pertaining to the granule with id granuleId and
* also removes the history entry for this granule from the system keyspace
* TODO: ensure cannot fully delete granule that is still splitting!
*/
ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Key historyKey) {
ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId,
Key historyKey,
Version purgeVersion) {
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
}
// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
// delete the granule, since we need to keep the last snapshot and deltas for splitting
wait(canDeleteFullGranule(self, granuleId));
// get files
GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));
std::vector<Future<Void>> deletions;
std::vector<std::string> filesToDelete; // TODO: remove, just for debugging
state std::vector<std::string> filesToDelete; // TODO: remove, just for debugging
for (auto snapshotFile : files.snapshotFiles) {
std::string fname = snapshotFile.filename;
@ -2191,7 +2261,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granu
}
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), deletions.size());
fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
for (auto filename : filesToDelete) {
fmt::print(" - {}\n", filename.c_str());
}
@ -2228,18 +2298,27 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self, UID granu
fmt::print("Fully deleting granule {0}: success\n", granuleId.toString());
}
TraceEvent("GranuleFullPurge", self->id)
.detail("Epoch", self->epoch)
.detail("GranuleID", granuleId)
.detail("PurgeVersion", purgeVersion)
.detail("FilesPurged", filesToDelete.size());
++self->stats.granulesFullyPurged;
self->stats.filesPurged += filesToDelete.size();
return Void();
}
/*
* For the granule with id granuleId, finds the first snapshot file at a
* version <= pruneVersion and deletes all files older than it.
* version <= purgeVersion and deletes all files older than it.
*
* Assumption: this granule's startVersion might change because the first snapshot
* file might be deleted. We will need to ensure we don't rely on the granule's startVersion
* (that's persisted as part of the key), but rather use the granule's first snapshot's version when needed
*/
ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version pruneVersion) {
ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version purgeVersion) {
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
}
@ -2247,7 +2326,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
// get files
GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));
// represents the version of the latest snapshot file in this granule with G.version < pruneVersion
// represents the version of the latest snapshot file in this granule with G.version < purgeVersion
Version latestSnapshotVersion = invalidVersion;
state std::vector<Future<Void>> deletions; // deletion work per file
@ -2262,8 +2341,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
deletions.emplace_back(self->bstore->deleteFile(fname));
deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, files.snapshotFiles[idx].version, 'S'));
filesToDelete.emplace_back(fname);
} else if (files.snapshotFiles[idx].version <= pruneVersion) {
// otherwise if this is the FIRST snapshot file with version < pruneVersion,
} else if (files.snapshotFiles[idx].version <= purgeVersion) {
// otherwise if this is the FIRST snapshot file with version < purgeVersion,
// then we found our latestSnapshotVersion (FIRST since we are traversing in reverse)
latestSnapshotVersion = files.snapshotFiles[idx].version;
}
@ -2289,19 +2368,19 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
}
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), deletions.size());
fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
for (auto filename : filesToDelete) {
fmt::print(" - {0}\n", filename);
}
}
// TODO: the following comment relies on the assumption that BWs will not get requests to
// read data that was already pruned. confirm assumption is fine. otherwise, we'd need
// to communicate with BWs here and have them ack the pruneVersion
// read data that was already purged. confirm assumption is fine. otherwise, we'd need
// to communicate with BWs here and have them ack the purgeVersion
// delete the files before the corresponding metadata.
// this could lead to dangling pointers in fdb, but we should never read data older than
// pruneVersion anyways, and we can clean up the keys the next time around.
// purgeVersion anyways, and we can clean up the keys the next time around.
// deleting files before corresponding metadata reduces the # of orphaned files.
wait(waitForAll(deletions));
@ -2329,26 +2408,41 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: success\n", granuleId.toString());
}
TraceEvent("GranulePartialPurge", self->id)
.detail("Epoch", self->epoch)
.detail("GranuleID", granuleId)
.detail("PurgeVersion", purgeVersion)
.detail("FilesPurged", filesToDelete.size());
++self->stats.granulesPartiallyPurged;
self->stats.filesPurged += filesToDelete.size();
return Void();
}
/*
* This method is used to prune the range [startKey, endKey) at (and including) pruneVersion.
* This method is used to purge the range [startKey, endKey) at (and including) purgeVersion.
* To do this, we do a BFS traversal starting at the active granules. Then we classify granules
* in the history as nodes that can be fully deleted (i.e. their files and history can be deleted)
* and nodes that can be partially deleted (i.e. some of their files can be deleted).
* Once all this is done, we finally clear the pruneIntent key, if possible, to indicate we are done
* processing this prune intent.
* Once all this is done, we finally clear the purgeIntent key, if possible, to indicate we are done
* processing this purge intent.
*/
ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range, Version pruneVersion, bool force) {
ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range, Version purgeVersion, bool force) {
if (BM_DEBUG) {
fmt::print("pruneRange starting for range [{0} - {1}) @ pruneVersion={2}, force={3}\n",
fmt::print("purgeRange starting for range [{0} - {1}) @ purgeVersion={2}, force={3}\n",
range.begin.printable(),
range.end.printable(),
pruneVersion,
purgeVersion,
force);
}
TraceEvent("PurgeGranulesBegin", self->id)
.detail("Epoch", self->epoch)
.detail("Range", range)
.detail("PurgeVersion", purgeVersion)
.detail("Force", force);
// queue of <range, startVersion, endVersion> for BFS traversal of history
state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;
@ -2371,18 +2465,18 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
state KeyRangeMap<UID>::iterator activeRange;
for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) {
if (BM_DEBUG) {
fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be pruned\n",
fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be purged\n",
activeRange.begin().printable(),
activeRange.end().printable(),
activeRange.value().toString());
}
// assumption: prune boundaries must respect granule boundaries
// assumption: purge boundaries must respect granule boundaries
if (activeRange.begin() < range.begin || activeRange.end() > range.end) {
continue;
}
// TODO: if this is a force prune, then revoke the assignment from the corresponding BW first
// TODO: if this is a force purge, then revoke the assignment from the corresponding BW first
// so that it doesn't try to interact with the granule (i.e. force it to give up gLock).
// we'll need some way to ack that the revoke was successful
@ -2456,17 +2550,17 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
}
// There are three cases this granule can fall into:
// - if the granule's end version is at or before the prune version or this is a force delete,
// - if the granule's end version is at or before the purge version or this is a force delete,
// this granule should be completely deleted
// - else if the startVersion <= pruneVersion, then G.startVersion < pruneVersion < G.endVersion
// - else if the startVersion <= purgeVersion, then G.startVersion < purgeVersion < G.endVersion
// and so this granule should be partially deleted
// - otherwise, this granule is active, so don't schedule it for deletion
if (force || endVersion <= pruneVersion) {
if (force || endVersion <= purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
}
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey });
} else if (startVersion < pruneVersion) {
} else if (startVersion < purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
}
@ -2513,70 +2607,79 @@ ACTOR Future<Void> pruneRange(Reference<BlobManagerData> self, KeyRangeRef range
// we won't run into any issues with trying to "re-delete" a blob file since deleting
// a file that doesn't exist is considered successful
state std::vector<Future<Void>> partialDeletions;
state int i;
if (BM_DEBUG) {
fmt::print("{0} granules to fully delete\n", toFullyDelete.size());
}
for (i = toFullyDelete.size() - 1; i >= 0; --i) {
UID granuleId;
state UID granuleId;
Key historyKey;
std::tie(granuleId, historyKey) = toFullyDelete[i];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_DEBUG) {
fmt::print("About to fully delete granule {0}\n", granuleId.toString());
}
wait(fullyDeleteGranule(self, granuleId, historyKey));
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion));
}
if (BM_DEBUG) {
fmt::print("{0} granules to partially delete\n", toPartiallyDelete.size());
}
std::vector<Future<Void>> partialDeletions;
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId = toPartiallyDelete[i];
if (BM_DEBUG) {
fmt::print("About to partially delete granule {0}\n", granuleId.toString());
}
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, pruneVersion));
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion));
}
wait(waitForAll(partialDeletions));
// Now that all the necessary granules and their files have been deleted, we can
// clear the pruneIntent key to signify that the work is done. However, there could have been
// another pruneIntent that got written for this table while we were processing this one.
// clear the purgeIntent key to signify that the work is done. However, there could have been
// another purgeIntent that got written for this table while we were processing this one.
// If that is the case, we should not clear the key. Otherwise, we can just clear the key.
if (BM_DEBUG) {
fmt::print("Successfully pruned range [{0} - {1}) at pruneVersion={2}\n",
fmt::print("Successfully purged range [{0} - {1}) at purgeVersion={2}\n",
range.begin.printable(),
range.end.printable(),
pruneVersion);
purgeVersion);
}
TraceEvent("PurgeGranulesComplete", self->id)
.detail("Epoch", self->epoch)
.detail("Range", range)
.detail("PurgeVersion", purgeVersion)
.detail("Force", force);
++self->stats.purgesProcessed;
return Void();
}
/*
* This monitor watches for changes to a key K that gets updated whenever there is a new prune intent.
* On this change, we scan through all blobGranulePruneKeys (which look like <startKey, endKey>=<prune_version,
* force>) and prune any intents.
* This monitor watches for changes to a key K that gets updated whenever there is a new purge intent.
* On this change, we scan through all blobGranulePurgeKeys (which look like <startKey, endKey>=<purge_version,
* force>) and purge any intents.
*
* Once the prune has succeeded, we clear the key IF the version is still the same one that was pruned.
* That way, if another prune intent arrived for the same range while we were working on an older one,
* Once the purge has succeeded, we clear the key IF the version is still the same one that was purged.
* That way, if another purge intent arrived for the same range while we were working on an older one,
* we wouldn't end up clearing the intent.
*
* When watching for changes, we might end up in scenarios where we failed to do the work
* for a prune intent even though the watch was triggered (maybe the BM had a blip). This is problematic
* if the intent is a force and there isn't another prune intent for quite some time. To remedy this,
* if we don't see a watch change in X (configurable) seconds, we will just sweep through the prune intents,
* for a purge intent even though the watch was triggered (maybe the BM had a blip). This is problematic
* if the intent is a force and there isn't another purge intent for quite some time. To remedy this,
* if we don't see a watch change in X (configurable) seconds, we will just sweep through the purge intents,
* consolidating any work we might have missed before.
*
* Note: we could potentially use a changefeed here to get the exact pruneIntent that was added
* Note: we could potentially use a changefeed here to get the exact purgeIntent that was added
* rather than iterating through all of them, but this might have too much overhead for latency
* improvements we don't really need here (also we need to go over all prune intents anyways in the
* case that the timer is up before any new prune intents arrive).
* improvements we don't really need here (also we need to go over all purge intents anyways in the
* case that the timer is up before any new purge intents arrive).
*/
ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
self->initBStore();
loop {
@ -2585,35 +2688,35 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
// Wait for the watch to change, or some time to expire (whichever comes first)
// before checking through the prune intents. We write a UID into the change key value
// before checking through the purge intents. We write a UID into the change key value
// so that we can still recognize when the watch key has been changed while we weren't
// monitoring it
state Key lastPruneKey = blobGranulePruneKeys.begin;
state Key lastPurgeKey = blobGranulePurgeKeys.begin;
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state std::vector<Future<Void>> prunes;
state CoalescedKeyRangeMap<std::pair<Version, bool>> pruneMap;
pruneMap.insert(allKeys, std::make_pair<Version, bool>(0, false));
state std::vector<Future<Void>> purges;
state CoalescedKeyRangeMap<std::pair<Version, bool>> purgeMap;
purgeMap.insert(allKeys, std::make_pair<Version, bool>(0, false));
try {
// TODO: replace 10000 with a knob
state RangeResult pruneIntents = wait(tr->getRange(blobGranulePruneKeys, BUGGIFY ? 1 : 10000));
if (pruneIntents.size()) {
state RangeResult purgeIntents = wait(tr->getRange(blobGranulePurgeKeys, BUGGIFY ? 1 : 10000));
if (purgeIntents.size()) {
int rangeIdx = 0;
for (; rangeIdx < pruneIntents.size(); ++rangeIdx) {
Version pruneVersion;
for (; rangeIdx < purgeIntents.size(); ++rangeIdx) {
Version purgeVersion;
KeyRange range;
bool force;
std::tie(pruneVersion, range, force) =
decodeBlobGranulePruneValue(pruneIntents[rangeIdx].value);
auto ranges = pruneMap.intersectingRanges(range);
std::tie(purgeVersion, range, force) =
decodeBlobGranulePurgeValue(purgeIntents[rangeIdx].value);
auto ranges = purgeMap.intersectingRanges(range);
bool foundConflict = false;
for (auto it : ranges) {
if ((it.value().second && !force && it.value().first < pruneVersion) ||
(!it.value().second && force && pruneVersion < it.value().first)) {
if ((it.value().second && !force && it.value().first < purgeVersion) ||
(!it.value().second && force && purgeVersion < it.value().first)) {
foundConflict = true;
break;
}
@ -2621,39 +2724,41 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
if (foundConflict) {
break;
}
pruneMap.insert(range, std::make_pair(pruneVersion, force));
purgeMap.insert(range, std::make_pair(purgeVersion, force));
fmt::print("about to prune range [{0} - {1}) @ {2}, force={3}\n",
range.begin.printable(),
range.end.printable(),
pruneVersion,
force ? "T" : "F");
if (BM_DEBUG) {
fmt::print("about to purge range [{0} - {1}) @ {2}, force={3}\n",
range.begin.printable(),
range.end.printable(),
purgeVersion,
force ? "T" : "F");
}
}
lastPruneKey = pruneIntents[rangeIdx - 1].key;
lastPurgeKey = purgeIntents[rangeIdx - 1].key;
for (auto it : pruneMap.ranges()) {
for (auto it : purgeMap.ranges()) {
if (it.value().first > 0) {
prunes.emplace_back(pruneRange(self, it.range(), it.value().first, it.value().second));
purges.emplace_back(purgeRange(self, it.range(), it.value().first, it.value().second));
}
}
// wait for this set of prunes to complete before starting the next ones since if we
// prune a range R at version V and while we are doing that, the time expires, we will
// end up trying to prune the same range again since the work isn't finished and the
// prunes will race
// wait for this set of purges to complete before starting the next ones since if we
// purge a range R at version V and while we are doing that, the time expires, we will
// end up trying to purge the same range again since the work isn't finished and the
// purges will race
//
// TODO: this isn't that efficient though. Instead we could keep metadata as part of the
// BM's memory that tracks which prunes are active. Once done, we can mark that work as
// done. If the BM fails then all prunes will fail and so the next BM will have a clear
// BM's memory that tracks which purges are active. Once done, we can mark that work as
// done. If the BM fails then all purges will fail and so the next BM will have a clear
// set of metadata (i.e. no work in progress) so we will end up doing the work in the
// new BM
wait(waitForAll(prunes));
wait(waitForAll(purges));
break;
} else {
state Future<Void> watchPruneIntentsChange = tr->watch(blobGranulePruneChangeKey);
state Future<Void> watchPurgeIntentsChange = tr->watch(blobGranulePurgeChangeKey);
wait(tr->commit());
wait(watchPruneIntentsChange);
wait(watchPurgeIntentsChange);
tr->reset();
}
} catch (Error& e) {
@ -2666,7 +2771,7 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->clear(KeyRangeRef(blobGranulePruneKeys.begin, keyAfter(lastPruneKey)));
tr->clear(KeyRangeRef(blobGranulePurgeKeys.begin, keyAfter(lastPurgeKey)));
wait(tr->commit());
break;
} catch (Error& e) {
@ -2675,7 +2780,7 @@ ACTOR Future<Void> monitorPruneKeys(Reference<BlobManagerData> self) {
}
if (BM_DEBUG) {
printf("Done pruning current set of prune intents.\n");
printf("Done clearing current set of purge intents.\n");
}
}
}
@ -2876,7 +2981,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
self->addActor.send(doLockChecks(self));
self->addActor.send(monitorClientRanges(self));
self->addActor.send(monitorPruneKeys(self));
self->addActor.send(monitorPurgeKeys(self));
if (SERVER_KNOBS->BG_CONSISTENCY_CHECK_ENABLED) {
self->addActor.send(bgConsistencyCheck(self));
}

View File

@ -86,6 +86,7 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
NotifiedVersion durableSnapshotVersion; // same as delta vars, except for snapshots
Version pendingSnapshotVersion = 0;
Version initialSnapshotVersion = invalidVersion;
Version historyVersion = invalidVersion;
Version knownCommittedVersion;
int64_t originalEpoch;
@ -756,7 +757,11 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
bytesRead);
}
state Error err = e;
wait(tr->onError(e));
if (e.code() == error_code_server_overloaded) {
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
} else {
wait(tr->onError(e));
}
retries++;
TEST(true); // Granule initial snapshot failed
// FIXME: why can't we supress error event?
@ -935,13 +940,8 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
break;
}
bwData->currentManagerStatusStream.get().send(GranuleStatusReply(metadata->keyRange,
true,
writeHot,
statusEpoch,
statusSeqno,
granuleID,
metadata->initialSnapshotVersion));
bwData->currentManagerStatusStream.get().send(GranuleStatusReply(
metadata->keyRange, true, writeHot, statusEpoch, statusSeqno, granuleID, metadata->historyVersion));
break;
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
@ -1037,10 +1037,14 @@ static void handleCompletedDeltaFile(Reference<BlobWorkerData> bwData,
// if we get an i/o error updating files, or a rollback, reassign the granule to ourselves and start fresh
static bool granuleCanRetry(const Error& e) {
switch (e.code()) {
case error_code_please_reboot:
case error_code_io_error:
case error_code_io_timeout:
// FIXME: handle connection errors in tighter retry loop around individual files.
// FIXME: if these requests fail at a high enough rate, the whole worker should be marked as unhealthy and its
// granules should be moved away, as there may be some problem with this host contacting blob storage
case error_code_http_request_failed:
case error_code_connection_failed:
case error_code_lookup_failed: // dns
return true;
default:
return false;
@ -1119,10 +1123,15 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
}
metadata->pendingDeltaVersion = cfRollbackVersion;
if (BW_DEBUG) {
fmt::print("[{0} - {1}) rollback discarding all {2} in-memory mutations\n",
fmt::print("[{0} - {1}) rollback discarding all {2} in-memory mutations",
metadata->keyRange.begin.printable(),
metadata->keyRange.end.printable(),
metadata->currentDeltas.size());
if (metadata->currentDeltas.size()) {
fmt::print(
" {0} - {1}", metadata->currentDeltas.front().version, metadata->currentDeltas.back().version);
}
fmt::print("\n");
}
// discard all in-memory mutations
@ -1150,6 +1159,8 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
// FIXME: could binary search?
int mIdx = metadata->currentDeltas.size() - 1;
Version firstDiscarded = invalidVersion;
Version lastDiscarded = invalidVersion;
while (mIdx >= 0) {
if (metadata->currentDeltas[mIdx].version <= rollbackVersion) {
break;
@ -1157,19 +1168,37 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
for (auto& m : metadata->currentDeltas[mIdx].mutations) {
metadata->bufferedDeltaBytes -= m.totalSize();
}
if (firstDiscarded == invalidVersion) {
firstDiscarded = metadata->currentDeltas[mIdx].version;
}
lastDiscarded = metadata->currentDeltas[mIdx].version;
mIdx--;
}
mIdx++;
if (BW_DEBUG) {
fmt::print("[{0} - {1}) rollback discarding {2} in-memory mutations, {3} mutations and {4} bytes left\n",
fmt::print("[{0} - {1}) rollback discarding {2} in-memory mutations",
metadata->keyRange.begin.printable(),
metadata->keyRange.end.printable(),
metadata->currentDeltas.size() - mIdx,
mIdx,
metadata->bufferedDeltaBytes);
metadata->currentDeltas.size() - mIdx - 1);
if (firstDiscarded != invalidVersion) {
fmt::print(" {0} - {1}", lastDiscarded, firstDiscarded);
}
fmt::print(", {0} mutations", mIdx);
if (mIdx >= 0) {
fmt::print(
" ({0} - {1})", metadata->currentDeltas.front().version, metadata->currentDeltas[mIdx].version);
}
fmt::print(" and {0} bytes left\n", metadata->bufferedDeltaBytes);
}
metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx);
if (mIdx < 0) {
metadata->currentDeltas = Standalone<GranuleDeltas>();
metadata->bufferedDeltaBytes = 0;
} else {
metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx + 1);
}
// delete all deltas in rollback range, but we can optimize here to just skip the uncommitted mutations
// directly and immediately pop the rollback out of inProgress to completed
@ -1328,6 +1357,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
metadata->pendingSnapshotVersion = metadata->files.snapshotFiles.back().version;
metadata->durableSnapshotVersion.set(metadata->pendingSnapshotVersion);
metadata->initialSnapshotVersion = metadata->files.snapshotFiles.front().version;
metadata->historyVersion = startState.history.get().version;
} else {
if (startState.blobFilesToSnapshot.present()) {
startVersion = startState.previousDurableVersion;
@ -1350,6 +1380,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
metadata->initialSnapshotVersion = startVersion;
metadata->pendingSnapshotVersion = startVersion;
metadata->historyVersion = startState.history.present() ? startState.history.get().version : startVersion;
}
metadata->durableDeltaVersion.set(startVersion);
@ -1459,8 +1490,16 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
ASSERT(mutations.front().version > metadata->bufferedDeltaVersion);
// If this assert trips we should have gotten change_feed_popped from SS and didn't
ASSERT(mutations.front().version >= metadata->activeCFData.get()->popVersion);
// Rare race from merge cursor where no individual server detected popped in their response
if (mutations.front().version < metadata->activeCFData.get()->popVersion) {
TEST(true); // Blob Worker detected popped instead of change feed
TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
.detail("Granule", metadata->keyRange)
.detail("GranuleID", startState.granuleID)
.detail("MutationVersion", mutations.front().version)
.detail("PopVersion", metadata->activeCFData.get()->popVersion);
throw change_feed_popped();
}
}
when(wait(inFlightFiles.empty() ? Never() : success(inFlightFiles.front().future))) {}
}
@ -1623,6 +1662,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
metadata->activeCFData.set(cfData);
justDidRollback = true;
lastDeltaVersion = cfRollbackVersion;
break;
}
}
@ -1841,6 +1881,12 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
}
} catch (Error& e) {
if (BW_DEBUG) {
fmt::print("Granule file updater for [{0} - {1}) got error {2}, exiting\n",
metadata->keyRange.begin.printable(),
metadata->keyRange.end.printable(),
e.name());
}
// Free last change feed data
metadata->activeCFData.set(Reference<ChangeFeedData>());
@ -1871,12 +1917,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
return Void();
}
++bwData->stats.granuleUpdateErrors;
if (BW_DEBUG) {
fmt::print("Granule file updater for [{0} - {1}) got error {2}, exiting\n",
metadata->keyRange.begin.printable(),
metadata->keyRange.end.printable(),
e.name());
}
if (granuleCanRetry(e)) {
TEST(true); // Granule close and re-open on error
@ -2002,6 +2042,14 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
int skipped = historyEntryStack.size() - 1 - i;
while (i >= 0) {
auto intersectingRanges = bwData->granuleHistory.intersectingRanges(historyEntryStack[i]->range);
std::vector<std::pair<KeyRange, Reference<GranuleHistoryEntry>>> newerHistory;
for (auto& r : intersectingRanges) {
if (r.value().isValid() && r.value()->endVersion >= historyEntryStack[i]->endVersion) {
newerHistory.push_back(std::make_pair(r.range(), r.value()));
}
}
auto prevRanges = bwData->granuleHistory.rangeContaining(historyEntryStack[i]->range.begin);
if (prevRanges.value().isValid() &&
@ -2012,6 +2060,9 @@ ACTOR Future<Void> blobGranuleLoadHistory(Reference<BlobWorkerData> bwData,
}
bwData->granuleHistory.insert(historyEntryStack[i]->range, historyEntryStack[i]);
for (auto& it : newerHistory) {
bwData->granuleHistory.insert(it.first, it.second);
}
i--;
}
@ -2137,7 +2188,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
if (req.beginVersion > 0) {
fmt::print("{0} - {1}\n", req.beginVersion, req.readVersion);
} else {
fmt::print("{}", req.readVersion);
fmt::print("{}\n", req.readVersion);
}
}
@ -2210,7 +2261,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
state KeyRange chunkRange;
state GranuleFiles chunkFiles;
if (metadata->initialSnapshotVersion > req.readVersion) {
if (req.readVersion < metadata->historyVersion) {
TEST(true); // Granule Time Travel Read
// this is a time travel query, find previous granule
if (metadata->historyLoaded.canBeSet()) {
@ -2226,7 +2277,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
Reference<GranuleHistoryEntry> cur = bwData->granuleHistory.rangeContaining(historySearchKey).value();
// FIXME: use skip pointers here
Version expectedEndVersion = metadata->initialSnapshotVersion;
Version expectedEndVersion = metadata->historyVersion;
if (cur.isValid()) {
ASSERT(cur->endVersion == expectedEndVersion);
}
@ -2269,17 +2320,22 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
}
if (chunkFiles.snapshotFiles.empty()) {
// a snapshot file must have been pruned
// a snapshot file must have been purged
throw blob_granule_transaction_too_old();
}
ASSERT(!chunkFiles.deltaFiles.empty());
ASSERT(chunkFiles.deltaFiles.back().version > req.readVersion);
if (chunkFiles.snapshotFiles.front().version > req.readVersion) {
// a snapshot file must have been pruned
// a snapshot file must have been purged
throw blob_granule_transaction_too_old();
}
} else {
if (req.readVersion < metadata->initialSnapshotVersion) {
// a snapshot file must have been pruned
throw blob_granule_transaction_too_old();
}
TEST(true); // Granule Active Read
// this is an active granule query
loop {
@ -2287,7 +2343,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
throw wrong_shard_server();
}
Future<Void> waitForVersionFuture = waitForVersion(metadata, req.readVersion);
if (waitForVersionFuture.isReady()) {
if (waitForVersionFuture.isReady() && !waitForVersionFuture.isError()) {
// didn't wait, so no need to check rollback stuff
break;
}

View File

@ -2410,24 +2410,26 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
wait(lowPriorityDelay(SERVER_KNOBS->CC_WORKER_HEALTH_CHECKING_INTERVAL));
}
self->degradedServers = self->getServersWithDegradedLink();
self->degradationInfo = self->getDegradationInfo();
// Compare `self->degradedServers` with `self->excludedDegradedServers` and remove those that have
// recovered.
for (auto it = self->excludedDegradedServers.begin(); it != self->excludedDegradedServers.end();) {
if (self->degradedServers.find(*it) == self->degradedServers.end()) {
if (self->degradationInfo.degradedServers.find(*it) == self->degradationInfo.degradedServers.end()) {
self->excludedDegradedServers.erase(it++);
} else {
++it;
}
}
if (!self->degradedServers.empty()) {
if (!self->degradationInfo.degradedServers.empty() || self->degradationInfo.degradedSatellite) {
std::string degradedServerString;
for (const auto& server : self->degradedServers) {
for (const auto& server : self->degradationInfo.degradedServers) {
degradedServerString += server.toString() + " ";
}
TraceEvent("ClusterControllerHealthMonitor").detail("DegradedServers", degradedServerString);
TraceEvent("ClusterControllerHealthMonitor")
.detail("DegradedServers", degradedServerString)
.detail("DegradedSatellite", self->degradationInfo.degradedSatellite);
// Check if the cluster controller should trigger a recovery to exclude any degraded servers from
// the transaction system.
@ -2435,7 +2437,7 @@ ACTOR Future<Void> workerHealthMonitor(ClusterControllerData* self) {
if (SERVER_KNOBS->CC_HEALTH_TRIGGER_RECOVERY) {
if (self->recentRecoveryCountDueToHealth() < SERVER_KNOBS->CC_MAX_HEALTH_RECOVERY_COUNT) {
self->recentHealthTriggeredRecoveryTime.push(now());
self->excludedDegradedServers = self->degradedServers;
self->excludedDegradedServers = self->degradationInfo.degradedServers;
TraceEvent("DegradedServerDetectedAndTriggerRecovery")
.detail("RecentRecoveryCountDueToHealth", self->recentRecoveryCountDueToHealth());
self->db.forceMasterFailure.trigger();
@ -2784,7 +2786,7 @@ TEST_CASE("/fdbserver/clustercontroller/updateRecoveredWorkers") {
return Void();
}
TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
// Create a testing ClusterControllerData. Most of the internal states do not matter in this test.
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
@ -2800,18 +2802,18 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
// cluster controller.
{
data.workerHealth[worker].degradedPeers[badPeer1] = { now(), now() };
ASSERT(data.getServersWithDegradedLink().empty());
ASSERT(data.getDegradationInfo().degradedServers.empty());
data.workerHealth.clear();
}
// Test that when there is only one reported degraded link, getServersWithDegradedLink can return correct
// Test that when there is only one reported degraded link, getDegradationInfo can return correct
// degraded server.
{
data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() };
auto degradedServers = data.getServersWithDegradedLink();
ASSERT(degradedServers.size() == 1);
ASSERT(degradedServers.find(badPeer1) != degradedServers.end());
auto degradationInfo = data.getDegradationInfo();
ASSERT(degradationInfo.degradedServers.size() == 1);
ASSERT(degradationInfo.degradedServers.find(badPeer1) != degradationInfo.degradedServers.end());
data.workerHealth.clear();
}
@ -2821,10 +2823,10 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
now() };
data.workerHealth[badPeer1].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() };
auto degradedServers = data.getServersWithDegradedLink();
ASSERT(degradedServers.size() == 1);
ASSERT(degradedServers.find(worker) != degradedServers.end() ||
degradedServers.find(badPeer1) != degradedServers.end());
auto degradationInfo = data.getDegradationInfo();
ASSERT(degradationInfo.degradedServers.size() == 1);
ASSERT(degradationInfo.degradedServers.find(worker) != degradationInfo.degradedServers.end() ||
degradationInfo.degradedServers.find(badPeer1) != degradationInfo.degradedServers.end());
data.workerHealth.clear();
}
@ -2839,9 +2841,9 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
now() };
data.workerHealth[badPeer2].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() };
auto degradedServers = data.getServersWithDegradedLink();
ASSERT(degradedServers.size() == 1);
ASSERT(degradedServers.find(worker) != degradedServers.end());
auto degradationInfo = data.getDegradationInfo();
ASSERT(degradationInfo.degradedServers.size() == 1);
ASSERT(degradationInfo.degradedServers.find(worker) != degradationInfo.degradedServers.end());
data.workerHealth.clear();
}
@ -2856,7 +2858,7 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
now() };
data.workerHealth[badPeer4].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() };
ASSERT(data.getServersWithDegradedLink().empty());
ASSERT(data.getDegradationInfo().degradedServers.empty());
data.workerHealth.clear();
}
@ -2880,7 +2882,7 @@ TEST_CASE("/fdbserver/clustercontroller/getServersWithDegradedLink") {
now() };
data.workerHealth[badPeer4].degradedPeers[worker] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() };
ASSERT(data.getServersWithDegradedLink().empty());
ASSERT(data.getDegradationInfo().degradedServers.empty());
data.workerHealth.clear();
}
@ -2977,42 +2979,42 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerRecoveryDueToDegradedServer
ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
// Trigger recovery when master is degraded.
data.degradedServers.insert(master);
data.degradationInfo.degradedServers.insert(master);
ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// Trigger recovery when primary TLog is degraded.
data.degradedServers.insert(tlog);
data.degradationInfo.degradedServers.insert(tlog);
ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// No recovery when satellite Tlog is degraded.
data.degradedServers.insert(satelliteTlog);
data.degradationInfo.degradedServers.insert(satelliteTlog);
ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// No recovery when remote tlog is degraded.
data.degradedServers.insert(remoteTlog);
data.degradationInfo.degradedServers.insert(remoteTlog);
ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// No recovery when log router is degraded.
data.degradedServers.insert(logRouter);
data.degradationInfo.degradedServers.insert(logRouter);
ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// No recovery when backup worker is degraded.
data.degradedServers.insert(backup);
data.degradationInfo.degradedServers.insert(backup);
ASSERT(!data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// Trigger recovery when proxy is degraded.
data.degradedServers.insert(proxy);
data.degradationInfo.degradedServers.insert(proxy);
ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// Trigger recovery when resolver is degraded.
data.degradedServers.insert(resolver);
data.degradationInfo.degradedServers.insert(resolver);
ASSERT(data.shouldTriggerRecoveryDueToDegradedServers());
return Void();
@ -3090,16 +3092,16 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
// No failover when small number of degraded servers
data.degradedServers.insert(master);
data.degradationInfo.degradedServers.insert(master);
ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// Trigger failover when enough servers in the txn system are degraded.
data.degradedServers.insert(master);
data.degradedServers.insert(tlog);
data.degradedServers.insert(proxy);
data.degradedServers.insert(proxy2);
data.degradedServers.insert(resolver);
data.degradationInfo.degradedServers.insert(master);
data.degradationInfo.degradedServers.insert(tlog);
data.degradationInfo.degradedServers.insert(proxy);
data.degradationInfo.degradedServers.insert(proxy2);
data.degradationInfo.degradedServers.insert(resolver);
ASSERT(data.shouldTriggerFailoverDueToDegradedServers());
// No failover when usable region is 1.
@ -3108,18 +3110,29 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
data.db.config.usableRegions = 2;
// No failover when remote is also degraded.
data.degradedServers.insert(remoteTlog);
data.degradationInfo.degradedServers.insert(remoteTlog);
ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// No failover when some are not from transaction system
data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 1));
data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 2));
data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 3));
data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 4));
data.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 5));
data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 1));
data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 2));
data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 3));
data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 4));
data.degradationInfo.degradedServers.insert(NetworkAddress(IPAddress(0x13131313), 5));
ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
data.degradedServers.clear();
data.degradationInfo.degradedServers.clear();
// Trigger failover when satellite is degraded.
data.degradationInfo.degradedSatellite = true;
ASSERT(data.shouldTriggerFailoverDueToDegradedServers());
data.degradationInfo.degradedServers.clear();
// No failover when satellite is degraded, but remote is not healthy.
data.degradationInfo.degradedSatellite = true;
data.degradationInfo.degradedServers.insert(remoteTlog);
ASSERT(!data.shouldTriggerFailoverDueToDegradedServers());
data.degradationInfo.degradedServers.clear();
return Void();
}

View File

@ -2981,9 +2981,16 @@ public:
}
}
struct DegradationInfo {
std::unordered_set<NetworkAddress>
degradedServers; // The servers that the cluster controller is considered as degraded. The servers in this
// list are not excluded unless they are added to `excludedDegradedServers`.
bool degradedSatellite = false; // Indicates that the entire satellite DC is degraded.
};
// Returns a list of servers who are experiencing degraded links. These are candidates to perform exclusion. Note
// that only one endpoint of a bad link will be included in this list.
std::unordered_set<NetworkAddress> getServersWithDegradedLink() {
DegradationInfo getDegradationInfo() {
updateRecoveredWorkers();
// Build a map keyed by measured degraded peer. This map gives the info that who complains a particular server.
@ -3014,7 +3021,11 @@ public:
//
// For example, if server A is already considered as a degraded server, and A complains B, we won't add B as
// degraded since A is already considered as degraded.
//
// In the meantime, we also count the number of satellite workers got complained. If enough number of satellite
// workers are degraded, this may indicates that the whole network between primary and satellite is bad.
std::unordered_set<NetworkAddress> currentDegradedServers;
int satelliteBadServerCount = 0;
for (const auto& [complainerCount, badServer] : count2DegradedPeer) {
for (const auto& complainer : degradedLinkDst2Src[badServer]) {
if (currentDegradedServers.find(complainer) == currentDegradedServers.end()) {
@ -3022,23 +3033,36 @@ public:
break;
}
}
if (SERVER_KNOBS->CC_ENABLE_ENTIRE_SATELLITE_MONITORING &&
addressInDbAndPrimarySatelliteDc(badServer, db.serverInfo) &&
complainerCount >= SERVER_KNOBS->CC_SATELLITE_DEGRADATION_MIN_COMPLAINER) {
++satelliteBadServerCount;
}
}
// For degraded server that are complained by more than SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE, we
// don't know if it is a hot server, or the network is bad. We remove from the returned degraded server list.
std::unordered_set<NetworkAddress> currentDegradedServersWithinLimit;
DegradationInfo currentDegradationInfo;
for (const auto& badServer : currentDegradedServers) {
if (degradedLinkDst2Src[badServer].size() <= SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE) {
currentDegradedServersWithinLimit.insert(badServer);
currentDegradationInfo.degradedServers.insert(badServer);
}
}
return currentDegradedServersWithinLimit;
// If enough number of satellite workers are bad, we mark the entire satellite is bad. Note that this needs to
// be used with caution (controlled by CC_ENABLE_ENTIRE_SATELLITE_MONITORING knob), since the slow workers may
// also be caused by workload.
if (satelliteBadServerCount >= SERVER_KNOBS->CC_SATELLITE_DEGRADATION_MIN_BAD_SERVER) {
currentDegradationInfo.degradedSatellite = true;
}
return currentDegradationInfo;
}
// Whether the transaction system (in primary DC if in HA setting) contains degraded servers.
bool transactionSystemContainsDegradedServers() {
const ServerDBInfo dbi = db.serverInfo->get();
for (const auto& excludedServer : degradedServers) {
for (const auto& excludedServer : degradationInfo.degradedServers) {
if (dbi.master.addresses().contains(excludedServer)) {
return true;
}
@ -3083,7 +3107,7 @@ public:
return false;
}
for (const auto& excludedServer : degradedServers) {
for (const auto& excludedServer : degradationInfo.degradedServers) {
if (addressInDbAndRemoteDc(excludedServer, db.serverInfo)) {
return true;
}
@ -3121,7 +3145,7 @@ public:
// Returns true when the cluster controller should trigger a recovery due to degraded servers used in the
// transaction system in the primary data center.
bool shouldTriggerRecoveryDueToDegradedServers() {
if (degradedServers.size() > SERVER_KNOBS->CC_MAX_EXCLUSION_DUE_TO_HEALTH) {
if (degradationInfo.degradedServers.size() > SERVER_KNOBS->CC_MAX_EXCLUSION_DUE_TO_HEALTH) {
return false;
}
@ -3154,8 +3178,14 @@ public:
return false;
}
if (degradedServers.size() < SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MIN_DEGRADATION ||
degradedServers.size() > SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION) {
bool remoteIsHealthy = !remoteTransactionSystemContainsDegradedServers();
if (degradationInfo.degradedSatellite && remoteIsHealthy) {
// If the satellite DC is bad, a failover is desired despite the number of degraded servers.
return true;
}
if (degradationInfo.degradedServers.size() < SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MIN_DEGRADATION ||
degradationInfo.degradedServers.size() > SERVER_KNOBS->CC_FAILOVER_DUE_TO_HEALTH_MAX_DEGRADATION) {
return false;
}
@ -3165,7 +3195,7 @@ public:
return false;
}
return transactionSystemContainsDegradedServers() && !remoteTransactionSystemContainsDegradedServers();
return transactionSystemContainsDegradedServers() && remoteIsHealthy;
}
int recentRecoveryCountDueToHealth() {
@ -3248,9 +3278,7 @@ public:
// TODO(zhewu): Include disk and CPU signals.
};
std::unordered_map<NetworkAddress, WorkerHealth> workerHealth;
std::unordered_set<NetworkAddress>
degradedServers; // The servers that the cluster controller is considered as degraded. The servers in this list
// are not excluded unless they are added to `excludedDegradedServers`.
DegradationInfo degradationInfo;
std::unordered_set<NetworkAddress>
excludedDegradedServers; // The degraded servers to be excluded when assigning workers to roles.
std::queue<double> recentHealthTriggeredRecoveryTime;

View File

@ -342,6 +342,7 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
isr.reqId = deterministicRandom()->randomUniqueID();
isr.interfaceId = deterministicRandom()->randomUniqueID();
isr.clusterId = self->clusterId;
isr.initialClusterVersion = self->recoveryTransactionVersion;
ErrorOr<InitializeStorageReply> newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr));
@ -989,8 +990,12 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
newTLogServers(self, recruits, oldLogSystem, &confChanges));
// Update recovery related information to the newly elected sequencer (master) process.
wait(brokenPromiseToNever(self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(
self->recoveryTransactionVersion, self->lastEpochEnd, self->commitProxies, self->resolvers))));
wait(brokenPromiseToNever(
self->masterInterface.updateRecoveryData.getReply(UpdateRecoveryDataRequest(self->recoveryTransactionVersion,
self->lastEpochEnd,
self->commitProxies,
self->resolvers,
self->versionEpoch))));
return confChanges;
}
@ -1036,6 +1041,14 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
self->txnStateStore =
keyValueStoreLogSystem(self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true);
// Version 0 occurs at the version epoch. The version epoch is the number
// of microseconds since the Unix epoch. It can be set through fdbcli.
self->versionEpoch.reset();
Optional<Standalone<StringRef>> versionEpochValue = wait(self->txnStateStore->readValue(versionEpochKey));
if (versionEpochValue.present()) {
self->versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochValue.get(), Unversioned());
}
// Versionstamped operations (particularly those applied from DR) define a minimum commit version
// that we may recover to, as they embed the version in user-readable data and require that no
// transactions will be committed at a lower version.
@ -1046,6 +1059,11 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
if (requiredCommitVersion.present()) {
minRequiredCommitVersion = BinaryReader::fromStringRef<Version>(requiredCommitVersion.get(), Unversioned());
}
if (g_network->isSimulated() && self->versionEpoch.present()) {
minRequiredCommitVersion = std::max(
minRequiredCommitVersion,
static_cast<Version>(g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->versionEpoch.get()));
}
// Recover version info
self->lastEpochEnd = oldLogSystem->getEnd() - 1;
@ -1058,14 +1076,14 @@ ACTOR Future<Void> readTransactionSystemState(Reference<ClusterRecoveryData> sel
self->recoveryTransactionVersion = self->lastEpochEnd + SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT;
}
if (BUGGIFY) {
self->recoveryTransactionVersion +=
deterministicRandom()->randomInt64(0, SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT);
}
if (self->recoveryTransactionVersion < minRequiredCommitVersion)
self->recoveryTransactionVersion = minRequiredCommitVersion;
}
if (BUGGIFY) {
self->recoveryTransactionVersion += deterministicRandom()->randomInt64(0, 10000000);
}
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME).c_str(),
self->dbgid)
.detail("LastEpochEnd", self->lastEpochEnd)

View File

@ -169,6 +169,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
AsyncTrigger registrationTrigger;
Version lastEpochEnd, // The last version in the old epoch not (to be) rolled back in this recovery
recoveryTransactionVersion; // The first version in this epoch
Optional<int64_t> versionEpoch; // The epoch which all versions are based off of
double lastCommitTime;
Version liveCommittedVersion; // The largest live committed version reported by commit proxies.
@ -209,6 +210,7 @@ struct ClusterRecoveryData : NonCopyable, ReferenceCounted<ClusterRecoveryData>
std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;
UID clusterId;
Version initialClusterVersion = -1;
Standalone<StringRef> dbId;
MasterInterface masterInterface;

View File

@ -42,6 +42,7 @@
#include "flow/ActorCollection.h"
#include "flow/Arena.h"
#include "flow/BooleanParam.h"
#include "flow/genericactors.actor.h"
#include "flow/serialize.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
@ -754,7 +755,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
shardsAffectedByTeamFailure,
lock,
getAverageShardBytes,
getUnhealthyRelocationCount,
getUnhealthyRelocationCount.getFuture(),
self->ddId,
storageTeamSize,
configuration.storageTeamSize,
@ -902,8 +903,39 @@ Future<Void> sendSnapReq(RequestStream<Req> stream, Req req, Error e) {
return Void();
}
ACTOR template <class Req>
Future<ErrorOr<Void>> trySendSnapReq(RequestStream<Req> stream, Req req) {
ErrorOr<REPLY_TYPE(Req)> reply = wait(stream.tryGetReply(req));
if (reply.isError()) {
TraceEvent("SnapDataDistributor_ReqError")
.errorUnsuppressed(reply.getError())
.detail("Peer", stream.getEndpoint().getPrimaryAddress());
return ErrorOr<Void>(reply.getError());
}
return ErrorOr<Void>(Void());
}
ACTOR static Future<Void> waitForMost(std::vector<Future<ErrorOr<Void>>> futures,
int faultTolerance,
Error e,
double waitMultiplierForSlowFutures = 1.0) {
state std::vector<Future<bool>> successFutures;
state double startTime = now();
successFutures.reserve(futures.size());
for (const auto& future : futures) {
successFutures.push_back(fmap([](auto const& result) { return result.present(); }, future));
}
bool success = wait(quorumEqualsTrue(successFutures, successFutures.size() - faultTolerance));
if (!success) {
throw e;
}
wait(delay((now() - startTime) * waitMultiplierForSlowFutures) || waitForAll(successFutures));
return Void();
}
ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<AsyncVar<ServerDBInfo> const> db) {
state Database cx = openDBOnServer(db, TaskPriority::DefaultDelay, LockAware::True);
state ReadYourWritesTransaction tr(cx);
loop {
try {
@ -938,19 +970,29 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
// snap local storage nodes
std::vector<WorkerInterface> storageWorkers =
// TODO: Atomically read configuration and storage worker list in a single transaction
state DatabaseConfiguration configuration = wait(getDatabaseConfiguration(cx));
std::pair<std::vector<WorkerInterface>, int> storageWorkersAndFailures =
wait(transformErrors(getStorageWorkers(cx, db, true /* localOnly */), snap_storage_failed()));
const auto& [storageWorkers, storageFailures] = storageWorkersAndFailures;
auto const storageFaultTolerance =
std::min(static_cast<int>(SERVER_KNOBS->MAX_STORAGE_SNAPSHOT_FAULT_TOLERANCE),
configuration.storageTeamSize - 1) -
storageFailures;
if (storageFaultTolerance < 0) {
TEST(true); // Too many failed storage servers to complete snapshot
throw snap_storage_failed();
}
TraceEvent("SnapDataDistributor_GotStorageWorkers")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
std::vector<Future<Void>> storageSnapReqs;
std::vector<Future<ErrorOr<Void>>> storageSnapReqs;
storageSnapReqs.reserve(storageWorkers.size());
for (const auto& worker : storageWorkers) {
storageSnapReqs.push_back(sendSnapReq(worker.workerSnapReq,
WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr),
snap_storage_failed()));
storageSnapReqs.push_back(trySendSnapReq(
worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "storage"_sr)));
}
wait(waitForAll(storageSnapReqs));
wait(waitForMost(storageSnapReqs, storageFaultTolerance, snap_storage_failed()));
TraceEvent("SnapDataDistributor_AfterSnapStorage")
.detail("SnapPayload", snapReq.snapPayload)
@ -985,14 +1027,15 @@ ACTOR Future<Void> ddSnapCreateCore(DistributorSnapRequest snapReq, Reference<As
TraceEvent("SnapDataDistributor_GotCoordWorkers")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
std::vector<Future<Void>> coordSnapReqs;
std::vector<Future<ErrorOr<Void>>> coordSnapReqs;
coordSnapReqs.reserve(coordWorkers.size());
for (const auto& worker : coordWorkers) {
coordSnapReqs.push_back(sendSnapReq(worker.workerSnapReq,
WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr),
snap_coord_failed()));
coordSnapReqs.push_back(trySendSnapReq(
worker.workerSnapReq, WorkerSnapRequest(snapReq.snapPayload, snapReq.snapUID, "coord"_sr)));
}
wait(waitForAll(coordSnapReqs));
auto const coordFaultTolerance = std::min<int>(std::max<int>(0, coordSnapReqs.size() / 2 - 1),
SERVER_KNOBS->MAX_COORDINATOR_SNAPSHOT_FAULT_TOLERANCE);
wait(waitForMost(coordSnapReqs, coordFaultTolerance, snap_coord_failed()));
TraceEvent("SnapDataDistributor_AfterSnapCoords")
.detail("SnapPayload", snapReq.snapPayload)
.detail("SnapUID", snapReq.snapUID);
@ -1262,3 +1305,44 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
return Void();
}
static Future<ErrorOr<Void>> goodTestFuture(double duration) {
return tag(delay(duration), ErrorOr<Void>(Void()));
}
static Future<ErrorOr<Void>> badTestFuture(double duration, Error e) {
return tag(delay(duration), ErrorOr<Void>(e));
}
TEST_CASE("/DataDistribution/WaitForMost") {
state std::vector<Future<ErrorOr<Void>>> futures;
{
futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
wait(waitForMost(futures, 1, operation_failed(), 0.0)); // Don't wait for slowest future
ASSERT(!futures[2].isReady());
}
{
futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
wait(waitForMost(futures, 0, operation_failed(), 0.0)); // Wait for all futures
ASSERT(futures[2].isReady());
}
{
futures = { goodTestFuture(1), goodTestFuture(2), goodTestFuture(3) };
wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Wait for slowest future
ASSERT(futures[2].isReady());
}
{
futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
wait(waitForMost(futures, 1, operation_failed(), 1.0)); // Error ignored
}
{
futures = { goodTestFuture(1), goodTestFuture(2), badTestFuture(1, success()) };
try {
wait(waitForMost(futures, 0, operation_failed(), 1.0));
ASSERT(false);
} catch (Error& e) {
ASSERT_EQ(e.code(), error_code_operation_failed);
}
}
return Void();
}

View File

@ -308,7 +308,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
MoveKeysLock lock,
PromiseStream<Promise<int64_t>> getAverageShardBytes,
PromiseStream<Promise<int>> getUnhealthyRelocationCount,
FutureStream<Promise<int>> getUnhealthyRelocationCount,
UID distributorId,
int teamSize,
int singleRegionTeamSize,

View File

@ -1059,6 +1059,16 @@ struct DDQueueData {
validate();
}
int getHighestPriorityRelocation() const {
int highestPriority{ 0 };
for (const auto& [priority, count] : priority_relocations) {
if (count > 0) {
highestPriority = std::max(highestPriority, priority);
}
}
return highestPriority;
}
};
// return -1 if a.readload > b.readload
@ -1987,7 +1997,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
MoveKeysLock lock,
PromiseStream<Promise<int64_t>> getAverageShardBytes,
PromiseStream<Promise<int>> getUnhealthyRelocationCount,
FutureStream<Promise<int>> getUnhealthyRelocationCount,
UID distributorId,
int teamSize,
int singleRegionTeamSize,
@ -2090,12 +2100,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
recordMetrics = delay(SERVER_KNOBS->DD_QUEUE_LOGGING_INTERVAL, TaskPriority::FlushTrace);
int highestPriorityRelocation = 0;
for (auto it = self.priority_relocations.begin(); it != self.priority_relocations.end(); ++it) {
if (it->second) {
highestPriorityRelocation = std::max(highestPriorityRelocation, it->first);
}
}
auto const highestPriorityRelocation = self.getHighestPriorityRelocation();
TraceEvent("MovingData", distributorId)
.detail("InFlight", self.activeRelocations)
@ -2135,9 +2140,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
}
when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
when(wait(waitForAll(balancingFutures))) {}
when(Promise<int> r = waitNext(getUnhealthyRelocationCount.getFuture())) {
r.send(self.unhealthyRelocations);
}
when(Promise<int> r = waitNext(getUnhealthyRelocationCount)) { r.send(self.unhealthyRelocations); }
}
}
} catch (Error& e) {

View File

@ -1177,15 +1177,16 @@ public:
struct Cursor {
Cursor() : cache(nullptr), nodeIndex(-1) {}
Cursor(DecodeCache* cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {}
Cursor(Reference<DecodeCache> cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {}
Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : tree(tree), cache(cache), nodeIndex(nodeIndex) {}
Cursor(Reference<DecodeCache> cache, DeltaTree2* tree, int nodeIndex)
: tree(tree), cache(cache), nodeIndex(nodeIndex) {}
// Copy constructor does not copy item because normally a copied cursor will be immediately moved.
Cursor(const Cursor& c) : tree(c.tree), cache(c.cache), nodeIndex(c.nodeIndex) {}
~Cursor() {
if (cache != nullptr) {
if (cache.isValid()) {
cache->updateUsedMemory();
}
}
@ -1212,7 +1213,7 @@ public:
}
DeltaTree2* tree;
DecodeCache* cache;
Reference<DecodeCache> cache;
int nodeIndex;
mutable Optional<T> item;
@ -1274,6 +1275,7 @@ public:
return item.get();
}
// Switch the cursor to point to a new DeltaTree
void switchTree(DeltaTree2* newTree) {
tree = newTree;
// Reset item because it may point into tree memory
@ -1709,7 +1711,13 @@ public:
} else {
nodeBytesUsed = 0;
}
ASSERT(size() <= spaceAvailable);
nodeBytesFree = spaceAvailable - size();
// Zero unused available space
memset((uint8_t*)this + size(), 0, nodeBytesFree);
return size();
}
@ -1782,8 +1790,15 @@ private:
node.setLeftChildOffset(largeNodes, leftChildOffset);
node.setRightChildOffset(largeNodes, rightChildOffset);
deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str());
int written = wptr - (uint8_t*)&node;
deltatree_printf("Built subtree tree=%p subtreeRoot=%p written=%d end=%p serialized subtreeRoot %s as %s \n",
this,
&node,
written,
(uint8_t*)&node + written,
item.toString().c_str(),
node.toString(this).c_str());
return wptr - (uint8_t*)&node;
return written;
}
};

View File

@ -20,22 +20,24 @@
#ifndef FDBSERVER_IPAGER_H
#define FDBSERVER_IPAGER_H
#include "flow/Error.h"
#include "flow/FastAlloc.h"
#include "flow/ProtocolVersion.h"
#include <cstddef>
#include <stdint.h>
#pragma once
#include "fdbserver/IKeyValueStore.h"
#include "flow/flow.h"
#include "fdbclient/FDBTypes.h"
#define XXH_INLINE_ALL
#include "flow/xxhash.h"
#ifndef VALGRIND
#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
#define VALGRIND_MAKE_MEM_DEFINED(x, y)
#endif
typedef uint32_t LogicalPageID;
typedef uint32_t PhysicalPageID;
#define invalidLogicalPageID std::numeric_limits<LogicalPageID>::max()
#define invalidPhysicalPageID std::numeric_limits<PhysicalPageID>::max()
typedef uint32_t QueueID;
#define invalidQueueID std::numeric_limits<QueueID>::max()
@ -76,90 +78,509 @@ static const std::vector<std::pair<PagerEvents, PagerEventReasons>> L0PossibleEv
{ PagerEvents::PageWrite, PagerEventReasons::MetaData },
};
// Represents a block of memory in a 4096-byte aligned location held by an Arena.
enum EncodingType : uint8_t {
XXHash64 = 0,
// For testing purposes
XOREncryption = 1
};
enum PageType : uint8_t {
HeaderPage = 0,
BackupHeaderPage = 1,
BTreeNode = 2,
BTreeSuperNode = 3,
QueuePageStandalone = 4,
QueuePageInExtent = 5
};
// Encryption key ID
typedef uint64_t KeyID;
// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union
// of all fields relevant to any implemented encryption scheme. They are generally of
// the form
// Page Fields - fields which come from or are stored in the Page
// Secret Fields - fields which are only known by the Key Provider
// but it is up to each encoding and provider which fields are which and which ones are used
struct EncryptionKeyRef {
EncryptionKeyRef(){};
EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy) : secret(arena, toCopy.secret), id(toCopy.id) {}
int expectedSize() const { return secret.size(); }
StringRef secret;
Optional<KeyID> id;
};
typedef Standalone<EncryptionKeyRef> EncryptionKey;
// Interface used by pager to get encryption keys by ID when reading pages from disk
// and by the BTree to get encryption keys to use for new pages
class IEncryptionKeyProvider {
public:
virtual ~IEncryptionKeyProvider() {}
// Get an EncryptionKey with Secret Fields populated based on the given Page Fields.
// It is up to the implementation which fields those are.
// The output Page Fields must match the input Page Fields.
virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) = 0;
// Get encryption key that should be used for a given user Key-Value range
virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) = 0;
};
// This is a hacky way to attach an additional object of an arbitrary type at runtime to another object.
// It stores an arbitrary void pointer and a void pointer function to call when the ArbitraryObject
// is destroyed.
// It has helper operator= methods for storing heap-allocated T's or Reference<T>'s in into it via
// x = thing;
// Examples:
// ArbitraryObject x;
// x.set(new Widget()); // x owns the new object
// x.set(Reference<SomeClass>(new SomeClass()); // x holds a reference now too
// x.setReference(new SomeReferenceCountedType()); //
struct ArbitraryObject {
ArbitraryObject() : ptr(nullptr), onDestruct(nullptr) {}
ArbitraryObject(const ArbitraryObject&) = delete;
~ArbitraryObject() { destructOnly(); }
bool valid() const { return ptr != nullptr; }
template <typename T>
void operator=(T* p) {
destructOnly();
ptr = p;
onDestruct = [](void* ptr) { delete (T*)ptr; };
}
template <typename T>
void operator=(Reference<T>& r) {
destructOnly();
ptr = r.getPtr();
r.getPtr()->addref();
onDestruct = [](void* ptr) { ((T*)ptr)->delref(); };
}
template <typename T>
void operator=(Reference<T>&& r) {
destructOnly();
ptr = r.extractPtr();
onDestruct = [](void* ptr) { ((T*)ptr)->delref(); };
}
template <typename T>
T* getPtr() {
return (T*)ptr;
}
template <typename T>
Reference<T> getReference() {
return Reference<T>::addRef((T*)ptr);
}
void reset() {
destructOnly();
ptr = nullptr;
onDestruct = nullptr;
}
// ptr can be set to any arbitrary thing. If it is not null at destruct time then
// onDestruct(ptr) will be called if onDestruct is not null.
void* ptr = nullptr;
void (*onDestruct)(void*) = nullptr;
private:
// Call onDestruct(ptr) if needed but don't reset any state
void destructOnly() {
if (ptr != nullptr && onDestruct != nullptr) {
onDestruct(ptr);
}
}
};
// ArenaPage represents a data page meant to be stored on disk, located in a block of
// 4k-aligned memory held by an Arena
//
// Page Format:
// PageHeader - describes main header version, encoding type, and offsets of subheaders and payload.
// MainHeader - structure based on header version. It is responsible for protecting all bytes
// of PageHeader, MainHeader, and EncodingHeader with some sort of checksum.
// EncodingHeader - structure based on encoding type. It is responsible for protecting and
// possibly encrypting all payload bytes.
// Payload - User accessible bytes, protected and possibly encrypted based on the encoding
//
// preWrite() must be called before writing a page to disk to update checksums and encrypt as needed
// After reading a page from disk,
// postReadHeader() must be called to verify the verison, main, and encoding headers
// postReadPayload() must be called, after potentially setting encryption secret, to verify and possibly
// decrypt the payload
class ArenaPage : public ReferenceCounted<ArenaPage>, public FastAllocated<ArenaPage> {
public:
// The page's logical size includes an opaque checksum, use size() to get usable size
ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
// This is the header version that new page init() calls will use.
// It is not necessarily the latest header version, as read/modify support for
// a new header version may be added prior to using that version as the default
// for new pages as part of downgrade support.
static constexpr uint8_t HEADER_WRITE_VERSION = 1;
ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), pPayload(nullptr) {
if (bufferSize > 0) {
buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize);
// Mark any unused page portion defined
VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
// Zero unused region
memset(buffer + logicalSize, 0, bufferSize - logicalSize);
} else {
buffer = nullptr;
}
};
~ArenaPage() {
if (userData != nullptr && userDataDestructor != nullptr) {
userDataDestructor(userData);
~ArenaPage() {}
// Before using these, either init() or postReadHeader and postReadPayload() must be called
const uint8_t* data() const { return pPayload; }
uint8_t* mutateData() const { return (uint8_t*)pPayload; }
int dataSize() const { return payloadSize; }
StringRef dataAsStringRef() const { return StringRef((uint8_t*)pPayload, payloadSize); }
const uint8_t* rawData() const { return buffer; }
uint8_t* rawData() { return buffer; }
int rawSize() const { return bufferSize; }
#pragma pack(push, 1)
// The next few structs describe the byte-packed physical structure. The fields of Page
// cannot change, but new header versions and encoding types can be added and existing
// header versions and encoding type headers could change size as offset information
// is stored to enable efficient jumping to the encoding header or payload.
// Page members are only initialized in init()
struct PageHeader {
uint8_t headerVersion;
EncodingType encodingType;
// Encoding header comes after main header
uint8_t encodingHeaderOffset;
// Payload comes after encoding header
uint8_t payloadOffset;
// Get main header pointer, casting to its type
template <typename T>
T* getMainHeader() const {
return (T*)(this + 1);
}
// Get encoding header pointer, casting to its type
template <typename T>
T* getEncodingHeader() const {
return (T*)((uint8_t*)this + encodingHeaderOffset);
}
// Get payload pointer
uint8_t* getPayload() const { return (uint8_t*)this + payloadOffset; }
};
// Redwood header version 1
// Protects all headers with a 64-bit XXHash checksum
// Most other fields are forensic in nature and are not required to be set for correct
// behavior but they can faciliate forensic investigation of data on disk. Some of them
// could be used for sanity checks at runtime.
struct RedwoodHeaderV1 {
PageType pageType;
// The meaning of pageSubType is based on pageType
// For Queue pages, pageSubType is the QueueID
// For BTree nodes, pageSubType is Height (also stored in BTreeNode)
uint8_t pageSubType;
// Format identifier, normally specific to the page Type and SubType
uint8_t pageFormat;
XXH64_hash_t checksum;
// Physical page ID of first block on disk of the ArenaPage
PhysicalPageID firstPhysicalPageID;
// The first logical page ID the ArenaPage was referenced by when last written
LogicalPageID lastKnownLogicalPageID;
// The first logical page ID of the parent of this ArenaPage when last written
LogicalPageID lastKnownParentLogicalPageID;
// Time and write version as of the last update to this page.
// Note that for relocated pages, writeVersion should not be updated.
double writeTime;
Version writeVersion;
// Update checksum
void updateChecksum(uint8_t* headerBytes, int len) {
// Checksum is within the checksum input so clear it first
checksum = 0;
checksum = XXH3_64bits(headerBytes, len);
}
// Verify checksum
void verifyChecksum(uint8_t* headerBytes, int len) {
// Checksum is within the checksum input so save it and restore it afterwards
XXH64_hash_t saved = checksum;
checksum = 0;
XXH64_hash_t calculated = XXH3_64bits(headerBytes, len);
checksum = saved;
if (saved != calculated) {
throw page_header_checksum_failed();
}
}
};
// An encoding that validates the payload with an XXHash checksum
struct XXHashEncodingHeader {
XXH64_hash_t checksum;
void encode(uint8_t* payload, int len, PhysicalPageID seed) {
checksum = XXH3_64bits_withSeed(payload, len, seed);
}
void decode(uint8_t* payload, int len, PhysicalPageID seed) {
if (checksum != XXH3_64bits_withSeed(payload, len, seed)) {
throw page_decoding_failed();
}
}
};
// A dummy "encrypting" encoding which uses XOR with a 1 byte secret key on
// the payload to obfuscate it and protects the payload with an XXHash checksum.
struct XOREncryptionEncodingHeader {
// Checksum is on unencrypted payload
XXH64_hash_t checksum;
uint8_t keyID;
void encode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) {
checksum = XXH3_64bits_withSeed(payload, len, seed);
for (int i = 0; i < len; ++i) {
payload[i] ^= secret;
}
}
void decode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) {
for (int i = 0; i < len; ++i) {
payload[i] ^= secret;
}
if (checksum != XXH3_64bits_withSeed(payload, len, seed)) {
throw page_decoding_failed();
}
}
};
#pragma pack(pop)
// Get the size of the encoding header based on type
// Note that this is only to be used in operations involving new pages to calculate the payload offset. For
// existing pages, the payload offset is stored in the page.
static int encodingHeaderSize(EncodingType t) {
if (t == EncodingType::XXHash64) {
return sizeof(XXHashEncodingHeader);
} else if (t == EncodingType::XOREncryption) {
return sizeof(XOREncryptionEncodingHeader);
} else {
throw page_encoding_not_supported();
}
}
uint8_t const* begin() const { return (uint8_t*)buffer; }
// Get the usable size for a new page of pageSize using HEADER_WRITE_VERSION with encoding type t
static int getUsableSize(int pageSize, EncodingType t) {
return pageSize - sizeof(PageHeader) - sizeof(RedwoodHeaderV1) - encodingHeaderSize(t);
}
uint8_t* mutate() { return (uint8_t*)buffer; }
// Initialize the header for a new page so that the payload can be written to
// Pre: Buffer is allocated and logical size is set
// Post: Page header is initialized and space is reserved for subheaders for
// HEADER_WRITE_VERSION main header and the given encoding type.
// Payload can be written to with mutateData() and dataSize()
void init(EncodingType t, PageType pageType, uint8_t pageSubType, uint8_t pageFormat = 0) {
// Carefully cast away constness to modify page header
PageHeader* p = const_cast<PageHeader*>(page);
p->headerVersion = HEADER_WRITE_VERSION;
p->encodingHeaderOffset = sizeof(PageHeader) + sizeof(RedwoodHeaderV1);
p->encodingType = t;
p->payloadOffset = page->encodingHeaderOffset + encodingHeaderSize(t);
typedef XXH64_hash_t Checksum;
pPayload = page->getPayload();
payloadSize = logicalSize - (pPayload - buffer);
// Usable size, without checksum
int size() const { return logicalSize - sizeof(Checksum); }
RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
h->pageType = pageType;
h->pageSubType = pageSubType;
h->pageFormat = pageFormat;
Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(begin(), size()), arena); }
// Write dummy values for these in new pages. They should be updated when possible before calling preWrite()
// when modifying existing pages
h->lastKnownLogicalPageID = invalidLogicalPageID;
h->lastKnownParentLogicalPageID = invalidLogicalPageID;
h->writeVersion = invalidVersion;
}
// Get an ArenaPage which is a copy of this page, in its own Arena
Reference<ArenaPage> cloneContents() const {
// Get the logical page buffer as a StringRef
Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(buffer, logicalSize)); }
// Get a new ArenaPage that contains a copy of this page's data.
// extra is not copied to the returned page
Reference<ArenaPage> clone() const {
ArenaPage* p = new ArenaPage(logicalSize, bufferSize);
memcpy(p->buffer, buffer, logicalSize);
// Non-verifying header parse just to initialize members
p->postReadHeader(invalidPhysicalPageID, false);
p->encryptionKey = encryptionKey;
return Reference<ArenaPage>(p);
}
// Get an ArenaPage which depends on this page's Arena and references some of its memory
Reference<ArenaPage> subPage(int offset, int len) const {
Reference<ArenaPage> getSubPage(int offset, int len) const {
ASSERT(offset + len <= logicalSize);
ArenaPage* p = new ArenaPage(len, 0);
p->buffer = buffer + offset;
p->arena.dependsOn(arena);
// Non-verifying header parse just to initialize component pointers
p->postReadHeader(invalidPhysicalPageID, false);
p->encryptionKey = encryptionKey;
return Reference<ArenaPage>(p);
}
// Given a vector of pages with the same ->size(), create a new ArenaPage with a ->size() that is
// equivalent to all of the input pages and has all of their contents copied into it.
static Reference<ArenaPage> concatPages(const std::vector<Reference<const ArenaPage>>& pages) {
int usableSize = pages.front()->size();
int totalUsableSize = pages.size() * usableSize;
int totalBufferSize = pages.front()->bufferSize * pages.size();
ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
uint8_t* wptr = superpage->mutate();
for (auto& p : pages) {
ASSERT(p->size() == usableSize);
memcpy(wptr, p->begin(), usableSize);
wptr += usableSize;
// The next two functions set mostly forensic info that may help in an investigation to identify data on disk. The
// exception is pageID which must be set to the physical page ID on disk where the page is written or post-read
// verification will fail.
void setWriteInfo(PhysicalPageID pageID, Version writeVersion) {
if (page->headerVersion == 1) {
RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
h->firstPhysicalPageID = pageID;
h->writeVersion = writeVersion;
h->writeTime = now();
}
return Reference<ArenaPage>(superpage);
}
Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }
// These should be updated before writing a BTree page. Note that the logical ID that refers to a page can change
// after the page is written, if its parent is updated to point directly to its physical page ID. Therefore, the
// last known logical page ID should always be updated before writing an updated version of a BTree page.
void setLogicalPageInfo(LogicalPageID lastKnownLogicalPageID, LogicalPageID lastKnownParentLogicalPageID) {
if (page->headerVersion == 1) {
RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
h->lastKnownLogicalPageID = lastKnownLogicalPageID;
h->lastKnownParentLogicalPageID = lastKnownParentLogicalPageID;
}
}
Checksum calculateChecksum(LogicalPageID pageID) { return XXH3_64bits_withSeed(buffer, size(), pageID); }
// Must be called before writing to disk to update headers and encrypt page
// Pre: Encoding-specific header fields are set if needed
// Secret is set if needed
// Post: Main and Encoding subheaders are updated
// Payload is possibly encrypted
void preWrite(PhysicalPageID pageID) const {
// Explicitly check payload definedness to make the source of valgrind errors more clear.
// Without this check, calculating a checksum on a payload with undefined bytes does not
// cause a valgrind error but the resulting checksum is undefined which causes errors later.
ASSERT(VALGRIND_CHECK_MEM_IS_DEFINED(pPayload, payloadSize) == 0);
void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); }
if (page->encodingType == EncodingType::XXHash64) {
page->getEncodingHeader<XXHashEncodingHeader>()->encode(pPayload, payloadSize, pageID);
} else if (page->encodingType == EncodingType::XOREncryption) {
ASSERT(encryptionKey.secret.size() == 1);
XOREncryptionEncodingHeader* xh = page->getEncodingHeader<XOREncryptionEncodingHeader>();
xh->keyID = encryptionKey.id.orDefault(0);
xh->encode(encryptionKey.secret[0], pPayload, payloadSize, pageID);
} else {
throw page_encoding_not_supported();
}
bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
if (page->headerVersion == 1) {
page->getMainHeader<RedwoodHeaderV1>()->updateChecksum(buffer, pPayload - buffer);
} else {
throw page_header_version_not_supported();
}
}
// Must be called after reading from disk to verify all non-payload bytes
// Pre: Bytes from storage medium copied into raw buffer space
// Post: Page headers outside of payload are verified (unless verify is false)
// encryptionKey is updated with information from encoding header if needed
// Payload is accessible via data(), dataSize(), etc.
//
// Exceptions are thrown for unknown header types or pages which fail verification
void postReadHeader(PhysicalPageID pageID, bool verify = true) {
pPayload = page->getPayload();
payloadSize = logicalSize - (pPayload - buffer);
// Populate encryption key with relevant fields from page
if (page->encodingType == EncodingType::XOREncryption) {
encryptionKey.id = page->getEncodingHeader<XOREncryptionEncodingHeader>()->keyID;
}
if (page->headerVersion == 1) {
if (verify) {
RedwoodHeaderV1* h = page->getMainHeader<RedwoodHeaderV1>();
h->verifyChecksum(buffer, pPayload - buffer);
if (pageID != h->firstPhysicalPageID) {
throw page_header_wrong_page_id();
}
}
} else {
throw page_header_version_not_supported();
}
}
// Pre: postReadHeader has been called, encoding-specific parameters (such as the encryption secret) have been set
// Post: Payload has been verified and decrypted if necessary
void postReadPayload(PhysicalPageID pageID) {
if (page->encodingType == EncodingType::XXHash64) {
page->getEncodingHeader<XXHashEncodingHeader>()->decode(pPayload, payloadSize, pageID);
} else if (page->encodingType == EncodingType::XOREncryption) {
ASSERT(encryptionKey.secret.size() == 1);
page->getEncodingHeader<XOREncryptionEncodingHeader>()->decode(
encryptionKey.secret[0], pPayload, payloadSize, pageID);
} else {
throw page_encoding_not_supported();
}
}
const Arena& getArena() const { return arena; }
static bool isEncodingTypeEncrypted(EncodingType t) { return t == EncodingType::XOREncryption; }
// Returns true if the page's encoding type employs encryption
bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); }
private:
Arena arena;
// The logical size of the page, which can be smaller than bufferSize, which is only of
// practical purpose in simulation to use arbitrarily small page sizes to test edge cases
// with shorter execution time
int logicalSize;
// The 4k-aligned physical size of allocated memory for the page which also represents the
// block size to be written to disk
int bufferSize;
uint8_t* buffer;
// buffer is a pointer to the page's memory
// For convenience, it is unioned with a Page pointer which defines the page structure
union {
uint8_t* buffer;
const PageHeader* page;
};
// Pointer and length of page space available to the user
// These are accessed very often so they are stored directly
uint8_t* pPayload;
int payloadSize;
public:
mutable void* userData;
mutable void (*userDataDestructor)(void*);
EncodingType getEncodingType() const { return page->encodingType; }
PhysicalPageID getPhysicalPageID() const {
if (page->headerVersion == 1) {
return page->getMainHeader<RedwoodHeaderV1>()->firstPhysicalPageID;
} else {
throw page_header_version_not_supported();
}
}
// Used by encodings that do encryption
EncryptionKey encryptionKey;
mutable ArbitraryObject extra;
};
class IPagerSnapshot {
@ -184,18 +605,21 @@ public:
virtual void addref() = 0;
virtual void delref() = 0;
ArbitraryObject extra;
};
// This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic.
class IPager2 : public IClosable {
public:
virtual std::string getName() const = 0;
// Returns an ArenaPage that can be passed to writePage. The data in the returned ArenaPage might not be zeroed.
virtual Reference<ArenaPage> newPageBuffer(size_t size = 1) = 0;
virtual Reference<ArenaPage> newPageBuffer(size_t blocks = 1) = 0;
// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
// For a given pager instance, separate calls to this function must return the same value.
// Only valid to call after recovery is complete.
virtual int getUsablePageSize() const = 0;
virtual int getPhysicalPageSize() const = 0;
virtual int getLogicalPageSize() const = 0;
virtual int getPagesPerExtent() const = 0;
@ -251,7 +675,7 @@ public:
bool noHit) = 0;
virtual Future<Reference<ArenaPage>> readMultiPage(PagerEventReasons reason,
unsigned int level,
Standalone<VectorRef<PhysicalPageID>> pageIDs,
VectorRef<PhysicalPageID> pageIDs,
int priority,
bool cacheable,
bool noHit) = 0;
@ -271,16 +695,13 @@ public:
// The snapshot shall be usable until setOldVersion() is called with a version > v.
virtual Reference<IPagerSnapshot> getReadSnapshot(Version v) = 0;
// Atomically make durable all pending page writes, page frees, and update the metadata string,
// setting the committed version to v
// v must be >= the highest versioned page write.
virtual Future<Void> commit(Version v) = 0;
// Atomically make durable all pending page writes, page frees, and update the user commit
// record at version v
// v must be higher than the highest committed version
virtual Future<Void> commit(Version v, Value commitRecord) = 0;
// Get the latest meta key set or committed
virtual Key getMetaKey() const = 0;
// Set the metakey which will be stored in the next commit
virtual void setMetaKey(KeyRef metaKey) = 0;
// Get the latest committed user commit record
virtual Value getCommitRecord() const = 0;
virtual StorageBytes getStorageBytes() const = 0;
@ -318,4 +739,52 @@ protected:
~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface
};
// The null key provider is useful to simplify page decoding.
// It throws an error for any key info requested.
class NullKeyProvider : public IEncryptionKeyProvider {
public:
virtual ~NullKeyProvider() {}
Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); }
Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
throw encryption_key_not_found();
}
};
// Key provider for dummy XOR encryption scheme
class XOREncryptionKeyProvider : public IEncryptionKeyProvider {
public:
XOREncryptionKeyProvider(std::string filename) {
ASSERT(g_network->isSimulated());
// Choose a deterministic random filename (without path) byte for secret generation
// Remove any leading directory names
size_t lastSlash = filename.find_last_of("\\/");
if (lastSlash != filename.npos) {
filename.erase(0, lastSlash);
}
xorWith = filename.empty() ? 0x5e
: (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()];
}
virtual ~XOREncryptionKeyProvider() {}
virtual Future<EncryptionKey> getSecrets(const EncryptionKeyRef& key) override {
if (!key.id.present()) {
throw encryption_key_not_found();
}
EncryptionKey s = key;
uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith;
s.secret = StringRef(s.arena(), &secret, 1);
return s;
}
virtual Future<EncryptionKey> getByRange(const KeyRef& begin, const KeyRef& end) override {
EncryptionKeyRef k;
k.id = end.empty() ? 0 : *(end.end() - 1);
return getSecrets(k);
}
uint8_t xorWith;
};
#endif

View File

@ -790,6 +790,7 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
{ "EstPendCompactBytes", rocksdb::DB::Properties::kEstimatePendingCompactionBytes },
{ "BlockCacheUsage", rocksdb::DB::Properties::kBlockCacheUsage },
{ "BlockCachePinnedUsage", rocksdb::DB::Properties::kBlockCachePinnedUsage },
{ "LiveSstFilesSize", rocksdb::DB::Properties::kLiveSstFilesSize },
};
state std::unordered_map<std::string, uint64_t> readIteratorPoolStats = {
@ -811,7 +812,8 @@ ACTOR Future<Void> rocksDBMetricLogger(std::shared_ptr<rocksdb::Statistics> stat
for (auto& p : propertyStats) {
auto& [name, property] = p;
stat = 0;
ASSERT(db->GetIntProperty(property, &stat));
// GetAggregatedIntProperty gets the aggregated int property from all column families.
ASSERT(db->GetAggregatedIntProperty(property, &stat));
e.detail(name, stat);
}
@ -1933,7 +1935,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
StorageBytes getStorageBytes() const override {
uint64_t live = 0;
ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));
ASSERT(db->GetAggregatedIntProperty(rocksdb::DB::Properties::kLiveSstFilesSize, &live));
int64_t free;
int64_t total;

View File

@ -157,19 +157,21 @@ struct UpdateRecoveryDataRequest {
Version lastEpochEnd;
std::vector<CommitProxyInterface> commitProxies;
std::vector<ResolverInterface> resolvers;
Optional<int64_t> versionEpoch;
ReplyPromise<Void> reply;
UpdateRecoveryDataRequest() = default;
UpdateRecoveryDataRequest(Version recoveryTransactionVersion,
Version lastEpochEnd,
const std::vector<CommitProxyInterface>& commitProxies,
const std::vector<ResolverInterface>& resolvers)
const std::vector<ResolverInterface>& resolvers,
Optional<int64_t> versionEpoch)
: recoveryTransactionVersion(recoveryTransactionVersion), lastEpochEnd(lastEpochEnd),
commitProxies(commitProxies), resolvers(resolvers) {}
commitProxies(commitProxies), resolvers(resolvers), versionEpoch(versionEpoch) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, resolvers, reply);
serializer(ar, recoveryTransactionVersion, lastEpochEnd, commitProxies, resolvers, versionEpoch, reply);
}
};

View File

@ -277,9 +277,8 @@ ACTOR Future<std::vector<StorageServerInterface>> getStorageServers(Database cx,
}
}
ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
bool localOnly) {
ACTOR Future<std::pair<std::vector<WorkerInterface>, int>>
getStorageWorkers(Database cx, Reference<AsyncVar<ServerDBInfo> const> dbInfo, bool localOnly) {
state std::vector<StorageServerInterface> servers = wait(getStorageServers(cx));
state std::map<NetworkAddress, WorkerInterface> workersMap;
std::vector<WorkerDetails> workers = wait(getWorkers(dbInfo));
@ -299,7 +298,9 @@ ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
}
auto masterDcId = dbInfo->get().master.locality.dcId();
std::vector<WorkerInterface> result;
std::pair<std::vector<WorkerInterface>, int> result;
auto& [workerInterfaces, failures] = result;
failures = 0;
for (const auto& server : servers) {
TraceEvent(SevDebug, "DcIdInfo")
.detail("ServerLocalityID", server.locality.dcId())
@ -310,9 +311,10 @@ ACTOR Future<std::vector<WorkerInterface>> getStorageWorkers(Database cx,
TraceEvent(SevWarn, "GetStorageWorkers")
.detail("Reason", "Could not find worker for storage server")
.detail("SS", server.id());
throw operation_failed();
++failures;
} else {
workerInterfaces.push_back(itr->second);
}
result.push_back(itr->second);
}
}
return result;
@ -598,6 +600,31 @@ ACTOR Future<bool> getStorageServersRecruiting(Database cx, WorkerInterface dist
}
}
// Gets the difference between the expected version (based on the version
// epoch) and the actual version.
ACTOR Future<int64_t> getVersionOffset(Database cx,
WorkerInterface distributorWorker,
Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
loop {
state Transaction tr(cx);
try {
TraceEvent("GetVersionOffset").detail("Stage", "ReadingVersionEpoch");
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state Version rv = wait(tr.getReadVersion());
Optional<Standalone<StringRef>> versionEpochValue = wait(tr.get(versionEpochKey));
if (!versionEpochValue.present()) {
return 0;
}
int64_t versionEpoch = BinaryReader::fromStringRef<int64_t>(versionEpochValue.get(), Unversioned());
int64_t versionOffset = abs(rv - (g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - versionEpoch));
return versionOffset;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> repairDeadDatacenter(Database cx,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
std::string context) {
@ -652,7 +679,8 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
int64_t maxTLogQueueGate = 5e6,
int64_t maxStorageServerQueueGate = 5e6,
int64_t maxDataDistributionQueueSize = 0,
int64_t maxPoppedVersionLag = 30e6) {
int64_t maxPoppedVersionLag = 30e6,
int64_t maxVersionOffset = 1e6) {
state Future<Void> reconfig =
reconfigureAfter(cx, 100 + (deterministicRandom()->random01() * 100), dbInfo, "QuietDatabase");
state Future<int64_t> dataInFlight;
@ -662,6 +690,7 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
state Future<int64_t> storageQueueSize;
state Future<bool> dataDistributionActive;
state Future<bool> storageServersRecruiting;
state Future<int64_t> versionOffset;
auto traceMessage = "QuietDatabase" + phase + "Begin";
TraceEvent(traceMessage.c_str()).log();
@ -698,10 +727,11 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
storageQueueSize = getMaxStorageServerQueueSize(cx, dbInfo);
dataDistributionActive = getDataDistributionActive(cx, distributorWorker);
storageServersRecruiting = getStorageServersRecruiting(cx, distributorWorker, distributorUID);
versionOffset = getVersionOffset(cx, distributorWorker, dbInfo);
wait(success(dataInFlight) && success(tLogQueueInfo) && success(dataDistributionQueueSize) &&
success(teamCollectionValid) && success(storageQueueSize) && success(dataDistributionActive) &&
success(storageServersRecruiting));
success(storageServersRecruiting) && success(versionOffset));
TraceEvent(("QuietDatabase" + phase).c_str())
.detail("DataInFlight", dataInFlight.get())
@ -717,13 +747,17 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
.detail("MaxStorageServerQueueGate", maxStorageServerQueueGate)
.detail("DataDistributionActive", dataDistributionActive.get())
.detail("StorageServersRecruiting", storageServersRecruiting.get())
.detail("RecoveryCount", dbInfo->get().recoveryCount)
.detail("VersionOffset", versionOffset.get())
.detail("NumSuccesses", numSuccesses);
maxVersionOffset += dbInfo->get().recoveryCount * SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT;
if (dataInFlight.get() > dataInFlightGate || tLogQueueInfo.get().first > maxTLogQueueGate ||
tLogQueueInfo.get().second > maxPoppedVersionLag ||
dataDistributionQueueSize.get() > maxDataDistributionQueueSize ||
storageQueueSize.get() > maxStorageServerQueueGate || !dataDistributionActive.get() ||
storageServersRecruiting.get() || !teamCollectionValid.get()) {
storageServersRecruiting.get() || versionOffset.get() > maxVersionOffset ||
!teamCollectionValid.get()) {
wait(delay(1.0));
numSuccesses = 0;
@ -779,6 +813,10 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
auto key = "NotReady" + std::to_string(notReadyCount++);
evt.detail(key.c_str(), "storageServersRecruiting");
}
if (versionOffset.isReady() && versionOffset.isError()) {
auto key = "NotReady" + std::to_string(notReadyCount++);
evt.detail(key.c_str(), "versionOffset");
}
wait(delay(1.0));
numSuccesses = 0;
}
@ -794,7 +832,8 @@ Future<Void> quietDatabase(Database const& cx,
int64_t maxTLogQueueGate,
int64_t maxStorageServerQueueGate,
int64_t maxDataDistributionQueueSize,
int64_t maxPoppedVersionLag) {
int64_t maxPoppedVersionLag,
int64_t maxVersionOffset) {
return waitForQuietDatabase(cx,
dbInfo,
phase,
@ -802,5 +841,6 @@ Future<Void> quietDatabase(Database const& cx,
maxTLogQueueGate,
maxStorageServerQueueGate,
maxDataDistributionQueueSize,
maxPoppedVersionLag);
maxPoppedVersionLag,
maxVersionOffset);
}

View File

@ -46,9 +46,11 @@ Future<WorkerInterface> getMasterWorker(Database const& cx, Reference<AsyncVar<S
Future<Void> repairDeadDatacenter(Database const& cx,
Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
std::string const& context);
Future<std::vector<WorkerInterface>> getStorageWorkers(Database const& cx,
Reference<AsyncVar<ServerDBInfo> const> const& dbInfo,
bool const& localOnly);
// Returns list of worker interfaces for available storage servers and the number of unavailable
// storage servers
Future<std::pair<std::vector<WorkerInterface>, int>>
getStorageWorkers(Database const& cx, Reference<AsyncVar<ServerDBInfo> const> const& dbInfo, bool const& localOnly);
Future<std::vector<WorkerInterface>> getCoordWorkers(Database const& cx,
Reference<AsyncVar<ServerDBInfo> const> const& dbInfo);

File diff suppressed because it is too large Load Diff

View File

@ -767,11 +767,13 @@ struct InitializeStorageRequest {
Optional<std::pair<UID, Version>>
tssPairIDAndVersion; // Only set if recruiting a tss. Will be the UID and Version of its SS pair.
UID clusterId; // Unique cluster identifier. Only needed at recruitment, will be read from txnStateStore on recovery
Version initialClusterVersion;
ReplyPromise<InitializeStorageReply> reply;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion, clusterId);
serializer(
ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion, clusterId, initialClusterVersion);
}
};
@ -1086,6 +1088,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
StorageServerInterface ssi,
Tag seedTag,
UID clusterId,
Version startVersion,
Version tssSeedVersion,
ReplyPromise<InitializeStorageReply> recruitReply,
Reference<AsyncVar<ServerDBInfo> const> db,
@ -1142,6 +1145,10 @@ ACTOR Future<Void> backupWorker(BackupInterface bi,
void registerThreadForProfiling();
// Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's primary
// satellite DC.
bool addressInDbAndPrimarySatelliteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo);
// Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's remote DC.
bool addressInDbAndRemoteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo);

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include <algorithm>
#include <iterator>
#include "fdbrpc/sim_validation.h"
@ -47,6 +48,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
Version version; // The last version assigned to a proxy by getVersion()
double lastVersionTime;
Optional<Version> referenceVersion;
std::map<UID, CommitProxyVersionReplies> lastCommitProxyVersionReplies;
@ -125,12 +127,36 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
if (BUGGIFY) {
t1 = self->lastVersionTime;
}
rep.prevVersion = self->version;
self->version +=
// Versions should roughly follow wall-clock time, based on the
// system clock of the current machine and an FDB-specific epoch.
// Calculate the expected version and determine whether we need to
// hand out versions faster or slower to stay in sync with the
// clock.
Version toAdd =
std::max<Version>(1,
std::min<Version>(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS,
SERVER_KNOBS->VERSIONS_PER_SECOND * (t1 - self->lastVersionTime)));
rep.prevVersion = self->version;
if (self->referenceVersion.present()) {
Version expected =
g_network->timer() * SERVER_KNOBS->VERSIONS_PER_SECOND - self->referenceVersion.get();
// Attempt to jump directly to the expected version. But make
// sure that versions are still being handed out at a rate
// around VERSIONS_PER_SECOND. This rate is scaled depending on
// how far off the calculated version is from the expected
// version.
int64_t maxOffset = std::min(static_cast<int64_t>(toAdd * SERVER_KNOBS->MAX_VERSION_RATE_MODIFIER),
SERVER_KNOBS->MAX_VERSION_RATE_OFFSET);
self->version =
std::clamp(expected, self->version + toAdd - maxOffset, self->version + toAdd + maxOffset);
ASSERT_GT(self->version, rep.prevVersion);
} else {
self->version = self->version + toAdd;
}
TEST(self->version - rep.prevVersion == 1); // Minimum possible version gap
bool maxVersionGap = self->version - rep.prevVersion == SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS;
@ -214,7 +240,8 @@ ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
TraceEvent("UpdateRecoveryData", self->dbgid)
.detail("RecoveryTxnVersion", req.recoveryTransactionVersion)
.detail("LastEpochEnd", req.lastEpochEnd)
.detail("NumCommitProxies", req.commitProxies.size());
.detail("NumCommitProxies", req.commitProxies.size())
.detail("VersionEpoch", req.versionEpoch);
if (self->recoveryTransactionVersion == invalidVersion ||
req.recoveryTransactionVersion > self->recoveryTransactionVersion) {
@ -230,6 +257,16 @@ ACTOR Future<Void> updateRecoveryData(Reference<MasterData> self) {
self->lastCommitProxyVersionReplies[p.id()] = CommitProxyVersionReplies();
}
}
if (req.versionEpoch.present()) {
self->referenceVersion = req.versionEpoch.get();
} else if (BUGGIFY) {
// Cannot use a positive version epoch in simulation because of the
// clock starting at 0. A positive version epoch would mean the initial
// cluster version was negative.
// TODO: Increase the size of this interval after fixing the issue
// with restoring ranges with large version gaps.
self->referenceVersion = deterministicRandom()->randomInt64(-1e6, 0);
}
self->resolutionBalancer.setCommitProxies(req.commitProxies);
self->resolutionBalancer.setResolvers(req.resolvers);

View File

@ -440,6 +440,8 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
Version metadataCreateVersion = invalidVersion;
bool removing = false;
bool destroyed = false;
bool possiblyDestroyed = false;
KeyRangeMap<std::unordered_map<UID, Promise<Void>>> moveTriggers;
@ -472,6 +474,13 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
}
// TODO: may be more cleanup possible here
}
void destroy(Version destroyVersion) {
removing = true;
destroyed = true;
moved(range);
newMutations.trigger();
}
};
class ServerWatchMetadata : public ReferenceCounted<ServerWatchMetadata> {
@ -795,6 +804,9 @@ public:
Reference<ILogSystem::IPeekCursor> logCursor;
Promise<UID> clusterId;
// The version the cluster starts on. This value is not persisted and may
// not be valid after a recovery.
Version initialClusterVersion = invalidVersion;
UID thisServerID;
Optional<UID> tssPairID; // if this server is a tss, this is the id of its (ss) pair
Optional<UID> ssPairID; // if this server is an ss, this is the id of its (tss) pair
@ -1908,6 +1920,12 @@ ACTOR Future<Void> overlappingChangeFeedsQ(StorageServer* data, OverlappingChang
for (auto& it : rangeIds) {
reply.rangeIds.push_back(OverlappingChangeFeedEntry(
it.first, std::get<0>(it.second), std::get<1>(it.second), std::get<2>(it.second)));
TraceEvent(SevDebug, "OverlappingChangeFeedEntry", data->thisServerID)
.detail("MinVersion", req.minVersion)
.detail("FeedID", it.first)
.detail("Range", std::get<0>(it.second))
.detail("EmptyVersion", std::get<1>(it.second))
.detail("StopVersion", std::get<2>(it.second));
}
// Make sure all of the metadata we are sending won't get rolled back
@ -4699,6 +4717,9 @@ ACTOR Future<Void> tryGetRange(PromiseStream<RangeResult> results, Transaction*
}
}
// global validation that missing refreshed feeds were previously destroyed
static std::unordered_set<Key> allDestroyedChangeFeeds;
// We have to store the version the change feed was stopped at in the SS instead of just the stopped status
// In addition to simplifying stopping logic, it enables communicating stopped status when fetching change feeds
// from other SS correctly
@ -4739,33 +4760,35 @@ ACTOR Future<Void> changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req)
.detail("RangeID", req.rangeID.printable())
.detail("Version", req.version)
.detail("SSVersion", self->version.get())
.detail("Range", req.range.toString());
.detail("Range", req.range);
if (req.version - 1 > feed->second->emptyVersion) {
feed->second->emptyVersion = req.version - 1;
while (!feed->second->mutations.empty() && feed->second->mutations.front().version < req.version) {
feed->second->mutations.pop_front();
}
Version durableVersion = self->data().getLatestVersion();
auto& mLV = self->addVersionToMutationLog(durableVersion);
self->addMutationToMutationLog(
mLV,
MutationRef(
MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + feed->second->id.toString(),
changeFeedSSValue(feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion)));
if (feed->second->storageVersion != invalidVersion) {
++self->counters.kvSystemClearRanges;
self->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange,
changeFeedDurableKey(feed->second->id, 0),
changeFeedDurableKey(feed->second->id, req.version)));
if (req.version > feed->second->storageVersion) {
feed->second->storageVersion = invalidVersion;
feed->second->durableVersion = invalidVersion;
if (!feed->second->destroyed) {
Version durableVersion = self->data().getLatestVersion();
auto& mLV = self->addVersionToMutationLog(durableVersion);
self->addMutationToMutationLog(
mLV,
MutationRef(
MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + feed->second->id.toString(),
changeFeedSSValue(feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion)));
if (feed->second->storageVersion != invalidVersion) {
++self->counters.kvSystemClearRanges;
self->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange,
changeFeedDurableKey(feed->second->id, 0),
changeFeedDurableKey(feed->second->id, req.version)));
if (req.version > feed->second->storageVersion) {
feed->second->storageVersion = invalidVersion;
feed->second->durableVersion = invalidVersion;
}
}
wait(self->durableVersion.whenAtLeast(durableVersion));
}
wait(self->durableVersion.whenAtLeast(durableVersion));
}
req.reply.send(Void());
return Void();
@ -4944,7 +4967,9 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
.errorUnsuppressed(e)
.detail("RangeID", rangeId.printable())
.detail("Range", range.toString())
.detail("EndVersion", endVersion);
.detail("EndVersion", endVersion)
.detail("Removing", changeFeedInfo->removing)
.detail("Destroyed", changeFeedInfo->destroyed);
throw;
}
}
@ -5041,6 +5066,7 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
}
}
state bool seenNotRegistered = false;
loop {
try {
Version maxFetched = wait(fetchChangeFeedApplier(data,
@ -5057,19 +5083,110 @@ ACTOR Future<Version> fetchChangeFeed(StorageServer* data,
throw;
}
}
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
// TODO REMOVE
fmt::print("DBG: SS {} Feed {} possibly destroyed {}, {} metadata create, {} desired committed\n",
data->thisServerID.toString().substr(0, 4),
changeFeedInfo->id.printable(),
changeFeedInfo->possiblyDestroyed,
changeFeedInfo->metadataCreateVersion,
data->desiredOldestVersion.get());
// There are two reasons for change_feed_not_registered:
// 1. The feed was just created, but the ss mutation stream is ahead of the GRV that fetchChangeFeedApplier
// uses to read the change feed data from the database. In this case we need to wait and retry
// 2. The feed was destroyed, but we missed a metadata update telling us this. In this case we need to destroy
// the feed
// endVersion >= the metadata create version, so we can safely use it as a proxy
if (beginVersion != 0 || seenNotRegistered || endVersion <= data->desiredOldestVersion.get()) {
// If any of these are true, the feed must be destroyed.
Version cleanupVersion = data->data().getLatestVersion();
TraceEvent(SevDebug, "DestroyingChangeFeedFromFetch", data->thisServerID)
.detail("RangeID", changeFeedInfo->id.printable())
.detail("Range", changeFeedInfo->range.toString())
.detail("Version", cleanupVersion);
if (g_network->isSimulated()) {
ASSERT(allDestroyedChangeFeeds.count(changeFeedInfo->id));
}
Key beginClearKey = changeFeedInfo->id.withPrefix(persistChangeFeedKeys.begin);
auto& mLV = data->addVersionToMutationLog(cleanupVersion);
data->addMutationToMutationLog(
mLV, MutationRef(MutationRef::ClearRange, beginClearKey, keyAfter(beginClearKey)));
++data->counters.kvSystemClearRanges;
data->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange,
changeFeedDurableKey(changeFeedInfo->id, 0),
changeFeedDurableKey(changeFeedInfo->id, cleanupVersion)));
++data->counters.kvSystemClearRanges;
changeFeedInfo->destroy(cleanupVersion);
data->changeFeedCleanupDurable[changeFeedInfo->id] = cleanupVersion;
for (auto& it : data->changeFeedRemovals) {
it.second.send(changeFeedInfo->id);
}
return invalidVersion;
}
// otherwise assume the feed just hasn't been created on the SS we tried to read it from yet, wait for it to
// definitely be committed and retry
seenNotRegistered = true;
wait(data->desiredOldestVersion.whenAtLeast(endVersion));
}
}
ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
KeyRange keys,
Version fetchVersion,
PromiseStream<Key> removals) {
PromiseStream<Key> removals,
UID fetchKeysID) {
// Wait for current TLog batch to finish to ensure that we're fetching metadata at a version >= the version of the
// ChangeServerKeys mutation. This guarantees we don't miss any metadata between the previous batch's version
// (data->version) and the mutation version.
wait(data->version.whenAtLeast(data->version.get() + 1));
state Version fetchVersion = data->version.get();
TraceEvent(SevDebug, "FetchChangeFeedMetadata", data->thisServerID)
.detail("Range", keys.toString())
.detail("FetchVersion", fetchVersion);
state std::vector<OverlappingChangeFeedEntry> feeds =
wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion + 1));
.detail("Range", keys)
.detail("FetchVersion", fetchVersion)
.detail("FKID", fetchKeysID);
state std::set<Key> refreshedFeedIds;
state std::set<Key> destroyedFeedIds;
// before fetching feeds from other SS's, refresh any feeds we already have that are being marked as removed
auto ranges = data->keyChangeFeed.intersectingRanges(keys);
for (auto& r : ranges) {
for (auto& cfInfo : r.value()) {
auto feedCleanup = data->changeFeedCleanupDurable.find(cfInfo->id);
if (feedCleanup != data->changeFeedCleanupDurable.end() && cfInfo->removing && !cfInfo->destroyed) {
TEST(true); // re-fetching feed scheduled for deletion! Un-mark it as removing
destroyedFeedIds.insert(cfInfo->id);
cfInfo->removing = false;
// because we now have a gap in the metadata, it's possible this feed was destroyed
cfInfo->possiblyDestroyed = true;
// reset fetch versions because everything previously fetched was cleaned up
cfInfo->fetchVersion = invalidVersion;
cfInfo->durableFetchVersion = NotifiedVersion();
TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID)
.detail("RangeID", cfInfo->id.printable())
.detail("Range", cfInfo->range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", cfInfo->emptyVersion)
.detail("StopVersion", cfInfo->stopVersion)
.detail("FKID", fetchKeysID);
}
}
}
state std::vector<OverlappingChangeFeedEntry> feeds = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion));
// handle change feeds removed while fetching overlapping
while (removals.getFuture().isReady()) {
Key remove = waitNext(removals.getFuture());
for (int i = 0; i < feeds.size(); i++) {
@ -5078,6 +5195,7 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
}
}
}
std::vector<Key> feedIds;
feedIds.reserve(feeds.size());
// create change feed metadata if it does not exist
@ -5090,16 +5208,23 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
TraceEvent(SevDebug, "FetchedChangeFeedInfo", data->thisServerID)
.detail("RangeID", cfEntry.rangeId.printable())
.detail("Range", cfEntry.range.toString())
.detail("Range", cfEntry.range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", cfEntry.emptyVersion)
.detail("StopVersion", cfEntry.stopVersion)
.detail("Existing", existing)
.detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion);
.detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion)
.detail("FKID", fetchKeysID);
bool addMutationToLog = false;
Reference<ChangeFeedInfo> changeFeedInfo;
auto fid = destroyedFeedIds.find(cfEntry.rangeId);
if (fid != destroyedFeedIds.end()) {
refreshedFeedIds.insert(cfEntry.rangeId);
destroyedFeedIds.erase(fid);
}
if (!existing) {
TEST(cleanupPending); // Fetch change feed which is cleanup pending. This means there was a move away and a
// move back, this will remake the metadata
@ -5120,30 +5245,26 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
addMutationToLog = true;
} else {
changeFeedInfo = existingEntry->second;
auto feedCleanup = data->changeFeedCleanupDurable.find(cfEntry.rangeId);
if (changeFeedInfo->destroyed) {
// race where multiple feeds fetched overlapping change feed, one realized feed was missing and marked
// it removed+destroyed, then this one fetched the same info
continue;
}
// we checked all feeds we already owned in this range at the start to reset them if they were removing, and
// this actor would have been cancelled if a later remove happened
ASSERT(!changeFeedInfo->removing);
if (cfEntry.stopVersion < changeFeedInfo->stopVersion) {
TEST(true); // Change feed updated stop version from fetch metadata
changeFeedInfo->stopVersion = cfEntry.stopVersion;
addMutationToLog = true;
}
if (feedCleanup != data->changeFeedCleanupDurable.end() && changeFeedInfo->removing) {
TEST(true); // re-fetching feed scheduled for deletion! Un-mark it as removing
if (cfEntry.emptyVersion < data->version.get()) {
changeFeedInfo->emptyVersion = cfEntry.emptyVersion;
}
changeFeedInfo->removing = false;
// reset fetch versions because everything previously fetched was cleaned up
changeFeedInfo->fetchVersion = invalidVersion;
changeFeedInfo->durableFetchVersion = NotifiedVersion();
// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
// it
// We may just want to refactor this so updateStorage does explicit deletes based on
// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
// Then we wouldn't have to reset anything here
// don't update empty version past SS version if SS is behind, it can cause issues
if (cfEntry.emptyVersion < data->version.get() && cfEntry.emptyVersion > changeFeedInfo->emptyVersion) {
TEST(true); // Change feed updated empty version from fetch metadata
changeFeedInfo->emptyVersion = cfEntry.emptyVersion;
addMutationToLog = true;
}
}
@ -5163,6 +5284,84 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
}
}
}
TEST(!refreshedFeedIds.empty()); // Feed refreshed between move away and move back
TEST(!destroyedFeedIds.empty()); // Feed destroyed between move away and move back
for (auto& feedId : refreshedFeedIds) {
auto existingEntry = data->uidChangeFeed.find(feedId);
if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
TEST(true); // feed refreshed
continue;
}
// Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore
// it
// We may just want to refactor this so updateStorage does explicit deletes based on
// changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup.
// Then we wouldn't have to reset anything here or above
// Do the mutation log update here instead of above to ensure we only add it back to the mutation log if we're
// sure it wasn't deleted in the metadata gap
Version metadataVersion = data->data().getLatestVersion();
auto& mLV = data->addVersionToMutationLog(metadataVersion);
data->addMutationToMutationLog(
mLV,
MutationRef(MutationRef::SetValue,
persistChangeFeedKeys.begin.toString() + existingEntry->second->id.toString(),
changeFeedSSValue(existingEntry->second->range,
existingEntry->second->emptyVersion + 1,
existingEntry->second->stopVersion)));
TraceEvent(SevDebug, "PersistingResetChangeFeedInfo", data->thisServerID)
.detail("RangeID", existingEntry->second->id.printable())
.detail("Range", existingEntry->second->range)
.detail("FetchVersion", fetchVersion)
.detail("EmptyVersion", existingEntry->second->emptyVersion)
.detail("StopVersion", existingEntry->second->stopVersion)
.detail("FKID", fetchKeysID)
.detail("MetadataVersion", metadataVersion);
}
for (auto& feedId : destroyedFeedIds) {
auto existingEntry = data->uidChangeFeed.find(feedId);
if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) {
TEST(true); // feed refreshed but then destroyed elsewhere
continue;
}
// TODO REMOVE print
fmt::print("DBG: SS {} fetching feed {} was refreshed but not present!! assuming destroyed\n",
data->thisServerID.toString().substr(0, 4),
feedId.printable());
Version cleanupVersion = data->data().getLatestVersion();
TraceEvent(SevDebug, "DestroyingChangeFeedFromFetchMetadata", data->thisServerID)
.detail("RangeID", feedId.printable())
.detail("Range", existingEntry->second->range)
.detail("Version", cleanupVersion)
.detail("FKID", fetchKeysID);
if (g_network->isSimulated()) {
ASSERT(allDestroyedChangeFeeds.count(feedId));
}
Key beginClearKey = feedId.withPrefix(persistChangeFeedKeys.begin);
auto& mLV = data->addVersionToMutationLog(cleanupVersion);
data->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange, beginClearKey, keyAfter(beginClearKey)));
++data->counters.kvSystemClearRanges;
data->addMutationToMutationLog(mLV,
MutationRef(MutationRef::ClearRange,
changeFeedDurableKey(feedId, 0),
changeFeedDurableKey(feedId, cleanupVersion)));
++data->counters.kvSystemClearRanges;
existingEntry->second->destroy(cleanupVersion);
data->changeFeedCleanupDurable[feedId] = cleanupVersion;
for (auto& it : data->changeFeedRemovals) {
it.second.send(feedId);
}
}
return feedIds;
}
@ -5218,7 +5417,6 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
}
}
if (done) {
data->changeFeedRemovals.erase(fetchKeysID);
return feedMaxFetched;
}
}
@ -5283,8 +5481,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
state PromiseStream<Key> removals;
data->changeFeedRemovals[fetchKeysID] = removals;
state Future<std::vector<Key>> fetchCFMetadata =
fetchChangeFeedMetadata(data, keys, data->version.get(), removals);
state Future<std::vector<Key>> fetchCFMetadata = fetchChangeFeedMetadata(data, keys, removals, fetchKeysID);
validate(data);
@ -5629,6 +5826,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
}
}
data->changeFeedRemovals.erase(fetchKeysID);
shard->phase = AddingShard::Waiting;
// Similar to transferred version, but wait for all feed data and
@ -5849,7 +6048,8 @@ void changeServerKeys(StorageServer* data,
data->watches.triggerRange(range.begin, range.end);
} else if (!dataAvailable) {
// SOMEDAY: Avoid restarting adding/transferred shards
if (version == 0) { // bypass fetchkeys; shard is known empty at version 0
// bypass fetchkeys; shard is known empty at initial cluster version
if (version == data->initialClusterVersion - 1) {
TraceEvent("ChangeServerKeysInitialRange", data->thisServerID)
.detail("Begin", range.begin)
.detail("End", range.end);
@ -5940,7 +6140,6 @@ void changeServerKeys(StorageServer* data,
auto feed = data->uidChangeFeed.find(f.first);
if (feed != data->uidChangeFeed.end()) {
feed->second->emptyVersion = version - 1;
feed->second->removing = true;
feed->second->moved(feed->second->range);
feed->second->newMutations.trigger();
@ -6242,7 +6441,10 @@ private:
feed->second->durableVersion = invalidVersion;
}
}
addMutationToLog = true;
if (!feed->second->destroyed) {
// if feed is destroyed, adding an extra mutation here would re-create it if SS restarted
addMutationToLog = true;
}
}
} else if (status == ChangeFeedStatus::CHANGE_FEED_CREATE && createdFeed) {
@ -6278,13 +6480,12 @@ private:
changeFeedDurableKey(feed->second->id, currentVersion)));
++data->counters.kvSystemClearRanges;
feed->second->emptyVersion = currentVersion - 1;
feed->second->stopVersion = currentVersion;
feed->second->removing = true;
feed->second->moved(feed->second->range);
feed->second->newMutations.trigger();
feed->second->destroy(currentVersion);
data->changeFeedCleanupDurable[feed->first] = cleanupVersion;
if (g_network->isSimulated()) {
allDestroyedChangeFeeds.insert(changeFeedId);
}
}
if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) {
@ -6734,7 +6935,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
.detail("Version", cloneCursor2->version().toString());
} else if (ver != invalidVersion) { // This change belongs to a version < minVersion
DEBUG_MUTATION("SSPeek", ver, msg, data->thisServerID);
if (ver == 1) {
if (ver == data->initialClusterVersion) {
//TraceEvent("SSPeekMutation", data->thisServerID).log();
// The following trace event may produce a value with special characters
TraceEvent("SSPeekMutation", data->thisServerID)
@ -6850,6 +7051,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
proposedOldestVersion = std::min(proposedOldestVersion, data->version.get() - 1);
proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get());
proposedOldestVersion = std::max(proposedOldestVersion, data->desiredOldestVersion.get());
proposedOldestVersion = std::max(proposedOldestVersion, data->initialClusterVersion);
//TraceEvent("StorageServerUpdated", data->thisServerID).detail("Ver", ver).detail("DataVersion", data->version.get())
// .detail("LastTLogVersion", data->lastTLogVersion).detail("NewOldest",
@ -8715,6 +8917,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
StorageServerInterface ssi,
Tag seedTag,
UID clusterId,
Version startVersion,
Version tssSeedVersion,
ReplyPromise<InitializeStorageReply> recruitReply,
Reference<AsyncVar<ServerDBInfo> const> db,
@ -8722,6 +8925,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
state StorageServer self(persistentData, db, ssi);
state Future<Void> ssCore;
self.clusterId.send(clusterId);
self.initialClusterVersion = startVersion;
if (ssi.isTss()) {
self.setTssPair(ssi.tssPairID.get());
ASSERT(self.isTss());

View File

@ -778,6 +778,82 @@ TEST_CASE("/fdbserver/worker/addressInDbAndPrimaryDc") {
} // namespace
// Returns true if `address` is used in the db (indicated by `dbInfo`) transaction system and in the db's primary
// satellite DC.
bool addressInDbAndPrimarySatelliteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
for (const auto& logSet : dbInfo->get().logSystemConfig.tLogs) {
if (logSet.isLocal && logSet.locality == tagLocalitySatellite) {
for (const auto& tlog : logSet.tLogs) {
if (tlog.present() && tlog.interf().addresses().contains(address)) {
return true;
}
}
}
}
return false;
}
bool addressesInDbAndPrimarySatelliteDc(const NetworkAddressList& addresses,
Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
return addressInDbAndPrimarySatelliteDc(addresses.address, dbInfo) ||
(addresses.secondaryAddress.present() &&
addressInDbAndPrimarySatelliteDc(addresses.secondaryAddress.get(), dbInfo));
}
namespace {
TEST_CASE("/fdbserver/worker/addressInDbAndPrimarySatelliteDc") {
// Setup a ServerDBInfo for test.
ServerDBInfo testDbInfo;
LocalityData testLocal;
testLocal.set(LiteralStringRef("dcid"), StringRef(std::to_string(1)));
testDbInfo.master.locality = testLocal;
// First, create an empty TLogInterface, and check that it shouldn't be considered as in satellite DC.
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().isLocal = true;
testDbInfo.logSystemConfig.tLogs.back().locality = tagLocalitySatellite;
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface<TLogInterface>());
ASSERT(!addressInDbAndPrimarySatelliteDc(g_network->getLocalAddress(),
makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a satellite tlog, and it should be considered as in primary satellite DC.
NetworkAddress satelliteTLogAddress(IPAddress(0x13131313), 1);
TLogInterface satelliteTLog(testLocal);
satelliteTLog.initEndpoints();
satelliteTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ satelliteTLogAddress }, UID(1, 2)));
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(satelliteTLog));
ASSERT(addressInDbAndPrimarySatelliteDc(satelliteTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a primary TLog, and it shouldn't be considered as in primary Satellite DC.
NetworkAddress primaryTLogAddress(IPAddress(0x26262626), 1);
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().isLocal = true;
TLogInterface primaryTLog(testLocal);
primaryTLog.initEndpoints();
primaryTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ primaryTLogAddress }, UID(1, 2)));
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(primaryTLog));
ASSERT(!addressInDbAndPrimarySatelliteDc(primaryTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
// Create a remote TLog, and it should be considered as in remote DC.
NetworkAddress remoteTLogAddress(IPAddress(0x37373737), 1);
LocalityData fakeRemote;
fakeRemote.set(LiteralStringRef("dcid"), StringRef(std::to_string(2)));
TLogInterface remoteTLog(fakeRemote);
remoteTLog.initEndpoints();
remoteTLog.peekMessages = RequestStream<struct TLogPeekRequest>(Endpoint({ remoteTLogAddress }, UID(1, 2)));
testDbInfo.logSystemConfig.tLogs.push_back(TLogSet());
testDbInfo.logSystemConfig.tLogs.back().isLocal = false;
testDbInfo.logSystemConfig.tLogs.back().tLogs.push_back(OptionalInterface(remoteTLog));
ASSERT(!addressInDbAndPrimarySatelliteDc(remoteTLogAddress, makeReference<AsyncVar<ServerDBInfo>>(testDbInfo)));
return Void();
}
} // namespace
bool addressInDbAndRemoteDc(const NetworkAddress& address, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
const auto& dbi = dbInfo->get();
@ -872,17 +948,15 @@ ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFu
const auto& allPeers = FlowTransport::transport().getAllPeers();
UpdateWorkerHealthRequest req;
bool workerInDb = false;
bool workerInPrimary = false;
enum WorkerLocation { None, Primary, Remote };
WorkerLocation workerLocation = None;
if (addressesInDbAndPrimaryDc(interf.addresses(), dbInfo)) {
workerInDb = true;
workerInPrimary = true;
workerLocation = Primary;
} else if (addressesInDbAndRemoteDc(interf.addresses(), dbInfo)) {
workerInDb = true;
workerInPrimary = false;
workerLocation = Remote;
}
if (workerInDb) {
if (workerLocation != None) {
for (const auto& [address, peer] : allPeers) {
if (peer->connectFailedCount == 0 &&
peer->pingLatencies.getPopulationSize() < SERVER_KNOBS->PEER_LATENCY_CHECK_MIN_POPULATION) {
@ -895,37 +969,50 @@ ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFu
// last ping latencies logged.
continue;
}
if ((workerInPrimary && addressInDbAndPrimaryDc(address, dbInfo)) ||
(!workerInPrimary && addressInDbAndRemoteDc(address, dbInfo))) {
// Only monitoring the servers that in the primary or remote DC's transaction systems.
// Note that currently we are not monitor storage servers, since lagging in storage servers
// today already can trigger server exclusion by data distributor.
bool degradedPeer = false;
if ((workerLocation == Primary && addressInDbAndPrimaryDc(address, dbInfo)) ||
(workerLocation == Remote && addressInDbAndRemoteDc(address, dbInfo))) {
// Monitors intra DC latencies between servers that in the primary or remote DC's transaction
// systems. Note that currently we are not monitor storage servers, since lagging in storage
// servers today already can trigger server exclusion by data distributor.
if (peer->connectFailedCount >= SERVER_KNOBS->PEER_DEGRADATION_CONNECTION_FAILURE_COUNT ||
peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE) >
SERVER_KNOBS->PEER_LATENCY_DEGRADATION_THRESHOLD ||
peer->timeoutCount / (double)(peer->pingLatencies.getPopulationSize()) >
SERVER_KNOBS->PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD) {
// This is a degraded peer.
TraceEvent("HealthMonitorDetectDegradedPeer")
.suppressFor(30)
.detail("Peer", address)
.detail("Elapsed", now() - peer->lastLoggedTime)
.detail("MinLatency", peer->pingLatencies.min())
.detail("MaxLatency", peer->pingLatencies.max())
.detail("MeanLatency", peer->pingLatencies.mean())
.detail("MedianLatency", peer->pingLatencies.median())
.detail("CheckedPercentile", SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE)
.detail(
"CheckedPercentileLatency",
peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE))
.detail("PingCount", peer->pingLatencies.getPopulationSize())
.detail("PingTimeoutCount", peer->timeoutCount)
.detail("ConnectionFailureCount", peer->connectFailedCount);
req.degradedPeers.push_back(address);
degradedPeer = true;
}
} else if (workerLocation == Primary && addressInDbAndPrimarySatelliteDc(address, dbInfo)) {
// Monitors inter DC latencies between servers in primary and primary satellite DC. Note that
// TLog workers in primary satellite DC are on the critical path of serving a commit.
if (peer->connectFailedCount >= SERVER_KNOBS->PEER_DEGRADATION_CONNECTION_FAILURE_COUNT ||
peer->pingLatencies.percentile(
SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE_SATELLITE) >
SERVER_KNOBS->PEER_LATENCY_DEGRADATION_THRESHOLD_SATELLITE ||
peer->timeoutCount / (double)(peer->pingLatencies.getPopulationSize()) >
SERVER_KNOBS->PEER_TIMEOUT_PERCENTAGE_DEGRADATION_THRESHOLD) {
degradedPeer = true;
}
}
if (degradedPeer) {
TraceEvent("HealthMonitorDetectDegradedPeer")
.suppressFor(30)
.detail("Peer", address)
.detail("Elapsed", now() - peer->lastLoggedTime)
.detail("MinLatency", peer->pingLatencies.min())
.detail("MaxLatency", peer->pingLatencies.max())
.detail("MeanLatency", peer->pingLatencies.mean())
.detail("MedianLatency", peer->pingLatencies.median())
.detail("CheckedPercentile", SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE)
.detail("CheckedPercentileLatency",
peer->pingLatencies.percentile(SERVER_KNOBS->PEER_LATENCY_DEGRADATION_PERCENTILE))
.detail("PingCount", peer->pingLatencies.getPopulationSize())
.detail("PingTimeoutCount", peer->timeoutCount)
.detail("ConnectionFailureCount", peer->connectFailedCount);
req.degradedPeers.push_back(address);
}
}
@ -941,8 +1028,9 @@ ACTOR Future<Void> healthMonitor(Reference<AsyncVar<Optional<ClusterControllerFu
continue;
}
if ((workerInPrimary && addressInDbAndPrimaryDc(address, dbInfo)) ||
(!workerInPrimary && addressInDbAndRemoteDc(address, dbInfo))) {
if ((workerLocation == Primary && addressInDbAndPrimaryDc(address, dbInfo)) ||
(workerLocation == Remote && addressInDbAndRemoteDc(address, dbInfo)) ||
(workerLocation == Primary && addressInDbAndPrimarySatelliteDc(address, dbInfo))) {
TraceEvent("HealthMonitorDetectRecentClosedPeer").suppressFor(30).detail("Peer", address);
req.degradedPeers.push_back(address);
}
@ -2095,6 +2183,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
[&req](const auto& p) { return p.second != req.storeType; }) ||
req.seedTag != invalidTag)) {
ASSERT(req.clusterId.isValid());
ASSERT(req.initialClusterVersion >= 0);
LocalLineage _;
getCurrentLineage()->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
bool isTss = req.tssPairIDAndVersion.present();
@ -2156,6 +2245,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
recruited,
req.seedTag,
req.clusterId,
req.initialClusterVersion,
isTss ? req.tssPairIDAndVersion.get().second : 0,
storageReady,
dbInfo,

View File

@ -62,8 +62,9 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
int64_t timeTravelTooOld = 0;
int64_t rowsRead = 0;
int64_t bytesRead = 0;
int64_t purges = 0;
std::vector<Future<Void>> clients;
bool enablePruning;
bool enablePurging;
DatabaseConfiguration config;
@ -79,7 +80,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
timeTravelLimit = getOption(options, LiteralStringRef("timeTravelLimit"), testDuration);
timeTravelBufferSize = getOption(options, LiteralStringRef("timeTravelBufferSize"), 100000000);
threads = getOption(options, LiteralStringRef("threads"), 1);
enablePruning = getOption(options, LiteralStringRef("enablePruning"), false /*sharedRandomNumber % 2 == 0*/);
enablePurging = getOption(options, LiteralStringRef("enablePurging"), false /*sharedRandomNumber % 2 == 0*/);
ASSERT(threads >= 1);
if (BGV_DEBUG) {
@ -177,60 +178,6 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
OldRead(KeyRange range, Version v, RangeResult oldResult) : range(range), v(v), oldResult(oldResult) {}
};
// utility to prune <range> at pruneVersion=<version> with the <force> flag
ACTOR Future<Void> pruneAtVersion(Database cx, KeyRange range, Version version, bool force) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state Key pruneKey;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Value pruneValue = blobGranulePruneValueFor(version, range, force);
tr->atomicOp(
addVersionStampAtEnd(blobGranulePruneKeys.begin), pruneValue, MutationRef::SetVersionstampedKey);
tr->set(blobGranulePruneChangeKey, deterministicRandom()->randomUniqueID().toString());
state Future<Standalone<StringRef>> fTrVs = tr->getVersionstamp();
wait(tr->commit());
Standalone<StringRef> vs = wait(fTrVs);
pruneKey = blobGranulePruneKeys.begin.withSuffix(vs);
if (BGV_DEBUG) {
fmt::print("pruneAtVersion for range [{0} - {1}) at version {2} succeeded\n",
range.begin.printable(),
range.end.printable(),
version);
}
break;
} catch (Error& e) {
if (BGV_DEBUG) {
fmt::print("pruneAtVersion for range [{0} - {1}) at version {2} encountered error {3}\n",
range.begin.printable(),
range.end.printable(),
version,
e.name());
}
wait(tr->onError(e));
}
}
tr->reset();
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> pruneVal = wait(tr->get(pruneKey));
if (!pruneVal.present()) {
return Void();
}
state Future<Void> watchFuture = tr->watch(pruneKey);
wait(tr->commit());
wait(watchFuture);
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> killBlobWorkers(Database cx, BlobGranuleVerifierWorkload* self) {
state Transaction tr(cx);
state std::set<UID> knownWorkers;
@ -272,12 +219,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
}
}
ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPruning) {
ACTOR Future<Void> verifyGranules(Database cx, BlobGranuleVerifierWorkload* self, bool allowPurging) {
state double last = now();
state double endTime = last + self->testDuration;
state std::map<double, OldRead> timeTravelChecks;
state int64_t timeTravelChecksMemory = 0;
state Version prevPruneVersion = -1;
state Version prevPurgeVersion = -1;
state UID dbgId = debugRandom()->randomUniqueID();
TraceEvent("BlobGranuleVerifierStart");
@ -300,25 +247,27 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
state OldRead oldRead = timeTravelIt->second;
timeTravelChecksMemory -= oldRead.oldResult.expectedSize();
timeTravelIt = timeTravelChecks.erase(timeTravelIt);
if (prevPruneVersion == -1) {
prevPruneVersion = oldRead.v;
if (prevPurgeVersion == -1) {
prevPurgeVersion = oldRead.v;
}
// advance iterator before doing read, so if it gets error we don't retry it
try {
state Version newPruneVersion = 0;
state bool doPruning = allowPruning && deterministicRandom()->random01() < 0.5;
if (doPruning) {
Version maxPruneVersion = oldRead.v;
state Version newPurgeVersion = 0;
state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
if (doPurging) {
Version maxPurgeVersion = oldRead.v;
for (auto& it : timeTravelChecks) {
maxPruneVersion = std::min(it.second.v, maxPruneVersion);
maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
}
if (prevPruneVersion < maxPruneVersion) {
newPruneVersion = deterministicRandom()->randomInt64(prevPruneVersion, maxPruneVersion);
prevPruneVersion = std::max(prevPruneVersion, newPruneVersion);
wait(self->pruneAtVersion(cx, normalKeys, newPruneVersion, false));
if (prevPurgeVersion < maxPurgeVersion) {
newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, false));
wait(cx->waitPurgeGranulesComplete(purgeKey));
self->purges++;
} else {
doPruning = false;
doPurging = false;
}
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> reReadResult =
@ -328,12 +277,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
}
self->timeTravelReads++;
if (doPruning) {
if (doPurging) {
wait(self->killBlobWorkers(cx, self));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPruneVersion));
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
try {
Version minSnapshotVersion = newPruneVersion;
Version minSnapshotVersion = newPurgeVersion;
for (auto& it : versionRead.second) {
minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
}
@ -395,10 +344,10 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
Future<Void> start(Database const& cx) override {
clients.reserve(threads + 1);
clients.push_back(timeout(findGranules(cx, this), testDuration, Void()));
if (enablePruning && clientId == 0) {
if (enablePurging && clientId == 0) {
clients.push_back(
timeout(reportErrors(verifyGranules(cx, this, true), "BlobGranuleVerifier"), testDuration, Void()));
} else if (!enablePruning) {
} else if (!enablePurging) {
for (int i = 0; i < threads; i++) {
clients.push_back(timeout(
reportErrors(verifyGranules(cx, this, false), "BlobGranuleVerifier"), testDuration, Void()));
@ -518,6 +467,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
fmt::print(" {} time travel reads\n", self->timeTravelReads);
fmt::print(" {} rows\n", self->rowsRead);
fmt::print(" {} bytes\n", self->bytesRead);
fmt::print(" {} purges\n", self->purges);
// FIXME: add above as details to trace event
TraceEvent("BlobGranuleVerifierChecked").detail("Result", result);

View File

@ -227,7 +227,8 @@ struct ConfigureDatabaseWorkload : TestWorkload {
double testDuration;
int additionalDBs;
bool allowDescriptorChange;
bool allowTestStorageMigration;
bool allowTestStorageMigration; // allow change storage migration and perpetual wiggle conf
bool storageMigrationCompatibleConf; // only allow generating configuration suitable for storage migration test
bool waitStoreTypeCheck;
bool downgradeTest1; // if this is true, don't pick up downgrade incompatible config
std::vector<Future<Void>> clients;
@ -239,6 +240,7 @@ struct ConfigureDatabaseWorkload : TestWorkload {
getOption(options, LiteralStringRef("allowDescriptorChange"), SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
allowTestStorageMigration =
getOption(options, "allowTestStorageMigration"_sr, false) && g_simulator.allowStorageMigrationTypeChange;
storageMigrationCompatibleConf = getOption(options, "storageMigrationCompatibleConf"_sr, false);
waitStoreTypeCheck = getOption(options, "waitStoreTypeCheck"_sr, false);
downgradeTest1 = getOption(options, "downgradeTest1"_sr, false);
g_simulator.usableRegions = 1;
@ -349,7 +351,11 @@ struct ConfigureDatabaseWorkload : TestWorkload {
}
state int randomChoice;
if (self->allowTestStorageMigration) {
randomChoice = deterministicRandom()->randomInt(4, 9);
randomChoice = (deterministicRandom()->random01() < 0.375) ? deterministicRandom()->randomInt(0, 3)
: deterministicRandom()->randomInt(4, 9);
} else if (self->storageMigrationCompatibleConf) {
randomChoice = (deterministicRandom()->random01() < 3.0 / 7) ? deterministicRandom()->randomInt(0, 3)
: deterministicRandom()->randomInt(5, 9);
} else {
randomChoice = deterministicRandom()->randomInt(0, 8);
}

View File

@ -154,9 +154,13 @@ struct DiskFailureInjectionWorkload : TestWorkload {
loop {
wait(poisson(&lastTime, 1));
try {
wait(store(machines, getStorageWorkers(cx, self->dbInfo, false)));
std::pair<std::vector<W>, int> m = wait(getStorageWorkers(cx, self->dbInfo, false));
if (m.second > 0) {
throw operation_failed();
}
machines = std::move(m.first);
} catch (Error& e) {
// If we failed to get a list of storage servers, we can't inject failure events
// If we failed to get a complete list of storage servers, we can't inject failure events
// But don't throw the error in that case
continue;
}

View File

@ -20,6 +20,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/NativeAPI.actor.h"
#include "flow/EncryptUtils.h"
#include "flow/IRandom.h"
#include "flow/BlobCipher.h"
#include "fdbserver/workloads/workloads.actor.h"
@ -116,9 +117,10 @@ struct EncryptionOpsWorkload : TestWorkload {
Arena arena;
std::unique_ptr<WorkloadMetrics> metrics;
BlobCipherDomainId minDomainId;
BlobCipherDomainId maxDomainId;
BlobCipherBaseKeyId minBaseCipherId;
EncryptCipherDomainId minDomainId;
EncryptCipherDomainId maxDomainId;
EncryptCipherBaseKeyId minBaseCipherId;
EncryptCipherBaseKeyId headerBaseCipherId;
EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
mode = getOption(options, LiteralStringRef("fixedSize"), 1);
@ -131,6 +133,7 @@ struct EncryptionOpsWorkload : TestWorkload {
minDomainId = wcx.clientId * 100 + mode * 30 + 1;
maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
minBaseCipherId = 100;
headerBaseCipherId = wcx.clientId * 100 + 1;
metrics = std::make_unique<WorkloadMetrics>();
@ -167,17 +170,21 @@ struct EncryptionOpsWorkload : TestWorkload {
uint8_t buff[AES_256_KEY_LENGTH];
std::vector<Reference<BlobCipherKey>> cipherKeys;
for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
int cipherLen = 0;
int cipherLen = 0;
for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) {
generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen);
ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH);
cipherKeys = cipherKeyCache.getAllCiphers(id);
ASSERT(cipherKeys.size() == 1);
ASSERT_EQ(cipherKeys.size(), 1);
}
// insert the Encrypt Header cipherKey
generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen);
cipherKeyCache.insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId, buff, cipherLen);
TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId);
}
@ -188,10 +195,10 @@ struct EncryptionOpsWorkload : TestWorkload {
TraceEvent("ResetCipherEssentials_Done").log();
}
void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId,
void updateLatestBaseCipher(const EncryptCipherDomainId encryptDomainId,
uint8_t* baseCipher,
int* baseCipherLen,
BlobCipherBaseKeyId* nextBaseCipherId) {
EncryptCipherBaseKeyId* nextBaseCipherId) {
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
*nextBaseCipherId = cipherKey->getBaseCipherId() + 1;
@ -202,22 +209,24 @@ struct EncryptionOpsWorkload : TestWorkload {
TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId);
}
Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> key,
Reference<EncryptBuf> doEncryption(Reference<BlobCipherKey> textCipherKey,
Reference<BlobCipherKey> headerCipherKey,
uint8_t* payload,
int len,
const EncryptAuthTokenMode authMode,
BlobCipherEncryptHeader* header) {
uint8_t iv[AES_256_IV_LENGTH];
generateRandomData(&iv[0], AES_256_IV_LENGTH);
EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH);
EncryptBlobCipherAes265Ctr encryptor(textCipherKey, headerCipherKey, &iv[0], AES_256_IV_LENGTH, authMode);
auto start = std::chrono::high_resolution_clock::now();
Reference<EncryptBuf> encrypted = encryptor.encrypt(payload, len, header, arena);
auto end = std::chrono::high_resolution_clock::now();
// validate encrypted buffer size and contents (not matching with plaintext)
ASSERT(encrypted->getLogicalSize() == len);
ASSERT(memcmp(encrypted->begin(), payload, len) != 0);
ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(encrypted->getLogicalSize(), len);
ASSERT_NE(memcmp(encrypted->begin(), payload, len), 0);
ASSERT_EQ(header->flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
metrics->updateEncryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
return encrypted;
@ -228,23 +237,30 @@ struct EncryptionOpsWorkload : TestWorkload {
const BlobCipherEncryptHeader& header,
uint8_t* originalPayload,
Reference<BlobCipherKey> orgCipherKey) {
ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
header.cipherTextDetails.baseCipherId);
Reference<BlobCipherKey> headerCipherKey = cipherKeyCache.getCipherKey(
header.cipherHeaderDetails.encryptDomainId, header.cipherHeaderDetails.baseCipherId);
ASSERT(cipherKey.isValid());
ASSERT(cipherKey->isEqual(orgCipherKey));
DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]);
DecryptBlobCipherAes256Ctr decryptor(cipherKey, headerCipherKey, &header.cipherTextDetails.iv[0]);
const bool validateHeaderAuthToken = deterministicRandom()->randomInt(0, 100) < 65;
auto start = std::chrono::high_resolution_clock::now();
if (validateHeaderAuthToken) {
decryptor.verifyHeaderAuthToken(header, arena);
}
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena);
auto end = std::chrono::high_resolution_clock::now();
// validate decrypted buffer size and contents (matching with original plaintext)
ASSERT(decrypted->getLogicalSize() == len);
ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0);
ASSERT_EQ(decrypted->getLogicalSize(), len);
ASSERT_EQ(memcmp(decrypted->begin(), originalPayload, len), 0);
metrics->updateDecryptionTime(std::chrono::duration<double, std::nano>(end - start).count());
}
@ -256,7 +272,7 @@ struct EncryptionOpsWorkload : TestWorkload {
Future<Void> start(Database const& cx) override {
uint8_t baseCipher[AES_256_KEY_LENGTH];
int baseCipherLen = 0;
BlobCipherBaseKeyId nextBaseCipherId;
EncryptCipherBaseKeyId nextBaseCipherId;
// Setup encryptDomainIds and corresponding baseCipher details
setupCipherEssentials();
@ -268,7 +284,7 @@ struct EncryptionOpsWorkload : TestWorkload {
auto& cipherKeyCache = BlobCipherKeyCache::getInstance();
// randomly select a domainId
const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
const EncryptCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId);
ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId);
if (updateBaseCipher) {
@ -279,14 +295,17 @@ struct EncryptionOpsWorkload : TestWorkload {
auto start = std::chrono::high_resolution_clock::now();
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId);
// Each client working with their own version of encryptHeaderCipherKey, avoid using getLatest()
Reference<BlobCipherKey> headerCipherKey =
cipherKeyCache.getCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId);
auto end = std::chrono::high_resolution_clock::now();
metrics->updateKeyDerivationTime(std::chrono::duration<double, std::nano>(end - start).count());
// Validate sanity of "getLatestCipher", especially when baseCipher gets updated
if (updateBaseCipher) {
ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId);
ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen);
ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0);
ASSERT_EQ(cipherKey->getBaseCipherId(), nextBaseCipherId);
ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipherLen);
ASSERT_EQ(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen), 0);
}
int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize);
@ -294,8 +313,12 @@ struct EncryptionOpsWorkload : TestWorkload {
// Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later
BlobCipherEncryptHeader header;
const EncryptAuthTokenMode authMode = deterministicRandom()->randomInt(0, 100) < 50
? ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
: ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI;
try {
Reference<EncryptBuf> encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header);
Reference<EncryptBuf> encrypted =
doEncryption(cipherKey, headerCipherKey, buff.get(), dataLen, authMode, &header);
// Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and
// decrypt
@ -303,7 +326,8 @@ struct EncryptionOpsWorkload : TestWorkload {
} catch (Error& e) {
TraceEvent("Failed")
.detail("DomainId", encryptDomainId)
.detail("BaseCipherId", cipherKey->getBaseCipherId());
.detail("BaseCipherId", cipherKey->getBaseCipherId())
.detail("AuthMode", authMode);
throw;
}

View File

@ -236,7 +236,8 @@ Future<Void> quietDatabase(Database const& cx,
int64_t maxTLogQueueGate = 5e6,
int64_t maxStorageServerQueueGate = 5e6,
int64_t maxDataDistributionQueueSize = 0,
int64_t maxPoppedVersionLag = 30e6);
int64_t maxPoppedVersionLag = 30e6,
int64_t maxVersionOffset = 1e6);
/**
* A utility function for testing error situations. It succeeds if the given test

View File

@ -19,6 +19,7 @@
*/
#include "flow/BlobCipher.h"
#include "flow/EncryptUtils.h"
#include "flow/Error.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
@ -29,21 +30,23 @@
#include <cstring>
#include <memory>
#include <string>
#if ENCRYPTION_ENABLED
// BlobCipherEncryptHeader
BlobCipherEncryptHeader::BlobCipherEncryptHeader() {
flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE;
namespace {
bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) {
return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST;
}
} // namespace
// BlobCipherKey class methods
BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCiphId,
BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId,
const EncryptCipherBaseKeyId& baseCiphId,
const uint8_t* baseCiph,
int baseCiphLen) {
BlobCipherRandomSalt salt;
EncryptCipherRandomSalt salt;
if (g_network->isSimulated()) {
salt = deterministicRandom()->randomUInt64();
} else {
@ -58,11 +61,11 @@ BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId,
.detail("CreationTime", creationTime);*/
}
void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,
void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId,
const uint8_t* baseCiph,
int baseCiphLen,
const BlobCipherBaseKeyId& baseCiphId,
const BlobCipherRandomSalt& salt) {
const EncryptCipherBaseKeyId& baseCiphId,
const EncryptCipherRandomSalt& salt) {
// Set the base encryption key properties
baseCipher = std::make_unique<uint8_t[]>(AES_256_KEY_LENGTH);
memset(baseCipher.get(), 0, AES_256_KEY_LENGTH);
@ -82,11 +85,11 @@ void BlobCipherKey::initKey(const BlobCipherDomainId& domainId,
void BlobCipherKey::applyHmacSha256Derivation() {
Arena arena;
uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)];
uint8_t buf[baseCipherLen + sizeof(EncryptCipherRandomSalt)];
memcpy(&buf[0], baseCipher.get(), baseCipherLen);
memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt));
memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt));
HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen);
StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena);
StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), arena);
std::copy(digest.begin(), digest.end(), cipher.get());
if (digest.size() < AES_256_KEY_LENGTH) {
memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size());
@ -101,10 +104,10 @@ void BlobCipherKey::reset() {
// BlobKeyIdCache class methods
BlobCipherKeyIdCache::BlobCipherKeyIdCache()
: domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {}
: domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {}
BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId)
: domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {
BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId)
: domainId(dId), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {
TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId);
}
@ -112,7 +115,7 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getLatestCipherKey() {
return getCipherByBaseCipherId(latestBaseCipherKeyId);
}
Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) {
Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId) {
BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId);
if (itr == keyIdCache.end()) {
throw encrypt_key_not_found();
@ -120,10 +123,10 @@ Reference<BlobCipherKey> BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCiphe
return itr->second;
}
void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId,
void BlobCipherKeyIdCache::insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId,
const uint8_t* baseCipher,
int baseCipherLen) {
ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID);
ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID);
// BaseCipherKeys are immutable, ensure that cached value doesn't get updated.
BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId);
@ -165,11 +168,11 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyIdCache::getAllCipherKeys() {
// BlobCipherKeyCache class methods
void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCipherId,
void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId,
const EncryptCipherBaseKeyId& baseCipherId,
const uint8_t* baseCipher,
int baseCipherLen) {
if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) {
if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) {
throw encrypt_invalid_id();
}
@ -193,7 +196,7 @@ void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId,
}
}
Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) {
Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId);
@ -212,8 +215,8 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getLatestCipherKey(const BlobCipher
return cipherKey;
}
Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCipherId) {
Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const EncryptCipherDomainId& domainId,
const EncryptCipherBaseKeyId& baseCipherId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
throw encrypt_key_not_found();
@ -223,7 +226,7 @@ Reference<BlobCipherKey> BlobCipherKeyCache::getCipherKey(const BlobCipherDomain
return keyIdCache->getCipherByBaseCipherId(baseCipherId);
}
void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) {
void BlobCipherKeyCache::resetEncyrptDomainId(const EncryptCipherDomainId domainId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
throw encrypt_key_not_found();
@ -245,7 +248,7 @@ void BlobCipherKeyCache::cleanup() noexcept {
instance.domainCacheMap.clear();
}
std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) {
std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const EncryptCipherDomainId& domainId) {
auto domainItr = domainCacheMap.find(domainId);
if (domainItr == domainCacheMap.end()) {
return {};
@ -255,13 +258,17 @@ std::vector<Reference<BlobCipherKey>> BlobCipherKeyCache::getAllCiphers(const Bl
return keyIdCache->getAllCipherKeys();
}
// EncryptBlobCipher class methods
// EncryptBlobCipherAes265Ctr class methods
EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key,
EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> tCipherKey,
Reference<BlobCipherKey> hCipherKey,
const uint8_t* cipherIV,
const int ivLen)
: ctx(EVP_CIPHER_CTX_new()), cipherKey(key) {
ASSERT(ivLen == AES_256_IV_LENGTH);
const int ivLen,
const EncryptAuthTokenMode mode)
: ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode) {
ASSERT(isEncryptHeaderAuthTokenModeValid(mode));
ASSERT_EQ(ivLen, AES_256_IV_LENGTH);
memcpy(&iv[0], cipherIV, ivLen);
if (ctx == nullptr) {
@ -270,7 +277,7 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey>
if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) {
throw encrypt_ops_error();
}
if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) {
if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, textCipherKey.getPtr()->data(), cipherIV) != 1) {
throw encrypt_ops_error();
}
}
@ -281,21 +288,29 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
Arena& arena) {
TEST(true); // Encrypting data with BlobCipher
Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(plaintextLen + AES_BLOCK_SIZE, arena);
memset(reinterpret_cast<uint8_t*>(header), 0, sizeof(BlobCipherEncryptHeader));
// Alloc buffer computation accounts for 'header authentication' generation scheme. If single-auth-token needs to be
// generated, allocate buffer sufficient to append header to the cipherText optimizing memcpy cost.
const int allocSize = authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
? plaintextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader)
: plaintextLen + AES_BLOCK_SIZE;
Reference<EncryptBuf> encryptBuf = makeReference<EncryptBuf>(allocSize, arena);
uint8_t* ciphertext = encryptBuf->begin();
int bytes{ 0 };
if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) {
TraceEvent("Encrypt_UpdateFailed")
.detail("BaseCipherId", cipherKey->getBaseCipherId())
.detail("EncryptDomainId", cipherKey->getDomainId());
.detail("BaseCipherId", textCipherKey->getBaseCipherId())
.detail("EncryptDomainId", textCipherKey->getDomainId());
throw encrypt_ops_error();
}
int finalBytes{ 0 };
if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) {
TraceEvent("Encrypt_FinalFailed")
.detail("BaseCipherId", cipherKey->getBaseCipherId())
.detail("EncryptDomainId", cipherKey->getDomainId());
.detail("BaseCipherId", textCipherKey->getBaseCipherId())
.detail("EncryptDomainId", textCipherKey->getDomainId());
throw encrypt_ops_error();
}
@ -306,19 +321,57 @@ Reference<EncryptBuf> EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte
throw encrypt_ops_error();
}
// populate header details for the encrypted blob.
// Populate encryption header flags details
header->flags.size = sizeof(BlobCipherEncryptHeader);
header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION;
header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR;
header->baseCipherId = cipherKey->getBaseCipherId();
header->encryptDomainId = cipherKey->getDomainId();
header->salt = cipherKey->getSalt();
memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH);
header->flags.encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR;
header->flags.authTokenMode = authTokenMode;
// Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip
// scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM.
// Populate cipherText encryption-key details
header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId();
header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId();
header->cipherTextDetails.salt = textCipherKey->getSalt();
memcpy(&header->cipherTextDetails.iv[0], &iv[0], AES_256_IV_LENGTH);
header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena);
if (authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) {
// No header 'authToken' generation needed.
} else {
// Populate header encryption-key details
header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId();
header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId();
// Populate header authToken details
if (header->flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) {
ASSERT_GE(allocSize, (bytes + finalBytes + sizeof(BlobCipherEncryptHeader)));
ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes + sizeof(BlobCipherEncryptHeader)));
memcpy(&ciphertext[bytes + finalBytes],
reinterpret_cast<const uint8_t*>(header),
sizeof(BlobCipherEncryptHeader));
StringRef authToken = computeAuthToken(ciphertext,
bytes + finalBytes + sizeof(BlobCipherEncryptHeader),
headerCipherKey->rawCipher(),
AES_256_KEY_LENGTH,
arena);
memcpy(&header->singleAuthToken.authToken[0], authToken.begin(), AUTH_TOKEN_SIZE);
} else {
ASSERT_EQ(header->flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
StringRef cipherTextAuthToken =
computeAuthToken(ciphertext,
bytes + finalBytes,
reinterpret_cast<const uint8_t*>(&header->cipherTextDetails.salt),
sizeof(EncryptCipherRandomSalt),
arena);
memcpy(&header->multiAuthTokens.cipherTextAuthToken[0], cipherTextAuthToken.begin(), AUTH_TOKEN_SIZE);
StringRef headerAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(header),
sizeof(BlobCipherEncryptHeader),
headerCipherKey->rawCipher(),
AES_256_KEY_LENGTH,
arena);
memcpy(&header->multiAuthTokens.headerAuthToken[0], headerAuthToken.begin(), AUTH_TOKEN_SIZE);
}
}
encryptBuf->setLogicalSize(plaintextLen);
return encryptBuf;
@ -330,45 +383,137 @@ EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() {
}
}
// DecryptBlobCipher class methods
// DecryptBlobCipherAes256Ctr class methods
DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv)
: ctx(EVP_CIPHER_CTX_new()) {
DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> tCipherKey,
Reference<BlobCipherKey> hCipherKey,
const uint8_t* iv)
: ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey),
headerAuthTokenValidationDone(false), authTokensValidationDone(false) {
if (ctx == nullptr) {
throw encrypt_ops_error();
}
if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) {
throw encrypt_ops_error();
}
if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) {
if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, tCipherKey.getPtr()->data(), iv)) {
throw encrypt_ops_error();
}
}
void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena) {
// validate header flag sanity
if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) {
TraceEvent("VerifyEncryptBlobHeader")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
.detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
throw encrypt_header_metadata_mismatch();
void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena) {
if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI) {
// NoneAuthToken mode; no authToken is generated; nothing to do
// SingleAuthToken mode; verification will happen as part of decryption.
return;
}
// encrypted byte checksum sanity; protection against data bit-rot/flip.
BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena);
if (computed != header.ciphertextChecksum) {
TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch")
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
BlobCipherEncryptHeader headerCopy;
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
memset(reinterpret_cast<uint8_t*>(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_SIZE);
StringRef computedHeaderAuthToken = computeAuthToken(reinterpret_cast<const uint8_t*>(&headerCopy),
sizeof(BlobCipherEncryptHeader),
headerCipherKey->rawCipher(),
AES_256_KEY_LENGTH,
arena);
if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) {
TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("CiphertextChecksum", header.ciphertextChecksum)
.detail("ComputedCiphertextChecksum", computed);
throw encrypt_header_checksum_mismatch();
.detail("MultiAuthHeaderAuthToken",
StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_SIZE).toString())
.detail("ComputedHeaderAuthToken", computedHeaderAuthToken.toString());
throw encrypt_header_authtoken_mismatch();
}
headerAuthTokenValidationDone = true;
}
void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
uint8_t* buff,
Arena& arena) {
// Header authToken not set for single auth-token mode.
ASSERT(!headerAuthTokenValidationDone);
// prepare the payload {cipherText + encryptionHeader}
memcpy(&buff[0], ciphertext, ciphertextLen);
memcpy(&buff[ciphertextLen], reinterpret_cast<const uint8_t*>(&header), sizeof(BlobCipherEncryptHeader));
// ensure the 'authToken' is reset before computing the 'authentication token'
BlobCipherEncryptHeader* eHeader = (BlobCipherEncryptHeader*)(&buff[ciphertextLen]);
memset(reinterpret_cast<uint8_t*>(&eHeader->singleAuthToken), 0, 2 * AUTH_TOKEN_SIZE);
StringRef computed = computeAuthToken(
buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena);
if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) {
TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("SingleAuthToken",
StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString())
.detail("ComputedSingleAuthToken", computed.toString());
throw encrypt_header_authtoken_mismatch();
}
}
void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
uint8_t* buff,
Arena& arena) {
if (!headerAuthTokenValidationDone) {
verifyHeaderAuthToken(header, arena);
}
StringRef computedCipherTextAuthToken =
computeAuthToken(ciphertext,
ciphertextLen,
reinterpret_cast<const uint8_t*>(&header.cipherTextDetails.salt),
sizeof(EncryptCipherRandomSalt),
arena);
if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) !=
0) {
TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderMode", header.flags.encryptMode)
.detail("MultiAuthCipherTextAuthToken",
StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_SIZE).toString())
.detail("ComputedCipherTextAuthToken", computedCipherTextAuthToken.toString());
throw encrypt_header_authtoken_mismatch();
}
}
void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
uint8_t* buff,
Arena& arena) {
if (header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) {
verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, buff, arena);
} else {
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, buff, arena);
}
authTokensValidationDone = true;
}
void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header) {
// validate header flag sanity
if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION ||
header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR ||
!isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) {
TraceEvent("VerifyEncryptBlobHeader")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION)
.detail("EncryptCipherMode", header.flags.encryptMode)
.detail("ExpectedCipherMode", ENCRYPT_CIPHER_MODE_AES_256_CTR)
.detail("EncryptHeaderAuthTokenMode", header.flags.authTokenMode);
throw encrypt_header_metadata_mismatch();
}
}
@ -378,23 +523,37 @@ Reference<EncryptBuf> DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert
Arena& arena) {
TEST(true); // Decrypting data with BlobCipher
verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena);
verifyEncryptHeaderMetadata(header);
if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) {
TraceEvent("Decrypt_InvalidHeaderCipherKey").detail("AuthTokenMode", header.flags.authTokenMode);
throw encrypt_ops_error();
}
const int allocSize = header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE
? ciphertextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader)
: ciphertextLen + AES_BLOCK_SIZE;
Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(allocSize, arena);
if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) {
verifyAuthTokens(ciphertext, ciphertextLen, header, decrypted->begin(), arena);
ASSERT(authTokensValidationDone);
}
Reference<EncryptBuf> decrypted = makeReference<EncryptBuf>(ciphertextLen + AES_BLOCK_SIZE, arena);
uint8_t* plaintext = decrypted->begin();
int bytesDecrypted{ 0 };
if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) {
TraceEvent("Decrypt_UpdateFailed")
.detail("BaseCipherId", header.baseCipherId)
.detail("EncryptDomainId", header.encryptDomainId);
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
throw encrypt_ops_error();
}
int finalBlobBytes{ 0 };
if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) {
TraceEvent("Decrypt_FinalFailed")
.detail("BaseCipherId", header.baseCipherId)
.detail("EncryptDomainId", header.encryptDomainId);
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId);
throw encrypt_ops_error();
}
@ -443,6 +602,18 @@ StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Are
return StringRef(digest, digestLen);
}
StringRef computeAuthToken(const uint8_t* payload,
const int payloadLen,
const uint8_t* key,
const int keyLen,
Arena& arena) {
HmacSha256DigestGen hmacGenerator(key, keyLen);
StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
ASSERT_GE(digest.size(), AUTH_TOKEN_SIZE);
return digest;
}
// Only used to link unit tests
void forceLinkBlobCipherTests() {}
@ -453,41 +624,42 @@ void forceLinkBlobCipherTests() {}
// 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired.
// 5. Validation encryption ops (correctness):
// 5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents.
// 5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc.
// 5.2. Simulate anomalies such as: EncyrptionHeader corruption, authToken mismatch / encryptionMode mismatch etc.
// 6. Cache cleanup
// 6.1 cleanup cipherKeys by given encryptDomainId
// 6.2. Cleanup all cached cipherKeys
TEST_CASE("flow/BlobCipher") {
TraceEvent("BlobCipherTest_Start").log();
// Construct a dummy External Key Manager representation and populate with some keys
class BaseCipher : public ReferenceCounted<BaseCipher>, NonCopyable {
public:
BlobCipherDomainId domainId;
EncryptCipherDomainId domainId;
int len;
BlobCipherBaseKeyId keyId;
EncryptCipherBaseKeyId keyId;
std::unique_ptr<uint8_t[]> key;
BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId)
BaseCipher(const EncryptCipherDomainId& dId, const EncryptCipherBaseKeyId& kId)
: domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)),
keyId(kId), key(std::make_unique<uint8_t[]>(len)) {
generateRandomData(key.get(), len);
}
};
using BaseKeyMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BaseCipher>>;
using DomainKeyMap = std::unordered_map<BlobCipherDomainId, BaseKeyMap>;
using BaseKeyMap = std::unordered_map<EncryptCipherBaseKeyId, Reference<BaseCipher>>;
using DomainKeyMap = std::unordered_map<EncryptCipherDomainId, BaseKeyMap>;
DomainKeyMap domainKeyMap;
const BlobCipherDomainId minDomainId = 1;
const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
const BlobCipherBaseKeyId minBaseCipherKeyId = 100;
const BlobCipherBaseKeyId maxBaseCipherKeyId =
const EncryptCipherDomainId minDomainId = 1;
const EncryptCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5;
const EncryptCipherBaseKeyId minBaseCipherKeyId = 100;
const EncryptCipherBaseKeyId maxBaseCipherKeyId =
deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15;
for (int dId = minDomainId; dId <= maxDomainId; dId++) {
for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) {
domainKeyMap[dId].emplace(kId, makeReference<BaseCipher>(dId, kId));
}
}
ASSERT(domainKeyMap.size() == maxDomainId);
ASSERT_EQ(domainKeyMap.size(), maxDomainId);
// insert BlobCipher keys into BlobCipherKeyCache map and validate
TraceEvent("BlobCipherTest_InsertKeys").log();
@ -500,6 +672,11 @@ TEST_CASE("flow/BlobCipher") {
baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len);
}
}
// insert EncryptHeader BlobCipher key
Reference<BaseCipher> headerBaseCipher = makeReference<BaseCipher>(ENCRYPT_HEADER_DOMAIN_ID, 1);
cipherKeyCache.insertCipherKey(
headerBaseCipher->domainId, headerBaseCipher->keyId, headerBaseCipher->key.get(), headerBaseCipher->len);
TraceEvent("BlobCipherTest_InsertKeysDone").log();
// validate the cipherKey lookups work as desired
@ -509,13 +686,13 @@ TEST_CASE("flow/BlobCipher") {
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId);
ASSERT(cipherKey.isValid());
// validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher
ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId);
ASSERT(cipherKey->getDomainId() == baseCipher->domainId);
ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len);
ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipher->keyId);
ASSERT_EQ(cipherKey->getDomainId(), baseCipher->domainId);
ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipher->len);
// ensure that baseCipher matches with the cached information
ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0);
ASSERT_EQ(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
// validate the encryption derivation
ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0);
ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0);
}
}
TraceEvent("BlobCipherTest_LooksupDone").log();
@ -548,6 +725,7 @@ TEST_CASE("flow/BlobCipher") {
// Validate Encyrption ops
Reference<BlobCipherKey> cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId);
Reference<BlobCipherKey> headerCipherKey = cipherKeyCache.getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID);
const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512;
uint8_t orgData[bufLen];
generateRandomData(&orgData[0], bufLen);
@ -556,68 +734,317 @@ TEST_CASE("flow/BlobCipher") {
uint8_t iv[AES_256_IV_LENGTH];
generateRandomData(&iv[0], AES_256_IV_LENGTH);
// validate basic encrypt followed by decrypt operation
EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
BlobCipherEncryptHeader headerCopy;
// validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE
{
TraceEvent("NoneAuthMode_Start").log();
ASSERT(encrypted->getLogicalSize() == bufLen);
ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0);
ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR);
EncryptBlobCipherAes265Ctr encryptor(
cipherKey, Reference<BlobCipherKey>(), iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
TraceEvent("BlobCipherTest_EncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("DomainId", header.encryptDomainId)
.detail("BaseCipherId", header.baseCipherId)
.detail("HeaderChecksum", header.ciphertextChecksum);
ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE);
Reference<BlobCipherKey> encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId);
ASSERT(encyrptKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
TraceEvent("BlobCipherTest_EncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("DomainId", header.cipherTextDetails.encryptDomainId)
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId);
ASSERT(decrypted->getLogicalSize() == bufLen);
ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0);
Reference<BlobCipherKey> tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
header.cipherTextDetails.baseCipherId);
ASSERT(tCipherKeyKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(
tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
TraceEvent("BlobCipherTest_DecryptDone").log();
ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
// induce encryption header corruption - headerVersion corrupted
header.flags.headerVersion += 1;
try {
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
TraceEvent("BlobCipherTest_DecryptDone").log();
// induce encryption header corruption - headerVersion corrupted
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.headerVersion += 1;
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
DecryptBlobCipherAes256Ctr decryptor(
tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
header.flags.headerVersion -= 1;
// induce encryption header corruption - encryptionMode corrupted
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.encryptMode += 1;
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
DecryptBlobCipherAes256Ctr decryptor(
tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encrypted buffer payload corruption
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
uint8_t temp[bufLen];
memcpy(encrypted->begin(), &temp[0], bufLen);
int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
temp[tIdx] += 1;
DecryptBlobCipherAes256Ctr decryptor(
tCipherKeyKey, Reference<BlobCipherKey>(), &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
} catch (Error& e) {
// No authToken, hence, no corruption detection supported
ASSERT(false);
}
TraceEvent("NoneAuthMode_Done").log();
}
// induce encryption header corruption - encryptionMode corrupted
header.flags.encryptMode += 1;
try {
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE
{
TraceEvent("SingleAuthMode_Start").log();
EncryptBlobCipherAes265Ctr encryptor(
cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
TraceEvent("BlobCipherTest_EncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("DomainId", header.cipherTextDetails.encryptDomainId)
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("HeaderAuthToken",
StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString());
Reference<BlobCipherKey> tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
header.cipherTextDetails.baseCipherId);
Reference<BlobCipherKey> hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId,
header.cipherHeaderDetails.baseCipherId);
ASSERT(tCipherKeyKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
TraceEvent("BlobCipherTest_DecryptDone").log();
// induce encryption header corruption - headerVersion corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.headerVersion += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
header.flags.encryptMode -= 1;
// induce encryption header corruption - encryptionMode corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.encryptMode += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encryption header corruption - authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
headerCopy.singleAuthToken.authToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
// induce encrypted buffer payload corruption
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
uint8_t temp[bufLen];
memcpy(encrypted->begin(), &temp[0], bufLen);
int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
temp[tIdx] += 1;
DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
TraceEvent("SingleAuthMode_Done").log();
}
// induce encryption header corruption - checksum mismatch
header.ciphertextChecksum += 1;
try {
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_checksum_mismatch) {
throw;
// validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI
{
TraceEvent("MultiAuthMode_Start").log();
EncryptBlobCipherAes265Ctr encryptor(
cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
BlobCipherEncryptHeader header;
Reference<EncryptBuf> encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
ASSERT_EQ(encrypted->getLogicalSize(), bufLen);
ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0);
ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION);
ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR);
ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI);
TraceEvent("BlobCipherTest_EncryptDone")
.detail("HeaderVersion", header.flags.headerVersion)
.detail("HeaderEncryptMode", header.flags.encryptMode)
.detail("DomainId", header.cipherTextDetails.encryptDomainId)
.detail("BaseCipherId", header.cipherTextDetails.baseCipherId)
.detail("HeaderAuthToken",
StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString());
Reference<BlobCipherKey> tCipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId,
header.cipherTextDetails.baseCipherId);
Reference<BlobCipherKey> hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId,
header.cipherHeaderDetails.baseCipherId);
ASSERT(tCipherKey->isEqual(cipherKey));
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
Reference<EncryptBuf> decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena);
ASSERT_EQ(decrypted->getLogicalSize(), bufLen);
ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0);
TraceEvent("BlobCipherTest_DecryptDone").log();
// induce encryption header corruption - headerVersion corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.headerVersion += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
header.ciphertextChecksum -= 1;
// induce encryption header corruption - encryptionMode corrupted
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
headerCopy.flags.encryptMode += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_metadata_mismatch) {
throw;
}
}
// induce encryption header corruption - cipherText authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
// induce encryption header corruption - header authToken mismatch
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
memcpy(reinterpret_cast<uint8_t*>(&headerCopy),
reinterpret_cast<const uint8_t*>(&header),
sizeof(BlobCipherEncryptHeader));
hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1);
headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1;
try {
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena);
ASSERT(false); // error expected
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
try {
encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena);
uint8_t temp[bufLen];
memcpy(encrypted->begin(), &temp[0], bufLen);
int tIdx = deterministicRandom()->randomInt(0, bufLen - 1);
temp[tIdx] += 1;
DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]);
decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena);
} catch (Error& e) {
if (e.code() != error_code_encrypt_header_authtoken_mismatch) {
throw;
}
}
TraceEvent("MultiAuthMode_Done").log();
}
// Validate dropping encyrptDomainId cached keys
const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId);
cipherKeyCache.resetEncyrptDomainId(candidate);
std::vector<Reference<BlobCipherKey>> cachedKeys = cipherKeyCache.getAllCiphers(candidate);
ASSERT(cachedKeys.empty());
@ -633,20 +1060,4 @@ TEST_CASE("flow/BlobCipher") {
return Void();
}
BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
const int payloadLen,
const BlobCipherRandomSalt& salt,
Arena& arena) {
// FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum
// Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest.
HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt));
StringRef digest = hmacGenerator.digest(payload, payloadLen, arena);
ASSERT(digest.size() >= sizeof(BlobCipherChecksum));
BlobCipherChecksum checksum;
memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum));
return checksum;
}
#endif // ENCRYPTION_ENABLED

View File

@ -33,6 +33,7 @@
#if ENCRYPTION_ENABLED
#include "flow/Arena.h"
#include "flow/EncryptUtils.h"
#include "flow/FastRef.h"
#include "flow/flow.h"
#include "flow/xxhash.h"
@ -45,15 +46,6 @@
#define AES_256_KEY_LENGTH 32
#define AES_256_IV_LENGTH 16
#define INVALID_DOMAIN_ID 0
#define INVALID_CIPHER_KEY_ID 0
using BlobCipherDomainId = uint64_t;
using BlobCipherRandomSalt = uint64_t;
using BlobCipherBaseKeyId = uint64_t;
using BlobCipherChecksum = uint64_t;
typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode;
// Encryption operations buffer management
// Approach limits number of copies needed during encryption or decryption operations.
@ -89,51 +81,94 @@ private:
// This header is persisted along with encrypted buffer, it contains information necessary
// to assist decrypting the buffers to serve read requests.
//
// The total space overhead is 56 bytes.
// The total space overhead is 96 bytes.
#pragma pack(push, 1) // exact fit - no padding
typedef struct BlobCipherEncryptHeader {
static constexpr int headerSize = 96;
union {
struct {
uint8_t size; // reading first byte is sufficient to determine header
// length. ALWAYS THE FIRST HEADER ELEMENT.
uint8_t headerVersion{};
uint8_t encryptMode{};
uint8_t _reserved[5]{};
uint8_t authTokenMode{};
uint8_t _reserved[4]{};
} flags;
uint64_t _padding{};
};
// Encyrption domain boundary identifier.
BlobCipherDomainId encryptDomainId{};
// BaseCipher encryption key identifier
BlobCipherBaseKeyId baseCipherId{};
// Random salt
BlobCipherRandomSalt salt{};
// Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'.
BlobCipherChecksum ciphertextChecksum{};
// Initialization vector used to encrypt the payload.
uint8_t iv[AES_256_IV_LENGTH];
BlobCipherEncryptHeader();
// Cipher text encryption information
struct {
// Encyrption domain boundary identifier.
EncryptCipherDomainId encryptDomainId{};
// BaseCipher encryption key identifier
EncryptCipherBaseKeyId baseCipherId{};
// Random salt
EncryptCipherRandomSalt salt{};
// Initialization vector used to encrypt the payload.
uint8_t iv[AES_256_IV_LENGTH];
} cipherTextDetails;
struct {
// Encryption domainId for the header
EncryptCipherDomainId encryptDomainId{};
// BaseCipher encryption key identifier.
EncryptCipherBaseKeyId baseCipherId{};
} cipherHeaderDetails;
// Encryption header is stored as plaintext on a persistent storage to assist reconstruction of cipher-key(s) for
// reads. FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate 'authentication
// token' (crypto-secure) to protect against malicious tampering and/or bit rot/flip scenarios.
union {
// Encryption header support two modes of generation 'authentication tokens':
// 1) SingleAuthTokenMode: the scheme generates single crypto-secrure auth token to protect {cipherText +
// header} payload. Scheme is geared towards optimizing cost due to crypto-secure auth-token generation,
// however, on decryption client needs to be read 'header' + 'encrypted-buffer' to validate the 'auth-token'.
// The scheme is ideal for usecases where payload represented by the encryptionHeader is not large and it is
// desirable to minimize CPU/latency penalty due to crypto-secure ops, such as: CommitProxies encrypted inline
// transactions, StorageServer encrypting pages etc. 2) MultiAuthTokenMode: Scheme generates separate authTokens
// for 'encrypted buffer' & 'encryption-header'. The scheme is ideal where payload represented by
// encryptionHeader is large enough such that it is desirable to optimize cost of upfront reading full
// 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance:
// backup-files.
struct {
// Cipher text authentication token
uint8_t cipherTextAuthToken[AUTH_TOKEN_SIZE]{};
uint8_t headerAuthToken[AUTH_TOKEN_SIZE]{};
} multiAuthTokens;
struct {
uint8_t authToken[AUTH_TOKEN_SIZE]{};
uint8_t _reserved[AUTH_TOKEN_SIZE]{};
} singleAuthToken;
};
BlobCipherEncryptHeader() {}
} BlobCipherEncryptHeader;
#pragma pack(pop)
// Ensure no struct-packing issues
static_assert(sizeof(BlobCipherEncryptHeader) == BlobCipherEncryptHeader::headerSize,
"BlobCipherEncryptHeader size mismatch");
// This interface is in-memory representation of CipherKey used for encryption/decryption information.
// It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying
// HMAC-SHA-256 derivation technique.
class BlobCipherKey : public ReferenceCounted<BlobCipherKey>, NonCopyable {
public:
BlobCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCiphId,
BlobCipherKey(const EncryptCipherDomainId& domainId,
const EncryptCipherBaseKeyId& baseCiphId,
const uint8_t* baseCiph,
int baseCiphLen);
uint8_t* data() const { return cipher.get(); }
uint64_t getCreationTime() const { return creationTime; }
BlobCipherDomainId getDomainId() const { return encryptDomainId; }
BlobCipherRandomSalt getSalt() const { return randomSalt; }
BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
EncryptCipherDomainId getDomainId() const { return encryptDomainId; }
EncryptCipherRandomSalt getSalt() const { return randomSalt; }
EncryptCipherBaseKeyId getBaseCipherId() const { return baseCipherId; }
int getBaseCipherLen() const { return baseCipherLen; }
uint8_t* rawCipher() const { return cipher.get(); }
uint8_t* rawBaseCipher() const { return baseCipher.get(); }
@ -147,23 +182,23 @@ public:
private:
// Encryption domain boundary identifier
BlobCipherDomainId encryptDomainId;
EncryptCipherDomainId encryptDomainId;
// Base encryption cipher key properties
std::unique_ptr<uint8_t[]> baseCipher;
int baseCipherLen;
BlobCipherBaseKeyId baseCipherId;
EncryptCipherBaseKeyId baseCipherId;
// Random salt used for encryption cipher key derivation
BlobCipherRandomSalt randomSalt;
EncryptCipherRandomSalt randomSalt;
// Creation timestamp for the derived encryption cipher key
uint64_t creationTime;
// Derived encryption cipher key
std::unique_ptr<uint8_t[]> cipher;
void initKey(const BlobCipherDomainId& domainId,
void initKey(const EncryptCipherDomainId& domainId,
const uint8_t* baseCiph,
int baseCiphLen,
const BlobCipherBaseKeyId& baseCiphId,
const BlobCipherRandomSalt& salt);
const EncryptCipherBaseKeyId& baseCiphId,
const EncryptCipherRandomSalt& salt);
void applyHmacSha256Derivation();
};
@ -190,37 +225,45 @@ private:
// required encryption key, however, CPs/SSs cache-miss would result in RPC to
// EncryptKeyServer to refresh the desired encryption key.
using BlobCipherKeyIdCacheMap = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>;
using BlobCipherKeyIdCacheMapCItr = std::unordered_map<BlobCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;
using BlobCipherKeyIdCacheMap = std::unordered_map<EncryptCipherBaseKeyId, Reference<BlobCipherKey>>;
using BlobCipherKeyIdCacheMapCItr =
std::unordered_map<EncryptCipherBaseKeyId, Reference<BlobCipherKey>>::const_iterator;
struct BlobCipherKeyIdCache : ReferenceCounted<BlobCipherKeyIdCache> {
public:
BlobCipherKeyIdCache();
explicit BlobCipherKeyIdCache(BlobCipherDomainId dId);
explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId);
// API returns the last inserted cipherKey.
// If none exists, 'encrypt_key_not_found' is thrown.
Reference<BlobCipherKey> getLatestCipherKey();
// API returns cipherKey corresponding to input 'baseCipherKeyId'.
// If none exists, 'encrypt_key_not_found' is thrown.
Reference<BlobCipherKey> getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId);
Reference<BlobCipherKey> getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId);
// API enables inserting base encryption cipher details to the BlobCipherKeyIdCache.
// Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey
// is treated as a NOP (success), however, an attempt to update cipherKey would throw
// 'encrypt_update_cipher' exception.
void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
void insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen);
// API cleanup the cache by dropping all cached cipherKeys
void cleanup();
// API returns list of all 'cached' cipherKeys
std::vector<Reference<BlobCipherKey>> getAllCipherKeys();
private:
BlobCipherDomainId domainId;
EncryptCipherDomainId domainId;
BlobCipherKeyIdCacheMap keyIdCache;
BlobCipherBaseKeyId latestBaseCipherKeyId;
EncryptCipherBaseKeyId latestBaseCipherKeyId;
};
using BlobCipherDomainCacheMap = std::unordered_map<BlobCipherDomainId, Reference<BlobCipherKeyIdCache>>;
using BlobCipherDomainCacheMap = std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKeyIdCache>>;
class BlobCipherKeyCache : NonCopyable {
public:
@ -228,21 +271,28 @@ public:
// The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable,
// attempting to re-insert same 'identical' cipherKey is treated as a NOP (success),
// however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception.
void insertCipherKey(const BlobCipherDomainId& domainId,
const BlobCipherBaseKeyId& baseCipherId,
void insertCipherKey(const EncryptCipherDomainId& domainId,
const EncryptCipherBaseKeyId& baseCipherId,
const uint8_t* baseCipher,
int baseCipherLen);
// API returns the last insert cipherKey for a given encyryption domain Id.
// If none exists, it would throw 'encrypt_key_not_found' exception.
Reference<BlobCipherKey> getLatestCipherKey(const BlobCipherDomainId& domainId);
Reference<BlobCipherKey> getLatestCipherKey(const EncryptCipherDomainId& domainId);
// API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple.
// If none exists, it would throw 'encrypt_key_not_found' exception.
Reference<BlobCipherKey> getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId);
Reference<BlobCipherKey> getCipherKey(const EncryptCipherDomainId& domainId,
const EncryptCipherBaseKeyId& baseCipherId);
// API returns point in time list of all 'cached' cipherKeys for a given encryption domainId.
std::vector<Reference<BlobCipherKey>> getAllCiphers(const BlobCipherDomainId& domainId);
std::vector<Reference<BlobCipherKey>> getAllCiphers(const EncryptCipherDomainId& domainId);
// API enables dropping all 'cached' cipherKeys for a given encryption domain Id.
// Useful to cleanup cache if an encryption domain gets removed/destroyed etc.
void resetEncyrptDomainId(const BlobCipherDomainId domainId);
void resetEncyrptDomainId(const EncryptCipherDomainId domainId);
static BlobCipherKeyCache& getInstance() {
static BlobCipherKeyCache instance;
@ -262,14 +312,19 @@ private:
// This interface enables data block encryption. An invocation to encrypt() will
// do two things:
// 1) generate encrypted ciphertext for given plaintext input.
// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads.
// 2) generate BlobCipherEncryptHeader (including the 'header authTokens') and persit for decryption on reads.
class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted<EncryptBlobCipherAes265Ctr> {
public:
static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1;
EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> key, const uint8_t* iv, const int ivLen);
EncryptBlobCipherAes265Ctr(Reference<BlobCipherKey> tCipherKey,
Reference<BlobCipherKey> hCipherKey,
const uint8_t* iv,
const int ivLen,
const EncryptAuthTokenMode mode);
~EncryptBlobCipherAes265Ctr();
Reference<EncryptBuf> encrypt(const uint8_t* plaintext,
const int plaintextLen,
BlobCipherEncryptHeader* header,
@ -277,7 +332,9 @@ public:
private:
EVP_CIPHER_CTX* ctx;
Reference<BlobCipherKey> cipherKey;
Reference<BlobCipherKey> textCipherKey;
Reference<BlobCipherKey> headerCipherKey;
EncryptAuthTokenMode authTokenMode;
uint8_t iv[AES_256_IV_LENGTH];
};
@ -286,20 +343,44 @@ private:
class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted<DecryptBlobCipherAes256Ctr> {
public:
DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> key, const uint8_t* iv);
DecryptBlobCipherAes256Ctr(Reference<BlobCipherKey> tCipherKey,
Reference<BlobCipherKey> hCipherKey,
const uint8_t* iv);
~DecryptBlobCipherAes256Ctr();
Reference<EncryptBuf> decrypt(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena&);
// Enable caller to validate encryption header auth-token (if available) without needing to read the full encyrpted
// payload. The call is NOP unless header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI.
void verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena);
private:
EVP_CIPHER_CTX* ctx;
Reference<BlobCipherKey> textCipherKey;
Reference<BlobCipherKey> headerCipherKey;
bool headerAuthTokenValidationDone;
bool authTokensValidationDone;
void verifyEncryptBlobHeader(const uint8_t* cipherText,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
Arena& arena);
void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header);
void verifyAuthTokens(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
uint8_t* buff,
Arena& arena);
void verifyHeaderSingleAuthToken(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
uint8_t* buff,
Arena& arena);
void verifyHeaderMultiAuthToken(const uint8_t* ciphertext,
const int ciphertextLen,
const BlobCipherEncryptHeader& header,
uint8_t* buff,
Arena& arena);
};
class HmacSha256DigestGen final : NonCopyable {
@ -313,9 +394,10 @@ private:
HMAC_CTX* ctx;
};
BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload,
const int payloadLen,
const BlobCipherRandomSalt& salt,
Arena& arena);
StringRef computeAuthToken(const uint8_t* payload,
const int payloadLen,
const uint8_t* key,
const int keyLen,
Arena& arena);
#endif // ENCRYPTION_ENABLED

66
flow/EncryptUtils.h Normal file
View File

@ -0,0 +1,66 @@
/*
* EncryptUtils.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ENCRYPT_UTILS_H
#define ENCRYPT_UTILS_H
#pragma once
#include <cstdint>
#include <limits>
#define ENCRYPT_INVALID_DOMAIN_ID 0
#define ENCRYPT_INVALID_CIPHER_KEY_ID 0
#define AUTH_TOKEN_SIZE 16
#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -1
#define ENCRYPT_HEADER_DOMAIN_ID -2
using EncryptCipherDomainId = int64_t;
using EncryptCipherBaseKeyId = uint64_t;
using EncryptCipherRandomSalt = uint64_t;
typedef enum {
ENCRYPT_CIPHER_MODE_NONE = 0,
ENCRYPT_CIPHER_MODE_AES_256_CTR = 1,
ENCRYPT_CIPHER_MODE_LAST = 2
} EncryptCipherMode;
static_assert(EncryptCipherMode::ENCRYPT_CIPHER_MODE_LAST <= std::numeric_limits<uint8_t>::max(),
"EncryptCipherMode value overflow");
// EncryptionHeader authentication modes
// 1. NONE - No 'authentication token' generation needed for EncryptionHeader i.e. no protection against header OR
// cipherText 'tampering' and/or bit rot/flip corruptions.
// 2. Single/Multi - Encyrption header would generate one or more 'authentication tokens' to protect the header against
// 'tempering' and/or bit rot/flip corruptions. Refer to BlobCipher.h for detailed usage recommendations.
// 3. LAST - Invalid mode, used for static asserts.
typedef enum {
ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE = 0,
ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE = 1,
ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI = 2,
ENCRYPT_HEADER_AUTH_TOKEN_LAST = 3 // Always the last element
} EncryptAuthTokenMode;
static_assert(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_LAST <= std::numeric_limits<uint8_t>::max(),
"EncryptHeaderAuthToken value overflow");
#endif

View File

@ -963,7 +963,7 @@ struct DynamicFieldBase {
if (getDerivedTypeName() == metricTypeName<T>())
return (DynamicField<T>*)this;
TraceEvent(SevWarnAlways, "ScopeEventFieldTypeMismatch")
TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways, "ScopeEventFieldTypeMismatch")
.detail("EventType", eventType.toString())
.detail("FieldName", fieldName().toString())
.detail("OldType", getDerivedTypeName().toString())

View File

@ -35,6 +35,7 @@
#define TRACE_DEFAULT_ROLL_SIZE (10 << 20)
#define TRACE_DEFAULT_MAX_LOGS_SIZE (10 * TRACE_DEFAULT_ROLL_SIZE)
#define PRINTABLE_COMPRESS_NULLS 0
inline int fastrand() {
static int g_seed = 0;
@ -343,20 +344,37 @@ struct TraceableStringImpl : std::true_type {
}
std::string result;
result.reserve(size - nonPrintables + (nonPrintables * 4) + numBackslashes);
int numNull = 0;
for (auto iter = TraceableString<T>::begin(value); !TraceableString<T>::atEnd(value, iter); ++iter) {
if (*iter == '\\') {
if (numNull > 0) {
result += format("[%d]", numNull);
numNull = 0;
}
result.push_back('\\');
result.push_back('\\');
} else if (isPrintable(*iter)) {
if (numNull > 0) {
result += format("[%d]", numNull);
numNull = 0;
}
result.push_back(*iter);
} else {
const uint8_t byte = *iter;
result.push_back('\\');
result.push_back('x');
result.push_back(base16Char(byte / 16));
result.push_back(base16Char(byte));
if (PRINTABLE_COMPRESS_NULLS && byte == 0) {
numNull++;
} else {
result.push_back('\\');
result.push_back('x');
result.push_back(base16Char(byte / 16));
result.push_back(base16Char(byte));
}
}
}
if (numNull > 0) {
result += format("[%d]", numNull);
numNull = 0;
}
return result;
}
};

View File

@ -690,7 +690,7 @@ TEST_CASE("/flow/Tracing/AddLinks") {
return Void();
};
uint64_t swapUint16BE(uint8_t* index) {
uint16_t swapUint16BE(uint8_t* index) {
uint16_t value;
memcpy(&value, index, sizeof(value));
return fromBigEndian16(value);
@ -718,6 +718,26 @@ std::string readMPString(uint8_t* index, int len) {
return reinterpret_cast<char*>(data);
}
std::string readMPString(uint8_t* index) {
auto len = 0;
switch (*index) {
case 0xda:
index++; // read the size in the next 2 bytes
len = swapUint16BE(index);
index += 2; // move index past the size bytes
break;
default:
// We & out the bits here that contain the length the initial 3 higher order bits are
// to signify this is a string of len <= 31 chars.
len = static_cast<uint8_t>(*index & 0b00011111);
index++;
}
uint8_t data[len + 1];
std::copy(index, index + len, data);
data[len] = '\0';
return reinterpret_cast<char*>(data);
}
// Windows doesn't like lack of header and declaration of constructor for FastUDPTracer
#ifndef WIN32
TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
@ -754,9 +774,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
ASSERT(data[46] == 0xcf);
ASSERT(swapUint64BE(&data[47]) == 1);
// Read and verify span name
ASSERT(data[55] == (0b10100000 | strlen("encoded_span")));
ASSERT(strncmp(readMPString(&data[56], strlen("encoded_span")).c_str(), "encoded_span", strlen("encoded_span")) ==
0);
ASSERT(readMPString(&data[55]) == "encoded_span");
// Verify begin/end is encoded, we don't care about the values
ASSERT(data[68] == 0xcb);
ASSERT(data[77] == 0xcb);
@ -795,10 +813,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
ASSERT(data[0] == 0b10011110); // 14 element array.
// We don't care about the next 54 bytes as there is no parent and a randomly assigned Trace and SpanID
// Read and verify span name
ASSERT(data[55] == (0b10100000 | strlen("encoded_span_3")));
ASSERT(strncmp(readMPString(&data[56], strlen("encoded_span_3")).c_str(),
"encoded_span_3",
strlen("encoded_span_3")) == 0);
ASSERT(readMPString(&data[55]) == "encoded_span_3");
// Verify begin/end is encoded, we don't care about the values
ASSERT(data[70] == 0xcb);
ASSERT(data[79] == 0xcb);
@ -818,43 +833,32 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
ASSERT(swapUint64BE(&data[112]) == 400);
// Events
ASSERT(data[120] == 0b10010001); // empty
ASSERT(data[121] == (0b10100000 | strlen("event1")));
ASSERT(strncmp(readMPString(&data[122], strlen("event1")).c_str(), "event1", strlen("event1")) == 0);
ASSERT(readMPString(&data[121]) == "event1");
ASSERT(data[128] == 0xcb);
ASSERT(swapDoubleBE(&data[129]) == 100.101);
// Events Attributes
ASSERT(data[137] == 0b10000001); // single k/v pair
ASSERT(data[138] == 0b10100011); // length of key string "foo" == 3
ASSERT(strncmp(readMPString(&data[139], strlen("foo")).c_str(), "foo", strlen("foo")) == 0);
ASSERT(data[142] == 0b10100011); // length of key string "bar" == 3
ASSERT(strncmp(readMPString(&data[143], strlen("bar")).c_str(), "bar", strlen("bar")) == 0);
ASSERT(readMPString(&data[138]) == "foo");
ASSERT(readMPString(&data[142]) == "bar");
// Attributes
ASSERT(data[146] == 0b10000010); // two k/v pair
// Reconstruct map from MessagePack wire format data and verify.
std::unordered_map<std::string, std::string> attributes;
auto index = 147;
// We & out the bits here that contain the length the initial 4 higher order bits are
// to signify this is a string of len <= 31 chars.
auto firstKeyLength = static_cast<uint8_t>(data[index] & 0b00011111);
index++;
auto firstKey = readMPString(&data[index], firstKeyLength);
index += firstKeyLength;
auto firstValueLength = static_cast<uint8_t>(data[index] & 0b00011111);
index++;
auto firstValue = readMPString(&data[index], firstValueLength);
index += firstValueLength;
auto firstKey = readMPString(&data[index]);
index += firstKey.length() + 1; // +1 for control byte
auto firstValue = readMPString(&data[index]);
index += firstValue.length() + 1; // +1 for control byte
attributes[firstKey] = firstValue;
auto secondKeyLength = static_cast<uint8_t>(data[index] & 0b00011111);
index++;
auto secondKey = readMPString(&data[index], secondKeyLength);
index += secondKeyLength;
auto secondValueLength = static_cast<uint8_t>(data[index] & 0b00011111);
index++;
auto secondValue = readMPString(&data[index], secondValueLength);
auto secondKey = readMPString(&data[index]);
index += secondKey.length() + 1; // +1 for control byte
auto secondValue = readMPString(&data[index]);
attributes[secondKey] = secondValue;
// We don't know what the value for address will be, so just verify it is in the map.
ASSERT(attributes.find("address") != attributes.end());
ASSERT(strncmp(attributes["operation"].c_str(), "grv", strlen("grv")) == 0);
ASSERT(attributes["operation"] == "grv");
request.reset();
@ -876,9 +880,7 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
// We don't care about the next 54 bytes as there is no parent and a randomly assigned Trace and SpanID
// Read and verify span name
ASSERT(data[55] == 0xda);
auto locationLength = swapUint16BE(&data[56]);
ASSERT(locationLength == strlen(longString));
ASSERT(strncmp(readMPString(&data[58], locationLength).c_str(), longString, strlen(longString)) == 0);
ASSERT(readMPString(&data[55]) == longString);
return Void();
};
#endif

View File

@ -73,3 +73,11 @@ T waitNext(const FutureStream<T>&);
#ifdef _MSC_VER
#pragma warning(disable : 4355) // 'this' : used in base member initializer list
#endif
// Currently, #ifdef can't be used inside actors, so define no-op versions of these valgrind
// functions if valgrind is not defined
#ifndef VALGRIND
#define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
#define VALGRIND_MAKE_MEM_DEFINED(x, y)
#define VALGRIND_CHECK_MEM_IS_DEFINED(x, y) 0
#endif

View File

@ -88,6 +88,13 @@ ERROR( blob_granule_transaction_too_old, 1064, "Read version is older than blob
ERROR( blob_manager_replaced, 1065, "This blob manager has been replaced." )
ERROR( change_feed_popped, 1066, "Tried to read a version older than what has been popped from the change feed" )
ERROR( remote_kvs_cancelled, 1067, "The remote key-value store is cancelled" )
ERROR( page_header_wrong_page_id, 1068, "Page header does not match location on disk" )
ERROR( page_header_checksum_failed, 1069, "Page header checksum failed" )
ERROR( page_header_version_not_supported, 1070, "Page header version is not supported" )
ERROR( page_encoding_not_supported, 1071, "Page encoding type is not supported or not valid" )
ERROR( page_decoding_failed, 1072, "Page content decoding failed" )
ERROR( unexpected_encoding_type, 1073, "Page content decoding failed" )
ERROR( encryption_key_not_found, 1074, "Encryption key not found" )
ERROR( broken_promise, 1100, "Broken promise" )
ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )
@ -290,14 +297,14 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum
ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported")
ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string")
// 3XXX - Encryption operations errors
ERROR( encrypt_ops_error, 3000, "Encryption operation error")
ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch")
ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing")
ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired")
ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch")
ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key")
ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id")
// 27XX - Encryption operations errors
ERROR( encrypt_ops_error, 2700, "Encryption operation error")
ERROR( encrypt_header_metadata_mismatch, 2701, "Encryption header metadata mismatch")
ERROR( encrypt_key_not_found, 2702, "Expected encryption key is missing")
ERROR( encrypt_key_ttl_expired, 2703, "Expected encryption key TTL has expired")
ERROR( encrypt_header_authtoken_mismatch, 2704, "Encryption header authentication token mismatch")
ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key")
ERROR( encrypt_invalid_id, 2706, "Invalid encryption domainId or encryption cipher key id")
// 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx
ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=3
logAntiQuorum = 0
testTitle=SubmitBackup

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=3
;write 1000 Keys ending with even numbers
testTitle=SnapTestPre
clearAfterTest=false

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=3
;write 1000 Keys ending with even numbers
testTitle=SnapTestPre
clearAfterTest=false

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=3
;write 1000 Keys ending with even number
testTitle=SnapSimplePre
clearAfterTest=false

View File

@ -1,3 +1,5 @@
storageEngineExcludeTypes=3
[[test]]
testTitle = 'SubmitBackup'
simBackupAgents= 'BackupToFile'

View File

@ -10,6 +10,7 @@ waitForQuiescenceBegin=false
testName = 'ConfigureDatabase'
testDuration = 300.0
waitStoreTypeCheck = true
storageMigrationCompatibleConf = true
[[test.workload]]
testName = 'RandomClogging'

View File

@ -1,4 +1,4 @@
storageEngineExcludeTypes=-1,-2
storageEngineExcludeTypes=-1,-2,3
maxTLogVersion=6
disableTss=true
disableHostname=true

View File

@ -10,6 +10,7 @@ waitForQuiescenceBegin=false
testName = 'ConfigureDatabase'
testDuration = 300.0
waitStoreTypeCheck = true
storageMigrationCompatibleConf = true
[[test.workload]]
testName = 'RandomClogging'