Merge branch 'master' of https://github.com/apple/foundationdb into feature-sim-time-batching

# Conflicts:
#	fdbserver/DataDistribution.actor.cpp
This commit is contained in:
Evan Tschannen 2021-06-08 10:04:06 -07:00
commit 08a5f17660
186 changed files with 5046 additions and 1105 deletions

3
.gitignore vendored
View File

@ -7,7 +7,8 @@ bindings/java/foundationdb-client*.jar
bindings/java/foundationdb-tests*.jar
bindings/java/fdb-java-*-sources.jar
packaging/msi/FDBInstaller.msi
builds/
cmake-build-debug/
# Generated source, build, and packaging files
*.g.cpp
*.g.h

View File

@ -78,6 +78,8 @@ if(NOT WIN32)
test/unit/fdb_api.cpp
test/unit/fdb_api.hpp)
set(UNIT_TEST_VERSION_510_SRCS test/unit/unit_tests_version_510.cpp)
if(OPEN_FOR_IDE)
add_library(fdb_c_performance_test OBJECT test/performance_test.c test/test.h)
add_library(fdb_c_ryw_benchmark OBJECT test/ryw_benchmark.c test/test.h)
@ -85,6 +87,7 @@ if(NOT WIN32)
add_library(mako OBJECT ${MAKO_SRCS})
add_library(fdb_c_setup_tests OBJECT test/unit/setup_tests.cpp)
add_library(fdb_c_unit_tests OBJECT ${UNIT_TEST_SRCS})
add_library(fdb_c_unit_tests_version_510 OBJECT ${UNIT_TEST_VERSION_510_SRCS})
else()
add_executable(fdb_c_performance_test test/performance_test.c test/test.h)
add_executable(fdb_c_ryw_benchmark test/ryw_benchmark.c test/test.h)
@ -92,6 +95,7 @@ if(NOT WIN32)
add_executable(mako ${MAKO_SRCS})
add_executable(fdb_c_setup_tests test/unit/setup_tests.cpp)
add_executable(fdb_c_unit_tests ${UNIT_TEST_SRCS})
add_executable(fdb_c_unit_tests_version_510 ${UNIT_TEST_VERSION_510_SRCS})
strip_debug_symbols(fdb_c_performance_test)
strip_debug_symbols(fdb_c_ryw_benchmark)
strip_debug_symbols(fdb_c_txn_size_test)
@ -104,8 +108,10 @@ if(NOT WIN32)
add_dependencies(fdb_c_unit_tests doctest)
target_include_directories(fdb_c_setup_tests PUBLIC ${DOCTEST_INCLUDE_DIR})
target_include_directories(fdb_c_unit_tests PUBLIC ${DOCTEST_INCLUDE_DIR})
target_include_directories(fdb_c_unit_tests_version_510 PUBLIC ${DOCTEST_INCLUDE_DIR})
target_link_libraries(fdb_c_setup_tests PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_unit_tests PRIVATE fdb_c Threads::Threads)
target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads)
# do not set RPATH for mako
set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
@ -135,6 +141,11 @@ if(NOT WIN32)
COMMAND $<TARGET_FILE:fdb_c_unit_tests>
@CLUSTER_FILE@
fdb)
add_fdbclient_test(
NAME fdb_c_unit_tests_version_510
COMMAND $<TARGET_FILE:fdb_c_unit_tests_version_510>
@CLUSTER_FILE@
fdb)
add_fdbclient_test(
NAME fdb_c_external_client_unit_tests
COMMAND $<TARGET_FILE:fdb_c_unit_tests>
@ -158,6 +169,10 @@ set_target_properties(c_workloads PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/share/foundationdb")
target_link_libraries(c_workloads PUBLIC fdb_c)
if (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE)
target_link_options(c_workloads PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/external_workload.map,-z,nodelete")
endif()
# TODO: re-enable once the old vcxproj-based build system is removed.
#generate_export_header(fdb_c EXPORT_MACRO_NAME "DLLEXPORT"
# EXPORT_FILE_NAME ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/fdb_c_export.h)

View File

@ -0,0 +1,7 @@
{
global:
workloadFactory;
local:
*;
};

View File

@ -74,10 +74,41 @@ def write_unix_asm(asmfile, functions, prefix):
for f in functions:
asmfile.write("\n.globl %s%s\n" % (prefix, f))
asmfile.write("%s%s:\n" % (prefix, f))
# These assembly implementations of versioned fdb c api functions must have the following properties.
#
# 1. Don't require dynamic relocation.
#
# 2. Perform a tail-call to the function pointer that works for a
# function with any number of arguments. For example, since registers x0-x7 are used
# to pass arguments in the Arm calling convention we must not use x0-x7
# here.
#
# You can compile this example c program to get a rough idea of how to
# load the extern symbol and make a tail call.
#
# $ cat test.c
# typedef int (*function)();
# extern function f;
# int g() { return f(); }
# $ cc -S -O3 -fPIC test.c && grep -A 10 '^g:' test.[sS]
# g:
# .LFB0:
# .cfi_startproc
# adrp x0, :got:f
# ldr x0, [x0, #:got_lo12:f]
# ldr x0, [x0]
# br x0
# .cfi_endproc
# .LFE0:
# .size g, .-g
# .ident "GCC: (GNU) 8.3.1 20190311 (Red Hat 8.3.1-3)"
if platform == "linux-aarch64":
asmfile.write("\tldr x16, =fdb_api_ptr_%s\n" % (f))
asmfile.write("\tldr x16, [x16]\n")
asmfile.write("\tbr x16\n")
asmfile.write("\tadrp x8, :got:fdb_api_ptr_%s\n" % (f))
asmfile.write("\tldr x8, [x8, #:got_lo12:fdb_api_ptr_%s]\n" % (f))
asmfile.write("\tldr x8, [x8]\n")
asmfile.write("\tbr x8\n")
else:
asmfile.write(
"\tmov r11, qword ptr [%sfdb_api_ptr_%s@GOTPCREL+rip]\n" % (prefix, f))

View File

@ -219,7 +219,7 @@ GetRangeResult get_range(fdb::Transaction& tr,
for (int i = 0; i < out_count; ++i) {
std::string key((const char*)out_kv[i].key, out_kv[i].key_length);
std::string value((const char*)out_kv[i].value, out_kv[i].value_length);
results.push_back(std::make_pair(key, value));
results.emplace_back(key, value);
}
return GetRangeResult{ results, out_more != 0, 0 };
}
@ -263,13 +263,15 @@ TEST_CASE("fdb_future_set_callback") {
&context));
fdb_error_t err = wait_future(f1);
context.event.wait(); // Wait until callback is called
if (err) {
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
}
context.event.wait();
break;
}
}
@ -515,10 +517,10 @@ TEST_CASE("write system key") {
fdb::Transaction tr(db);
std::string syskey("\xff\x02");
fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0));
tr.set(syskey, "bar");
while (1) {
fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0));
tr.set(syskey, "bar");
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
@ -949,16 +951,25 @@ TEST_CASE("fdb_transaction_clear") {
}
TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") {
insert_data(db, create_data({ { "foo", "a" } }));
insert_data(db, create_data({ { "foo", "\x00" } }));
fdb::Transaction tr(db);
int8_t param = 1;
int potentialCommitCount = 0;
while (1) {
tr.atomic_op(key("foo"), (const uint8_t*)&param, sizeof(param), FDB_MUTATION_TYPE_ADD);
if (potentialCommitCount + 1 == 256) {
// Trying to commit again might overflow the one unsigned byte we're looking at
break;
}
++potentialCommitCount;
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
if (err) {
if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
--potentialCommitCount;
}
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
@ -969,7 +980,8 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") {
auto value = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->size() == 1);
CHECK(value->data()[0] == 'b'); // incrementing 'a' results in 'b'
CHECK(uint8_t(value->data()[0]) > 0);
CHECK(uint8_t(value->data()[0]) <= potentialCommitCount);
}
TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_AND") {
@ -1139,14 +1151,19 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") {
fdb::Transaction tr(db);
char param[] = { 'a', 'd' };
int potentialCommitCount = 0;
while (1) {
tr.atomic_op(key("foo"), (const uint8_t*)"b", 1, FDB_MUTATION_TYPE_BIT_XOR);
tr.atomic_op(key("bar"), (const uint8_t*)param, 2, FDB_MUTATION_TYPE_BIT_XOR);
tr.atomic_op(key("baz"), (const uint8_t*)"d", 1, FDB_MUTATION_TYPE_BIT_XOR);
++potentialCommitCount;
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
if (err) {
if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
--potentialCommitCount;
}
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
@ -1154,6 +1171,11 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") {
break;
}
if (potentialCommitCount != 1) {
MESSAGE("Transaction may not have committed exactly once. Suppressing assertions");
return;
}
auto value = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->size() == 1);
@ -1204,13 +1226,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") {
insert_data(db, create_data({ { "foo", "f" } }));
fdb::Transaction tr(db);
int potentialCommitCount = 0;
while (1) {
tr.atomic_op(key("foo"), (const uint8_t*)"db", 2, FDB_MUTATION_TYPE_APPEND_IF_FITS);
tr.atomic_op(key("bar"), (const uint8_t*)"foundation", 10, FDB_MUTATION_TYPE_APPEND_IF_FITS);
++potentialCommitCount;
fdb::EmptyFuture f1 = tr.commit();
fdb_error_t err = wait_future(f1);
if (err) {
if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
--potentialCommitCount;
}
fdb::EmptyFuture f2 = tr.on_error(err);
fdb_check(wait_future(f2));
continue;
@ -1218,13 +1245,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") {
break;
}
auto value = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->compare("fdb") == 0);
auto value_foo = get_value(key("foo"), /* snapshot */ false, {});
REQUIRE(value_foo.has_value());
value = get_value(key("bar"), /* snapshot */ false, {});
REQUIRE(value.has_value());
CHECK(value->compare("foundation") == 0);
auto value_bar = get_value(key("bar"), /* snapshot */ false, {});
REQUIRE(value_bar.has_value());
if (potentialCommitCount != 1) {
MESSAGE("Transaction may not have committed exactly once. Suppressing assertions");
} else {
CHECK(value_foo.value() == "fdb");
CHECK(value_bar.value() == "foundation");
}
}
TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_MAX") {
@ -1576,7 +1608,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f1.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1587,7 +1619,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f2.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1598,7 +1630,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f3.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1609,7 +1641,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
fdb_check(f4.set_callback(
+[](FDBFuture* f, void* param) {
fdb_error_t err = fdb_future_get_error(f);
if (err != 1101) { // operation_cancelled
if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
CHECK(err == 1032); // too_many_watches
}
auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@ -1671,7 +1703,7 @@ TEST_CASE("fdb_transaction_cancel") {
// ... until the transaction has been reset.
tr.reset();
fdb::ValueFuture f2 = tr.get("foo", /* snapshot */ false);
fdb_check(wait_future(f2));
CHECK(wait_future(f2) != 1025); // transaction_cancelled
}
TEST_CASE("fdb_transaction_add_conflict_range") {
@ -2146,22 +2178,29 @@ TEST_CASE("monitor_network_busyness") {
}
int main(int argc, char** argv) {
if (argc != 3 && argc != 4) {
if (argc < 3) {
std::cout << "Unit tests for the FoundationDB C API.\n"
<< "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient]" << std::endl;
<< "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient] [doctest args]"
<< std::endl;
return 1;
}
fdb_check(fdb_select_api_version(710));
if (argc == 4) {
if (argc >= 4) {
std::string externalClientLibrary = argv[3];
if (externalClientLibrary.substr(0, 2) != "--") {
fdb_check(fdb_network_set_option(
FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast<const uint8_t*>(""), 0));
fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY,
reinterpret_cast<const uint8_t*>(externalClientLibrary.c_str()),
externalClientLibrary.size()));
}
}
/* fdb_check(fdb_network_set_option( */
/* FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE, reinterpret_cast<const uint8_t*>(""), 0)); */
doctest::Context context;
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };

View File

@ -0,0 +1,118 @@
/*
* unit_tests_header_510.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Unit tests for the FoundationDB C API, at api header version 510
#include "fdb_c_options.g.h"
#include <thread>
#define FDB_API_VERSION 510
static_assert(FDB_API_VERSION == 510, "Don't change this! This test intentionally tests an old api header version");
#include <foundationdb/fdb_c.h>
#define DOCTEST_CONFIG_IMPLEMENT
#include "doctest.h"
#include "flow/config.h"
void fdb_check(fdb_error_t e) {
if (e) {
std::cerr << fdb_get_error(e) << std::endl;
std::abort();
}
}
std::string clusterFilePath;
std::string prefix;
FDBDatabase* db;
struct Future {
FDBFuture* f = nullptr;
Future() = default;
explicit Future(FDBFuture* f) : f(f) {}
~Future() {
if (f)
fdb_future_destroy(f);
}
};
struct Transaction {
FDBTransaction* tr = nullptr;
Transaction() = default;
explicit Transaction(FDBTransaction* tr) : tr(tr) {}
~Transaction() {
if (tr)
fdb_transaction_destroy(tr);
}
};
// TODO add more tests. The motivation for this test for now is to test the
// assembly code that handles emulating older api versions, but there's no
// reason why this shouldn't also test api version 510 specific behavior.
TEST_CASE("GRV") {
Transaction tr;
fdb_check(fdb_database_create_transaction(db, &tr.tr));
Future grv{ fdb_transaction_get_read_version(tr.tr) };
fdb_check(fdb_future_block_until_ready(grv.f));
}
int main(int argc, char** argv) {
if (argc < 3) {
std::cout << "Unit tests for the FoundationDB C API.\n"
<< "Usage: " << argv[0] << " /path/to/cluster_file key_prefix [doctest args]" << std::endl;
return 1;
}
fdb_check(fdb_select_api_version(FDB_API_VERSION));
doctest::Context context;
context.applyCommandLine(argc, argv);
fdb_check(fdb_setup_network());
std::thread network_thread{ &fdb_run_network };
{
FDBCluster* cluster;
Future clusterFuture{ fdb_create_cluster(argv[1]) };
fdb_check(fdb_future_block_until_ready(clusterFuture.f));
fdb_check(fdb_future_get_cluster(clusterFuture.f, &cluster));
Future databaseFuture{ fdb_cluster_create_database(cluster, (const uint8_t*)"DB", 2) };
fdb_check(fdb_future_block_until_ready(databaseFuture.f));
fdb_check(fdb_future_get_database(databaseFuture.f, &db));
fdb_cluster_destroy(cluster);
}
clusterFilePath = std::string(argv[1]);
prefix = argv[2];
int res = context.run();
fdb_database_destroy(db);
if (context.shouldExit()) {
fdb_check(fdb_stop_network());
network_thread.join();
return res;
}
fdb_check(fdb_stop_network());
network_thread.join();
return res;
}

View File

@ -138,6 +138,11 @@ else()
add_library(fdb_java SHARED fdbJNI.cpp)
add_library(java_workloads SHARED JavaWorkload.cpp)
endif()
if (NOT WIN32 AND NOT APPLE AND NOT OPEN_FOR_IDE)
target_link_options(java_workloads PRIVATE "LINKER:--version-script=${CMAKE_SOURCE_DIR}/bindings/c/external_workload.map,-z,nodelete")
endif()
target_include_directories(fdb_java PRIVATE ${JNI_INCLUDE_DIRS})
# libfdb_java.so is loaded by fdb-java.jar and doesn't need to depened on jvm shared libraries.
target_link_libraries(fdb_java PRIVATE fdb_c)

View File

@ -74,3 +74,12 @@ add_custom_command(OUTPUT ${package_file}
add_custom_target(python_package DEPENDS ${package_file})
add_dependencies(python_package python_binding)
add_dependencies(packages python_package)
if (NOT WIN32 AND NOT OPEN_FOR_IDE)
add_fdbclient_test(
NAME fdbcli_tests
COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py
${CMAKE_BINARY_DIR}/bin/fdbcli
@CLUSTER_FILE@
)
endif()

View File

@ -0,0 +1,93 @@
#!/usr/bin/env python3
import sys
import subprocess
import logging
import functools
def enable_logging(level=logging.ERROR):
"""Enable logging in the function with the specified logging level
Args:
level (logging.<level>, optional): logging level for the decorated function. Defaults to logging.ERROR.
"""
def func_decorator(func):
@functools.wraps(func)
def wrapper(*args,**kwargs):
# initialize logger
logger = logging.getLogger(func.__name__)
logger.setLevel(level)
# set logging format
handler = logging.StreamHandler()
handler_format = logging.Formatter('[%(asctime)s] - %(filename)s:%(lineno)d - %(levelname)s - %(name)s - %(message)s')
handler.setFormatter(handler_format)
handler.setLevel(level)
logger.addHandler(handler)
# pass the logger to the decorated function
result = func(logger, *args,**kwargs)
return result
return wrapper
return func_decorator
def run_fdbcli_command(*args):
"""run the fdbcli statement: fdbcli --exec '<arg1> <arg2> ... <argN>'.
Returns:
string: Console output from fdbcli
"""
commands = command_template + ["{}".format(' '.join(args))]
return subprocess.run(commands, stdout=subprocess.PIPE).stdout.decode('utf-8').strip()
@enable_logging()
def advanceversion(logger):
# get current read version
version1 = int(run_fdbcli_command('getversion'))
logger.debug("Read version: {}".format(version1))
# advance version to a much larger value compared to the current version
version2 = version1 * 10000
logger.debug("Advanced to version: " + str(version2))
run_fdbcli_command('advanceversion', str(version2))
# after running the advanceversion command,
# check the read version is advanced to the specified value
version3 = int(run_fdbcli_command('getversion'))
logger.debug("Read version: {}".format(version3))
assert version3 >= version2
# advance version to a smaller value compared to the current version
# this should be a no-op
run_fdbcli_command('advanceversion', str(version1))
# get the current version to make sure the version did not decrease
version4 = int(run_fdbcli_command('getversion'))
logger.debug("Read version: {}".format(version4))
assert version4 >= version3
@enable_logging()
def maintenance(logger):
# expected fdbcli output when running 'maintenance' while there's no ongoing maintenance
no_maintenance_output = 'No ongoing maintenance.'
output1 = run_fdbcli_command('maintenance')
assert output1 == no_maintenance_output
# set maintenance on a fake zone id for 10 seconds
run_fdbcli_command('maintenance', 'on', 'fake_zone_id', '10')
# show current maintenance status
output2 = run_fdbcli_command('maintenance')
logger.debug("Maintenance status: " + output2)
items = output2.split(' ')
# make sure this specific zone id is under maintenance
assert 'fake_zone_id' in items
logger.debug("Remaining time(seconds): " + items[-2])
assert 0 < int(items[-2]) < 10
# turn off maintenance
run_fdbcli_command('maintenance', 'off')
# check maintenance status
output3 = run_fdbcli_command('maintenance')
assert output3 == no_maintenance_output
if __name__ == '__main__':
# fdbcli_tests.py <path_to_fdbcli_binary> <path_to_fdb_cluster_file>
assert len(sys.argv) == 3, "Please pass arguments: <path_to_fdbcli_binary> <path_to_fdb_cluster_file>"
# shell command template
command_template = [sys.argv[1], '-C', sys.argv[2], '--exec']
# tests for fdbcli commands
# assertions will fail if fdbcli does not work as expected
advanceversion()
maintenance()

View File

@ -971,7 +971,7 @@ For example, you can change a process type or update coordinators by manipulatin
#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clearing keys is forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, indicating the source that the process's class type comes from.
#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators's network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators' network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
#. ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` Read/write. A single key, if read, will return the cluster description. Thus modifying the key will update the cluster decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. In addition, clear on the key is meaningless thus forbidden. For more details, see help text of ``fdbcli`` command ``coordinators``.
The ``<address>`` here is the network address of the corresponding process. Thus the general form is ``ip:port``.

View File

@ -121,6 +121,16 @@
"counter":0,
"roughness":0.0
},
"fetched_versions":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"fetches_from_logs":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default).
"default":{
"count":0,
@ -604,6 +614,10 @@
"data_distribution_disabled_for_rebalance":true,
"data_distribution_disabled":true,
"active_primary_dc":"pv",
"bounce_impact":{
"can_clean_bounce":true,
"reason":""
},
"configuration":{
"log_anti_quorum":0,
"log_replicas":2,
@ -668,6 +682,16 @@
"ssd-rocksdb-experimental",
"memory"
]},
"tss_count":1,
"tss_storage_engine":{
"$enum":[
"ssd",
"ssd-1",
"ssd-2",
"ssd-redwood-experimental",
"ssd-rocksdb-experimental",
"memory"
]},
"coordinators_count":1,
"excluded_servers":[
{

View File

@ -3,16 +3,29 @@ Release Notes
#############
6.3.14
======
* Fixed fdbbackup start command that automatically configures database with backup workers to only do so when using partitioned logs. `(PR #4863) <https://github.com/apple/foundationdb/pull/4863>`_
* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) <https://github.com/apple/foundationdb/pull/4774>`_
* Fix several packaging issues. The osx package should now install successfully, and the structure of the RPM and DEB packages should match that of 6.2. `(PR #4810) <https://github.com/apple/foundationdb/pull/4810>`_
* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
6.3.13
======
* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) <https://github.com/apple/foundationdb/pull/4736>`_
* The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
6.3.12
======
* Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) <https://github.com/apple/foundationdb/pull/4421>`_
* Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4526) <https://github.com/apple/foundationdb/pull/4526>`_
* As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database. `(PR #4554) <https://github.com/apple/foundationdb/pull/4554>`_
* Fix fault tolerance calculation when there are no tLogs in LogSet. `(PR #4454) <https://github.com/apple/foundationdb/pull/4454>`_
* Change client's ``iteration_progression`` size defaults from 256 to 4096 bytes for better performance. `(PR #4416) <https://github.com/apple/foundationdb/pull/4416>`_
* Add the ability to instrument java driver actions, such as ``FDBTransaction`` and ``RangeQuery``. `(PR #4385) <https://github.com/apple/foundationdb/pull/4385>`_
6.3.11
======

View File

@ -31,7 +31,9 @@ Fixes
Status
------
* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
Bindings
--------

View File

@ -3357,7 +3357,7 @@ int main(int argc, char* argv[]) {
deleteData = true;
break;
case OPT_MIN_CLEANUP_SECONDS:
knobs.push_back(std::make_pair("min_cleanup_seconds", args->OptionArg()));
knobs.emplace_back("min_cleanup_seconds", args->OptionArg());
break;
case OPT_FORCE:
forceAction = true;
@ -3452,7 +3452,7 @@ int main(int argc, char* argv[]) {
return FDB_EXIT_ERROR;
}
syn = syn.substr(7);
knobs.push_back(std::make_pair(syn, args->OptionArg()));
knobs.emplace_back(syn, args->OptionArg());
break;
}
case OPT_BACKUPKEYS:
@ -4212,7 +4212,7 @@ int main(int argc, char* argv[]) {
s = s.substr(LiteralStringRef("struct ").size());
#endif
typeNames.push_back(std::make_pair(s, i->first));
typeNames.emplace_back(s, i->first);
}
std::sort(typeNames.begin(), typeNames.end());
for (int i = 0; i < typeNames.size(); i++) {

View File

@ -35,6 +35,7 @@
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/Tuple.h"
#include "fdbclient/ThreadSafeTransaction.h"
#include "flow/DeterministicRandom.h"
@ -496,11 +497,15 @@ void initHelp() {
helpMap["configure"] = CommandHelp(
"configure [new] "
"<single|double|triple|three_data_hall|three_datacenter|ssd|memory|memory-radixtree-beta|proxies=<PROXIES>|"
"commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
"commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*|"
"perpetual_storage_wiggle=<WIGGLE_SPEED>",
"change the database configuration",
"The `new' option, if present, initializes a new database with the given configuration rather than changing "
"the configuration of an existing one. When used, both a redundancy mode and a storage engine must be "
"specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies "
"specified.\n\ntss: when enabled, configures the testing storage server for the cluster instead."
"When used with new to set up tss for the first time, it requires both a count and a storage engine."
"To disable the testing storage server, run \"configure tss count=0\"\n\n"
"Redundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies "
"of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - "
"See the Admin Guide.\n three_datacenter - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage "
"engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small "
@ -517,8 +522,11 @@ void initHelp() {
"1, or set to -1 which restores the number of GRV proxies to the default value.\n\nlogs=<LOGS>: Sets the "
"desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of "
"logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. "
"Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the "
"FoundationDB Administration Guide for more information.");
"Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\n"
"perpetual_storage_wiggle=<WIGGLE_SPEED>: Set the value speed (a.k.a., the number of processes that the Data "
"Distributor should wiggle at a time). Currently, only 0 and 1 are supported. The value 0 means to disable the "
"perpetual storage wiggle.\n\n"
"See the FoundationDB Administration Guide for more information.");
helpMap["fileconfigure"] = CommandHelp(
"fileconfigure [new] <FILENAME>",
"change the database configuration from a file",
@ -1124,6 +1132,17 @@ void printStatus(StatusObjectReader statusObj,
if (statusObjConfig.get("log_routers", intVal))
outputString += format("\n Desired Log Routers - %d", intVal);
if (statusObjConfig.get("tss_count", intVal) && intVal > 0) {
int activeTss = 0;
if (statusObjCluster.has("active_tss_count")) {
statusObjCluster.get("active_tss_count", activeTss);
}
outputString += format("\n TSS - %d/%d", activeTss, intVal);
if (statusObjConfig.get("tss_storage_engine", strVal))
outputString += format("\n TSS Storage Engine - %s", strVal.c_str());
}
outputString += "\n Usable Regions - ";
if (statusObjConfig.get("usable_regions", intVal)) {
outputString += std::to_string(intVal);
@ -2766,6 +2785,7 @@ void configureGenerator(const char* text, const char* line, std::vector<std::str
"grv_proxies=",
"logs=",
"resolvers=",
"perpetual_storage_wiggle=",
nullptr };
arrayGenerator(text, line, opts, lc);
}
@ -3088,7 +3108,7 @@ struct CLIOptions {
return FDB_EXIT_ERROR;
}
syn = syn.substr(7);
knobs.push_back(std::make_pair(syn, args.OptionArg()));
knobs.emplace_back(syn, args.OptionArg());
break;
}
case OPT_DEBUG_TLS:

View File

@ -404,8 +404,14 @@ ACTOR Future<Void> readCommitted(Database cx,
state RangeResult values = wait(tr.getRange(begin, end, limits));
// When this buggify line is enabled, if there are more than 1 result then use half of the results
// Copy the data instead of messing with the results directly to avoid TSS issues.
if (values.size() > 1 && BUGGIFY) {
values.resize(values.arena(), values.size() / 2);
RangeResult copy;
// only copy first half of values into copy
for (int i = 0; i < values.size() / 2; i++) {
copy.push_back_deep(copy.arena(), values[i]);
}
values = copy;
values.more = true;
// Half of the time wait for this tr to expire so that the next read is at a different version
if (deterministicRandom()->random01() < 0.5)
@ -469,9 +475,15 @@ ACTOR Future<Void> readCommitted(Database cx,
state RangeResult rangevalue = wait(tr.getRange(nextKey, end, limits));
// When this buggify line is enabled, if there are more than 1 result then use half of the results
// When this buggify line is enabled, if there are more than 1 result then use half of the results.
// Copy the data instead of messing with the results directly to avoid TSS issues.
if (rangevalue.size() > 1 && BUGGIFY) {
rangevalue.resize(rangevalue.arena(), rangevalue.size() / 2);
RangeResult copy;
// only copy first half of rangevalue into copy
for (int i = 0; i < rangevalue.size() / 2; i++) {
copy.push_back_deep(copy.arena(), rangevalue[i]);
}
rangevalue = copy;
rangevalue.more = true;
// Half of the time wait for this tr to expire so that the next read is at a different version
if (deterministicRandom()->random01() < 0.5)

View File

@ -57,7 +57,8 @@ set(FDBCLIENT_SRCS
SpecialKeySpace.actor.h
ReadYourWrites.actor.cpp
ReadYourWrites.h
RestoreWorkerInterface.actor.h
RestoreInterface.cpp
RestoreInterface.h
RunTransaction.actor.h
RYWIterator.cpp
RYWIterator.h
@ -68,6 +69,7 @@ set(FDBCLIENT_SRCS
Status.h
StatusClient.actor.cpp
StatusClient.h
StorageServerInterface.cpp
StorageServerInterface.h
Subspace.cpp
Subspace.h

View File

@ -29,7 +29,6 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbclient/TagThrottle.h"
#include "fdbclient/GlobalConfig.h"
@ -288,9 +287,12 @@ struct GetKeyServerLocationsReply {
Arena arena;
std::vector<std::pair<KeyRangeRef, vector<StorageServerInterface>>> results;
// if any storage servers in results have a TSS pair, that mapping is in here
std::vector<std::pair<UID, StorageServerInterface>> resultsTssMapping;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, results, arena);
serializer(ar, results, resultsTssMapping, arena);
}
};

View File

@ -23,7 +23,7 @@
#pragma once
#include "fdbclient/FDBTypes.h"
#include "fdbserver/Knobs.h"
#include "fdbclient/Knobs.h"
// The versioned message has wire format : -1, version, messages
static const int32_t VERSION_HEADER = -1;
@ -95,7 +95,7 @@ struct MutationRef {
// Amplify atomicOp size to consider such extra workload.
// A good value for FASTRESTORE_ATOMICOP_WEIGHT needs experimental evaluations.
if (isAtomicOp()) {
return totalSize() * SERVER_KNOBS->FASTRESTORE_ATOMICOP_WEIGHT;
return totalSize() * CLIENT_KNOBS->FASTRESTORE_ATOMICOP_WEIGHT;
} else {
return totalSize();
}

View File

@ -33,12 +33,15 @@ const int MAX_CLUSTER_FILE_BYTES = 60000;
constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2);
constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
// the value of this endpoint should be stable and not change.
constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 11);
// The coordinator interface as exposed to clients
// well known endpoints published to the client.
struct ClientLeaderRegInterface {
RequestStream<struct GetLeaderRequest> getLeader;
RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
RequestStream<struct CheckDescriptorMutableRequest> checkDescriptorMutable;
ClientLeaderRegInterface() {}
ClientLeaderRegInterface(NetworkAddress remote);
@ -236,4 +239,28 @@ struct ProtocolInfoRequest {
}
};
// Returns true if the cluster descriptor may be modified.
struct CheckDescriptorMutableReply {
constexpr static FileIdentifier file_identifier = 7784299;
CheckDescriptorMutableReply() = default;
explicit CheckDescriptorMutableReply(bool isMutable) : isMutable(isMutable) {}
bool isMutable;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, isMutable);
}
};
// Allows client to check if allowed to change the cluster descriptor.
struct CheckDescriptorMutableRequest {
constexpr static FileIdentifier file_identifier = 214729;
ReplyPromise<CheckDescriptorMutableReply> reply;
CheckDescriptorMutableRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reply);
}
};
#endif

View File

@ -31,7 +31,8 @@ void DatabaseConfiguration::resetInternal() {
commitProxyCount = grvProxyCount = resolverCount = desiredTLogCount = tLogWriteAntiQuorum = tLogReplicationFactor =
storageTeamSize = desiredLogRouterCount = -1;
tLogVersion = TLogVersion::DEFAULT;
tLogDataStoreType = storageServerStoreType = KeyValueStoreType::END;
tLogDataStoreType = storageServerStoreType = testingStorageServerStoreType = KeyValueStoreType::END;
desiredTSSCount = 0;
tLogSpillType = TLogSpillType::DEFAULT;
autoCommitProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_COMMIT_PROXIES;
autoGrvProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_GRV_PROXIES;
@ -43,6 +44,7 @@ void DatabaseConfiguration::resetInternal() {
remoteDesiredTLogCount = -1;
remoteTLogReplicationFactor = repopulateRegionAntiQuorum = 0;
backupWorkerEnabled = false;
perpetualStorageWiggleSpeed = 0;
}
void parse(int* i, ValueRef const& v) {
@ -194,9 +196,9 @@ bool DatabaseConfiguration::isValid() const {
getDesiredRemoteLogs() >= 1 && remoteTLogReplicationFactor >= 0 && repopulateRegionAntiQuorum >= 0 &&
repopulateRegionAntiQuorum <= 1 && usableRegions >= 1 && usableRegions <= 2 && regions.size() <= 2 &&
(usableRegions == 1 || regions.size() == 2) && (regions.size() == 0 || regions[0].priority >= 0) &&
(regions.size() == 0 ||
tLogPolicy->info() !=
"dcid^2 x zoneid^2 x 1"))) { // We cannot specify regions with three_datacenter replication
(regions.size() == 0 || tLogPolicy->info() != "dcid^2 x zoneid^2 x 1") &&
// We cannot specify regions with three_datacenter replication
(perpetualStorageWiggleSpeed == 0 || perpetualStorageWiggleSpeed == 1))) {
return false;
}
std::set<Key> dcIds;
@ -298,6 +300,25 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
result["storage_engine"] = "custom";
}
if (desiredTSSCount > 0) {
result["tss_count"] = desiredTSSCount;
if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V1) {
result["tss_storage_engine"] = "ssd-1";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V2) {
result["tss_storage_engine"] = "ssd-2";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_REDWOOD_V1) {
result["tss_storage_engine"] = "ssd-redwood-experimental";
} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
result["tss_storage_engine"] = "ssd-rocksdb-experimental";
} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY_RADIXTREE) {
result["tss_storage_engine"] = "memory-radixtree-beta";
} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY) {
result["tss_storage_engine"] = "memory-2";
} else {
result["tss_storage_engine"] = "custom";
}
}
result["log_spill"] = (int)tLogSpillType;
if (remoteTLogReplicationFactor == 1) {
@ -352,7 +373,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
}
result["backup_worker_enabled"] = (int32_t)backupWorkerEnabled;
result["perpetual_storage_wiggle"] = perpetualStorageWiggleSpeed;
return result;
}
@ -448,6 +469,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
}
} else if (ck == LiteralStringRef("storage_replicas")) {
parse(&storageTeamSize, value);
} else if (ck == LiteralStringRef("tss_count")) {
parse(&desiredTSSCount, value);
} else if (ck == LiteralStringRef("log_version")) {
parse((&type), value);
type = std::max((int)TLogVersion::MIN_RECRUITABLE, type);
@ -470,6 +493,9 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
} else if (ck == LiteralStringRef("storage_engine")) {
parse((&type), value);
storageServerStoreType = (KeyValueStoreType::StoreType)type;
} else if (ck == LiteralStringRef("tss_storage_engine")) {
parse((&type), value);
testingStorageServerStoreType = (KeyValueStoreType::StoreType)type;
} else if (ck == LiteralStringRef("auto_commit_proxies")) {
parse(&autoCommitProxyCount, value);
} else if (ck == LiteralStringRef("auto_grv_proxies")) {
@ -499,6 +525,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
parse(&repopulateRegionAntiQuorum, value);
} else if (ck == LiteralStringRef("regions")) {
parse(&regions, value);
} else if (ck == LiteralStringRef("perpetual_storage_wiggle")) {
parse(&perpetualStorageWiggleSpeed, value);
} else {
return false;
}

View File

@ -225,6 +225,10 @@ struct DatabaseConfiguration {
int32_t storageTeamSize;
KeyValueStoreType storageServerStoreType;
// Testing StorageServers
int32_t desiredTSSCount;
KeyValueStoreType testingStorageServerStoreType;
// Remote TLogs
int32_t desiredLogRouterCount;
int32_t remoteDesiredTLogCount;
@ -239,6 +243,9 @@ struct DatabaseConfiguration {
int32_t repopulateRegionAntiQuorum;
std::vector<RegionInfo> regions;
// Perpetual Storage Setting
int32_t perpetualStorageWiggleSpeed;
// Excluded servers (no state should be here)
bool isExcludedServer(NetworkAddressList) const;
std::set<AddressExclusion> getExcludedServers() const;

View File

@ -273,6 +273,9 @@ public:
Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile;
AsyncTrigger proxiesChangeTrigger;
Future<Void> monitorProxiesInfoChange;
Future<Void> monitorTssInfoChange;
Future<Void> tssMismatchHandler;
PromiseStream<UID> tssMismatchStream;
Reference<CommitProxyInfo> commitProxies;
Reference<GrvProxyInfo> grvProxies;
bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time.
@ -320,6 +323,11 @@ public:
std::map<UID, StorageServerInfo*> server_interf;
// map from ssid -> tss interface
std::unordered_map<UID, StorageServerInterface> tssMapping;
// map from tssid -> metrics for that tss pair
std::unordered_map<UID, Reference<TSSMetrics>> tssMetrics;
UID dbId;
bool internal; // Only contexts created through the C client and fdbcli are non-internal
@ -419,6 +427,14 @@ public:
static bool debugUseTags;
static const std::vector<std::string> debugTransactionTagChoices;
std::unordered_map<KeyRef, Reference<WatchMetadata>> watchMap;
// Adds or updates the specified (SS, TSS) pair in the TSS mapping (if not already present).
// Requests to the storage server will be duplicated to the TSS.
void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi);
// Removes the storage server and its TSS pair from the TSS mapping (if present).
// Requests to the storage server will no longer be duplicated to its pair TSS.
void removeTssMapping(StorageServerInterface const& ssi);
};
#endif

View File

@ -95,7 +95,7 @@ public:
if (itr != optionsIndexMap.end()) {
options.erase(itr->second);
}
options.push_back(std::make_pair(option, value));
options.emplace_back(option, value);
optionsIndexMap[option] = --options.end();
}

View File

@ -483,7 +483,9 @@ inline Key keyAfter(const KeyRef& key) {
Standalone<StringRef> r;
uint8_t* s = new (r.arena()) uint8_t[key.size() + 1];
if (key.size() > 0) {
memcpy(s, key.begin(), key.size());
}
s[key.size()] = 0;
((StringRef&)r) = StringRef(s, key.size() + 1);
return r;

View File

@ -23,6 +23,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/RestoreInterface.h"
#include "fdbclient/Status.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/KeyBackedTypes.h"
@ -2705,13 +2706,17 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version));
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
state BackupConfig config(task);
state Future<Optional<bool>> partitionedLog;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
Version startVersion = wait(tr->getReadVersion());
partitionedLog = config.partitionedLogEnabled().get(tr);
state Future<Version> startVersionFuture = tr->getReadVersion();
wait(success(partitionedLog) && success(startVersionFuture));
Params.beginVersion().set(task, startVersion);
Params.beginVersion().set(task, startVersionFuture.get());
break;
} catch (Error& e) {
wait(tr->onError(e));
@ -2721,14 +2726,15 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
// Check if backup worker is enabled
DatabaseConfiguration dbConfig = wait(getDatabaseConfiguration(cx));
state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled;
if (!backupWorkerEnabled) {
if (!backupWorkerEnabled && partitionedLog.get().present() && partitionedLog.get().get()) {
// Change configuration only when we set to use partitioned logs and
// the flag was not set before.
wait(success(changeConfig(cx, "backup_worker_enabled:=1", true)));
backupWorkerEnabled = true;
}
// Set the "backupStartedKey" and wait for all backup worker started
tr->reset();
state BackupConfig config(task);
loop {
state Future<Void> watchFuture;
try {
@ -2738,7 +2744,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
state Future<Optional<Value>> started = tr->get(backupStartedKey);
state Future<Optional<Value>> taskStarted = tr->get(config.allWorkerStarted().key);
state Future<Optional<bool>> partitionedLog = config.partitionedLogEnabled().get(tr);
partitionedLog = config.partitionedLogEnabled().get(tr);
wait(success(started) && success(taskStarted) && success(partitionedLog));
if (!partitionedLog.get().present() || !partitionedLog.get().get()) {

View File

@ -34,16 +34,7 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
GlobalConfig::GlobalConfig() : lastUpdate(0) {}
void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
auto config = new GlobalConfig{};
config->cx = Database(cx);
g_network->setGlobal(INetwork::enGlobalConfig, config);
config->_updater = updater(config, dbInfo);
}
}
GlobalConfig::GlobalConfig(Database& cx) : cx(cx), lastUpdate(0) {}
GlobalConfig& GlobalConfig::globalConfig() {
void* res = g_network->global(INetwork::enGlobalConfig);
@ -77,6 +68,14 @@ Future<Void> GlobalConfig::onInitialized() {
return initialized.getFuture();
}
Future<Void> GlobalConfig::onChange() {
return configChanged.onTrigger();
}
void GlobalConfig::trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn) {
callbacks.emplace(key, std::move(fn));
}
void GlobalConfig::insert(KeyRef key, ValueRef value) {
data.erase(key);
@ -89,6 +88,8 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
any = StringRef(arena, t.getString(0).contents());
} else if (t.getType(0) == Tuple::ElementType::INT) {
any = t.getInt(0);
} else if (t.getType(0) == Tuple::ElementType::BOOL) {
any = t.getBool(0);
} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
any = t.getFloat(0);
} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
@ -97,19 +98,26 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
ASSERT(false);
}
data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
if (callbacks.find(stableKey) != callbacks.end()) {
callbacks[stableKey](data[stableKey]->value);
}
} catch (Error& e) {
TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
TraceEvent(SevWarn, "GlobalConfigTupleParseError").detail("What", e.what());
}
}
void GlobalConfig::erase(KeyRef key) {
data.erase(key);
void GlobalConfig::erase(Key key) {
erase(KeyRangeRef(key, keyAfter(key)));
}
void GlobalConfig::erase(KeyRangeRef range) {
auto it = data.begin();
while (it != data.end()) {
if (range.contains(it->first)) {
if (callbacks.find(it->first) != callbacks.end()) {
callbacks[it->first](std::nullopt);
}
it = data.erase(it);
} else {
++it;
@ -134,7 +142,6 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
loop {
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
// The value doesn't matter too much, as long as the key is set.
@ -153,17 +160,21 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
}
wait(tr->commit());
return Void();
} catch (Error& e) {
throw;
}
// If multiple fdbserver processes are started at once, they will all
// attempt this migration at the same time, sometimes resulting in
// aborts due to conflicts. Purposefully avoid retrying, making this
// migration best-effort.
TraceEvent(SevInfo, "GlobalConfigMigrationError").detail("What", e.what());
}
return Void();
}
// Updates local copy of global configuration by reading the entire key-range
// from storage.
ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
self->data.clear();
self->erase(KeyRangeRef(""_sr, "\xff"_sr));
Transaction tr(self->cx);
RangeResult result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
@ -176,7 +187,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
// Applies updates to the local copy of the global configuration when this
// process receives an updated history.
ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo* dbInfo) {
wait(self->cx->onConnected());
wait(self->migrate(self));
wait(self->refresh(self));
@ -184,9 +196,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
loop {
try {
wait(dbInfo->onChange());
wait(self->dbInfoChanged.onTrigger());
auto& history = dbInfo->get().history;
auto& history = dbInfo->history;
if (history.size() == 0) {
continue;
}
@ -196,8 +208,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
// history updates or the protocol version changed, so it
// must re-read the entire configuration range.
wait(self->refresh(self));
if (dbInfo->get().history.size() > 0) {
self->lastUpdate = dbInfo->get().history.back().version;
if (dbInfo->history.size() > 0) {
self->lastUpdate = dbInfo->history.back().version;
}
} else {
// Apply history in order, from lowest version to highest
@ -222,6 +234,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
self->lastUpdate = vh.version;
}
}
self->configChanged.trigger();
} catch (Error& e) {
throw;
}

View File

@ -62,10 +62,28 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
class GlobalConfig : NonCopyable {
public:
// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
// This function should only be called once by each process (however, it is
// idempotent and calling it multiple times will have no effect).
static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
// Creates a GlobalConfig singleton, accessed by calling
// GlobalConfig::globalConfig(). This function requires a database object
// to allow global configuration to run transactions on the database, and
// an AsyncVar object to watch for changes on. The ClientDBInfo pointer
// should point to a ClientDBInfo object which will contain the updated
// global configuration history when the given AsyncVar changes. This
// function should be called whenever the database object changes, in order
// to allow global configuration to run transactions on the latest
// database.
template <class T>
static void create(Database& cx, Reference<AsyncVar<T>> db, const ClientDBInfo* dbInfo) {
if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
auto config = new GlobalConfig{ cx };
g_network->setGlobal(INetwork::enGlobalConfig, config);
config->_updater = updater(config, dbInfo);
// Bind changes in `db` to the `dbInfoChanged` AsyncTrigger.
forward(db, std::addressof(config->dbInfoChanged));
} else {
GlobalConfig* config = reinterpret_cast<GlobalConfig*>(g_network->global(INetwork::enGlobalConfig));
config->cx = cx;
}
}
// Returns a reference to the global GlobalConfig object. Clients should
// call this function whenever they need to read a value out of the global
@ -114,8 +132,18 @@ public:
// been created and is ready.
Future<Void> onInitialized();
// Triggers the returned future when any key-value pair in the global
// configuration changes.
Future<Void> onChange();
// Calls \ref fn when the value associated with \ref key is changed. \ref
// fn is passed the updated value for the key, or an empty optional if the
// key has been cleared. If the value is an allocated object, its memory
// remains in the control of the global configuration.
void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
private:
GlobalConfig();
GlobalConfig(Database& cx);
// The functions below only affect the local copy of the global
// configuration keyspace! To insert or remove values across all nodes you
@ -127,20 +155,23 @@ private:
void insert(KeyRef key, ValueRef value);
// Removes the given key (and associated value) from the local copy of the
// global configuration keyspace.
void erase(KeyRef key);
void erase(Key key);
// Removes the given key range (and associated values) from the local copy
// of the global configuration keyspace.
void erase(KeyRangeRef range);
ACTOR static Future<Void> migrate(GlobalConfig* self);
ACTOR static Future<Void> refresh(GlobalConfig* self);
ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
ACTOR static Future<Void> updater(GlobalConfig* self, const ClientDBInfo* dbInfo);
Database cx;
AsyncTrigger dbInfoChanged;
Future<Void> _updater;
Promise<Void> initialized;
AsyncTrigger configChanged;
std::unordered_map<StringRef, Reference<ConfigValue>> data;
Version lastUpdate;
std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
};
#endif

View File

@ -173,6 +173,7 @@ void ClientKnobs::initialize(bool randomize) {
init( BACKUP_STATUS_DELAY, 40.0 );
init( BACKUP_STATUS_JITTER, 0.05 );
init( MIN_CLEANUP_SECONDS, 3600.0 );
init( FASTRESTORE_ATOMICOP_WEIGHT, 1 ); if( randomize && BUGGIFY ) { FASTRESTORE_ATOMICOP_WEIGHT = deterministicRandom()->random01() * 200 + 1; }
// Configuration
init( DEFAULT_AUTO_COMMIT_PROXIES, 3 );

View File

@ -168,6 +168,7 @@ public:
double BACKUP_STATUS_DELAY;
double BACKUP_STATUS_JITTER;
double MIN_CLEANUP_SECONDS;
int64_t FASTRESTORE_ATOMICOP_WEIGHT; // workload amplication factor for atomic op
// Configuration
int32_t DEFAULT_AUTO_COMMIT_PROXIES;

View File

@ -60,6 +60,13 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
return out;
}
if (mode == "tss") {
// Set temporary marker in config map to mark that this is a tss configuration and not a normal storage/log
// configuration. A bit of a hack but reuses the parsing code nicely.
out[p + "istss"] = "1";
return out;
}
if (mode == "locked") {
// Setting this key is interpreted as an instruction to use the normal version-stamp-based mechanism for locking
// the database.
@ -119,7 +126,7 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
if ((key == "logs" || key == "commit_proxies" || key == "grv_proxies" || key == "resolvers" ||
key == "remote_logs" || key == "log_routers" || key == "usable_regions" ||
key == "repopulate_anti_quorum") &&
key == "repopulate_anti_quorum" || key == "count") &&
isInteger(value)) {
out[p + key] = value;
}
@ -134,6 +141,14 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
BinaryWriter::toValue(regionObj, IncludeVersion(ProtocolVersion::withRegionConfiguration())).toString();
}
if (key == "perpetual_storage_wiggle" && isInteger(value)) {
int ppWiggle = atoi(value.c_str());
if (ppWiggle >= 2 || ppWiggle < 0) {
printf("Error: Only 0 and 1 are valid values of perpetual_storage_wiggle at present.\n");
return out;
}
out[p + key] = value;
}
return out;
}
@ -326,6 +341,35 @@ ConfigurationResult buildConfiguration(std::vector<StringRef> const& modeTokens,
serializeReplicationPolicy(policyWriter, logPolicy);
outConf[p + "log_replication_policy"] = policyWriter.toValue().toString();
}
if (outConf.count(p + "istss")) {
// redo config parameters to be tss config instead of normal config
// save param values from parsing as a normal config
bool isNew = outConf.count(p + "initialized");
Optional<std::string> count;
Optional<std::string> storageEngine;
if (outConf.count(p + "count")) {
count = Optional<std::string>(outConf[p + "count"]);
}
if (outConf.count(p + "storage_engine")) {
storageEngine = Optional<std::string>(outConf[p + "storage_engine"]);
}
// A new tss setup must have count + storage engine. An adjustment must have at least one.
if ((isNew && (!count.present() || !storageEngine.present())) ||
(!isNew && !count.present() && !storageEngine.present())) {
return ConfigurationResult::INCOMPLETE_CONFIGURATION;
}
// clear map and only reset tss parameters
outConf.clear();
if (count.present()) {
outConf[p + "tss_count"] = count.get();
}
if (storageEngine.present()) {
outConf[p + "tss_storage_engine"] = storageEngine.get();
}
}
return ConfigurationResult::SUCCESS;
}
@ -741,7 +785,7 @@ ConfigureAutoResult parseConfig(StatusObject const& status) {
}
if (processClass.classType() != ProcessClass::TesterClass) {
machine_processes[machineId].push_back(std::make_pair(addr, processClass));
machine_processes[machineId].emplace_back(addr, processClass);
processCount++;
}
}
@ -1105,6 +1149,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
leaderServers.reserve(coord.clientLeaderServers.size());
for (int i = 0; i < coord.clientLeaderServers.size(); i++)
leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
@ -1188,14 +1233,20 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
TEST(old.clusterKeyName() != conn.clusterKeyName()); // Quorum change with new name
TEST(old.clusterKeyName() == conn.clusterKeyName()); // Quorum change with unchanged name
vector<Future<Optional<LeaderInfo>>> leaderServers;
ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
state vector<Future<Optional<LeaderInfo>>> leaderServers;
state ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
// check if allowed to modify the cluster descriptor
if (!change->getDesiredClusterKeyName().empty()) {
CheckDescriptorMutableReply mutabilityReply =
wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutableRequest()));
if (!mutabilityReply.isMutable)
return CoordinatorsResult::BAD_DATABASE_STATE;
}
leaderServers.reserve(coord.clientLeaderServers.size());
for (int i = 0; i < coord.clientLeaderServers.size(); i++)
leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
GetLeaderRequest(coord.clusterKey, UID()),
TaskPriority::CoordinationReply));
choose {
when(wait(waitForAll(leaderServers))) {}
when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; }
@ -1264,7 +1315,7 @@ struct AutoQuorumChange final : IQuorumChange {
vector<NetworkAddress> oldCoordinators,
Reference<ClusterConnectionFile> ccf,
CoordinatorsResult& err) override {
return getDesired(this, tr, oldCoordinators, ccf, &err);
return getDesired(Reference<AutoQuorumChange>::addRef(this), tr, oldCoordinators, ccf, &err);
}
ACTOR static Future<int> getRedundancy(AutoQuorumChange* self, Transaction* tr) {
@ -1327,7 +1378,7 @@ struct AutoQuorumChange final : IQuorumChange {
return true; // The status quo seems fine
}
ACTOR static Future<vector<NetworkAddress>> getDesired(AutoQuorumChange* self,
ACTOR static Future<vector<NetworkAddress>> getDesired(Reference<AutoQuorumChange> self,
Transaction* tr,
vector<NetworkAddress> oldCoordinators,
Reference<ClusterConnectionFile> ccf,
@ -1335,7 +1386,7 @@ struct AutoQuorumChange final : IQuorumChange {
state int desiredCount = self->desired;
if (desiredCount == -1) {
int redundancy = wait(getRedundancy(self, tr));
int redundancy = wait(getRedundancy(self.getPtr(), tr));
desiredCount = redundancy * 2 - 1;
}
@ -1364,7 +1415,7 @@ struct AutoQuorumChange final : IQuorumChange {
}
if (checkAcceptable) {
bool ok = wait(isAcceptable(self, tr, oldCoordinators, ccf, desiredCount, &excluded));
bool ok = wait(isAcceptable(self.getPtr(), tr, oldCoordinators, ccf, desiredCount, &excluded));
if (ok)
return oldCoordinators;
}

View File

@ -380,11 +380,14 @@ ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector<NetworkAddres
ClientLeaderRegInterface::ClientLeaderRegInterface(NetworkAddress remote)
: getLeader(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_GETLEADER)),
openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)) {}
openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)),
checkDescriptorMutable(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE)) {}
ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {
getLeader.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination);
openDatabase.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_OPENDATABASE, TaskPriority::Coordination);
checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE,
TaskPriority::Coordination);
}
// Nominee is the worker among all workers that are considered as leader by a coordinator
@ -431,9 +434,9 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const vector<Optional<LeaderInfo
maskedNominees.reserve(nominees.size());
for (int i = 0; i < nominees.size(); i++) {
if (nominees[i].present()) {
maskedNominees.push_back(std::make_pair(
maskedNominees.emplace_back(
UID(nominees[i].get().changeID.first() & LeaderInfo::changeIDMask, nominees[i].get().changeID.second()),
i));
i);
}
}
@ -496,7 +499,8 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<ClusterConn
if (leader.get().first.forward) {
TraceEvent("MonitorLeaderForwarding")
.detail("NewConnStr", leader.get().first.serializedInfo.toString())
.detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()).trackLatest("MonitorLeaderForwarding");
.detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString())
.trackLatest("MonitorLeaderForwarding");
info.intermediateConnFile = makeReference<ClusterConnectionFile>(
connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString()));
return info;
@ -582,7 +586,7 @@ OpenDatabaseRequest ClientData::getRequest() {
auto& entry = issueMap[it];
entry.count++;
if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
entry.examples.push_back(std::make_pair(ci.first, ci.second.traceLogGroup));
entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
}
}
if (ci.second.versions.size()) {
@ -593,19 +597,19 @@ OpenDatabaseRequest ClientData::getRequest() {
auto& entry = versionMap[it];
entry.count++;
if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
entry.examples.push_back(std::make_pair(ci.first, ci.second.traceLogGroup));
entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
}
}
auto& maxEntry = maxProtocolMap[maxProtocol];
maxEntry.count++;
if (maxEntry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
maxEntry.examples.push_back(std::make_pair(ci.first, ci.second.traceLogGroup));
maxEntry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
}
} else {
auto& entry = versionMap[ClientVersionRef()];
entry.count++;
if (entry.examples.size() < CLIENT_KNOBS->CLIENT_EXAMPLE_AMOUNT) {
entry.examples.push_back(std::make_pair(ci.first, ci.second.traceLogGroup));
entry.examples.emplace_back(ci.first, ci.second.traceLogGroup);
}
}
}

View File

@ -595,7 +595,7 @@ Reference<IDatabase> DLApi::createDatabase(const char* clusterFilePath) {
void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) {
MutexHolder holder(lock);
threadCompletionHooks.push_back(std::make_pair(hook, hookParameter));
threadCompletionHooks.emplace_back(hook, hookParameter);
}
// MultiVersionTransaction
@ -947,7 +947,7 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
value.castTo<Standalone<StringRef>>());
}
dbState->options.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
dbState->options.emplace_back(option, value.castTo<Standalone<StringRef>>());
if (dbState->db) {
dbState->db->setOption(option, value);
@ -1559,7 +1559,7 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option,
runOnExternalClientsAllThreads(
[option, value](Reference<ClientInfo> client) { client->api->setNetworkOption(option, value); });
} else {
options.push_back(std::make_pair(option, value.castTo<Standalone<StringRef>>()));
options.emplace_back(option, value.castTo<Standalone<StringRef>>());
}
}
}

View File

@ -38,6 +38,7 @@
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/JsonBuilder.h"
#include "fdbclient/KeyBackedTypes.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ManagementAPI.actor.h"
@ -121,6 +122,52 @@ NetworkOptions::NetworkOptions()
static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/");
static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/");
void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) {
auto result = tssMapping.find(ssi.id());
// Update tss endpoint mapping if ss isn't in mapping, or the interface it mapped to changed
if (result == tssMapping.end() ||
result->second.getValue.getEndpoint().token.first() != tssi.getValue.getEndpoint().token.first()) {
Reference<TSSMetrics> metrics;
if (result == tssMapping.end()) {
// new TSS pairing
metrics = makeReference<TSSMetrics>();
tssMetrics[tssi.id()] = metrics;
tssMapping[ssi.id()] = tssi;
} else {
if (result->second.id() == tssi.id()) {
metrics = tssMetrics[tssi.id()];
} else {
TEST(true); // SS now maps to new TSS! This will probably never happen in practice
tssMetrics.erase(result->second.id());
metrics = makeReference<TSSMetrics>();
tssMetrics[tssi.id()] = metrics;
}
result->second = tssi;
}
queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics));
queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(),
TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics));
}
}
void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) {
auto result = tssMapping.find(ssi.id());
if (result != tssMapping.end()) {
tssMetrics.erase(ssi.id());
tssMapping.erase(result);
queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first());
queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first());
}
}
Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx,
StorageServerInterface const& ssi,
LocalityData const& locality) {
@ -133,6 +180,7 @@ Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx
// pointing to. This is technically correct, but is very unnatural. We may want to refactor load
// balance to take an AsyncVar<Reference<Interface>> so that it is notified when the interface
// changes.
it->second->interf = ssi;
} else {
it->second->notifyContextDestroyed();
@ -285,6 +333,13 @@ void delref(DatabaseContext* ptr) {
ptr->delref();
}
void traceTSSErrors(const char* name, UID tssId, const std::unordered_map<int, uint64_t>& errorsByCode) {
TraceEvent ev(name, tssId);
for (auto& it : errorsByCode) {
ev.detail("E" + std::to_string(it.first), it.second);
}
}
ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
state double lastLogged = 0;
loop {
@ -327,6 +382,62 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
cx->mutationsPerCommit.clear();
cx->bytesPerCommit.clear();
for (const auto& it : cx->tssMetrics) {
// TODO could skip this tss if request counter is zero? would potentially complicate elapsed calculation
// though
if (it.second->mismatches.getIntervalDelta()) {
cx->tssMismatchStream.send(it.first);
}
// do error histograms as separate event
if (it.second->ssErrorsByCode.size()) {
traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode);
}
if (it.second->tssErrorsByCode.size()) {
traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode);
}
TraceEvent tssEv("TSSClientMetrics", cx->dbId);
tssEv.detail("TSSID", it.first)
.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
.detail("Internal", cx->internal);
it.second->cc.logToTraceEvent(tssEv);
tssEv.detail("MeanSSGetValueLatency", it.second->SSgetValueLatency.mean())
.detail("MedianSSGetValueLatency", it.second->SSgetValueLatency.median())
.detail("SSGetValueLatency90", it.second->SSgetValueLatency.percentile(0.90))
.detail("SSGetValueLatency99", it.second->SSgetValueLatency.percentile(0.99));
tssEv.detail("MeanTSSGetValueLatency", it.second->TSSgetValueLatency.mean())
.detail("MedianTSSGetValueLatency", it.second->TSSgetValueLatency.median())
.detail("TSSGetValueLatency90", it.second->TSSgetValueLatency.percentile(0.90))
.detail("TSSGetValueLatency99", it.second->TSSgetValueLatency.percentile(0.99));
tssEv.detail("MeanSSGetKeyLatency", it.second->SSgetKeyLatency.mean())
.detail("MedianSSGetKeyLatency", it.second->SSgetKeyLatency.median())
.detail("SSGetKeyLatency90", it.second->SSgetKeyLatency.percentile(0.90))
.detail("SSGetKeyLatency99", it.second->SSgetKeyLatency.percentile(0.99));
tssEv.detail("MeanTSSGetKeyLatency", it.second->TSSgetKeyLatency.mean())
.detail("MedianTSSGetKeyLatency", it.second->TSSgetKeyLatency.median())
.detail("TSSGetKeyLatency90", it.second->TSSgetKeyLatency.percentile(0.90))
.detail("TSSGetKeyLatency99", it.second->TSSgetKeyLatency.percentile(0.99));
tssEv.detail("MeanSSGetKeyValuesLatency", it.second->SSgetKeyLatency.mean())
.detail("MedianSSGetKeyValuesLatency", it.second->SSgetKeyLatency.median())
.detail("SSGetKeyValuesLatency90", it.second->SSgetKeyLatency.percentile(0.90))
.detail("SSGetKeyValuesLatency99", it.second->SSgetKeyLatency.percentile(0.99));
tssEv.detail("MeanTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.mean())
.detail("MedianTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.median())
.detail("TSSGetKeyValuesLatency90", it.second->TSSgetKeyValuesLatency.percentile(0.90))
.detail("TSSGetKeyValuesLatency99", it.second->TSSgetKeyValuesLatency.percentile(0.99));
it.second->clear();
}
lastLogged = now();
}
}
@ -711,6 +822,59 @@ ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
}
}
ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
state Reference<ReadYourWritesTransaction> tr;
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
loop {
state UID tssID = waitNext(cx->tssMismatchStream.getFuture());
// find ss pair id so we can remove it from the mapping
state UID tssPairID;
bool found = false;
for (const auto& it : cx->tssMapping) {
if (it.second.id() == tssID) {
tssPairID = it.first;
found = true;
break;
}
}
if (found) {
TraceEvent(SevWarnAlways, "TSS_KillMismatch").detail("TSSID", tssID.toString());
TEST(true); // killing TSS because it got mismatch
// TODO we could write something to the system keyspace and then have DD listen to that keyspace and then DD
// do exactly this, so why not just cut out the middle man (or the middle system keys, as it were)
tr = makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(cx)));
state int tries = 0;
loop {
try {
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->clear(serverTagKeyFor(tssID));
tssMapDB.erase(tr, tssPairID);
wait(tr->commit());
break;
} catch (Error& e) {
wait(tr->onError(e));
}
tries++;
if (tries > 10) {
// Give up on trying to kill the tss, it'll get another mismatch or a human will investigate
// eventually
TraceEvent("TSS_KillMismatchGaveUp").detail("TSSID", tssID.toString());
break;
}
}
// clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx
tr = makeReference<ReadYourWritesTransaction>();
} else {
TEST(true); // Not killing TSS with mismatch because it's already gone
}
}
}
ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext* cx, bool detailed) {
if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) {
if (detailed) {
@ -957,9 +1121,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
GlobalConfig::create(this, clientInfo);
monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
tssMismatchHandler = handleTssMismatches(this);
clientStatusUpdater.actor = clientStatusUpdateActor(this);
cacheListMonitor = monitorCacheList(this);
@ -1053,12 +1216,14 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
SpecialKeySpace::MODULE::MANAGEMENT,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<MaintenanceImpl>(
KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
registerSpecialKeySpaceModule(
SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
SpecialKeySpace::MODULE::MANAGEMENT,
SpecialKeySpace::IMPLTYPE::READWRITE,
std::make_unique<DataDistributionImpl>(
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
@ -1199,6 +1364,8 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
DatabaseContext::~DatabaseContext() {
cacheListMonitor.cancel();
monitorProxiesInfoChange.cancel();
monitorTssInfoChange.cancel();
tssMismatchHandler.cancel();
for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it))
it->second->notifyContextDestroyed();
ASSERT_ABORT(server_interf.empty());
@ -1553,7 +1720,9 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
/*switchable*/ true);
}
return Database(db);
auto database = Database(db);
GlobalConfig::create(database, clientInfo, std::addressof(clientInfo->get()));
return database;
}
Database Database::createDatabase(std::string connFileName,
@ -2015,6 +2184,29 @@ ACTOR Future<Optional<vector<StorageServerInterface>>> transactionalGetServerInt
return serverInterfaces;
}
void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) {
// Since a ss -> tss mapping is included in resultsTssMapping iff that SS is in results and has a tss pair,
// all SS in results that do not have a mapping present must not have a tss pair.
std::unordered_map<UID, const StorageServerInterface*> ssiById;
for (const auto& [_, shard] : reply.results) {
for (auto& ssi : shard) {
ssiById[ssi.id()] = &ssi;
}
}
for (const auto& mapping : reply.resultsTssMapping) {
auto ssi = ssiById.find(mapping.first);
ASSERT(ssi != ssiById.end());
cx->addTssMapping(*ssi->second, mapping.second);
ssiById.erase(mapping.first);
}
// if SS didn't have a mapping above, it's still in the ssiById map, so remove its tss mapping
for (const auto& it : ssiById) {
cx->removeTssMapping(*it.second);
}
}
// If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key).
// Otherwise returns the shard containing key
ACTOR Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation_internal(Database cx,
@ -2047,6 +2239,7 @@ ACTOR Future<pair<KeyRange, Reference<LocationInfo>>> getKeyLocation_internal(Da
ASSERT(rep.results.size() == 1);
auto locationInfo = cx->setCachedLocation(rep.results[0].first, rep.results[0].second);
updateTssMappings(cx, rep);
return std::make_pair(KeyRange(rep.results[0].first, rep.arena), locationInfo);
}
}
@ -2110,6 +2303,7 @@ ACTOR Future<vector<pair<KeyRange, Reference<LocationInfo>>>> getKeyRangeLocatio
cx->setCachedLocation(rep.results[shard].first, rep.results[shard].second));
wait(yield());
}
updateTssMappings(cx, rep);
return results;
}
@ -2235,7 +2429,7 @@ ACTOR Future<Optional<Value>> getValue(Future<Version> version,
state GetValueReply reply;
try {
if (CLIENT_BUGGIFY) {
if (CLIENT_BUGGIFY_WITH_PROB(.01)) {
throw deterministicRandom()->randomChoice(
std::vector<Error>{ transaction_too_old(), future_version() });
}
@ -2345,6 +2539,11 @@ ACTOR Future<Key> getKey(Database cx, KeySelector k, Future<Version> version, Tr
"NativeAPI.getKey.Before"); //.detail("StartKey",
// k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual);
++cx->transactionPhysicalReads;
GetKeyRequest req(
span.context, k, version.get(), cx->sampleReadTags() ? tags : Optional<TagSet>(), getKeyID);
req.arena.dependsOn(k.arena());
state GetKeyReply reply;
try {
choose {
@ -2353,11 +2552,7 @@ ACTOR Future<Key> getKey(Database cx, KeySelector k, Future<Version> version, Tr
wait(loadBalance(cx.getPtr(),
ssi.second,
&StorageServerInterface::getKey,
GetKeyRequest(span.context,
k,
version.get(),
cx->sampleReadTags() ? tags : Optional<TagSet>(),
getKeyID),
req,
TaskPriority::DefaultPromiseEndpoint,
false,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
@ -2718,6 +2913,9 @@ ACTOR Future<RangeResult> getExactRange(Database cx,
req.end = firstGreaterOrEqual(range.end);
req.spanContext = span.context;
// keep shard's arena around in case of async tss comparison
req.arena.dependsOn(locations[shard].first.arena());
transformRangeLimits(limits, reverse, req);
ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
@ -3034,6 +3232,9 @@ ACTOR Future<RangeResult> getRange(Database cx,
req.isFetchKeys = (info.taskID == TaskPriority::FetchKeys);
req.version = readVersion;
// In case of async tss comparison, also make req arena depend on begin, end, and/or shard's arena depending
// on which is used
bool dependOnShard = false;
if (reverse && (begin - 1).isDefinitelyLess(shard.begin) &&
(!begin.isFirstGreaterOrEqual() ||
begin.getKey() != shard.begin)) { // In this case we would be setting modifiedSelectors to true, but
@ -3041,14 +3242,23 @@ ACTOR Future<RangeResult> getRange(Database cx,
req.begin = firstGreaterOrEqual(shard.begin);
modifiedSelectors = true;
} else
req.arena.dependsOn(shard.arena());
dependOnShard = true;
} else {
req.begin = begin;
req.arena.dependsOn(begin.arena());
}
if (!reverse && end.isDefinitelyGreater(shard.end)) {
req.end = firstGreaterOrEqual(shard.end);
modifiedSelectors = true;
} else
if (!dependOnShard) {
req.arena.dependsOn(shard.arena());
}
} else {
req.end = end;
req.arena.dependsOn(end.arena());
}
transformRangeLimits(limits, reverse, req);
ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
@ -3078,7 +3288,7 @@ ACTOR Future<RangeResult> getRange(Database cx,
++cx->transactionPhysicalReads;
state GetKeyValuesReply rep;
try {
if (CLIENT_BUGGIFY) {
if (CLIENT_BUGGIFY_WITH_PROB(.01)) {
throw deterministicRandom()->randomChoice(
std::vector<Error>{ transaction_too_old(), future_version() });
}
@ -3133,10 +3343,17 @@ ACTOR Future<RangeResult> getRange(Database cx,
output.readThroughEnd = readThroughEnd;
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) {
// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
// happens in simulation so it's fine
RangeResult copy;
int newSize =
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
for (int i = 0; i < newSize; i++) {
copy.push_back_deep(copy.arena(), output[i]);
}
output = copy;
output.more = true;
output.resize(
output.arena(),
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()));
getRangeFinished(cx,
trLogInfo,
startTime,
@ -5659,3 +5876,23 @@ Future<Void> DatabaseContext::createSnapshot(StringRef uid, StringRef snapshot_c
}
return createSnapshotActor(this, UID::fromString(uid_str), snapshot_command);
}
ACTOR Future<Void> setPerpetualStorageWiggle(Database cx, bool enable, bool lock_aware) {
state ReadYourWritesTransaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
if(lock_aware) {
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
}
tr.set(perpetualStorageWiggleKey, enable ? LiteralStringRef("1") : LiteralStringRef("0"));
wait(tr.commit());
break;
}
catch (Error& e) {
wait(tr.onError(e));
}
}
return Void();
}

View File

@ -407,5 +407,10 @@ ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exc
inline uint64_t getWriteOperationCost(uint64_t bytes) {
return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
}
// Create a transaction to set the value of system key \xff/conf/perpetual_storage_wiggle. If enable == true, the value
// will be 1. Otherwise, the value will be 0.
ACTOR Future<Void> setPerpetualStorageWiggle(Database cx, bool enable, bool lock_aware = false);
#include "flow/unactorcompiler.h"
#endif

View File

@ -0,0 +1,56 @@
/*
* RestoreInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/RestoreInterface.h"
#include "flow/serialize.h"
const KeyRef restoreRequestDoneKey = "\xff\x02/restoreRequestDone"_sr;
const KeyRef restoreRequestTriggerKey = "\xff\x02/restoreRequestTrigger"_sr;
const KeyRangeRef restoreRequestKeys("\xff\x02/restoreRequests/"_sr, "\xff\x02/restoreRequests0"_sr);
// Encode and decode restore request value
Value restoreRequestTriggerValue(UID randomID, int numRequests) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreRequestTriggerValue()));
wr << numRequests;
wr << randomID;
return wr.toValue();
}
int decodeRestoreRequestTriggerValue(ValueRef const& value) {
int s;
UID randomID;
BinaryReader reader(value, IncludeVersion());
reader >> s;
reader >> randomID;
return s;
}
Key restoreRequestKeyFor(int index) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreRequestKeys.begin);
wr << index;
return wr.toValue();
}
Value restoreRequestValue(RestoreRequest const& request) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreRequestValue()));
wr << request;
return wr.toValue();
}

View File

@ -0,0 +1,99 @@
/*
* RestoreInterface.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
struct RestoreCommonReply {
constexpr static FileIdentifier file_identifier = 5808787;
UID id; // unique ID of the server who sends the reply
bool isDuplicated;
RestoreCommonReply() = default;
explicit RestoreCommonReply(UID id, bool isDuplicated = false) : id(id), isDuplicated(isDuplicated) {}
std::string toString() const {
std::stringstream ss;
ss << "ServerNodeID:" << id.toString() << " isDuplicated:" << isDuplicated;
return ss.str();
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, isDuplicated);
}
};
struct RestoreRequest {
constexpr static FileIdentifier file_identifier = 16035338;
int index;
Key tagName;
Key url;
Version targetVersion;
KeyRange range;
UID randomUid;
// Every key in backup will first removePrefix and then addPrefix;
// Simulation testing does not cover when both addPrefix and removePrefix exist yet.
Key addPrefix;
Key removePrefix;
ReplyPromise<struct RestoreCommonReply> reply;
RestoreRequest() = default;
explicit RestoreRequest(const int index,
const Key& tagName,
const Key& url,
Version targetVersion,
const KeyRange& range,
const UID& randomUid,
Key& addPrefix,
Key removePrefix)
: index(index), tagName(tagName), url(url), targetVersion(targetVersion), range(range), randomUid(randomUid),
addPrefix(addPrefix), removePrefix(removePrefix) {}
// To change this serialization, ProtocolVersion::RestoreRequestValue must be updated, and downgrades need to be
// considered
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, index, tagName, url, targetVersion, range, randomUid, addPrefix, removePrefix, reply);
}
std::string toString() const {
std::stringstream ss;
ss << "index:" << std::to_string(index) << " tagName:" << tagName.contents().toString()
<< " url:" << url.contents().toString() << " targetVersion:" << std::to_string(targetVersion)
<< " range:" << range.toString() << " randomUid:" << randomUid.toString()
<< " addPrefix:" << addPrefix.toString() << " removePrefix:" << removePrefix.toString();
return ss.str();
}
};
extern const KeyRef restoreRequestDoneKey;
extern const KeyRef restoreRequestTriggerKey;
extern const KeyRangeRef restoreRequestKeys;
Value restoreRequestTriggerValue(UID randomID, int numRequests);
int decodeRequestRequestTriggerValue(ValueRef const&);
Key restoreRequestKeyFor(int index);
Value restoreRequestValue(RestoreRequest const&);

View File

@ -144,6 +144,16 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"counter":0,
"roughness":0.0
},
"fetched_versions":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"fetches_from_logs":{
"hz":0.0,
"counter":0,
"roughness":0.0
},
"grv_latency_statistics":{
"default":{
"count":0,
@ -421,6 +431,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"seconds" : 1.0,
"versions" : 1000000
},
"active_tss_count":0,
"degraded_processes":0,
"database_available":true,
"database_lock_state": {
@ -648,6 +659,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"data_distribution_disabled_for_rebalance":true,
"data_distribution_disabled":true,
"active_primary_dc":"pv",
"bounce_impact":{
"can_clean_bounce":true,
"reason":""
},
"configuration":{
"log_anti_quorum":0,
"log_replicas":2,
@ -715,6 +730,19 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"memory-2",
"memory-radixtree-beta"
]},
"tss_count":1,
"tss_storage_engine":{
"$enum":[
"ssd",
"ssd-1",
"ssd-2",
"ssd-redwood-experimental",
"ssd-rocksdb-experimental",
"memory",
"memory-1",
"memory-2",
"memory-radixtree-beta"
]},
"coordinators_count":1,
"excluded_servers":[
{
@ -727,7 +755,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
"auto_logs":3,
"commit_proxies":5,
"grv_proxies":1,
"backup_worker_enabled":1
"backup_worker_enabled":1,
"perpetual_storage_wiggle":0
},
"data":{
"least_operating_space_bytes_log_server":0,
@ -787,7 +816,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
}
}
],
"least_operating_space_bytes_storage_server":0
"least_operating_space_bytes_storage_server":0,
"max_machine_failures_without_losing_data":0
},
"machines":{
"$map":{

View File

@ -1384,6 +1384,9 @@ Future<RangeResult> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, K
} else if (config->value.type() == typeid(int64_t)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
} else if (config->value.type() == typeid(bool)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
} else if (config->value.type() == typeid(float)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
@ -2058,9 +2061,20 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
try {
int mode = boost::lexical_cast<int>(iter->value().second.get().toString());
Value modeVal = BinaryWriter::toValue(mode, Unversioned());
if (mode == 0 || mode == 1)
if (mode == 0 || mode == 1) {
// Whenever configuration changes or DD related system keyspace is changed,
// actor must grab the moveKeysLockOwnerKey and update moveKeysLockWriteKey.
// This prevents concurrent write to the same system keyspace.
// When the owner of the DD related system keyspace changes, DD will reboot
BinaryWriter wrMyOwner(Unversioned());
wrMyOwner << dataDistributionModeLock;
ryw->getTransaction().set(moveKeysLockOwnerKey, wrMyOwner.toValue());
BinaryWriter wrLastWrite(Unversioned());
wrLastWrite << deterministicRandom()->randomUniqueID();
ryw->getTransaction().set(moveKeysLockWriteKey, wrLastWrite.toValue());
// set mode
ryw->getTransaction().set(dataDistributionModeKey, modeVal);
else
} else
msg = ManagementAPIError::toJsonString(false,
"datadistribution",
"Please set the value of the data_distribution/mode to "

View File

@ -0,0 +1,385 @@
/*
* StorageServerInterface.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/StorageServerInterface.h"
#include "flow/crc32c.h" // for crc32c_append, to checksum values in tss trace events
// Includes template specializations for all tss operations on storage server types.
// New StorageServerInterface reply types must be added here or it won't compile.
// if size + hex of checksum is shorter than value, record that instead of actual value. break-even point is 12
// characters
std::string traceChecksumValue(ValueRef s) {
return s.size() > 12 ? format("(%d)%08x", s.size(), crc32c_append(0, s.begin(), s.size())) : s.toString();
}
template <>
bool TSS_doCompare(const GetValueRequest& req,
const GetValueReply& src,
const GetValueReply& tss,
Severity traceSeverity,
UID tssId) {
if (src.value.present() != tss.value.present() || (src.value.present() && src.value.get() != tss.value.get())) {
TraceEvent(traceSeverity, "TSSMismatchGetValue")
.suppressFor(1.0)
.detail("TSSID", tssId)
.detail("Key", req.key.printable())
.detail("Version", req.version)
.detail("SSReply", src.value.present() ? traceChecksumValue(src.value.get()) : "missing")
.detail("TSSReply", tss.value.present() ? traceChecksumValue(tss.value.get()) : "missing");
return false;
}
return true;
}
template <>
bool TSS_doCompare(const GetKeyRequest& req,
const GetKeyReply& src,
const GetKeyReply& tss,
Severity traceSeverity,
UID tssId) {
// This process is a bit complicated. Since the tss and ss can return different results if neighboring shards to
// req.sel.key are currently being moved, We validate that the results are the same IF the returned key selectors
// are final. Otherwise, we only mark the request as a mismatch if the difference between the two returned key
// selectors could ONLY be because of different results from the storage engines. We can afford to only partially
// check key selectors that start in a TSS shard and end in a non-TSS shard because the other read queries and the
// consistency check will eventually catch a misbehaving storage engine.
bool matches = true;
if (src.sel.orEqual == tss.sel.orEqual && src.sel.offset == tss.sel.offset) {
// full matching case
if (src.sel.offset == 0 && src.sel.orEqual) {
// found exact key, should be identical
matches = src.sel.getKey() == tss.sel.getKey();
}
// if the query doesn't return the final key, there is an edge case where the ss and tss have different shard
// boundaries, so they pass different shard boundary keys back for the same offset
} else if (src.sel.getKey() == tss.sel.getKey()) {
// There is one case with a positive offset where the shard boundary the incomplete query stopped at is the next
// key in the shard that the complete query returned. This is not possible with a negative offset because the
// shard boundary is exclusive backwards
if (src.sel.offset == 0 && src.sel.orEqual && tss.sel.offset == 1 && !tss.sel.orEqual) {
// case where ss was complete and tss was incomplete
} else if (tss.sel.offset == 0 && tss.sel.orEqual && src.sel.offset == 1 && !src.sel.orEqual) {
// case where tss was complete and ss was incomplete
} else {
matches = false;
}
} else {
// ss/tss returned different keys, and different offsets and/or orEqual
// here we just validate that ordering of the keys matches the ordering of the offsets
bool tssKeyLarger = src.sel.getKey() < tss.sel.getKey();
// the only case offsets are equal and orEqual aren't equal is the case with a negative offset,
// where one response has <=0 with the actual result and the other has <0 with the shard upper boundary.
// So whichever one has the actual result should have the lower key.
bool tssOffsetLarger = (src.sel.offset == tss.sel.offset) ? tss.sel.orEqual : src.sel.offset < tss.sel.offset;
matches = tssKeyLarger != tssOffsetLarger;
}
if (!matches) {
TraceEvent(traceSeverity, "TSSMismatchGetKey")
.suppressFor(1.0)
.detail("TSSID", tssId)
.detail("KeySelector",
format("%s%s:%d", req.sel.orEqual ? "=" : "", req.sel.getKey().printable().c_str(), req.sel.offset))
.detail("Version", req.version)
.detail("SSReply",
format("%s%s:%d", src.sel.orEqual ? "=" : "", src.sel.getKey().printable().c_str(), src.sel.offset))
.detail(
"TSSReply",
format("%s%s:%d", tss.sel.orEqual ? "=" : "", tss.sel.getKey().printable().c_str(), tss.sel.offset));
}
return matches;
}
template <>
bool TSS_doCompare(const GetKeyValuesRequest& req,
const GetKeyValuesReply& src,
const GetKeyValuesReply& tss,
Severity traceSeverity,
UID tssId) {
if (src.more != tss.more || src.data != tss.data) {
std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : "");
for (auto& it : src.data) {
ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
}
std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : "");
for (auto& it : tss.data) {
tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
}
TraceEvent(traceSeverity, "TSSMismatchGetKeyValues")
.suppressFor(1.0)
.detail("TSSID", tssId)
.detail(
"Begin",
format(
"%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset))
.detail("End",
format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset))
.detail("Version", req.version)
.detail("Limit", req.limit)
.detail("LimitBytes", req.limitBytes)
.detail("SSReply", ssResultsString)
.detail("TSSReply", tssResultsString);
return false;
}
return true;
}
template <>
bool TSS_doCompare(const WatchValueRequest& req,
const WatchValueReply& src,
const WatchValueReply& tss,
Severity traceSeverity,
UID tssId) {
// We duplicate watches just for load, no need to validte replies.
return true;
}
// no-op template specializations for metrics replies
template <>
bool TSS_doCompare(const WaitMetricsRequest& req,
const StorageMetrics& src,
const StorageMetrics& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
template <>
bool TSS_doCompare(const SplitMetricsRequest& req,
const SplitMetricsReply& src,
const SplitMetricsReply& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
template <>
bool TSS_doCompare(const ReadHotSubRangeRequest& req,
const ReadHotSubRangeReply& src,
const ReadHotSubRangeReply& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
template <>
bool TSS_doCompare(const SplitRangeRequest& req,
const SplitRangeReply& src,
const SplitRangeReply& tss,
Severity traceSeverity,
UID tssId) {
return true;
}
// only record metrics for data reads
template <>
void TSSMetrics::recordLatency(const GetValueRequest& req, double ssLatency, double tssLatency) {
SSgetValueLatency.addSample(ssLatency);
TSSgetValueLatency.addSample(tssLatency);
}
template <>
void TSSMetrics::recordLatency(const GetKeyRequest& req, double ssLatency, double tssLatency) {
SSgetKeyLatency.addSample(ssLatency);
TSSgetKeyLatency.addSample(tssLatency);
}
template <>
void TSSMetrics::recordLatency(const GetKeyValuesRequest& req, double ssLatency, double tssLatency) {
SSgetKeyValuesLatency.addSample(ssLatency);
TSSgetKeyValuesLatency.addSample(tssLatency);
}
template <>
void TSSMetrics::recordLatency(const WatchValueRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const WaitMetricsRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const SplitMetricsRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const ReadHotSubRangeRequest& req, double ssLatency, double tssLatency) {}
template <>
void TSSMetrics::recordLatency(const SplitRangeRequest& req, double ssLatency, double tssLatency) {}
// -------------------
TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
printf("testing tss comparisons\n");
// to avoid compiler issues that StringRef(char* is deprecated)
std::string s_a = "a";
std::string s_b = "b";
std::string s_c = "c";
std::string s_d = "d";
std::string s_e = "e";
// test getValue
GetValueRequest gvReq;
gvReq.key = StringRef(s_a);
gvReq.version = 5;
UID tssId;
GetValueReply gvReplyMissing;
GetValueReply gvReplyA(Optional<Value>(StringRef(s_a)), false);
GetValueReply gvReplyB(Optional<Value>(StringRef(s_b)), false);
ASSERT(TSS_doCompare(gvReq, gvReplyMissing, gvReplyMissing, SevInfo, tssId));
ASSERT(TSS_doCompare(gvReq, gvReplyA, gvReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gvReq, gvReplyB, gvReplyB, SevInfo, tssId));
ASSERT(!TSS_doCompare(gvReq, gvReplyMissing, gvReplyA, SevInfo, tssId));
ASSERT(!TSS_doCompare(gvReq, gvReplyA, gvReplyB, SevInfo, tssId));
// test GetKeyValues
Arena a; // for all of the refs. ASAN complains if this isn't done. Could also make them all standalone i guess
GetKeyValuesRequest gkvReq;
gkvReq.begin = firstGreaterOrEqual(StringRef(a, s_a));
gkvReq.end = firstGreaterOrEqual(StringRef(a, s_b));
gkvReq.version = 5;
gkvReq.limit = 100;
gkvReq.limitBytes = 1000;
GetKeyValuesReply gkvReplyEmpty;
GetKeyValuesReply gkvReplyOne;
KeyValueRef v;
v.key = StringRef(a, s_a);
v.value = StringRef(a, s_b);
gkvReplyOne.data.push_back_deep(gkvReplyOne.arena, v);
GetKeyValuesReply gkvReplyOneMore;
gkvReplyOneMore.data.push_back_deep(gkvReplyOneMore.arena, v);
gkvReplyOneMore.more = true;
ASSERT(TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyEmpty, SevInfo, tssId));
ASSERT(TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOne, SevInfo, tssId));
ASSERT(TSS_doCompare(gkvReq, gkvReplyOneMore, gkvReplyOneMore, SevInfo, tssId));
ASSERT(!TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyOne, SevInfo, tssId));
ASSERT(!TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOneMore, SevInfo, tssId));
// test GetKey
GetKeyRequest gkReq;
gkReq.sel = KeySelectorRef(StringRef(a, s_a), false, 1);
gkReq.version = 5;
GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, s_a), false, 20), false);
GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, s_b), false, 10), false);
GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, s_c), true, 0), false);
GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, s_d), false, -10), false);
GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, s_e), false, -20), false);
// identical cases
ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyB, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyD, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyE, SevInfo, tssId));
// relative offset cases
ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyB, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyA, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyB, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyD, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyE, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyC, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyE, SevInfo, tssId));
ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyD, SevInfo, tssId));
// test same offset/orEqual wrong key
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
SevInfo,
tssId));
// this could be from different shard boundaries, so don't say it's a mismatch
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 10), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false),
SevInfo,
tssId));
// test offsets and key difference don't match
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false),
SevInfo,
tssId));
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, -10), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false),
SevInfo,
tssId));
// test key is next over in one shard, one found it and other didn't
// positive
// one that didn't find is +1
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
SevInfo,
tssId));
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 1), false),
SevInfo,
tssId));
// negative will have zero offset but not equal set
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false),
SevInfo,
tssId));
ASSERT(!TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
SevInfo,
tssId));
// test shard boundary key returned by incomplete query is the same as the key found by the other (only possible in
// positive direction)
ASSERT(TSS_doCompare(gkReq,
GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false),
SevInfo,
tssId));
// explictly test checksum function
std::string s12 = "ABCDEFGHIJKL";
std::string s13 = "ABCDEFGHIJKLO";
std::string checksumStart13 = "(13)";
ASSERT(s_a == traceChecksumValue(StringRef(s_a)));
ASSERT(s12 == traceChecksumValue(StringRef(s12)));
ASSERT(checksumStart13 == traceChecksumValue(StringRef(s13)).substr(0, 4));
return Void();
}

View File

@ -29,7 +29,9 @@
#include "fdbrpc/LoadBalance.actor.h"
#include "fdbrpc/Stats.h"
#include "fdbrpc/TimedRequest.h"
#include "fdbrpc/TSSComparison.h"
#include "fdbclient/TagThrottle.h"
#include "flow/UnitTest.h"
// Dead code, removed in the next protocol version
struct VersionReply {
@ -54,6 +56,7 @@ struct StorageServerInterface {
LocalityData locality;
UID uniqueID;
Optional<UID> tssPairID;
RequestStream<struct GetValueRequest> getValue;
RequestStream<struct GetKeyRequest> getKey;
@ -80,6 +83,7 @@ struct StorageServerInterface {
NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
UID id() const { return uniqueID; }
bool isTss() const { return tssPairID.present(); }
std::string toString() const { return id().shortString(); }
template <class Ar>
void serialize(Ar& ar) {
@ -88,7 +92,11 @@ struct StorageServerInterface {
// considered
if (ar.protocolVersion().hasSmallEndpoints()) {
if (ar.protocolVersion().hasTSS()) {
serializer(ar, uniqueID, locality, getValue, tssPairID);
} else {
serializer(ar, uniqueID, locality, getValue);
}
if (Ar::isDeserializing) {
getKey = RequestStream<struct GetKeyRequest>(getValue.getEndpoint().getAdjustedEndpoint(1));
getKeyValues = RequestStream<struct GetKeyValuesRequest>(getValue.getEndpoint().getAdjustedEndpoint(2));
@ -127,10 +135,11 @@ struct StorageServerInterface {
waitFailure,
getQueuingMetrics,
getKeyValueStoreType);
if (ar.protocolVersion().hasWatches())
if (ar.protocolVersion().hasWatches()) {
serializer(ar, watchValue);
}
}
}
bool operator==(StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }
bool operator<(StorageServerInterface const& s) const { return uniqueID < s.uniqueID; }
void initEndpoints() {

View File

@ -25,6 +25,7 @@
#include "flow/Arena.h"
#include "flow/TDMetric.actor.h"
#include "flow/serialize.h"
#include "flow/UnitTest.h"
const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@ -345,7 +346,10 @@ uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key) {
return idx;
}
const KeyRangeRef tssMappingKeys(LiteralStringRef("\xff/tss/"), LiteralStringRef("\xff/tss0"));
const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0"));
const KeyRef serverTagPrefix = serverTagKeys.begin;
const KeyRangeRef serverTagConflictKeys(LiteralStringRef("\xff/serverTagConflict/"),
LiteralStringRef("\xff/serverTagConflict0"));
@ -532,6 +536,7 @@ const Key serverListKeyFor(UID serverID) {
return wr.toValue();
}
// TODO use flatbuffers depending on version
const Value serverListValue(StorageServerInterface const& server) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withServerListValue()));
wr << server;
@ -550,6 +555,17 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) {
return s;
}
const Value serverListValueFB(StorageServerInterface const& server) {
return ObjectWriter::toValue(server, IncludeVersion());
}
StorageServerInterface decodeServerListValueFB(ValueRef const& value) {
StorageServerInterface s;
ObjectReader reader(value.begin(), IncludeVersion());
reader.deserialize(s);
return s;
}
// processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0'
const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0"));
const KeyRef processClassPrefix = processClassKeys.begin;
@ -594,6 +610,9 @@ ProcessClass decodeProcessClassValue(ValueRef const& value) {
const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\xff/conf0"));
const KeyRef configKeysPrefix = configKeys.begin;
const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle"));
const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWigglePID"));
const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint"));
const KeyRangeRef excludedServersKeys(LiteralStringRef("\xff/conf/excluded/"), LiteralStringRef("\xff/conf/excluded0"));
@ -633,15 +652,17 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
// const KeyRef globalConfigPrefix = globalConfigKeys.begin;
const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
const KeyRangeRef globalConfigDataKeys(LiteralStringRef("\xff/globalConfig/k/"),
LiteralStringRef("\xff/globalConfig/k0"));
const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin;
const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
const KeyRangeRef globalConfigHistoryKeys(LiteralStringRef("\xff/globalConfig/h/"),
LiteralStringRef("\xff/globalConfig/h0"));
const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v");
const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") );
const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0"));
const KeyRef workerListPrefix = workerListKeys.begin;
const Key workerListKeyFor(StringRef processID) {
@ -939,124 +960,8 @@ const KeyRef mustContainSystemMutationsKey = LiteralStringRef("\xff/mustContainS
const KeyRangeRef monitorConfKeys(LiteralStringRef("\xff\x02/monitorConf/"), LiteralStringRef("\xff\x02/monitorConf0"));
const KeyRef restoreLeaderKey = LiteralStringRef("\xff\x02/restoreLeader");
const KeyRangeRef restoreWorkersKeys(LiteralStringRef("\xff\x02/restoreWorkers/"),
LiteralStringRef("\xff\x02/restoreWorkers0"));
const KeyRef restoreStatusKey = LiteralStringRef("\xff\x02/restoreStatus/");
const KeyRef restoreRequestTriggerKey = LiteralStringRef("\xff\x02/restoreRequestTrigger");
const KeyRef restoreRequestDoneKey = LiteralStringRef("\xff\x02/restoreRequestDone");
const KeyRangeRef restoreRequestKeys(LiteralStringRef("\xff\x02/restoreRequests/"),
LiteralStringRef("\xff\x02/restoreRequests0"));
const KeyRangeRef restoreApplierKeys(LiteralStringRef("\xff\x02/restoreApplier/"),
LiteralStringRef("\xff\x02/restoreApplier0"));
const KeyRef restoreApplierTxnValue = LiteralStringRef("1");
// restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once
// Version and batchIndex are passed in as LittleEndian,
// they must be converted to BigEndian to maintain ordering in lexical order
const Key restoreApplierKeyFor(UID const& applierID, int64_t batchIndex, Version version) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreApplierKeys.begin);
wr << applierID << bigEndian64(batchIndex) << bigEndian64(version);
return wr.toValue();
}
std::tuple<UID, int64_t, Version> decodeRestoreApplierKey(ValueRef const& key) {
BinaryReader rd(key, Unversioned());
UID applierID;
int64_t batchIndex;
Version version;
rd >> applierID >> batchIndex >> version;
return std::make_tuple(applierID, bigEndian64(batchIndex), bigEndian64(version));
}
// Encode restore worker key for workerID
const Key restoreWorkerKeyFor(UID const& workerID) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreWorkersKeys.begin);
wr << workerID;
return wr.toValue();
}
// Encode restore agent value
const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& cmdInterf) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreWorkerInterfaceValue()));
wr << cmdInterf;
return wr.toValue();
}
RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value) {
RestoreWorkerInterface s;
BinaryReader reader(value, IncludeVersion());
reader >> s;
return s;
}
// Encode and decode restore request value
// restoreRequestTrigger key
const Value restoreRequestTriggerValue(UID randomID, int const numRequests) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreRequestTriggerValue()));
wr << numRequests;
wr << randomID;
return wr.toValue();
}
int decodeRestoreRequestTriggerValue(ValueRef const& value) {
int s;
UID randomID;
BinaryReader reader(value, IncludeVersion());
reader >> s;
reader >> randomID;
return s;
}
// restoreRequestDone key
const Value restoreRequestDoneVersionValue(Version readVersion) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreRequestDoneVersionValue()));
wr << readVersion;
return wr.toValue();
}
Version decodeRestoreRequestDoneVersionValue(ValueRef const& value) {
Version v;
BinaryReader reader(value, IncludeVersion());
reader >> v;
return v;
}
const Key restoreRequestKeyFor(int const& index) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreRequestKeys.begin);
wr << index;
return wr.toValue();
}
const Value restoreRequestValue(RestoreRequest const& request) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreRequestValue()));
wr << request;
return wr.toValue();
}
RestoreRequest decodeRestoreRequestValue(ValueRef const& value) {
RestoreRequest s;
BinaryReader reader(value, IncludeVersion());
reader >> s;
return s;
}
// TODO: Register restore performance data to restoreStatus key
const Key restoreStatusKeyFor(StringRef statusType) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreStatusKey);
wr << statusType;
return wr.toValue();
}
const Value restoreStatusValue(double val) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreStatusValue()));
wr << StringRef(std::to_string(val));
return wr.toValue();
}
const KeyRef healthyZoneKey = LiteralStringRef("\xff\x02/healthyZone");
const StringRef ignoreSSFailuresZoneString = LiteralStringRef("IgnoreSSFailures");
const KeyRef rebalanceDDIgnoreKey = LiteralStringRef("\xff\x02/rebalanceDDIgnored");
@ -1082,3 +987,60 @@ const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTON
const KeyRef writeRecoveryKey = LiteralStringRef("\xff/writeRecovery");
const ValueRef writeRecoveryKeyTrue = LiteralStringRef("1");
const KeyRef snapshotEndVersionKey = LiteralStringRef("\xff/snapshotEndVersion");
// for tests
void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
ssi.id().toString().c_str(),
ssi.locality.toString().c_str(),
ssi.isTss() ? "true" : "false",
ssi.isTss() ? ssi.tssPairID.get().toString().c_str() : "",
ssi.address().toString().c_str(),
ssi.getValue.getEndpoint().token.toString().c_str());
StorageServerInterface ssi2 =
(useFB) ? decodeServerListValueFB(serverListValueFB(ssi)) : decodeServerListValue(serverListValue(ssi));
printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
ssi2.id().toString().c_str(),
ssi2.locality.toString().c_str(),
ssi2.isTss() ? "true" : "false",
ssi2.isTss() ? ssi2.tssPairID.get().toString().c_str() : "",
ssi2.address().toString().c_str(),
ssi2.getValue.getEndpoint().token.toString().c_str());
ASSERT(ssi.id() == ssi2.id());
ASSERT(ssi.locality == ssi2.locality);
ASSERT(ssi.isTss() == ssi2.isTss());
if (ssi.isTss()) {
ASSERT(ssi2.tssPairID.get() == ssi2.tssPairID.get());
}
ASSERT(ssi.address() == ssi2.address());
ASSERT(ssi.getValue.getEndpoint().token == ssi2.getValue.getEndpoint().token);
}
// unit test for serialization since tss stuff had bugs
TEST_CASE("/SystemData/SerDes/SSI") {
printf("testing ssi serdes\n");
LocalityData localityData(Optional<Standalone<StringRef>>(),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
Optional<Standalone<StringRef>>());
// non-tss
StorageServerInterface ssi;
ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678);
ssi.locality = localityData;
ssi.initEndpoints();
testSSISerdes(ssi, false);
testSSISerdes(ssi, true);
ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238);
testSSISerdes(ssi, false);
testSSISerdes(ssi, true);
printf("ssi serdes test complete\n");
return Void();
}

View File

@ -26,7 +26,6 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
// Don't warn on constants being defined in this file.
#pragma clang diagnostic push
@ -115,6 +114,9 @@ extern const KeyRef cacheChangePrefix;
const Key cacheChangeKeyFor(uint16_t idx);
uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key);
// "\xff/tss/[[serverId]]" := "[[tssId]]"
extern const KeyRangeRef tssMappingKeys;
// "\xff/serverTag/[[serverID]]" = "[[Tag]]"
// Provides the Tag for the given serverID. Used to access a
// storage server's corresponding TLog in order to apply mutations.
@ -196,6 +198,8 @@ UID decodeProcessClassKeyOld(KeyRef const& key);
extern const KeyRangeRef configKeys;
extern const KeyRef configKeysPrefix;
extern const KeyRef perpetualStorageWiggleKey;
extern const KeyRef wigglingStorageServerKey;
// Change the value of this key to anything and that will trigger detailed data distribution team info log.
extern const KeyRef triggerDDTeamInfoPrintKey;
@ -442,31 +446,6 @@ extern const KeyRef mustContainSystemMutationsKey;
// Key range reserved for storing changes to monitor conf files
extern const KeyRangeRef monitorConfKeys;
// Fast restore
extern const KeyRef restoreLeaderKey;
extern const KeyRangeRef restoreWorkersKeys;
extern const KeyRef restoreStatusKey; // To be used when we measure fast restore performance
extern const KeyRef restoreRequestTriggerKey;
extern const KeyRef restoreRequestDoneKey;
extern const KeyRangeRef restoreRequestKeys;
extern const KeyRangeRef restoreApplierKeys;
extern const KeyRef restoreApplierTxnValue;
const Key restoreApplierKeyFor(UID const& applierID, int64_t batchIndex, Version version);
std::tuple<UID, int64_t, Version> decodeRestoreApplierKey(ValueRef const& key);
const Key restoreWorkerKeyFor(UID const& workerID);
const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& server);
RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value);
const Value restoreRequestTriggerValue(UID randomUID, int const numRequests);
int decodeRestoreRequestTriggerValue(ValueRef const& value);
const Value restoreRequestDoneVersionValue(Version readVersion);
Version decodeRestoreRequestDoneVersionValue(ValueRef const& value);
const Key restoreRequestKeyFor(int const& index);
const Value restoreRequestValue(RestoreRequest const& server);
RestoreRequest decodeRestoreRequestValue(ValueRef const& value);
const Key restoreStatusKeyFor(StringRef statusType);
const Value restoreStatusValue(double val);
extern const KeyRef healthyZoneKey;
extern const StringRef ignoreSSFailuresZoneString;
extern const KeyRef rebalanceDDIgnoreKey;

View File

@ -194,6 +194,40 @@ struct TagThrottleInfo {
}
};
struct ClientTagThrottleLimits {
double tpsRate;
double expiration;
ClientTagThrottleLimits() : tpsRate(0), expiration(0) {}
ClientTagThrottleLimits(double tpsRate, double expiration) : tpsRate(tpsRate), expiration(expiration) {}
template <class Archive>
void serialize(Archive& ar) {
// Convert expiration time to a duration to avoid clock differences
double duration = 0;
if (!ar.isDeserializing) {
duration = expiration - now();
}
serializer(ar, tpsRate, duration);
if (ar.isDeserializing) {
expiration = now() + duration;
}
}
};
struct ClientTrCommitCostEstimation {
int opsCount = 0;
uint64_t writeCosts = 0;
std::deque<std::pair<int, uint64_t>> clearIdxCosts;
uint32_t expensiveCostEstCount = 0;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, opsCount, writeCosts, clearIdxCosts, expensiveCostEstCount);
}
};
namespace ThrottleApi {
Future<std::vector<TagThrottleInfo>> getThrottledTags(Database const& db,
int const& limit,

View File

@ -474,7 +474,7 @@ void ThreadSafeApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* ho
MutexHolder holder(lock); // We could use the network thread to protect this action, but then we can't guarantee
// upon return that the hook is set.
threadCompletionHooks.push_back(std::make_pair(hook, hookParameter));
threadCompletionHooks.emplace_back(hook, hookParameter);
}
IClientApi* ThreadSafeApi::api = new ThreadSafeApi();

View File

@ -71,6 +71,8 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
i += sizeof(float) + 1;
} else if (data[i] == 0x21) {
i += sizeof(double) + 1;
} else if (data[i] == 0x26 || data[i] == 0x27) {
i += 1;
} else if (data[i] == '\x00') {
i += 1;
} else {
@ -144,6 +146,16 @@ Tuple& Tuple::append(int64_t value) {
return *this;
}
Tuple& Tuple::appendBool(bool value) {
offsets.push_back(data.size());
if (value) {
data.push_back(data.arena(), 0x27);
} else {
data.push_back(data.arena(), 0x26);
}
return *this;
}
Tuple& Tuple::appendFloat(float value) {
offsets.push_back(data.size());
float swap = bigEndianFloat(value);
@ -192,6 +204,8 @@ Tuple::ElementType Tuple::getType(size_t index) const {
return ElementType::FLOAT;
} else if (code == 0x21) {
return ElementType::DOUBLE;
} else if (code == 0x26 || code == 0x27) {
return ElementType::BOOL;
} else {
throw invalid_tuple_data_type();
}
@ -287,6 +301,21 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
}
// TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
bool Tuple::getBool(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();
}
ASSERT_LT(offsets[index], data.size());
uint8_t code = data[offsets[index]];
if (code == 0x26) {
return false;
} else if (code == 0x27) {
return true;
} else {
throw invalid_tuple_data_type();
}
}
float Tuple::getFloat(size_t index) const {
if (index >= offsets.size()) {
throw invalid_tuple_index();

View File

@ -40,6 +40,7 @@ struct Tuple {
Tuple& append(int64_t);
// There are some ambiguous append calls in fdbclient, so to make it easier
// to add append for floats and doubles, name them differently for now.
Tuple& appendBool(bool);
Tuple& appendFloat(float);
Tuple& appendDouble(double);
Tuple& appendNull();
@ -51,7 +52,7 @@ struct Tuple {
return append(t);
}
enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE };
// this is number of elements, not length of data
size_t size() const { return offsets.size(); }
@ -59,6 +60,7 @@ struct Tuple {
ElementType getType(size_t index) const;
Standalone<StringRef> getString(size_t index) const;
int64_t getInt(size_t index, bool allow_incomplete = false) const;
bool getBool(size_t index) const;
float getFloat(size_t index) const;
double getDouble(size_t index) const;

View File

@ -856,7 +856,7 @@ void load_conf(const char* confpath, uid_t& uid, gid_t& gid, sigset_t* mask, fdb
if (id_command[i.first]->kill_on_configuration_change) {
kill_ids.push_back(i.first);
start_ids.push_back(std::make_pair(i.first, cmd));
start_ids.emplace_back(i.first, cmd);
}
} else {
log_msg(SevInfo, "Updated configuration for %s\n", id_command[i.first]->ssection.c_str());

View File

@ -46,7 +46,8 @@ EvictablePage::~EvictablePage() {
}
}
std::map<std::string, OpenFileInfo> AsyncFileCached::openFiles;
// A map of filename to the file handle for all opened cached files
std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> AsyncFileCached::openFiles;
void AsyncFileCached::remove_page(AFCPage* page) {
pages.erase(page->pageOffset);

View File

@ -132,39 +132,32 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
const CacheEvictionType cacheEvictionType;
};
struct OpenFileInfo : NonCopyable {
IAsyncFile* f;
Future<Reference<IAsyncFile>> opened; // Only valid until the file is fully opened
OpenFileInfo() : f(0) {}
OpenFileInfo(OpenFileInfo&& r) noexcept : f(r.f), opened(std::move(r.opened)) { r.f = 0; }
Future<Reference<IAsyncFile>> get() {
if (f)
return Reference<IAsyncFile>::addRef(f);
else
return opened;
}
};
struct AFCPage;
class AsyncFileCached final : public IAsyncFile, public ReferenceCounted<AsyncFileCached> {
friend struct AFCPage;
public:
// Opens a file that uses the FDB in-memory page cache
static Future<Reference<IAsyncFile>> open(std::string filename, int flags, int mode) {
//TraceEvent("AsyncFileCachedOpen").detail("Filename", filename);
if (openFiles.find(filename) == openFiles.end()) {
auto itr = openFiles.find(filename);
if (itr == openFiles.end()) {
auto f = open_impl(filename, flags, mode);
if (f.isReady() && f.isError())
return f;
if (!f.isReady())
openFiles[filename].opened = f;
else
return f.get();
auto result = openFiles.try_emplace(filename, f);
// This should be inserting a new entry
ASSERT(result.second);
itr = result.first;
// We return here instead of falling through to the outer scope so that we don't delete all references to
// the underlying file before returning
return itr->second.get();
}
return openFiles[filename].get();
return itr->second.get();
}
Future<int> read(void* data, int length, int64_t offset) override {
@ -263,7 +256,9 @@ public:
~AsyncFileCached() override;
private:
static std::map<std::string, OpenFileInfo> openFiles;
// A map of filename to the file handle for all opened cached files
static std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
std::string filename;
Reference<IAsyncFile> uncached;
int64_t length;
@ -330,6 +325,7 @@ private:
static Future<Reference<IAsyncFile>> open_impl(std::string filename, int flags, int mode);
// Opens a file that uses the FDB in-memory page cache
ACTOR static Future<Reference<IAsyncFile>> open_impl(std::string filename,
int flags,
int mode,
@ -345,10 +341,7 @@ private:
TraceEvent("AFCUnderlyingOpenEnd").detail("Filename", filename);
int64_t l = wait(f->size());
TraceEvent("AFCUnderlyingSize").detail("Filename", filename).detail("Size", l);
auto& of = openFiles[filename];
of.f = new AsyncFileCached(f, filename, l, pageCache);
of.opened = Future<Reference<IAsyncFile>>();
return Reference<IAsyncFile>(of.f);
return new AsyncFileCached(f, filename, l, pageCache);
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled)
openFiles.erase(filename);

View File

@ -130,6 +130,9 @@ public:
UID id;
std::string filename;
// For files that use atomic write and create, they are initially created with an extra suffix
std::string initialFilename;
// An approximation of the size of the file; .size() should be used instead of this variable in most cases
mutable int64_t approximateSize;
@ -182,11 +185,13 @@ private:
reponses; // cannot call getResult on this actor collection, since the actors will be on different processes
AsyncFileNonDurable(const std::string& filename,
const std::string& initialFilename,
Reference<IAsyncFile> file,
Reference<DiskParameters> diskParameters,
NetworkAddress openedAddress,
bool aio)
: openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false),
: filename(filename), initialFilename(initialFilename), file(file), diskParameters(diskParameters),
openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false),
aio(aio) {
// This is only designed to work in simulation
@ -194,9 +199,6 @@ private:
this->id = deterministicRandom()->randomUniqueID();
//TraceEvent("AsyncFileNonDurable_Create", id).detail("Filename", filename);
this->file = file;
this->filename = filename;
this->diskParameters = diskParameters;
maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
hasBeenSynced = false;
@ -236,10 +238,11 @@ public:
//TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename);
if (shutdown.isReady())
throw io_error().asInjectedFault();
wait(g_simulator.onProcess(currentProcess, currentTaskID));
}
state Reference<AsyncFileNonDurable> nonDurableFile(
new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio));
new AsyncFileNonDurable(filename, actualFilename, file, diskParameters, currentProcess->address, aio));
// Causes the approximateSize member to be set
state Future<int64_t> sizeFuture = nonDurableFile->size();
@ -269,14 +272,39 @@ public:
}
void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
void delref() override {
if (delref_no_destroy()) {
ASSERT(filesBeingDeleted.count(filename) == 0);
if (filesBeingDeleted.count(filename) == 0) {
//TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename);
Future<Void> deleteFuture = deleteFile(this);
if (!deleteFuture.isReady())
filesBeingDeleted[filename] = deleteFuture;
}
removeOpenFile(filename, this);
if (initialFilename != filename) {
removeOpenFile(initialFilename, this);
}
}
}
// Removes a file from the openFiles map
static void removeOpenFile(std::string filename, AsyncFileNonDurable* file) {
auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
auto iter = openFiles.find(filename);
// Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already
// be gone. Renamed files (from atomic write and create) will also be present under only one of the two
// names.
if (iter != openFiles.end()) {
// even if the filename exists, it doesn't mean that it references the same file. It could be that the
// file was renamed and later a file with the same name was opened.
if (iter->second.getPtrIfReady().orDefault(nullptr) == file) {
openFiles.erase(iter);
}
}
}
// Passes along reads straight to the underlying file, waiting for any outstanding changes that could affect the
@ -832,11 +860,9 @@ private:
//TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename);
delete self;
wait(g_simulator.onProcess(currentProcess, currentTaskID));
return Void();
} catch (Error& e) {
state Error err = e;
wait(g_simulator.onProcess(currentProcess, currentTaskID));
throw err;
}
}

View File

@ -29,7 +29,8 @@ set(FDBRPC_SRCS
sim2.actor.cpp
sim_validation.cpp
TimedRequest.h
TraceFileIO.cpp)
TraceFileIO.cpp
TSSComparison.h)
set(COMPILE_EIO OFF)

View File

@ -51,6 +51,8 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1);
constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t);
const uint64_t TOKEN_STREAM_FLAG = 1;
const int WLTOKEN_COUNTS = 12; // number of wellKnownEndpoints
class EndpointMap : NonCopyable {
public:
// Reserve space for this many wellKnownEndpoints
@ -96,6 +98,7 @@ void EndpointMap::realloc() {
void EndpointMap::insertWellKnown(NetworkMessageReceiver* r, const Endpoint::Token& token, TaskPriority priority) {
int index = token.second();
ASSERT(index <= WLTOKEN_COUNTS);
ASSERT(data[index].receiver == nullptr);
data[index].receiver = r;
data[index].token() =
@ -334,7 +337,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
}
TransportData::TransportData(uint64_t transportId)
: endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
: endpoints(WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
numIncompatibleConnections(0) {
degraded = makeReference<AsyncVar<bool>>(false);
@ -1215,7 +1218,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
}
compatible = false;
if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
if(peer) {
if (peer) {
peer->protocolVersion->set(protocolVersion);
}

View File

@ -24,7 +24,7 @@
void HealthMonitor::reportPeerClosed(const NetworkAddress& peerAddress) {
purgeOutdatedHistory();
peerClosedHistory.push_back(std::make_pair(now(), peerAddress));
peerClosedHistory.emplace_back(now(), peerAddress);
peerClosedNum[peerAddress] += 1;
}

View File

@ -36,6 +36,8 @@
#include "fdbrpc/Locality.h"
#include "fdbrpc/QueueModel.h"
#include "fdbrpc/MultiInterface.h"
#include "fdbrpc/simulator.h" // for checking tss simulation mode
#include "fdbrpc/TSSComparison.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using std::vector;
@ -75,6 +77,97 @@ struct LoadBalancedReply {
Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply);
Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
ACTOR template <class Req, class Resp>
Future<Void> tssComparison(Req req,
Future<ErrorOr<Resp>> fSource,
Future<ErrorOr<Resp>> fTss,
TSSEndpointData tssData) {
state double startTime = now();
state Future<Optional<ErrorOr<Resp>>> fTssWithTimeout = timeout(fTss, FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT);
state int finished = 0;
state double srcEndTime;
state double tssEndTime;
loop {
choose {
when(state ErrorOr<Resp> src = wait(fSource)) {
srcEndTime = now();
fSource = Never();
finished++;
if (finished == 2) {
break;
}
}
when(state Optional<ErrorOr<Resp>> tss = wait(fTssWithTimeout)) {
tssEndTime = now();
fTssWithTimeout = Never();
finished++;
if (finished == 2) {
break;
}
}
}
}
// we want to record ss/tss errors to metrics
int srcErrorCode = error_code_success;
int tssErrorCode = error_code_success;
++tssData.metrics->requests;
if (src.isError()) {
srcErrorCode = src.getError().code();
tssData.metrics->ssError(srcErrorCode);
}
if (!tss.present()) {
++tssData.metrics->tssTimeouts;
} else if (tss.get().isError()) {
tssErrorCode = tss.get().getError().code();
tssData.metrics->tssError(tssErrorCode);
}
if (!src.isError() && tss.present() && !tss.get().isError()) {
Optional<LoadBalancedReply> srcLB = getLoadBalancedReply(&src.get());
Optional<LoadBalancedReply> tssLB = getLoadBalancedReply(&tss.get().get());
ASSERT(srcLB.present() ==
tssLB.present()); // getLoadBalancedReply returned different responses for same templated type
// if Resp is a LoadBalancedReply, only compare if both replies are non-error
if (!srcLB.present() || (!srcLB.get().error.present() && !tssLB.get().error.present())) {
// only record latency difference if both requests actually succeeded, so that we're comparing apples to
// apples
tssData.metrics->recordLatency(req, srcEndTime - startTime, tssEndTime - startTime);
// expect mismatches in drop mutations mode.
Severity traceSeverity =
(g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
? SevWarnAlways
: SevError;
if (!TSS_doCompare(req, src.get(), tss.get().get(), traceSeverity, tssData.tssId)) {
TEST(true); // TSS Mismatch
++tssData.metrics->mismatches;
}
} else if (tssLB.present() && tssLB.get().error.present()) {
tssErrorCode = tssLB.get().error.get().code();
tssData.metrics->tssError(tssErrorCode);
} else if (srcLB.present() && srcLB.get().error.present()) {
srcErrorCode = srcLB.get().error.get().code();
tssData.metrics->ssError(srcErrorCode);
}
}
if (srcErrorCode != error_code_success && tssErrorCode != error_code_success && srcErrorCode != tssErrorCode) {
// if ss and tss both got different errors, record them
TraceEvent("TSSErrorMismatch")
.suppressFor(1.0)
.detail("TSSID", tssData.tssId)
.detail("SSError", srcErrorCode)
.detail("TSSError", tssErrorCode);
}
return Void();
}
// Stores state for a request made by the load balancer
template <class Request>
struct RequestData : NonCopyable {
@ -91,11 +184,30 @@ struct RequestData : NonCopyable {
// This is true once setupRequest is called, even though at that point the response is Never().
bool isValid() { return response.isValid(); }
static void maybeDuplicateTSSRequest(RequestStream<Request> const* stream,
Request& request,
QueueModel* model,
Future<Reply> ssResponse) {
if (model) {
// Send parallel request to TSS pair, if it exists
Optional<TSSEndpointData> tssData = model->getTssData(stream->getEndpoint().token.first());
if (tssData.present()) {
TEST(true); // duplicating request to TSS
resetReply(request);
// FIXME: optimize to avoid creating new netNotifiedQueue for each message
RequestStream<Request> tssRequestStream(tssData.get().endpoint);
Future<ErrorOr<REPLY_TYPE(Request)>> fTssResult = tssRequestStream.tryGetReply(request);
model->addActor.send(tssComparison(request, ssResponse, fTssResult, tssData.get()));
}
}
}
// Initializes the request state and starts it, possibly after a backoff delay
void startRequest(double backoff,
bool triedAllOptions,
RequestStream<Request> const* stream,
Request const& request,
Request& request,
QueueModel* model) {
modelHolder = Reference<ModelHolder>();
requestStarted = false;
@ -105,12 +217,15 @@ struct RequestData : NonCopyable {
delay(backoff), [this, stream, &request, model](Void _) {
requestStarted = true;
modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
return stream->tryGetReply(request);
Future<Reply> resp = stream->tryGetReply(request);
maybeDuplicateTSSRequest(stream, request, model, resp);
return resp;
});
} else {
requestStarted = true;
modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
response = stream->tryGetReply(request);
maybeDuplicateTSSRequest(stream, request, model, response);
}
requestProcessed = false;

View File

@ -60,6 +60,20 @@ double QueueModel::addRequest(uint64_t id) {
return d.penalty;
}
void QueueModel::updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& tssData) {
auto& d = data[endpointId];
d.tssData = tssData;
}
void QueueModel::removeTssEndpoint(uint64_t endpointId) {
auto& d = data[endpointId];
d.tssData = Optional<TSSEndpointData>();
}
Optional<TSSEndpointData> QueueModel::getTssData(uint64_t id) {
return data[id].tssData;
}
Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply) {
return *reply;
}

View File

@ -26,6 +26,17 @@
#include "fdbrpc/Smoother.h"
#include "flow/Knobs.h"
#include "flow/ActorCollection.h"
#include "fdbrpc/TSSComparison.h" // For TSS Metrics
#include "fdbrpc/FlowTransport.h" // For Endpoint
struct TSSEndpointData {
UID tssId;
Endpoint endpoint;
Reference<TSSMetrics> metrics;
TSSEndpointData(UID tssId, Endpoint endpoint, Reference<TSSMetrics> metrics)
: tssId(tssId), endpoint(endpoint), metrics(metrics) {}
};
// The data structure used for the client-side load balancing algorithm to
// decide which storage server to read data from. Conceptually, it tracks the
@ -59,6 +70,10 @@ struct QueueData {
// hasn't returned a valid result, increase above `futureVersionBackoff`
// to increase the future backoff amount.
double increaseBackoffTime;
// a bit of a hack to store this here, but it's the only centralized place for per-endpoint tracking
Optional<TSSEndpointData> tssData;
QueueData()
: latency(0.001), penalty(1.0), smoothOutstanding(FLOW_KNOBS->QUEUE_MODEL_SMOOTHING_AMOUNT), failedUntil(0),
futureVersionBackoff(FLOW_KNOBS->FUTURE_VERSION_INITIAL_BACKOFF), increaseBackoffTime(0) {}
@ -89,13 +104,29 @@ public:
double secondBudget;
PromiseStream<Future<Void>> addActor;
Future<Void> laggingRequests; // requests for which a different recipient already answered
PromiseStream<Future<Void>> addTSSActor;
Future<Void> tssComparisons; // requests for which a different recipient already answered
int laggingRequestCount;
int laggingTSSCompareCount;
// Updates this endpoint data to duplicate requests to the specified TSS endpoint
void updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& endpointData);
// Removes the TSS mapping from this endpoint to stop duplicating requests to a TSS endpoint
void removeTssEndpoint(uint64_t endpointId);
// Retrieves the data for this endpoint's pair TSS endpoint, if present
Optional<TSSEndpointData> getTssData(uint64_t endpointId);
QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0) {
laggingRequests = actorCollection(addActor.getFuture(), &laggingRequestCount);
tssComparisons = actorCollection(addTSSActor.getFuture(), &laggingTSSCompareCount);
}
~QueueModel() { laggingRequests.cancel(); }
~QueueModel() {
laggingRequests.cancel();
tssComparisons.cancel();
}
private:
std::unordered_map<uint64_t, QueueData> data;

View File

@ -20,6 +20,7 @@
#ifndef FDBRPC_STATS_H
#define FDBRPC_STATS_H
#include <type_traits>
#pragma once
// Yet another performance statistics interface
@ -136,7 +137,15 @@ struct SpecialCounter final : ICounter, FastAllocated<SpecialCounter<F>>, NonCop
void remove() override { delete this; }
std::string const& getName() const override { return name; }
int64_t getValue() const override { return f(); }
int64_t getValue() const override {
auto result = f();
// Disallow conversion from floating point to int64_t, since this has
// been a source of confusion - e.g. a percentage represented as a
// fraction between 0 and 1 is not meaningful after conversion to
// int64_t.
static_assert(!std::is_floating_point_v<decltype(result)>);
return result;
}
void resetInterval() override {}

89
fdbrpc/TSSComparison.h Normal file
View File

@ -0,0 +1,89 @@
/*
* TSSComparison.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This header is to declare the tss comparison function that LoadBalance.Actor.h needs to be aware of to call,
* But StorageServerInterface.h needs to implement on the types defined in SSI.h.
*/
#ifndef FDBRPC_TSS_COMPARISON_H
#define FDBRPC_TSS_COMPARISON_H
#include "fdbrpc/ContinuousSample.h"
#include "fdbrpc/Stats.h"
// refcounted + noncopyable because both DatabaseContext and individual endpoints share ownership
struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
CounterCollection cc;
Counter requests;
Counter ssErrors;
Counter tssErrors;
Counter tssTimeouts;
Counter mismatches;
// We could probably just ignore getKey as it's seldom used?
ContinuousSample<double> SSgetValueLatency;
ContinuousSample<double> SSgetKeyLatency;
ContinuousSample<double> SSgetKeyValuesLatency;
ContinuousSample<double> TSSgetValueLatency;
ContinuousSample<double> TSSgetKeyLatency;
ContinuousSample<double> TSSgetKeyValuesLatency;
std::unordered_map<int, uint64_t> ssErrorsByCode;
std::unordered_map<int, uint64_t> tssErrorsByCode;
void ssError(int code) {
++ssErrors;
ssErrorsByCode[code]++;
}
void tssError(int code) {
++tssErrors;
tssErrorsByCode[code]++;
}
template <class Req>
void recordLatency(const Req& req, double ssLatency, double tssLatency);
void clear() {
SSgetValueLatency.clear();
SSgetKeyLatency.clear();
SSgetKeyValuesLatency.clear();
TSSgetValueLatency.clear();
TSSgetKeyLatency.clear();
TSSgetKeyValuesLatency.clear();
tssErrorsByCode.clear();
ssErrorsByCode.clear();
}
TSSMetrics()
: cc("TSSClientMetrics"), requests("Requests", cc), ssErrors("SSErrors", cc), tssErrors("TSSErrors", cc),
tssTimeouts("TSSTimeouts", cc), mismatches("Mismatches", cc), SSgetValueLatency(1000), SSgetKeyLatency(1000),
SSgetKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), TSSgetKeyValuesLatency(1000) {}
};
// part of the contract of this function is that if there is a mismatch, the implementation needs to record a trace
// event with the specified severity and tssId in the event.
template <class Req, class Rep>
bool TSS_doCompare(const Req& req, const Rep& src, const Rep& tss, Severity traceSeverity, UID tssId);
#endif

View File

@ -537,7 +537,10 @@ public:
std::string getFilename() const override { return actualFilename; }
~SimpleFile() override { _close(h); }
~SimpleFile() override {
_close(h);
--openCount;
}
private:
int h;
@ -1028,8 +1031,8 @@ public:
// Get the size of all files we've created on the server and subtract them from the free space
for (auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) {
if (file->second.isReady()) {
totalFileSize += ((AsyncFileNonDurable*)file->second.get().getPtr())->approximateSize;
if (file->second.get().isReady()) {
totalFileSize += ((AsyncFileNonDurable*)file->second.get().get().getPtr())->approximateSize;
}
numFiles++;
}
@ -2490,7 +2493,7 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
actualFilename = filename + ".part";
auto partFile = machineCache.find(actualFilename);
if (partFile != machineCache.end()) {
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second);
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second.get());
if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
f = map(f, [=](Reference<IAsyncFile> r) {
return Reference<IAsyncFile>(new AsyncFileWriteChecker(r));
@ -2498,19 +2501,26 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
return f;
}
}
if (machineCache.find(actualFilename) == machineCache.end()) {
Future<Reference<IAsyncFile>> f;
auto itr = machineCache.find(actualFilename);
if (itr == machineCache.end()) {
// Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile.
// This way, they can both keep up with the time to start the next operation
auto diskParameters =
makeReference<DiskParameters>(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH);
machineCache[actualFilename] =
AsyncFileNonDurable::open(filename,
f = AsyncFileNonDurable::open(filename,
actualFilename,
SimpleFile::open(filename, flags, mode, diskParameters, false),
diskParameters,
(flags & IAsyncFile::OPEN_NO_AIO) == 0);
machineCache[actualFilename] = UnsafeWeakFutureReference<IAsyncFile>(f);
} else {
f = itr->second.get();
}
Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(machineCache[actualFilename]);
f = AsyncFileDetachable::open(f);
if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
f = map(f, [=](Reference<IAsyncFile> r) { return Reference<IAsyncFile>(new AsyncFileWriteChecker(r)); });
return f;

View File

@ -41,7 +41,7 @@ public:
: desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1),
isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false),
allSwapsDisabled(false), backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType),
extraDB(nullptr), allowLogSetKills(true), usableRegions(1) {}
extraDB(nullptr), allowLogSetKills(true), usableRegions(1), tssMode(TSSMode::Disabled) {}
// Order matters!
enum KillType {
@ -55,6 +55,9 @@ public:
None
};
// Order matters! all modes >= 2 are fault injection modes
enum TSSMode { Disabled, EnabledNormal, EnabledAddDelay, EnabledDropMutations };
enum class BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };
// Subclasses may subclass ProcessInfo as well
@ -188,10 +191,14 @@ public:
Promise<KillType> shutdownSignal;
};
// A set of data associated with a simulated machine
struct MachineInfo {
ProcessInfo* machineProcess;
std::vector<ProcessInfo*> processes;
std::map<std::string, Future<Reference<IAsyncFile>>> openFiles;
// A map from filename to file handle for all open files on a machine
std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
std::set<std::string> deletingFiles;
std::set<std::string> closingFiles;
Optional<Standalone<StringRef>> machineId;
@ -401,6 +408,7 @@ public:
int32_t satelliteTLogWriteAntiQuorumFallback;
std::vector<Optional<Standalone<StringRef>>> primarySatelliteDcIds;
std::vector<Optional<Standalone<StringRef>>> remoteSatelliteDcIds;
TSSMode tssMode;
// Used by workloads that perform reconfigurations
int testerCount;

View File

@ -19,6 +19,7 @@
*/
#include "fdbclient/MutationList.h"
#include "fdbclient/KeyBackedTypes.h" // for key backed map codecs for tss mapping
#include "fdbclient/SystemData.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/Notified.h"
@ -64,10 +65,19 @@ void applyMetadataMutations(SpanID const& spanContext,
NotifiedVersion* commitVersion,
std::map<UID, Reference<StorageInfo>>* storageCache,
std::map<Tag, Version>* tag_popped,
std::unordered_map<UID, StorageServerInterface>* tssMapping,
bool initialCommit) {
// std::map<keyRef, vector<uint16_t>> cacheRangeInfo;
std::map<KeyRef, MutationRef> cachedRangeInfo;
// Testing Storage Server removal (clearing serverTagKey) needs to read tss server list value to determine it is a
// tss + find partner's tag to send the private mutation. Since the removeStorageServer transaction clears both the
// storage list and server tag, we have to enforce ordering, proccessing the server tag first, and postpone the
// server list clear until the end;
// Similarly, the TSS mapping change key needs to read the server list at the end of the commit
std::vector<KeyRangeRef> tssServerListToRemove;
std::vector<std::pair<UID, UID>> tssMappingToAdd;
for (auto const& m : mutations) {
//TraceEvent("MetadataMutation", dbgid).detail("M", m.toString());
if (toCommit) {
@ -95,12 +105,14 @@ void applyMetadataMutations(SpanID const& spanContext,
for (const auto& id : src) {
auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
ASSERT(!storageInfo->interf.isTss());
ASSERT(storageInfo->tag != invalidTag);
info.tags.push_back(storageInfo->tag);
info.src_info.push_back(storageInfo);
}
for (const auto& id : dest) {
auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
ASSERT(!storageInfo->interf.isTss());
ASSERT(storageInfo->tag != invalidTag);
info.tags.push_back(storageInfo->tag);
info.dest_info.push_back(storageInfo);
@ -113,6 +125,8 @@ void applyMetadataMutations(SpanID const& spanContext,
txnStateStore->set(KeyValueRef(m.param1, m.param2));
} else if (m.param1.startsWith(serverKeysPrefix)) {
if (toCommit) {
Tag tag = decodeServerTagValue(
txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get());
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
TraceEvent(SevDebug, "SendingPrivateMutation", dbgid)
@ -120,14 +134,9 @@ void applyMetadataMutations(SpanID const& spanContext,
.detail("Privatized", privatized.toString())
.detail("Server", serverKeysDecodeServer(m.param1))
.detail("TagKey", serverTagKeyFor(serverKeysDecodeServer(m.param1)))
.detail(
"Tag",
decodeServerTagValue(
txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get())
.toString());
.detail("Tag", tag.toString());
toCommit->addTag(decodeServerTagValue(
txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get()));
toCommit->addTag(tag);
toCommit->writeTypedMessage(privatized);
}
} else if (m.param1.startsWith(serverTagPrefix)) {
@ -235,6 +244,29 @@ void applyMetadataMutations(SpanID const& spanContext,
}
}
}
} else if (m.param1.startsWith(tssMappingKeys.begin)) {
if (!initialCommit) {
txnStateStore->set(KeyValueRef(m.param1, m.param2));
if (tssMapping) {
// Normally uses key backed map, so have to use same unpacking code here.
UID ssId = Codec<UID>::unpack(Tuple::unpack(m.param1.removePrefix(tssMappingKeys.begin)));
UID tssId = Codec<UID>::unpack(Tuple::unpack(m.param2));
tssMappingToAdd.push_back(std::pair(ssId, tssId));
// send private mutation to SS that it now has a TSS pair
if (toCommit) {
MutationRef privatized = m;
privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
Optional<Value> tagV = txnStateStore->readValue(serverTagKeyFor(ssId)).get();
if (tagV.present()) {
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
}
}
}
}
} else if (m.param1 == databaseLockedKey || m.param1 == metadataVersionKey ||
m.param1 == mustContainSystemMutationsKey ||
m.param1.startsWith(applyMutationsBeginRange.begin) ||
@ -379,8 +411,20 @@ void applyMetadataMutations(SpanID const& spanContext,
}
}
if (serverListKeys.intersects(range)) {
if (!initialCommit)
txnStateStore->clear(range & serverListKeys);
if (!initialCommit) {
KeyRangeRef rangeToClear = range & serverListKeys;
if (rangeToClear.singleKeyRange()) {
UID id = decodeServerListKey(rangeToClear.begin);
Optional<Value> ssiV = txnStateStore->readValue(serverListKeyFor(id)).get();
if (ssiV.present() && decodeServerListValue(ssiV.get()).isTss()) {
tssServerListToRemove.push_back(rangeToClear);
} else {
txnStateStore->clear(rangeToClear);
}
} else {
txnStateStore->clear(rangeToClear);
}
}
}
if (tagLocalityListKeys.intersects(range)) {
if (!initialCommit)
@ -411,6 +455,32 @@ void applyMetadataMutations(SpanID const& spanContext,
toCommit->writeTypedMessage(privatized);
}
}
// Might be a tss removal, which doesn't store a tag there.
// Chained if is a little verbose, but avoids unecessary work
if (toCommit && !initialCommit && !serverKeysCleared.size()) {
KeyRangeRef maybeTssRange = range & serverTagKeys;
if (maybeTssRange.singleKeyRange()) {
UID id = decodeServerTagKey(maybeTssRange.begin);
Optional<Value> ssiV = txnStateStore->readValue(serverListKeyFor(id)).get();
if (ssiV.present()) {
StorageServerInterface ssi = decodeServerListValue(ssiV.get());
if (ssi.isTss()) {
Optional<Value> tagV =
txnStateStore->readValue(serverTagKeyFor(ssi.tssPairID.get())).get();
if (tagV.present()) {
MutationRef privatized = m;
privatized.param1 = maybeTssRange.begin.withPrefix(systemKeys.begin, arena);
privatized.param2 =
keyAfter(maybeTssRange.begin, arena).withPrefix(systemKeys.begin, arena);
toCommit->addTag(decodeServerTagValue(tagV.get()));
toCommit->writeTypedMessage(privatized);
}
}
}
}
}
}
if (!initialCommit) {
KeyRangeRef clearRange = range & serverTagKeys;
@ -439,6 +509,19 @@ void applyMetadataMutations(SpanID const& spanContext,
if (!initialCommit)
txnStateStore->clear(range & serverTagHistoryKeys);
}
if (tssMappingKeys.intersects(range)) {
if (!initialCommit) {
KeyRangeRef rangeToClear = range & tssMappingKeys;
ASSERT(rangeToClear.singleKeyRange());
txnStateStore->clear(rangeToClear);
if (tssMapping) {
// Normally uses key backed map, so have to use same unpacking code here.
UID ssId =
Codec<UID>::unpack(Tuple::unpack(rangeToClear.begin.removePrefix(tssMappingKeys.begin)));
tssMapping->erase(ssId);
}
}
}
if (range.contains(coordinatorsKey)) {
if (!initialCommit)
txnStateStore->clear(singleKeyRange(coordinatorsKey));
@ -568,6 +651,17 @@ void applyMetadataMutations(SpanID const& spanContext,
}
}
for (KeyRangeRef& range : tssServerListToRemove) {
txnStateStore->clear(range);
}
for (auto& tssPair : tssMappingToAdd) {
// read tss server list from txn state store and add it to tss mapping
StorageServerInterface tssi =
decodeServerListValue(txnStateStore->readValue(serverListKeyFor(tssPair.second)).get().get());
(*tssMapping)[tssPair.first] = tssi;
}
// If we accumulated private mutations for cached key-ranges, we also need to
// tag them with the relevant storage servers. This is done to make the storage
// servers aware of the cached key-ranges
@ -666,6 +760,7 @@ void applyMetadataMutations(SpanID const& spanContext,
&proxyCommitData.committedVersion,
&proxyCommitData.storageCache,
&proxyCommitData.tag_popped,
&proxyCommitData.tssMapping,
initialCommit);
}
@ -695,5 +790,6 @@ void applyMetadataMutations(SpanID const& spanContext,
/* commitVersion= */ nullptr,
/* storageCache= */ nullptr,
/* tag_popped= */ nullptr,
/* tssMapping= */ nullptr,
/* initialCommit= */ false);
}

View File

@ -25,6 +25,7 @@
#include "fdbclient/SystemData.h"
#include "fdbserver/BackupInterface.h"
#include "fdbserver/BackupProgress.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/ServerDBInfo.h"

View File

@ -83,6 +83,8 @@ set(FDBSERVER_SRCS
RestoreLoader.actor.cpp
RestoreWorker.actor.h
RestoreWorker.actor.cpp
RestoreWorkerInterface.actor.cpp
RestoreWorkerInterface.actor.h
Resolver.actor.cpp
ResolverInterface.h
ServerDBInfo.actor.h
@ -103,6 +105,8 @@ set(FDBSERVER_SRCS
TesterInterface.actor.h
TLogInterface.h
TLogServer.actor.cpp
TSSMappingUtil.actor.h
TSSMappingUtil.actor.cpp
VersionedBTree.actor.cpp
VFSAsync.h
VFSAsync.cpp

View File

@ -599,8 +599,8 @@ public:
std::vector<std::tuple<ProcessClass::Fitness, int, bool, int, Field>> orderedFields;
for (auto& it : fieldsWithMin) {
auto& fitness = field_fitness[it];
orderedFields.push_back(std::make_tuple(
std::get<0>(fitness), std::get<1>(fitness), std::get<2>(fitness), field_count[it], it));
orderedFields.emplace_back(
std::get<0>(fitness), std::get<1>(fitness), std::get<2>(fitness), field_count[it], it);
}
std::sort(orderedFields.begin(), orderedFields.end());
int totalFields = desired / minPerField;
@ -1692,20 +1692,37 @@ public:
if (req.configuration.regions.size() > 1) {
std::vector<RegionInfo> regions = req.configuration.regions;
if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
TraceEvent("CCSwitchPrimaryDc", id)
.detail("CCDcId", clusterControllerDcId.get())
.detail("OldPrimaryDcId", regions[0].dcId)
.detail("NewPrimaryDcId", regions[1].dcId);
std::swap(regions[0], regions[1]);
}
if (regions[1].dcId == clusterControllerDcId.get() &&
(!versionDifferenceUpdated || datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE)) {
if (regions[1].priority >= 0) {
TraceEvent("CCSwitchPrimaryDcVersionDifference", id)
.detail("CCDcId", clusterControllerDcId.get())
.detail("OldPrimaryDcId", regions[0].dcId)
.detail("NewPrimaryDcId", regions[1].dcId);
std::swap(regions[0], regions[1]);
} else {
TraceEvent(SevWarnAlways, "CCDcPriorityNegative")
.detail("DcId", regions[1].dcId)
.detail("Priority", regions[1].priority);
.detail("Priority", regions[1].priority)
.detail("FindWorkersInDc", regions[0].dcId)
.detail("Warning", "Failover did not happen but CC is in remote DC");
}
}
TraceEvent("CCFindWorkersForConfiguration", id)
.detail("CCDcId", clusterControllerDcId.get())
.detail("Region0DcId", regions[0].dcId)
.detail("Region1DcId", regions[1].dcId)
.detail("DatacenterVersionDifference", datacenterVersionDifference)
.detail("VersionDifferenceUpdated", versionDifferenceUpdated);
bool setPrimaryDesired = false;
try {
auto reply = findWorkersForConfigurationFromDC(req, regions[0].dcId);
@ -1719,6 +1736,10 @@ public:
} else if (regions[0].dcId == clusterControllerDcId.get()) {
return reply.get();
}
TraceEvent(SevWarn, "CCRecruitmentFailed", id)
.detail("Reason", "Recruited Txn system and CC are in different DCs")
.detail("CCDcId", clusterControllerDcId.get())
.detail("RecruitedTxnSystemDcId", regions[0].dcId);
throw no_more_servers();
} catch (Error& e) {
if (!goodRemoteRecruitmentTime.isReady() && regions[1].dcId != clusterControllerDcId.get()) {
@ -1728,7 +1749,9 @@ public:
if (e.code() != error_code_no_more_servers || regions[1].priority < 0) {
throw;
}
TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDC", id).error(e);
TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDc", id)
.detail("SetPrimaryDesired", setPrimaryDesired)
.error(e);
auto reply = findWorkersForConfigurationFromDC(req, regions[1].dcId);
if (!setPrimaryDesired) {
vector<Optional<Key>> dcPriority;
@ -3382,6 +3405,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
if (db->clientInfo->get().commitProxies != req.commitProxies ||
db->clientInfo->get().grvProxies != req.grvProxies) {
isChanged = true;
// TODO why construct a new one and not just copy the old one and change proxies + id?
ClientDBInfo clientInfo;
clientInfo.id = deterministicRandom()->randomUniqueID();
clientInfo.commitProxies = req.commitProxies;
@ -3874,7 +3898,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
state ClientDBInfo clientInfo = db->clientInfo->get();
state ClientDBInfo clientInfo = db->serverInfo->get().client;
if (globalConfigVersion.present()) {
// Since the history keys end with versionstamps, they
@ -3932,6 +3956,14 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
}
clientInfo.id = deterministicRandom()->randomUniqueID();
// Update ServerDBInfo so fdbserver processes receive updated history.
ServerDBInfo serverInfo = db->serverInfo->get();
serverInfo.id = deterministicRandom()->randomUniqueID();
serverInfo.infoGeneration = ++db->dbInfoCount;
serverInfo.client = clientInfo;
db->serverInfo->set(serverInfo);
// Update ClientDBInfo so client processes receive updated history.
db->clientInfo->set(clientInfo);
}
@ -4411,6 +4443,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(handleForcedRecoveries(&self, interf));
self.addActor.send(monitorDataDistributor(&self));
self.addActor.send(monitorRatekeeper(&self));
// self.addActor.send(monitorTSSMapping(&self));
self.addActor.send(dbInfoUpdater(&self));
self.addActor.send(traceCounters("ClusterControllerMetrics",
self.id,

View File

@ -42,6 +42,7 @@
#include "fdbserver/ProxyCommitData.actor.h"
#include "fdbserver/RatekeeperInterface.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
@ -1431,11 +1432,26 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
return Void();
}
// Add tss mapping data to the reply, if any of the included storage servers have a TSS pair
void maybeAddTssMapping(GetKeyServerLocationsReply& reply,
ProxyCommitData* commitData,
std::unordered_set<UID>& included,
UID ssId) {
if (!included.count(ssId)) {
auto mappingItr = commitData->tssMapping.find(ssId);
if (mappingItr != commitData->tssMapping.end()) {
included.insert(ssId);
reply.resultsTssMapping.push_back(*mappingItr);
}
}
}
ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
// We can't respond to these requests until we have valid txnStateStore
wait(commitData->validState.getFuture());
wait(delay(0, TaskPriority::DefaultEndpoint));
std::unordered_set<UID> tssMappingsIncluded;
GetKeyServerLocationsReply rep;
if (!req.end.present()) {
auto r = req.reverse ? commitData->keyInfo.rangeContainingKeyBefore(req.begin)
@ -1444,8 +1460,9 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
ssis.reserve(r.value().src_info.size());
for (auto& it : r.value().src_info) {
ssis.push_back(it->interf);
maybeAddTssMapping(rep, commitData, tssMappingsIncluded, it->interf.id());
}
rep.results.push_back(std::make_pair(r.range(), ssis));
rep.results.emplace_back(r.range(), ssis);
} else if (!req.reverse) {
int count = 0;
for (auto r = commitData->keyInfo.rangeContaining(req.begin);
@ -1455,8 +1472,9 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
ssis.reserve(r.value().src_info.size());
for (auto& it : r.value().src_info) {
ssis.push_back(it->interf);
maybeAddTssMapping(rep, commitData, tssMappingsIncluded, it->interf.id());
}
rep.results.push_back(std::make_pair(r.range(), ssis));
rep.results.emplace_back(r.range(), ssis);
count++;
}
} else {
@ -1467,8 +1485,9 @@ ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsReques
ssis.reserve(r.value().src_info.size());
for (auto& it : r.value().src_info) {
ssis.push_back(it->interf);
maybeAddTssMapping(rep, commitData, tssMappingsIncluded, it->interf.id());
}
rep.results.push_back(std::make_pair(r.range(), ssis));
rep.results.emplace_back(r.range(), ssis);
if (r == commitData->keyInfo.ranges().begin()) {
break;
}

View File

@ -406,8 +406,8 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
// If the current leader's priority became worse, we still need to notified all clients because now one
// of them might be better than the leader. In addition, even though FitnessRemote is better than
// FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to switch
// from passively monitoring the leader to actively attempting to become the leader.
// FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to
// switch from passively monitoring the leader to actively attempting to become the leader.
if (!currentNominee.present() || !nextNominee.present() ||
!currentNominee.get().equalInternalId(nextNominee.get()) ||
nextNominee.get() > currentNominee.get() ||
@ -545,15 +545,30 @@ struct LeaderRegisterCollection {
}
};
// extract the prefix descriptor from cluster id
StringRef getClusterDescriptor(Key key) {
StringRef str = key.contents();
return str.eat(":");
}
// leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface,
// creating and destroying them on demand.
ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id) {
ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf,
OnDemandStore* pStore,
UID id,
Reference<ClusterConnectionFile> ccf) {
state LeaderRegisterCollection regs(pStore);
state ActorCollection forwarders(false);
wait(LeaderRegisterCollection::init(&regs));
loop choose {
when(CheckDescriptorMutableRequest req = waitNext(interf.checkDescriptorMutable.getFuture())) {
// Note the response returns the value of a knob enforced by checking only one coordinator. It is not
// quorum based.
CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
req.reply.send(rep);
}
when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.clusterKey);
if (forward.present()) {
@ -561,49 +576,111 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
info.id = deterministicRandom()->randomUniqueID();
info.forward = forward.get().serializedInfo;
req.reply.send(CachedSerialization<ClientDBInfo>(info));
} else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
getClusterDescriptor(req.clusterKey).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "OpenDatabaseCoordRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.clusterKey)
.detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.clusterKey, id).openDatabase.send(req);
}
}
}
when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present()) {
req.reply.send(forward.get());
} else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "ElectionResultRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key)
.detail("ClusterKey", ccf->getConnectionString().clusterKey())
.detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).electionResult.send(req);
}
}
}
when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(forward.get());
else
else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "GetLeaderRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key)
.detail("ClusterKey", ccf->getConnectionString().clusterKey());
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).getLeader.send(req);
}
}
}
when(CandidacyRequest req = waitNext(interf.candidacy.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(forward.get());
else
else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "CandidacyRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key);
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).candidacy.send(req);
}
}
}
when(LeaderHeartbeatRequest req = waitNext(interf.leaderHeartbeat.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(LeaderHeartbeatReply{ false });
else
else {
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "LeaderHeartbeatRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key);
req.reply.sendError(wrong_connection_file());
} else {
regs.getInterface(req.key, id).leaderHeartbeat.send(req);
}
}
}
when(ForwardRequest req = waitNext(interf.forward.getFuture())) {
Optional<LeaderInfo> forward = regs.getForward(req.key);
if (forward.present())
req.reply.send(Void());
else {
forwarders.add(
LeaderRegisterCollection::setForward(&regs, req.key, ClusterConnectionString(req.conn.toString())));
StringRef clusterName = ccf->getConnectionString().clusterKeyName();
if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
TraceEvent(SevWarn, "CCFMismatch")
.detail("RequestType", "ForwardRequest")
.detail("LocalCS", ccf->getConnectionString().toString())
.detail("IncomingClusterKey", req.key);
req.reply.sendError(wrong_connection_file());
} else {
forwarders.add(LeaderRegisterCollection::setForward(
&regs, req.key, ClusterConnectionString(req.conn.toString())));
regs.getInterface(req.key, id).forward.send(req);
}
}
}
when(wait(forwarders.getResult())) {
ASSERT(false);
throw internal_error();
@ -611,7 +688,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
}
}
ACTOR Future<Void> coordinationServer(std::string dataFolder) {
ACTOR Future<Void> coordinationServer(std::string dataFolder, Reference<ClusterConnectionFile> ccf) {
state UID myID = deterministicRandom()->randomUniqueID();
state LeaderElectionRegInterface myLeaderInterface(g_network);
state GenerationRegInterface myInterface(g_network);
@ -622,7 +699,7 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder) {
.detail("Folder", dataFolder);
try {
wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID) ||
wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID, ccf) ||
store.getError());
throw internal_error();
} catch (Error& e) {

View File

@ -225,6 +225,6 @@ public:
vector<GenerationRegInterface> stateServers;
};
Future<Void> coordinationServer(std::string const& dataFolder);
Future<Void> coordinationServer(std::string const& dataFolder, Reference<ClusterConnectionFile> const& ccf);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -263,6 +263,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
MoveKeysLock lock,
PromiseStream<Promise<int64_t>> getAverageShardBytes,
PromiseStream<Promise<int>> getUnhealthyRelocationCount,
UID distributorId,
int teamSize,
int singleRegionTeamSize,

View File

@ -1032,7 +1032,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueueData* self, RelocateData rd,
anyWithSource = true;
}
bestTeams.push_back(std::make_pair(bestTeam.first.get(), bestTeam.second));
bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
tciIndex++;
}
if (foundTeams && anyHealthy) {
@ -1550,6 +1550,7 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure,
MoveKeysLock lock,
PromiseStream<Promise<int64_t>> getAverageShardBytes,
PromiseStream<Promise<int>> getUnhealthyRelocationCount,
UID distributorId,
int teamSize,
int singleRegionTeamSize,
@ -1679,6 +1680,9 @@ ACTOR Future<Void> dataDistributionQueue(Database cx,
}
when(wait(self.error.getFuture())) {} // Propagate errors from dataDistributionRelocator
when(wait(waitForAll(balancingFutures))) {}
when(Promise<int> r = waitNext(getUnhealthyRelocationCount.getFuture())) {
r.send(self.unhealthyRelocations);
}
}
}
} catch (Error& e) {

View File

@ -176,8 +176,8 @@ ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize) {
}
int64_t getMaxShardSize(double dbSizeEstimate) {
return std::min((SERVER_KNOBS->MIN_SHARD_BYTES +
(int64_t)std::sqrt(dbSizeEstimate) * SERVER_KNOBS->SHARD_BYTES_PER_SQRT_BYTES) *
return std::min((SERVER_KNOBS->MIN_SHARD_BYTES + (int64_t)std::sqrt(std::max<double>(dbSizeEstimate, 0)) *
SERVER_KNOBS->SHARD_BYTES_PER_SQRT_BYTES) *
SERVER_KNOBS->SHARD_BYTES_RATIO,
(int64_t)SERVER_KNOBS->MAX_SHARD_BYTES);
}

View File

@ -832,7 +832,7 @@ public:
int count = end - begin;
numItems = count;
nodeBytesDeleted = 0;
initialHeight = (uint8_t)log2(count) + 1;
initialHeight = count ? (uint8_t)log2(count) + 1 : 0;
maxHeight = 0;
// The boundary leading to the new page acts as the last time we branched right

View File

@ -148,7 +148,10 @@ ACTOR Future<int> spawnProcess(std::string path,
state pid_t pid = pidAndReadFD.first;
state Optional<int> readFD = pidAndReadFD.second;
if (pid == -1) {
TraceEvent(SevWarnAlways, "SpawnProcess: Command failed to spawn").detail("Cmd", path).detail("Args", allArgs);
TraceEvent(SevWarnAlways, "SpawnProcessFailure")
.detail("Reason", "Command failed to spawn")
.detail("Cmd", path)
.detail("Args", allArgs);
return -1;
} else if (pid > 0) {
state int status = -1;
@ -160,7 +163,8 @@ ACTOR Future<int> spawnProcess(std::string path,
if (runTime > maxWaitTime) {
// timing out
TraceEvent(SevWarnAlways, "SpawnProcess : Command failed, timeout")
TraceEvent(SevWarnAlways, "SpawnProcessFailure")
.detail("Reason", "Command failed, timeout")
.detail("Cmd", path)
.detail("Args", allArgs);
return -1;
@ -175,9 +179,10 @@ ACTOR Future<int> spawnProcess(std::string path,
}
if (err < 0) {
TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed");
TraceEvent event(SevWarnAlways, "SpawnProcessFailure");
setupTraceWithOutput(event, bytesRead, outputBuffer);
event.detail("Cmd", path)
event.detail("Reason", "Command failed")
.detail("Cmd", path)
.detail("Args", allArgs)
.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
return -1;
@ -194,14 +199,15 @@ ACTOR Future<int> spawnProcess(std::string path,
} else {
// child process completed
if (!(WIFEXITED(status) && WEXITSTATUS(status) == 0)) {
TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed");
TraceEvent event(SevWarnAlways, "SpawnProcessFailure");
setupTraceWithOutput(event, bytesRead, outputBuffer);
event.detail("Cmd", path)
event.detail("Reason", "Command failed")
.detail("Cmd", path)
.detail("Args", allArgs)
.detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
}
TraceEvent event("SpawnProcess : Command status");
TraceEvent event("SpawnProcessCommandStatus");
setupTraceWithOutput(event, bytesRead, outputBuffer);
event.detail("Cmd", path)
.detail("Args", allArgs)

View File

@ -109,15 +109,18 @@ struct GrvProxyStats {
SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
grvLatencyBands("GRVLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
// The rate at which the limit(budget) is allowed to grow.
specialCounter(cc, "SystemAndDefaultTxnRateAllowed", [this]() { return this->transactionRateAllowed; });
specialCounter(cc, "BatchTransactionRateAllowed", [this]() { return this->batchTransactionRateAllowed; });
specialCounter(cc, "SystemAndDefaultTxnLimit", [this]() { return this->transactionLimit; });
specialCounter(cc, "BatchTransactionLimit", [this]() { return this->batchTransactionLimit; });
specialCounter(cc, "PercentageOfDefaultGRVQueueProcessed", [this]() {
return this->percentageOfDefaultGRVQueueProcessed;
});
specialCounter(
cc, "PercentageOfBatchGRVQueueProcessed", [this]() { return this->percentageOfBatchGRVQueueProcessed; });
cc, "SystemAndDefaultTxnRateAllowed", [this]() { return int64_t(this->transactionRateAllowed); });
specialCounter(
cc, "BatchTransactionRateAllowed", [this]() { return int64_t(this->batchTransactionRateAllowed); });
specialCounter(cc, "SystemAndDefaultTxnLimit", [this]() { return int64_t(this->transactionLimit); });
specialCounter(cc, "BatchTransactionLimit", [this]() { return int64_t(this->batchTransactionLimit); });
specialCounter(cc, "PercentageOfDefaultGRVQueueProcessed", [this]() {
return int64_t(100 * this->percentageOfDefaultGRVQueueProcessed);
});
specialCounter(cc, "PercentageOfBatchGRVQueueProcessed", [this]() {
return int64_t(100 * this->percentageOfBatchGRVQueueProcessed);
});
logger = traceCounters("GrvProxyMetrics", id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "GrvProxyMetrics");
for (int i = 0; i < FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS; i++) {
@ -831,8 +834,10 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
}
span = Span(span.location);
grvProxyData->stats.percentageOfDefaultGRVQueueProcessed = (double)defaultGRVProcessed / defaultQueueSize;
grvProxyData->stats.percentageOfBatchGRVQueueProcessed = (double)batchGRVProcessed / batchQueueSize;
grvProxyData->stats.percentageOfDefaultGRVQueueProcessed =
defaultQueueSize ? (double)defaultGRVProcessed / defaultQueueSize : 1;
grvProxyData->stats.percentageOfBatchGRVQueueProcessed =
batchQueueSize ? (double)batchGRVProcessed / batchQueueSize : 1;
}
}

View File

@ -56,9 +56,6 @@ public:
if (userData != nullptr && userDataDestructor != nullptr) {
userDataDestructor(userData);
}
if (buffer != nullptr) {
VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize);
}
}
uint8_t const* begin() const { return (uint8_t*)buffer; }

View File

@ -401,7 +401,7 @@ private:
if (o->op == OpSet) {
if (sequential) {
KeyValueMapPair pair(o->p1, o->p2);
dataSets.push_back(std::make_pair(pair, pair.arena.getSize() + data.getElementBytes()));
dataSets.emplace_back(pair, pair.arena.getSize() + data.getElementBytes());
} else {
data.insert(o->p1, o->p2);
}

View File

@ -131,6 +131,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( PRIORITY_RECOVER_MOVE, 110 );
init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 );
init( PRIORITY_REBALANCE_OVERUTILIZED_TEAM, 121 );
init( PRIORITY_PERPETUAL_STORAGE_WIGGLE, 140 );
init( PRIORITY_TEAM_HEALTHY, 140 );
init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 );
init( PRIORITY_TEAM_REDUNDANT, 200 );
@ -217,6 +218,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( SERVER_LIST_DELAY, 1.0 );
init( RECRUITMENT_IDLE_DELAY, 1.0 );
init( STORAGE_RECRUITMENT_DELAY, 10.0 );
init( TSS_HACK_IDENTITY_MAPPING, false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing
init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail
init( TSS_DD_CHECK_INTERVAL, 60.0 ); if (randomize && BUGGIFY ) TSS_DD_CHECK_INTERVAL = 1.0; // May kill all TSS quickly
init( DATA_DISTRIBUTION_LOGGING_INTERVAL, 5.0 );
init( DD_ENABLED_CHECK_DELAY, 1.0 );
init( DD_STALL_CHECK_DELAY, 0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY
@ -250,6 +254,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
init( DD_TEAMS_INFO_PRINT_INTERVAL, 60 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_INTERVAL = 10;
init( DD_TEAMS_INFO_PRINT_YIELD_COUNT, 100 ); if( randomize && BUGGIFY ) DD_TEAMS_INFO_PRINT_YIELD_COUNT = deterministicRandom()->random01() * 1000 + 1;
init( DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY, 120 ); if( randomize && BUGGIFY ) DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY = 5;
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 1 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 10;
// TeamRemover
init( TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
@ -631,6 +636,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
// Coordination
init( COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL, 1.0 ); if( randomize && BUGGIFY ) COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL = 10.0;
init( ENABLE_CROSS_CLUSTER_SUPPORT, true ); if( randomize && BUGGIFY ) ENABLE_CROSS_CLUSTER_SUPPORT = false;
// Buggification
init( BUGGIFIED_EVENTUAL_CONSISTENCY, 1.0 );

View File

@ -133,6 +133,7 @@ public:
int PRIORITY_RECOVER_MOVE;
int PRIORITY_REBALANCE_UNDERUTILIZED_TEAM;
int PRIORITY_REBALANCE_OVERUTILIZED_TEAM;
int PRIORITY_PERPETUAL_STORAGE_WIGGLE;
int PRIORITY_TEAM_HEALTHY;
int PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER;
int PRIORITY_TEAM_REDUNDANT;
@ -167,6 +168,9 @@ public:
double SERVER_LIST_DELAY;
double RECRUITMENT_IDLE_DELAY;
double STORAGE_RECRUITMENT_DELAY;
bool TSS_HACK_IDENTITY_MAPPING;
double TSS_RECRUITMENT_TIMEOUT;
double TSS_DD_CHECK_INTERVAL;
double DATA_DISTRIBUTION_LOGGING_INTERVAL;
double DD_ENABLED_CHECK_DELAY;
double DD_STALL_CHECK_DELAY;
@ -200,6 +204,7 @@ public:
int DD_TEAMS_INFO_PRINT_INTERVAL;
int DD_TEAMS_INFO_PRINT_YIELD_COUNT;
int DD_TEAM_ZERO_SERVER_LEFT_LOG_DELAY;
int DD_STORAGE_WIGGLE_PAUSE_THRESHOLD; // How many unhealthy relocations are ongoing will pause storage wiggle
// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
@ -559,6 +564,8 @@ public:
// Coordination
double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL;
bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match
// the local descriptor
// Buggification
double BUGGIFIED_EVENTUAL_CONSISTENCY;

View File

@ -21,6 +21,7 @@
#include "fdbrpc/FailureMonitor.h"
#include "fdbrpc/Locality.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/Knobs.h"
#include "fdbclient/MonitorLeader.h"
#include "flow/actorcompiler.h" // This must be the last #include.

View File

@ -175,22 +175,22 @@ struct LogRouterData {
specialCounter(cc, "WaitForVersionMS", [this]() {
double val = this->waitForVersionTime;
this->waitForVersionTime = 0;
return 1000 * val;
return int64_t(1000 * val);
});
specialCounter(cc, "WaitForVersionMaxMS", [this]() {
double val = this->maxWaitForVersionTime;
this->maxWaitForVersionTime = 0;
return 1000 * val;
return int64_t(1000 * val);
});
specialCounter(cc, "GetMoreMS", [this]() {
double val = this->getMoreTime;
this->getMoreTime = 0;
return 1000 * val;
return int64_t(1000 * val);
});
specialCounter(cc, "GetMoreMaxMS", [this]() {
double val = this->maxGetMoreTime;
this->maxGetMoreTime = 0;
return 1000 * val;
return int64_t(1000 * val);
});
specialCounter(cc, "Generation", [this]() { return this->generation; });
logger = traceCounters("LogRouterMetrics",

View File

@ -410,6 +410,8 @@ struct ILogSystem {
virtual Optional<UID> getPrimaryPeekLocation() const = 0;
virtual Optional<UID> getCurrentPeekLocation() const = 0;
virtual void addref() = 0;
virtual void delref() = 0;
@ -473,6 +475,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<ServerPeekCursor>::addref(); }
@ -534,6 +537,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MergedPeekCursor>::addref(); }
@ -589,6 +593,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<SetPeekCursor>::addref(); }
@ -620,6 +625,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MultiCursor>::addref(); }
@ -698,6 +704,7 @@ struct ILogSystem {
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<BufferedCursor>::addref(); }

View File

@ -393,12 +393,16 @@ Version ILogSystem::ServerPeekCursor::getMinKnownCommittedVersion() const {
}
Optional<UID> ILogSystem::ServerPeekCursor::getPrimaryPeekLocation() const {
if (interf) {
if (interf && interf->get().present()) {
return interf->get().id();
}
return Optional<UID>();
}
Optional<UID> ILogSystem::ServerPeekCursor::getCurrentPeekLocation() const {
return ILogSystem::ServerPeekCursor::getPrimaryPeekLocation();
}
Version ILogSystem::ServerPeekCursor::popped() const {
return poppedVersion;
}
@ -673,6 +677,13 @@ Optional<UID> ILogSystem::MergedPeekCursor::getPrimaryPeekLocation() const {
return Optional<UID>();
}
Optional<UID> ILogSystem::MergedPeekCursor::getCurrentPeekLocation() const {
if (currentCursor >= 0) {
return serverCursors[currentCursor]->getPrimaryPeekLocation();
}
return Optional<UID>();
}
Version ILogSystem::MergedPeekCursor::popped() const {
Version poppedVersion = 0;
for (auto& c : serverCursors)
@ -1023,6 +1034,13 @@ Optional<UID> ILogSystem::SetPeekCursor::getPrimaryPeekLocation() const {
return Optional<UID>();
}
Optional<UID> ILogSystem::SetPeekCursor::getCurrentPeekLocation() const {
if (currentCursor >= 0 && currentSet >= 0) {
return serverCursors[currentSet][currentCursor]->getPrimaryPeekLocation();
}
return Optional<UID>();
}
Version ILogSystem::SetPeekCursor::popped() const {
Version poppedVersion = 0;
for (auto& cursors : serverCursors) {
@ -1123,6 +1141,10 @@ Optional<UID> ILogSystem::MultiCursor::getPrimaryPeekLocation() const {
return cursors.back()->getPrimaryPeekLocation();
}
Optional<UID> ILogSystem::MultiCursor::getCurrentPeekLocation() const {
return cursors.back()->getCurrentPeekLocation();
}
Version ILogSystem::MultiCursor::popped() const {
return std::max(poppedVersion, cursors.back()->popped());
}
@ -1403,6 +1425,10 @@ Optional<UID> ILogSystem::BufferedCursor::getPrimaryPeekLocation() const {
return Optional<UID>();
}
Optional<UID> ILogSystem::BufferedCursor::getCurrentPeekLocation() const {
return Optional<UID>();
}
Version ILogSystem::BufferedCursor::popped() const {
if (initialPoppedVersion == poppedVersion) {
return 0;

View File

@ -20,9 +20,11 @@
#include "flow/Util.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbclient/KeyBackedTypes.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/TSSMappingUtil.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
using std::max;
@ -158,7 +160,7 @@ ACTOR Future<Optional<UID>> checkReadWrite(Future<ErrorOr<GetShardStateReply>> f
return Optional<UID>(uid);
}
Future<Void> removeOldDestinations(Transaction* tr,
Future<Void> removeOldDestinations(Reference<ReadYourWritesTransaction> tr,
UID oldDest,
VectorRef<KeyRangeRef> shards,
KeyRangeRef currentKeys) {
@ -235,7 +237,7 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard,
}
ACTOR Future<vector<vector<UID>>> additionalSources(RangeResult shards,
Transaction* tr,
Reference<ReadYourWritesTransaction> tr,
int desiredHealthy,
int maxServers) {
state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
@ -320,6 +322,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
MoveKeysLock lock,
FlowLock* startMoveKeysLock,
UID relocationIntervalId,
std::map<UID, StorageServerInterface>* tssMapping,
const DDEnabledState* ddEnabledState) {
state TraceInterval interval("RelocateShard_StartMoveKeys");
state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
@ -327,6 +330,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
wait(startMoveKeysLock->take(TaskPriority::DataDistributionLaunch));
state FlowLock::Releaser releaser(*startMoveKeysLock);
state bool loadedTssMapping = false;
TraceEvent(SevDebug, interval.begin(), relocationIntervalId);
@ -343,7 +347,8 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
TEST(begin > keys.begin); // Multi-transactional startMoveKeys
batches++;
state Transaction tr(occ);
// RYW to optimize re-reading the same key ranges
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(occ);
state int retries = 0;
loop {
@ -356,15 +361,22 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// Keep track of shards for all src servers so that we can preserve their values in serverKeys
state Map<UID, VectorRef<KeyRangeRef>> shardMap;
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->getTransaction().info.taskID = TaskPriority::MoveKeys;
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState));
if (!loadedTssMapping) {
// share transaction for loading tss mapping with the rest of start move keys
wait(readTSSMappingRYW(tr, tssMapping));
loadedTssMapping = true;
}
vector<Future<Optional<Value>>> serverListEntries;
serverListEntries.reserve(servers.size());
for (int s = 0; s < servers.size(); s++)
serverListEntries.push_back(tr.get(serverListKeyFor(servers[s])));
serverListEntries.push_back(tr->get(serverListKeyFor(servers[s])));
state vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
for (int s = 0; s < serverListValues.size(); s++) {
@ -380,7 +392,8 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// Get all existing shards overlapping keys (exclude any that have been processed in a previous
// iteration of the outer loop)
state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
state RangeResult old = wait(krmGetRanges(&tr,
state RangeResult old = wait(krmGetRanges(tr,
keyServersPrefix,
currentKeys,
SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
@ -399,10 +412,10 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());
// Check that enough servers for each shard are in the correct state
state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
vector<vector<UID>> addAsSource = wait(additionalSources(
old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
old, tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
// For each intersecting range, update keyServers[range] dest to be servers and clear existing dest
// servers from serverKeys
@ -417,7 +430,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// .detail("KeyEnd", rangeIntersectKeys.end.toString())
// .detail("OldSrc", describe(src))
// .detail("OldDest", describe(dest))
// .detail("ReadVersion", tr.getReadVersion().get());
// .detail("ReadVersion", tr->getReadVersion().get());
for (auto& uid : addAsSource[i]) {
src.push_back(uid);
@ -425,7 +438,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
uniquify(src);
// Update dest servers for this range to be equal to servers
krmSetPreviouslyEmptyRange(&tr,
krmSetPreviouslyEmptyRange(&(tr->getTransaction()),
keyServersPrefix,
rangeIntersectKeys,
keyServersValue(UIDtoTagMap, src, servers),
@ -455,7 +468,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
vector<Future<Void>> actors;
for (oldDest = oldDests.begin(); oldDest != oldDests.end(); ++oldDest)
if (std::find(servers.begin(), servers.end(), *oldDest) == servers.end())
actors.push_back(removeOldDestinations(&tr, *oldDest, shardMap[*oldDest], currentKeys));
actors.push_back(removeOldDestinations(tr, *oldDest, shardMap[*oldDest], currentKeys));
// Update serverKeys to include keys (or the currently processed subset of keys) for each SS in
// servers
@ -464,12 +477,12 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
// to have the same shard boundaries If that invariant was important, we would have to move this
// inside the loop above and also set it for the src servers
actors.push_back(krmSetRangeCoalescing(
&tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue));
tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue));
}
wait(waitForAll(actors));
wait(tr.commit());
wait(tr->commit());
/*TraceEvent("StartMoveKeysCommitDone", relocationIntervalId)
.detail("CommitVersion", tr.getCommittedVersion())
@ -481,7 +494,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
state Error err = e;
if (err.code() == error_code_move_to_removed_server)
throw;
wait(tr.onError(e));
wait(tr->onError(e));
if (retries % 10 == 0) {
TraceEvent(
@ -500,7 +513,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
}
// printf("Committed moving '%s'-'%s' (version %lld)\n", keys.begin.toString().c_str(),
// keys.end.toString().c_str(), tr.getCommittedVersion());
// keys.end.toString().c_str(), tr->getCommittedVersion());
TraceEvent(SevDebug, interval.end(), relocationIntervalId)
.detail("Batches", batches)
.detail("Shards", shards)
@ -536,11 +549,14 @@ ACTOR Future<Void> waitForShardReady(StorageServerInterface server,
}
}
// best effort to also wait for TSS on data move
ACTOR Future<Void> checkFetchingState(Database cx,
vector<UID> dest,
KeyRange keys,
Promise<Void> dataMovementComplete,
UID relocationIntervalId) {
UID relocationIntervalId,
std::map<UID, StorageServerInterface> tssMapping) {
state Transaction tr(cx);
loop {
@ -557,6 +573,7 @@ ACTOR Future<Void> checkFetchingState(Database cx,
serverListEntries.push_back(tr.get(serverListKeyFor(dest[s])));
state vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
vector<Future<Void>> requests;
state vector<Future<Void>> tssRequests;
for (int s = 0; s < serverListValues.size(); s++) {
if (!serverListValues[s].present()) {
// FIXME: Is this the right behavior? dataMovementComplete will never be sent!
@ -567,10 +584,25 @@ ACTOR Future<Void> checkFetchingState(Database cx,
ASSERT(si.id() == dest[s]);
requests.push_back(
waitForShardReady(si, keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING));
auto tssPair = tssMapping.find(si.id());
if (tssPair != tssMapping.end()) {
tssRequests.push_back(waitForShardReady(
tssPair->second, keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING));
}
}
wait(timeoutError(waitForAll(requests), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys));
// If normal servers return normally, give TSS data movement a bit of a chance, but don't block on it, and
// ignore errors in tss requests
if (tssRequests.size()) {
wait(timeout(waitForAllReady(tssRequests),
SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT / 2,
Void(),
TaskPriority::MoveKeys));
}
dataMovementComplete.send(Void());
return Void();
} catch (Error& e) {
@ -593,6 +625,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
FlowLock* finishMoveKeysParallelismLock,
bool hasRemote,
UID relocationIntervalId,
std::map<UID, StorageServerInterface> tssMapping,
const DDEnabledState* ddEnabledState) {
state TraceInterval interval("RelocateShard_FinishMoveKeys");
state TraceInterval waitInterval("");
@ -602,6 +635,11 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
state int retries = 0;
state FlowLock::Releaser releaser;
state std::vector<std::pair<UID, UID>> tssToKill;
state std::unordered_set<UID> tssToIgnore;
// try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster
state int waitForTSSCounter = 2;
ASSERT(!destinationTeam.empty());
try {
@ -616,9 +654,26 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
state Transaction tr(occ);
// printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
// printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str());
loop {
try {
if (tssToKill.size()) {
TEST(true); // killing TSS because they were unavailable for movekeys
// Kill tss BEFORE committing main txn so that client requests don't make it to the tss when it
// has a different shard set than its pair use a different RYW transaction since i'm too lazy
// (and don't want to add bugs) by changing whole method to RYW. Also, using a different
// transaction makes it commit earlier which we may need to guarantee causality of tss getting
// removed before client sends a request to this key range on the new SS
wait(removeTSSPairsFromCluster(occ, tssToKill));
for (auto& tssPair : tssToKill) {
TraceEvent(SevWarnAlways, "TSS_KillMoveKeys").detail("TSSID", tssPair.second);
tssToIgnore.insert(tssPair.second);
}
tssToKill.clear();
}
tr.info.taskID = TaskPriority::MoveKeys;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@ -763,6 +818,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
// between
// now and when this transaction commits.
state vector<Future<Void>> serverReady; // only for count below
state vector<Future<Void>> tssReady; // for waiting in parallel with tss
state vector<StorageServerInterface> tssReadyInterfs;
state vector<UID> newDestinations;
std::set<UID> completeSrcSet(completeSrc.begin(), completeSrc.end());
for (auto& it : dest) {
@ -789,22 +846,95 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
storageServerInterfaces.push_back(si);
}
// update client info in case tss mapping changed or server got updated
// Wait for new destination servers to fetch the keys
serverReady.reserve(storageServerInterfaces.size());
for (int s = 0; s < storageServerInterfaces.size(); s++)
tssReady.reserve(storageServerInterfaces.size());
tssReadyInterfs.reserve(storageServerInterfaces.size());
for (int s = 0; s < storageServerInterfaces.size(); s++) {
serverReady.push_back(waitForShardReady(storageServerInterfaces[s],
keys,
tr.getReadVersion().get(),
GetShardStateRequest::READABLE));
wait(timeout(waitForAll(serverReady),
auto tssPair = tssMapping.find(storageServerInterfaces[s].id());
if (tssPair != tssMapping.end() && waitForTSSCounter > 0 &&
!tssToIgnore.count(tssPair->second.id())) {
tssReadyInterfs.push_back(tssPair->second);
tssReady.push_back(waitForShardReady(
tssPair->second, keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE));
}
}
// Wait for all storage server moves, and explicitly swallow errors for tss ones with
// waitForAllReady If this takes too long the transaction will time out and retry, which is ok
wait(timeout(waitForAll(serverReady) && waitForAllReady(tssReady),
SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT,
Void(),
TaskPriority::MoveKeys));
// Check to see if we're waiting only on tss. If so, decrement the waiting counter.
// If the waiting counter is zero, kill the slow/non-responsive tss processes before finalizing the
// data move.
if (tssReady.size()) {
bool allSSDone = true;
for (auto& f : serverReady) {
allSSDone &= f.isReady() && !f.isError();
if (!allSSDone) {
break;
}
}
if (allSSDone) {
bool anyTssNotDone = false;
for (auto& f : tssReady) {
if (!f.isReady() || f.isError()) {
anyTssNotDone = true;
waitForTSSCounter--;
break;
}
}
if (anyTssNotDone && waitForTSSCounter == 0) {
for (int i = 0; i < tssReady.size(); i++) {
if (!tssReady[i].isReady() || tssReady[i].isError()) {
tssToKill.push_back(
std::pair(tssReadyInterfs[i].tssPairID.get(), tssReadyInterfs[i].id()));
}
}
// repeat loop and go back to start to kill tss' before continuing on
continue;
}
}
}
int count = dest.size() - newDestinations.size();
for (int s = 0; s < serverReady.size(); s++)
count += serverReady[s].isReady() && !serverReady[s].isError();
// printf(" fMK: moved data to %d/%d servers\n", count, serverReady.size());
int tssCount = 0;
for (int s = 0; s < tssReady.size(); s++)
tssCount += tssReady[s].isReady() && !tssReady[s].isError();
/*if (tssReady.size()) {
printf(" fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n",
begin.toString().c_str(),
keys.end.toString().c_str(),
count,
serverReady.size(),
tssCount,
tssReady.size());
} else {
printf(" fMK: [%s - %s) moved data to %d/%d servers\n",
begin.toString().c_str(),
keys.end.toString().c_str(),
count,
serverReady.size());
}*/
TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count);
if (count == dest.size()) {
@ -862,43 +992,48 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
}
ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServerInterface server) {
state Transaction tr(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
state int maxSkipTags = 1;
loop {
try {
state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> fv = tr.get(serverListKeyFor(server.id()));
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Future<Optional<Value>> fExclProc = tr.get(
// FIXME: don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag
state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> fv = tr->get(serverListKeyFor(server.id()));
state Future<Optional<Value>> fExclProc = tr->get(
StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip, server.address().port))));
state Future<Optional<Value>> fExclIP =
tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip))));
state Future<Optional<Value>> fFailProc =
tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port))));
tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip))));
state Future<Optional<Value>> fFailProc = tr->get(
StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port))));
state Future<Optional<Value>> fFailIP =
tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip))));
tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip))));
state Future<Optional<Value>> fExclProc2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeExcludedServersKey(
? tr->get(StringRef(encodeExcludedServersKey(
AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port))))
: Future<Optional<Value>>(Optional<Value>());
state Future<Optional<Value>> fExclIP2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
? tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
: Future<Optional<Value>>(Optional<Value>());
state Future<Optional<Value>> fFailProc2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeFailedServersKey(
? tr->get(StringRef(encodeFailedServersKey(
AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port))))
: Future<Optional<Value>>(Optional<Value>());
state Future<Optional<Value>> fFailIP2 =
server.secondaryAddress().present()
? tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
? tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
: Future<Optional<Value>>(Optional<Value>());
state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<RangeResult> fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
state Future<RangeResult> fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
wait(success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) &&
success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) &&
@ -914,6 +1049,24 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
if (fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more)
ASSERT(false);
state Tag tag;
if (server.isTss()) {
bool foundTag = false;
for (auto& it : fTags.get()) {
UID key = decodeServerTagKey(it.key);
if (key == server.tssPairID.get()) {
tag = decodeServerTagValue(it.value);
foundTag = true;
break;
}
}
if (!foundTag) {
throw recruitment_failed();
}
tssMapDB.set(tr, server.tssPairID.get(), server.id());
} else {
int8_t maxTagLocality = 0;
state int8_t locality = -1;
for (auto& kv : fTagLocalities.get()) {
@ -927,9 +1080,10 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
if (locality == -1) {
locality = maxTagLocality + 1;
if (locality < 0)
if (locality < 0) {
throw recruitment_failed();
tr.set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality));
}
tr->set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality));
}
int skipTags = deterministicRandom()->randomInt(0, maxSkipTags);
@ -962,15 +1116,23 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
}
tagId += skipTags;
state Tag tag(locality, tagId);
tr.set(serverTagKeyFor(server.id()), serverTagValue(tag));
tr.set(serverListKeyFor(server.id()), serverListValue(server));
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
tr.addReadConflictRange(conflictRange);
tr.addWriteConflictRange(conflictRange);
tag = Tag(locality, tagId);
wait(tr.commit());
return std::make_pair(tr.getCommittedVersion(), tag);
tr->set(serverTagKeyFor(server.id()), serverTagValue(tag));
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
tr->addReadConflictRange(conflictRange);
tr->addWriteConflictRange(conflictRange);
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
TraceEvent(SevError, "TSSIdentityMappingEnabled");
tssMapDB.set(tr, server.id(), server.id());
}
}
tr->set(serverListKeyFor(server.id()), serverListValue(server));
wait(tr->commit());
return std::make_pair(tr->getCommittedVersion(), tag);
} catch (Error& e) {
if (e.code() == error_code_commit_unknown_result)
throw recruitment_failed(); // There is a remote possibility that we successfully added ourselves and
@ -980,12 +1142,12 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
maxSkipTags = SERVER_KNOBS->MAX_SKIP_TAGS;
}
wait(tr.onError(e));
wait(tr->onError(e));
}
}
}
// A SS can be removed only if all data (shards) on the SS have been moved away from the SS.
ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
ACTOR Future<bool> canRemoveStorageServer(Reference<ReadYourWritesTransaction> tr, UID serverID) {
RangeResult keys = wait(krmGetRanges(tr, serverKeysPrefixFor(serverID), allKeys, 2));
ASSERT(keys.size() >= 2);
@ -1005,34 +1167,37 @@ ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
ACTOR Future<Void> removeStorageServer(Database cx,
UID serverID,
Optional<UID> tssPairID,
MoveKeysLock lock,
const DDEnabledState* ddEnabledState) {
state Transaction tr(cx);
state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
state bool retry = false;
state int noCanRemoveCount = 0;
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState));
TraceEvent("RemoveStorageServerLocked")
.detail("ServerID", serverID)
.detail("Version", tr.getReadVersion().get());
.detail("Version", tr->getReadVersion().get());
state bool canRemove = wait(canRemoveStorageServer(&tr, serverID));
state bool canRemove = wait(canRemoveStorageServer(tr, serverID));
if (!canRemove) {
TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to
// reverse its mistake.
TraceEvent(SevWarn, "NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch));
tr.reset();
tr->reset();
TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove);
} else {
state Future<Optional<Value>> fListKey = tr.get(serverListKeyFor(serverID));
state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTLogDatacenters = tr.getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
state Future<Optional<Value>> fListKey = tr->get(serverListKeyFor(serverID));
state Future<RangeResult> fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
state Future<RangeResult> fTLogDatacenters = tr->getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
wait(success(fListKey) && success(fTags) && success(fHistoryTags) && success(fTagLocalities) &&
success(fTLogDatacenters));
@ -1072,22 +1237,32 @@ ACTOR Future<Void> removeStorageServer(Database cx,
if (locality >= 0 && !allLocalities.count(locality)) {
for (auto& it : fTagLocalities.get()) {
if (locality == decodeTagLocalityListValue(it.value)) {
tr.clear(it.key);
tr->clear(it.key);
break;
}
}
}
tr.clear(serverListKeyFor(serverID));
tr.clear(serverTagKeyFor(serverID));
tr.clear(serverTagHistoryRangeFor(serverID));
tr->clear(serverListKeyFor(serverID));
tr->clear(serverTagKeyFor(serverID)); // A tss uses this to communicate shutdown but it never has a
// server tag key set in the first place
tr->clear(serverTagHistoryRangeFor(serverID));
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
TraceEvent(SevError, "TSSIdentityMappingEnabled");
tssMapDB.erase(tr, serverID);
} else if (tssPairID.present()) {
tssMapDB.erase(tr, tssPairID.get());
}
retry = true;
wait(tr.commit());
wait(tr->commit());
return Void();
}
} catch (Error& e) {
state Error err = e;
wait(tr.onError(e));
wait(tr->onError(e));
TraceEvent("RemoveStorageServerRetrying").error(err);
}
}
@ -1180,11 +1355,20 @@ ACTOR Future<Void> moveKeys(Database cx,
const DDEnabledState* ddEnabledState) {
ASSERT(destinationTeam.size());
std::sort(destinationTeam.begin(), destinationTeam.end());
wait(startMoveKeys(
cx, keys, destinationTeam, lock, startMoveKeysParallelismLock, relocationIntervalId, ddEnabledState));
state std::map<UID, StorageServerInterface> tssMapping;
wait(startMoveKeys(cx,
keys,
destinationTeam,
lock,
startMoveKeysParallelismLock,
relocationIntervalId,
&tssMapping,
ddEnabledState));
state Future<Void> completionSignaller =
checkFetchingState(cx, healthyDestinations, keys, dataMovementComplete, relocationIntervalId);
checkFetchingState(cx, healthyDestinations, keys, dataMovementComplete, relocationIntervalId, tssMapping);
wait(finishMoveKeys(cx,
keys,
@ -1193,6 +1377,7 @@ ACTOR Future<Void> moveKeys(Database cx,
finishMoveKeysParallelismLock,
hasRemote,
relocationIntervalId,
tssMapping,
ddEnabledState));
// This is defensive, but make sure that we always say that the movement is complete before moveKeys completes
@ -1228,6 +1413,13 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServ
for (auto& s : servers) {
tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));
tr.set(arena, serverListKeyFor(s.id()), serverListValue(s));
if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
TraceEvent(SevError, "TSSIdentityMappingEnabled");
// hack key-backed map here since we can't really change CommitTransactionRef to a RYW transaction
Key uidRef = Codec<UID>::pack(s.id()).pack();
tr.set(arena, uidRef.withPrefix(tssMappingKeys.begin), uidRef);
}
}
std::vector<Tag> serverTags;

View File

@ -89,13 +89,14 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
ACTOR Future<Void> removeStorageServer(Database cx,
UID serverID,
Optional<UID> tssPairID, // if serverID is a tss, set to its ss pair id
MoveKeysLock lock,
const DDEnabledState* ddEnabledState);
// Removes the given storage server permanently from the database. It must already
// have no shards assigned to it. The storage server MUST NOT be added again after this
// (though a new storage server with a new unique ID may be recruited from the same fdbserver).
ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID);
ACTOR Future<bool> canRemoveStorageServer(Reference<ReadYourWritesTransaction> tr, UID serverID);
// Returns true if the given storage server has no keys assigned to it and may be safely removed
// Obviously that could change later!
ACTOR Future<Void> removeKeysFromFailedServer(Database cx,

View File

@ -842,7 +842,7 @@ void commitMessages(Reference<LogData> self,
TEST(true); // Splitting commit messages across multiple blocks
messages1 = StringRef(block.end(), bytes);
block.append(block.arena(), messages.begin(), bytes);
self->messageBlocks.push_back(std::make_pair(version, block));
self->messageBlocks.emplace_back(version, block);
addedBytes += int64_t(block.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
messages = messages.substr(bytes);
}
@ -855,7 +855,7 @@ void commitMessages(Reference<LogData> self,
// Copy messages into block
ASSERT(messages.size() <= block.capacity() - block.size());
block.append(block.arena(), messages.begin(), messages.size());
self->messageBlocks.push_back(std::make_pair(version, block));
self->messageBlocks.emplace_back(version, block);
addedBytes += int64_t(block.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR;
messages = StringRef(block.end() - messages.size(), messages.size());
@ -873,7 +873,7 @@ void commitMessages(Reference<LogData> self,
int offs = tag->messageOffsets[m];
uint8_t const* p =
offs < messages1.size() ? messages1.begin() + offs : messages.begin() + offs - messages1.size();
tsm->value.version_messages.push_back(std::make_pair(version, LengthPrefixedStringRef((uint32_t*)p)));
tsm->value.version_messages.emplace_back(version, LengthPrefixedStringRef((uint32_t*)p));
if (tsm->value.version_messages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) {
TraceEvent(SevWarnAlways, "LargeMessage")
.detail("Size", tsm->value.version_messages.back().second.expectedSize());

View File

@ -158,6 +158,7 @@ struct ProxyCommitData {
EventMetricHandle<SingleKeyMutation> singleKeyMutationEvent;
std::map<UID, Reference<StorageInfo>> storageCache;
std::unordered_map<UID, StorageServerInterface> tssMapping;
std::map<Tag, Version> tag_popped;
Deque<std::pair<Version, Version>> txsPopVersions;
Version lastTxsPop;

View File

@ -26,6 +26,7 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/RunTransaction.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbserver/ServerDBInfo.h"
@ -308,10 +309,14 @@ ACTOR Future<int64_t> getMaxStorageServerQueueSize(Database cx, Reference<AsyncV
.detail("SS", servers[i].id());
throw attribute_not_found();
}
messages.push_back(timeoutError(itr->second.eventLogRequest.getReply(
EventLogRequest(StringRef(servers[i].id().toString() + "/StorageMetrics"))),
// Ignore TSS in add delay mode since it can purposefully freeze forever
if (!servers[i].isTss() || !g_network->isSimulated() ||
g_simulator.tssMode != ISimulator::TSSMode::EnabledAddDelay) {
messages.push_back(timeoutError(itr->second.eventLogRequest.getReply(EventLogRequest(
StringRef(servers[i].id().toString() + "/StorageMetrics"))),
1.0));
}
}
wait(waitForAll(messages));
@ -516,7 +521,15 @@ ACTOR Future<bool> getStorageServersRecruiting(Database cx, WorkerInterface dist
1.0));
TraceEvent("StorageServersRecruiting").detail("Message", recruitingMessage.toString());
return recruitingMessage.getValue("State") == "Recruiting";
if (recruitingMessage.getValue("State") == "Recruiting") {
std::string tssValue;
// if we're tss recruiting, that's fine because that can block indefinitely if only 1 free storage process
if (!recruitingMessage.tryGetValue("IsTSS", tssValue) || tssValue == "False") {
return true;
}
}
return false;
} catch (Error& e) {
TraceEvent("QuietDatabaseFailure", distributorWorker.id())
.detail("Reason", "Failed to extract StorageServersRecruiting")
@ -586,6 +599,10 @@ ACTOR Future<Void> waitForQuietDatabase(Database cx,
if (g_network->isSimulated())
wait(delay(5.0));
// The quiet database check (which runs at the end of every test) will always time out due to active data movement.
// To get around this, quiet Database will disable the perpetual wiggle in the setup phase.
wait(setPerpetualStorageWiggle(cx, false, true));
// Require 3 consecutive successful quiet database checks spaced 2 second apart
state int numSuccesses = 0;

View File

@ -54,7 +54,9 @@ StringRef radix_join(const StringRef& key1, const StringRef& key2, Arena& arena)
uint8_t* s = new (arena) uint8_t[rsize];
memcpy(s, key1.begin(), key1.size());
if (key2.size() > 0) {
memcpy(s + key1.size(), key2.begin(), key2.size());
}
return StringRef(s, rsize);
}
@ -591,7 +593,9 @@ StringRef radix_tree::iterator::getKey(uint8_t* content) const {
auto node = m_pointee;
uint32_t pos = m_pointee->m_depth;
while (true) {
if (node->getKeySize() > 0) {
memcpy(content + pos, node->getKey().begin(), node->getKeySize());
}
node = node->m_parent;
if (node == nullptr || pos <= 0)
break;

View File

@ -719,9 +719,11 @@ ACTOR Future<Void> trackEachStorageServer(
when(state std::pair<UID, Optional<StorageServerInterface>> change = waitNext(serverChanges)) {
wait(delay(0)); // prevent storageServerTracker from getting cancelled while on the call stack
if (change.second.present()) {
if (!change.second.get().isTss()) {
auto& a = actors[change.first];
a = Future<Void>();
a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err);
}
} else
actors.erase(change.first);
}

View File

@ -21,6 +21,7 @@
#ifndef FDBSERVER_RATEKEEPERINTERFACE_H
#define FDBSERVER_RATEKEEPERINTERFACE_H
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/Locality.h"
@ -49,29 +50,6 @@ struct RatekeeperInterface {
}
};
struct ClientTagThrottleLimits {
double tpsRate;
double expiration;
ClientTagThrottleLimits() : tpsRate(0), expiration(0) {}
ClientTagThrottleLimits(double tpsRate, double expiration) : tpsRate(tpsRate), expiration(expiration) {}
template <class Archive>
void serialize(Archive& ar) {
// Convert expiration time to a duration to avoid clock differences
double duration = 0;
if (!ar.isDeserializing) {
duration = expiration - now();
}
serializer(ar, tpsRate, duration);
if (ar.isDeserializing) {
expiration = now() + duration;
}
}
};
struct TransactionCommitCostEstimation {
int opsSum = 0;
uint64_t costSum = 0;
@ -91,17 +69,6 @@ struct TransactionCommitCostEstimation {
}
};
struct ClientTrCommitCostEstimation {
int opsCount = 0;
uint64_t writeCosts = 0;
std::deque<std::pair<int, uint64_t>> clearIdxCosts;
uint32_t expensiveCostEstCount = 0;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, opsCount, writeCosts, clearIdxCosts, expensiveCostEstCount);
}
};
struct GetRateInfoReply {
constexpr static FileIdentifier file_identifier = 7845006;
double transactionRate;

View File

@ -233,7 +233,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self, ResolveTransactionBatc
self->resolvedStateBytes += stateBytes;
if (stateBytes > 0)
self->recentStateTransactionSizes.push_back(std::make_pair(req.version, stateBytes));
self->recentStateTransactionSizes.emplace_back(req.version, stateBytes);
ASSERT(req.version >= firstUnseenVersion);
ASSERT(firstUnseenVersion >= self->debugMinRecentStateVersion);

View File

@ -35,10 +35,10 @@
#include "fdbrpc/Locality.h"
#include "fdbrpc/Stats.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/RestoreWorkerInterface.actor.h"
#include "flow/actorcompiler.h" // has to be last include

View File

@ -35,6 +35,7 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h" // has to be last include

View File

@ -34,10 +34,10 @@
#include "fdbrpc/Stats.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbrpc/Locality.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/RestoreCommon.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/RestoreWorkerInterface.actor.h"
#include "fdbclient/BackupContainer.h"
#include "flow/actorcompiler.h" // has to be last include

View File

@ -37,7 +37,7 @@
#include "fdbrpc/Locality.h"
#include "fdbrpc/Stats.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbclient/RestoreWorkerInterface.actor.h"
#include "fdbserver/RestoreWorkerInterface.actor.h"
#include "fdbserver/RestoreUtil.h"
#include "flow/actorcompiler.h" // has to be last include

View File

@ -28,6 +28,7 @@
#include "fdbclient/Tuple.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/RestoreInterface.h"
#include "flow/flow.h"
#include "fdbrpc/TimedRequest.h"
#include "fdbrpc/fdbrpc.h"
@ -88,26 +89,6 @@ std::string getHexString(StringRef input);
bool debugFRMutation(const char* context, Version version, MutationRef const& mutation);
struct RestoreCommonReply {
constexpr static FileIdentifier file_identifier = 5808787;
UID id; // unique ID of the server who sends the reply
bool isDuplicated;
RestoreCommonReply() = default;
explicit RestoreCommonReply(UID id, bool isDuplicated = false) : id(id), isDuplicated(isDuplicated) {}
std::string toString() const {
std::stringstream ss;
ss << "ServerNodeID:" << id.toString() << " isDuplicated:" << isDuplicated;
return ss.str();
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, isDuplicated);
}
};
struct RestoreSimpleRequest : TimedRequest {
constexpr static FileIdentifier file_identifier = 16448937;

View File

@ -189,7 +189,7 @@ ACTOR Future<Void> monitorWorkerLiveness(Reference<RestoreWorkerData> self) {
loop {
std::vector<std::pair<UID, RestoreSimpleRequest>> requests;
for (auto& worker : self->workerInterfaces) {
requests.push_back(std::make_pair(worker.first, RestoreSimpleRequest()));
requests.emplace_back(worker.first, RestoreSimpleRequest());
}
wait(sendBatchRequests(&RestoreWorkerInterface::heartbeat, self->workerInterfaces, requests));
wait(delay(60.0));

View File

@ -33,12 +33,12 @@
#include <cstdint>
#include <cstdarg>
#include "fdbclient/RestoreWorkerInterface.actor.h"
#include "fdbserver/RestoreUtil.h"
#include "fdbserver/RestoreCommon.actor.h"
#include "fdbserver/RestoreRoleCommon.actor.h"
#include "fdbserver/RestoreLoader.actor.h"
#include "fdbserver/RestoreApplier.actor.h"
#include "fdbserver/RestoreWorkerInterface.actor.h"
// Each restore worker (a process) is assigned for a role.
// MAYBE Later: We will support multiple restore roles on a worker

View File

@ -0,0 +1,102 @@
/*
* RestoreWorkerInterface.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbserver/RestoreWorkerInterface.actor.h"
#include "flow/actorcompiler.h" // must be last include
const KeyRef restoreLeaderKey = "\xff\x02/restoreLeader"_sr;
const KeyRangeRef restoreWorkersKeys("\xff\x02/restoreWorkers/"_sr, "\xff\x02/restoreWorkers0"_sr);
const KeyRef restoreStatusKey = "\xff\x02/restoreStatus/"_sr;
const KeyRangeRef restoreApplierKeys("\xff\x02/restoreApplier/"_sr, "\xff\x02/restoreApplier0"_sr);
const KeyRef restoreApplierTxnValue = "1"_sr;
// restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once
// Version and batchIndex are passed in as LittleEndian,
// they must be converted to BigEndian to maintain ordering in lexical order
const Key restoreApplierKeyFor(UID const& applierID, int64_t batchIndex, Version version) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreApplierKeys.begin);
wr << applierID << bigEndian64(batchIndex) << bigEndian64(version);
return wr.toValue();
}
std::tuple<UID, int64_t, Version> decodeRestoreApplierKey(ValueRef const& key) {
BinaryReader rd(key, Unversioned());
UID applierID;
int64_t batchIndex;
Version version;
rd >> applierID >> batchIndex >> version;
return std::make_tuple(applierID, bigEndian64(batchIndex), bigEndian64(version));
}
// Encode restore worker key for workerID
const Key restoreWorkerKeyFor(UID const& workerID) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreWorkersKeys.begin);
wr << workerID;
return wr.toValue();
}
// Encode restore agent value
const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& cmdInterf) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreWorkerInterfaceValue()));
wr << cmdInterf;
return wr.toValue();
}
RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value) {
RestoreWorkerInterface s;
BinaryReader reader(value, IncludeVersion());
reader >> s;
return s;
}
Value restoreRequestDoneVersionValue(Version readVersion) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreRequestDoneVersionValue()));
wr << readVersion;
return wr.toValue();
}
Version decodeRestoreRequestDoneVersionValue(ValueRef const& value) {
Version v;
BinaryReader reader(value, IncludeVersion());
reader >> v;
return v;
}
RestoreRequest decodeRestoreRequestValue(ValueRef const& value) {
RestoreRequest s;
BinaryReader reader(value, IncludeVersion());
reader >> s;
return s;
}
// TODO: Register restore performance data to restoreStatus key
const Key restoreStatusKeyFor(StringRef statusType) {
BinaryWriter wr(Unversioned());
wr.serializeBytes(restoreStatusKey);
wr << statusType;
return wr.toValue();
}
const Value restoreStatusValue(double val) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withRestoreStatusValue()));
wr << StringRef(std::to_string(val));
return wr.toValue();
}

Some files were not shown because too many files have changed in this diff Show More